/* Handle SIGHUP and SIGTERM signals of startup process */ void HandleStartupProcInterrupts(void) { /* * Check if we were requested to re-read config file. */ if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); } /* * Check if we were requested to exit without finishing recovery. */ if (shutdown_requested) proc_exit(1); /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (IsUnderPostmaster && !PostmasterIsAlive()) exit(1); }
/* * TaskTrackerRunning checks if the task tracker process is running. To do this, * the function checks if the task tracker is configured to start up, and infers * from shared memory that the tracker hasn't received a shut down request. */ static bool TaskTrackerRunning(void) { WorkerTask *workerTask = NULL; bool postmasterAlive = true; bool taskTrackerRunning = true; /* if postmaster shut down, infer task tracker shut down from it */ postmasterAlive = PostmasterIsAlive(); if (!postmasterAlive) { return false; } /* * When the task tracker receives a termination signal, it inserts a special * marker task to the shared hash. We need to look up this marker task since * the postmaster doesn't send a terminate signal to running backends. */ LWLockAcquire(&WorkerTasksSharedState->taskHashLock, LW_SHARED); workerTask = WorkerTasksHashFind(RESERVED_JOB_ID, SHUTDOWN_MARKER_TASK_ID); if (workerTask != NULL) { taskTrackerRunning = false; } LWLockRelease(&WorkerTasksSharedState->taskHashLock); return taskTrackerRunning; }
/* * pgarch_ArchiverCopyLoop * * Archives all outstanding xlogs then returns */ static void pgarch_ArchiverCopyLoop(void) { char xlog[MAX_XFN_CHARS + 1]; if (!XLogArchiveCommandSet()) { ereport(WARNING, (errmsg("archive_mode enabled, yet archive_command is not set"))); /* can't do anything if no command ... */ return; } /* * loop through all xlogs with archive_status of .ready and archive * them...mostly we expect this to be a single file, though it is possible * some backend will add files onto the list of those that need archiving * while we are still copying earlier archives */ while (pgarch_readyXlog(xlog)) { int failures = 0; for (;;) { /* * Do not initiate any more archive commands after receiving * SIGTERM, nor after the postmaster has died unexpectedly. The * first condition is to try to keep from having init SIGKILL the * command, and the second is to avoid conflicts with another * archiver spawned by a newer postmaster. */ if (got_SIGTERM || !PostmasterIsAlive(true)) return; if (pgarch_archiveXlog(xlog)) { /* successful */ pgarch_archiveDone(xlog); break; /* out of inner retry loop */ } else { if (++failures >= NUM_ARCHIVE_RETRIES) { ereport(WARNING, (errmsg("transaction log file \"%s\" could not be archived: too many failures", xlog))); return; /* give up archiving for now */ } pg_usleep(1000000L); /* wait a bit before retrying */ } } } }
inline void body() { /* * We run the copy loop immediately upon entry, in case there are * unarchived files left over from a previous database run (or maybe * the archiver died unexpectedly). After that we wait for a signal * or timeout before doing more. */ wakend = true; while(1) { /* Check for config update */ if (got_SIGHUP>0) { got_SIGHUP = 0; ProcessConfigFile(PGC_SIGHUP); __rho_3_ = XLogArchivingActive(); // assume(__rho_3_>0); int tt = __rho_3_; if (tt<=0) break; /* user wants us to shut down */ } /* Do what we're here for */ if (wakend>0) { wakend = 0; pgarch_ArchiverCopyLoop(); __rho_4_ = time(NULL); last_copy_time = __rho_4_; } /* * There shouldn't be anything for the archiver to do except to * wait for a signal, ... however, the archiver exists to * protect our data, so she wakes up occasionally to allow * herself to be proactive. In particular this avoids getting * stuck if a signal arrives just before we sleep. */ if (wakend<=0) { //pg_usleep(PGARCH_AUTOWAKE_INTERVAL * 1000000L); curtime = time(NULL); if ((curtime - last_copy_time) >= PGARCH_AUTOWAKE_INTERVAL) wakend = true; } __rho_5_ = PostmasterIsAlive(true); dummy = __rho_5_; if (dummy<=0) break; } while(1) { dummy=dummy; } L_return: return 0; }
/* * pgarch_MainLoop * * Main loop for archiver */ static void pgarch_MainLoop(void) { time_t last_copy_time = 0; /* * We run the copy loop immediately upon entry, in case there are * unarchived files left over from a previous database run (or maybe the * archiver died unexpectedly). After that we wait for a signal or * timeout before doing more. */ wakened = true; do { /* Check for config update */ if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); if (!XLogArchivingActive()) break; /* user wants us to shut down */ } /* Do what we're here for */ if (wakened) { wakened = false; pgarch_ArchiverCopyLoop(); last_copy_time = time(NULL); } /* * There shouldn't be anything for the archiver to do except to wait * for a signal ... however, the archiver exists to protect our data, * so she wakes up occasionally to allow herself to be proactive. * * On some platforms, signals won't interrupt the sleep. To ensure we * respond reasonably promptly when someone signals us, break down the * sleep into 1-second increments, and check for interrupts after each * nap. */ while (!(wakened || got_SIGHUP)) { time_t curtime; pg_usleep(1000000L); curtime = time(NULL); if ((unsigned int) (curtime - last_copy_time) >= (unsigned int) PGARCH_AUTOWAKE_INTERVAL) wakened = true; } } while (PostmasterIsAlive(true)); }
/* * pgarch_MainLoop * * Main loop for archiver */ static void pgarch_MainLoop(void) { time_t last_copy_time = 0; time_t curtime; /* * We run the copy loop immediately upon entry, in case there are * unarchived files left over from a previous database run (or maybe the * archiver died unexpectedly). After that we wait for a signal or * timeout before doing more. */ wakened = true; do { /* Check for config update */ if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); if (!XLogArchivingActive()) break; /* user wants us to shut down */ } /* Do what we're here for */ if (wakened) { wakened = false; pgarch_ArchiverCopyLoop(); last_copy_time = time(NULL); } /* * There shouldn't be anything for the archiver to do except to wait * for a signal, ... however, the archiver exists to protect our data, * so she wakes up occasionally to allow herself to be proactive. In * particular this avoids getting stuck if a signal arrives just * before we sleep. */ if (!wakened) { pg_usleep(PGARCH_AUTOWAKE_INTERVAL * 1000000L); curtime = time(NULL); if ((unsigned int) (curtime - last_copy_time) >= (unsigned int) PGARCH_AUTOWAKE_INTERVAL) wakened = true; } } while (PostmasterIsAlive(true)); }
/** * Main loop of the sender process. It wakes up every * gp_perfmon_segment_interval ms to send segment * information to perfmon */ static void SegmentInfoSenderLoop(void) { int rc; int counter; for (counter = 0;; counter += SEGMENT_INFO_LOOP_SLEEP_MS) { CHECK_FOR_INTERRUPTS(); if (senderShutdownRequested) { break; } /* no need to live on if postmaster has died */ if (!PostmasterIsAlive()) exit(1); if (cluster_state_collect_hook) cluster_state_collect_hook(); if (gp_enable_gpperfmon && counter >= gp_perfmon_segment_interval) { SegmentInfoSender(); counter = 0; } /* Sleep a while. */ rc = WaitLatch(&MyProc->procLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, SEGMENT_INFO_LOOP_SLEEP_MS); ResetLatch(&MyProc->procLatch); /* emergency bailout if postmaster has died */ if (rc & WL_POSTMASTER_DEATH) proc_exit(1); } /* end server loop */ return; }
/* * pgarch_ArchiverCopyLoop * * Archives all outstanding xlogs then returns */ static void pgarch_ArchiverCopyLoop(void) { char xlog[MAX_XFN_CHARS + 1]; /* * loop through all xlogs with archive_status of .ready and archive * them...mostly we expect this to be a single file, though it is possible * some backend will add files onto the list of those that need archiving * while we are still copying earlier archives */ while (pgarch_readyXlog(xlog)) { int failures = 0; for (;;) { /* Abandon processing if we notice our postmaster has died */ if (!PostmasterIsAlive(true)) return; if (pgarch_archiveXlog(xlog)) { /* successful */ pgarch_archiveDone(xlog); break; /* out of inner retry loop */ } else { if (++failures >= NUM_ARCHIVE_RETRIES) { ereport(WARNING, (errmsg("transaction log file \"%s\" could not be archived: too many failures", xlog))); return; /* give up archiving for now */ } pg_usleep(1000000L); /* wait a bit before retrying */ } } } }
/** * Main loop of the sweeper process. It wakes up once in a while, marks backends as active * or not and re-calculates CPU usage among active backends. */ void BackoffSweeperLoop(void) { for (;;) { CHECK_FOR_INTERRUPTS(); if (sweeperShutdownRequested) break; /* no need to live on if postmaster has died */ if (!PostmasterIsAlive()) exit(1); if (gp_enable_resqueue_priority) BackoffSweeper(); Assert(gp_resqueue_priority_sweeper_interval > 0.0); /* Sleep a while. */ pg_usleep(gp_resqueue_priority_sweeper_interval * 1000.0); } /* end server loop */ return; }
/* * Wait using linux's epoll_wait(2). * * This is the preferrable wait method, as several readiness notifications are * delivered, without having to iterate through all of set->events. The return * epoll_event struct contain a pointer to our events, making association * easy. */ static inline int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout, WaitEvent *occurred_events, int nevents) { int returned_events = 0; int rc; WaitEvent *cur_event; struct epoll_event *cur_epoll_event; /* Sleep */ rc = epoll_wait(set->epoll_fd, set->epoll_ret_events, nevents, cur_timeout); /* Check return code */ if (rc < 0) { /* EINTR is okay, otherwise complain */ if (errno != EINTR) { waiting = false; ereport(ERROR, (errcode_for_socket_access(), errmsg("epoll_wait() failed: %m"))); } return 0; } else if (rc == 0) { /* timeout exceeded */ return -1; } /* * At least one event occurred, iterate over the returned epoll events * until they're either all processed, or we've returned all the events * the caller desired. */ for (cur_epoll_event = set->epoll_ret_events; cur_epoll_event < (set->epoll_ret_events + rc) && returned_events < nevents; cur_epoll_event++) { /* epoll's data pointer is set to the associated WaitEvent */ cur_event = (WaitEvent *) cur_epoll_event->data.ptr; occurred_events->pos = cur_event->pos; occurred_events->user_data = cur_event->user_data; occurred_events->events = 0; if (cur_event->events == WL_LATCH_SET && cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP)) { /* There's data in the self-pipe, clear it. */ drainSelfPipe(); if (set->latch->is_set) { occurred_events->fd = PGINVALID_SOCKET; occurred_events->events = WL_LATCH_SET; occurred_events++; returned_events++; } } else if (cur_event->events == WL_POSTMASTER_DEATH && cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP)) { /* * We expect an EPOLLHUP when the remote end is closed, but * because we don't expect the pipe to become readable or to have * any errors either, treat those cases as postmaster death, too. * * As explained in the WAIT_USE_SELECT implementation, select(2) * may spuriously return. Be paranoid about that here too, a * spurious WL_POSTMASTER_DEATH would be painful. */ if (!PostmasterIsAlive()) { occurred_events->fd = PGINVALID_SOCKET; occurred_events->events = WL_POSTMASTER_DEATH; occurred_events++; returned_events++; } } else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) { Assert(cur_event->fd != PGINVALID_SOCKET); if ((cur_event->events & WL_SOCKET_READABLE) && (cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))) { /* data available in socket, or EOF */ occurred_events->events |= WL_SOCKET_READABLE; } if ((cur_event->events & WL_SOCKET_WRITEABLE) && (cur_epoll_event->events & (EPOLLOUT | EPOLLERR | EPOLLHUP))) { /* writable, or EOF */ occurred_events->events |= WL_SOCKET_WRITEABLE; } if (occurred_events->events != 0) { occurred_events->fd = cur_event->fd; occurred_events++; returned_events++; } } } return returned_events; }
/* * Wait using Windows' WaitForMultipleObjects(). * * Unfortunately this will only ever return a single readiness notification at * a time. Note that while the official documentation for * WaitForMultipleObjects is ambiguous about multiple events being "consumed" * with a single bWaitAll = FALSE call, * https://blogs.msdn.microsoft.com/oldnewthing/20150409-00/?p=44273 confirms * that only one event is "consumed". */ static inline int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout, WaitEvent *occurred_events, int nevents) { int returned_events = 0; DWORD rc; WaitEvent *cur_event; /* * Sleep. * * Need to wait for ->nevents + 1, because signal handle is in [0]. */ rc = WaitForMultipleObjects(set->nevents + 1, set->handles, FALSE, cur_timeout); /* Check return code */ if (rc == WAIT_FAILED) elog(ERROR, "WaitForMultipleObjects() failed: error code %lu", GetLastError()); else if (rc == WAIT_TIMEOUT) { /* timeout exceeded */ return -1; } if (rc == WAIT_OBJECT_0) { /* Service newly-arrived signals */ pgwin32_dispatch_queued_signals(); return 0; /* retry */ } /* * With an offset of one, due to the always present pgwin32_signal_event, * the handle offset directly corresponds to a wait event. */ cur_event = (WaitEvent *) &set->events[rc - WAIT_OBJECT_0 - 1]; occurred_events->pos = cur_event->pos; occurred_events->user_data = cur_event->user_data; occurred_events->events = 0; if (cur_event->events == WL_LATCH_SET) { if (!ResetEvent(set->latch->event)) elog(ERROR, "ResetEvent failed: error code %lu", GetLastError()); if (set->latch->is_set) { occurred_events->fd = PGINVALID_SOCKET; occurred_events->events = WL_LATCH_SET; occurred_events++; returned_events++; } } else if (cur_event->events == WL_POSTMASTER_DEATH) { /* * Postmaster apparently died. Since the consequences of falsely * returning WL_POSTMASTER_DEATH could be pretty unpleasant, we take * the trouble to positively verify this with PostmasterIsAlive(), * even though there is no known reason to think that the event could * be falsely set on Windows. */ if (!PostmasterIsAlive()) { occurred_events->fd = PGINVALID_SOCKET; occurred_events->events = WL_POSTMASTER_DEATH; occurred_events++; returned_events++; } } else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) { WSANETWORKEVENTS resEvents; HANDLE handle = set->handles[cur_event->pos + 1]; Assert(cur_event->fd); occurred_events->fd = cur_event->fd; ZeroMemory(&resEvents, sizeof(resEvents)); if (WSAEnumNetworkEvents(cur_event->fd, handle, &resEvents) != 0) elog(ERROR, "failed to enumerate network events: error code %u", WSAGetLastError()); if ((cur_event->events & WL_SOCKET_READABLE) && (resEvents.lNetworkEvents & FD_READ)) { /* data available in socket */ occurred_events->events |= WL_SOCKET_READABLE; } if ((cur_event->events & WL_SOCKET_WRITEABLE) && (resEvents.lNetworkEvents & FD_WRITE)) { /* writeable */ occurred_events->events |= WL_SOCKET_WRITEABLE; } if (resEvents.lNetworkEvents & FD_CLOSE) { /* EOF */ if (cur_event->events & WL_SOCKET_READABLE) occurred_events->events |= WL_SOCKET_READABLE; if (cur_event->events & WL_SOCKET_WRITEABLE) occurred_events->events |= WL_SOCKET_WRITEABLE; } if (occurred_events->events != 0) { occurred_events++; returned_events++; } } return returned_events; }
/* * Wait using select(2). * * XXX: On at least older linux kernels select(), in violation of POSIX, * doesn't reliably return a socket as writable if closed - but we rely on * that. So far all the known cases of this problem are on platforms that also * provide a poll() implementation without that bug. If we find one where * that's not the case, we'll need to add a workaround. */ static inline int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout, WaitEvent *occurred_events, int nevents) { int returned_events = 0; int rc; WaitEvent *cur_event; fd_set input_mask; fd_set output_mask; int hifd; struct timeval tv; struct timeval *tvp = NULL; FD_ZERO(&input_mask); FD_ZERO(&output_mask); /* * Prepare input/output masks. We do so every loop iteration as there's no * entirely portable way to copy fd_sets. */ for (cur_event = set->events; cur_event < (set->events + set->nevents); cur_event++) { if (cur_event->events == WL_LATCH_SET) FD_SET(cur_event->fd, &input_mask); else if (cur_event->events == WL_POSTMASTER_DEATH) FD_SET(cur_event->fd, &input_mask); else { Assert(cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)); if (cur_event->events == WL_SOCKET_READABLE) FD_SET(cur_event->fd, &input_mask); else if (cur_event->events == WL_SOCKET_WRITEABLE) FD_SET(cur_event->fd, &output_mask); } if (cur_event->fd > hifd) hifd = cur_event->fd; } /* Sleep */ if (cur_timeout >= 0) { tv.tv_sec = cur_timeout / 1000L; tv.tv_usec = (cur_timeout % 1000L) * 1000L; tvp = &tv; } rc = select(hifd + 1, &input_mask, &output_mask, NULL, tvp); /* Check return code */ if (rc < 0) { /* EINTR is okay, otherwise complain */ if (errno != EINTR) { waiting = false; ereport(ERROR, (errcode_for_socket_access(), errmsg("select() failed: %m"))); } return 0; /* retry */ } else if (rc == 0) { /* timeout exceeded */ return -1; } /* * To associate events with select's masks, we have to check the status of * the file descriptors associated with an event; by looping through all * events. */ for (cur_event = set->events; cur_event < (set->events + set->nevents) && returned_events < nevents; cur_event++) { occurred_events->pos = cur_event->pos; occurred_events->user_data = cur_event->user_data; occurred_events->events = 0; if (cur_event->events == WL_LATCH_SET && FD_ISSET(cur_event->fd, &input_mask)) { /* There's data in the self-pipe, clear it. */ drainSelfPipe(); if (set->latch->is_set) { occurred_events->fd = PGINVALID_SOCKET; occurred_events->events = WL_LATCH_SET; occurred_events++; returned_events++; } } else if (cur_event->events == WL_POSTMASTER_DEATH && FD_ISSET(cur_event->fd, &input_mask)) { /* * According to the select(2) man page on Linux, select(2) may * spuriously return and report a file descriptor as readable, * when it's not; and presumably so can poll(2). It's not clear * that the relevant cases would ever apply to the postmaster * pipe, but since the consequences of falsely returning * WL_POSTMASTER_DEATH could be pretty unpleasant, we take the * trouble to positively verify EOF with PostmasterIsAlive(). */ if (!PostmasterIsAlive()) { occurred_events->fd = PGINVALID_SOCKET; occurred_events->events = WL_POSTMASTER_DEATH; occurred_events++; returned_events++; } } else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) { Assert(cur_event->fd != PGINVALID_SOCKET); if ((cur_event->events & WL_SOCKET_READABLE) && FD_ISSET(cur_event->fd, &input_mask)) { /* data available in socket, or EOF */ occurred_events->events |= WL_SOCKET_READABLE; } if ((cur_event->events & WL_SOCKET_WRITEABLE) && FD_ISSET(cur_event->fd, &output_mask)) { /* socket is writeable, or EOF */ occurred_events->events |= WL_SOCKET_WRITEABLE; } if (occurred_events->events != 0) { occurred_events->fd = cur_event->fd; occurred_events++; returned_events++; } } } return returned_events; }
/* * Wait using poll(2). * * This allows to receive readiness notifications for several events at once, * but requires iterating through all of set->pollfds. */ static inline int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout, WaitEvent *occurred_events, int nevents) { int returned_events = 0; int rc; WaitEvent *cur_event; struct pollfd *cur_pollfd; /* Sleep */ rc = poll(set->pollfds, set->nevents, (int) cur_timeout); /* Check return code */ if (rc < 0) { /* EINTR is okay, otherwise complain */ if (errno != EINTR) { waiting = false; ereport(ERROR, (errcode_for_socket_access(), errmsg("poll() failed: %m"))); } return 0; } else if (rc == 0) { /* timeout exceeded */ return -1; } for (cur_event = set->events, cur_pollfd = set->pollfds; cur_event < (set->events + set->nevents) && returned_events < nevents; cur_event++, cur_pollfd++) { /* no activity on this FD, skip */ if (cur_pollfd->revents == 0) continue; occurred_events->pos = cur_event->pos; occurred_events->user_data = cur_event->user_data; occurred_events->events = 0; if (cur_event->events == WL_LATCH_SET && (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL))) { /* There's data in the self-pipe, clear it. */ drainSelfPipe(); if (set->latch->is_set) { occurred_events->fd = PGINVALID_SOCKET; occurred_events->events = WL_LATCH_SET; occurred_events++; returned_events++; } } else if (cur_event->events == WL_POSTMASTER_DEATH && (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL))) { /* * We expect an POLLHUP when the remote end is closed, but because * we don't expect the pipe to become readable or to have any * errors either, treat those cases as postmaster death, too. * * As explained in the WAIT_USE_SELECT implementation, select(2) * may spuriously return. Be paranoid about that here too, a * spurious WL_POSTMASTER_DEATH would be painful. */ if (!PostmasterIsAlive()) { occurred_events->fd = PGINVALID_SOCKET; occurred_events->events = WL_POSTMASTER_DEATH; occurred_events++; returned_events++; } } else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) { int errflags = POLLHUP | POLLERR | POLLNVAL; Assert(cur_event->fd >= PGINVALID_SOCKET); if ((cur_event->events & WL_SOCKET_READABLE) && (cur_pollfd->revents & (POLLIN | errflags))) { /* data available in socket, or EOF */ occurred_events->events |= WL_SOCKET_READABLE; } if ((cur_event->events & WL_SOCKET_WRITEABLE) && (cur_pollfd->revents & (POLLOUT | errflags))) { /* writeable, or EOF */ occurred_events->events |= WL_SOCKET_WRITEABLE; } if (occurred_events->events != 0) { occurred_events->fd = cur_event->fd; occurred_events++; returned_events++; } } } return returned_events; }
/* * Wait for synchronous replication, if requested by user. * * Initially backends start in state SYNC_REP_NOT_WAITING and then * change that state to SYNC_REP_WAITING before adding ourselves * to the wait queue. During SyncRepWakeQueue() a WALSender changes * the state to SYNC_REP_WAIT_COMPLETE once replication is confirmed. * This backend then resets its state to SYNC_REP_NOT_WAITING. */ void SyncRepWaitForLSN(XLogRecPtr XactCommitLSN) { char *new_status = NULL; const char *old_status; /* * Fast exit if user has not requested sync replication, or * there are no sync replication standby names defined. * Note that those standbys don't need to be connected. */ if (!SyncRepRequested() || !SyncStandbysDefined()) return; Assert(SHMQueueIsDetached(&(MyProc->syncRepLinks))); Assert(WalSndCtl != NULL); /* Reset the latch before adding ourselves to the queue. */ ResetLatch(&MyProc->waitLatch); /* * Set our waitLSN so WALSender will know when to wake us, and add * ourselves to the queue. */ LWLockAcquire(SyncRepLock, LW_EXCLUSIVE); Assert(MyProc->syncRepState == SYNC_REP_NOT_WAITING); if (!WalSndCtl->sync_standbys_defined) { /* * We don't wait for sync rep if WalSndCtl->sync_standbys_defined is * not set. See SyncRepUpdateSyncStandbysDefined. */ LWLockRelease(SyncRepLock); return; } MyProc->waitLSN = XactCommitLSN; MyProc->syncRepState = SYNC_REP_WAITING; SyncRepQueueInsert(); Assert(SyncRepQueueIsOrderedByLSN()); LWLockRelease(SyncRepLock); /* Alter ps display to show waiting for sync rep. */ if (update_process_title) { int len; old_status = get_ps_display(&len); new_status = (char *) palloc(len + 32 + 1); memcpy(new_status, old_status, len); sprintf(new_status + len, " waiting for %X/%X", XactCommitLSN.xlogid, XactCommitLSN.xrecoff); set_ps_display(new_status, false); new_status[len] = '\0'; /* truncate off " waiting ..." */ } /* * Wait for specified LSN to be confirmed. * * Each proc has its own wait latch, so we perform a normal latch * check/wait loop here. */ for (;;) { int syncRepState; /* * Wait on latch for up to 60 seconds. This allows us to * check for postmaster death regularly while waiting. * Note that timeout here does not necessarily release from loop. */ WaitLatch(&MyProc->waitLatch, 60000000L); /* Must reset the latch before testing state. */ ResetLatch(&MyProc->waitLatch); /* * Try checking the state without the lock first. There's no guarantee * that we'll read the most up-to-date value, so if it looks like we're * still waiting, recheck while holding the lock. But if it looks like * we're done, we must really be done, because once walsender changes * the state to SYNC_REP_WAIT_COMPLETE, it will never update it again, * so we can't be seeing a stale value in that case. */ syncRepState = MyProc->syncRepState; if (syncRepState == SYNC_REP_WAITING) { LWLockAcquire(SyncRepLock, LW_SHARED); syncRepState = MyProc->syncRepState; LWLockRelease(SyncRepLock); } if (syncRepState == SYNC_REP_WAIT_COMPLETE) break; /* * If a wait for synchronous replication is pending, we can neither * acknowledge the commit nor raise ERROR or FATAL. The latter * would lead the client to believe that that the transaction * aborted, which is not true: it's already committed locally. * The former is no good either: the client has requested * synchronous replication, and is entitled to assume that an * acknowledged commit is also replicated, which may not be true. * So in this case we issue a WARNING (which some clients may * be able to interpret) and shut off further output. We do NOT * reset ProcDiePending, so that the process will die after the * commit is cleaned up. */ if (ProcDiePending) { ereport(WARNING, (errcode(ERRCODE_ADMIN_SHUTDOWN), errmsg("canceling the wait for synchronous replication and terminating connection due to administrator command"), errdetail("The transaction has already committed locally, but may not have been replicated to the standby."))); whereToSendOutput = DestNone; SyncRepCancelWait(); break; } /* * It's unclear what to do if a query cancel interrupt arrives. We * can't actually abort at this point, but ignoring the interrupt * altogether is not helpful, so we just terminate the wait with * a suitable warning. */ if (QueryCancelPending) { QueryCancelPending = false; ereport(WARNING, (errmsg("canceling wait for synchronous replication due to user request"), errdetail("The transaction has already committed locally, but may not have been replicated to the standby."))); SyncRepCancelWait(); break; } /* * If the postmaster dies, we'll probably never get an acknowledgement, * because all the wal sender processes will exit. So just bail out. */ if (!PostmasterIsAlive(true)) { ProcDiePending = true; whereToSendOutput = DestNone; SyncRepCancelWait(); break; } } /* * WalSender has checked our LSN and has removed us from queue. Clean up * state and leave. It's OK to reset these shared memory fields without * holding SyncRepLock, because any walsenders will ignore us anyway when * we're not on the queue. */ Assert(SHMQueueIsDetached(&(MyProc->syncRepLinks))); MyProc->syncRepState = SYNC_REP_NOT_WAITING; MyProc->waitLSN.xlogid = 0; MyProc->waitLSN.xrecoff = 0; if (new_status) { /* Reset ps display */ set_ps_display(new_status, false); pfree(new_status); } }
/* Main loop of walsender process */ static int WalSndLoop(void) { char *output_message; bool caughtup = false; /* * Allocate buffer that will be used for each output message. We do this * just once to reduce palloc overhead. The buffer must be made large * enough for maximum-sized messages. */ output_message = palloc(1 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE); /* Loop forever, unless we get an error */ for (;;) { /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (!PostmasterIsAlive(true)) exit(1); /* Process any requests or signals received recently */ if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); } /* * When SIGUSR2 arrives, we send all outstanding logs up to the * shutdown checkpoint record (i.e., the latest record) and exit. */ if (ready_to_stop) { if (!XLogSend(output_message, &caughtup)) break; if (caughtup) shutdown_requested = true; } /* Normal exit from the walsender is here */ if (shutdown_requested) { /* Inform the standby that XLOG streaming was done */ pq_puttextmessage('C', "COPY 0"); pq_flush(); proc_exit(0); } /* * If we had sent all accumulated WAL in last round, nap for the * configured time before retrying. */ if (caughtup) { /* * Even if we wrote all the WAL that was available when we started * sending, more might have arrived while we were sending this * batch. We had the latch set while sending, so we have not * received any signals from that time. Let's arm the latch * again, and after that check that we're still up-to-date. */ ResetLatch(&MyWalSnd->latch); if (!XLogSend(output_message, &caughtup)) break; if (caughtup && !got_SIGHUP && !ready_to_stop && !shutdown_requested) { /* * XXX: We don't really need the periodic wakeups anymore, * WaitLatchOrSocket should reliably wake up as soon as * something interesting happens. */ /* Sleep */ WaitLatchOrSocket(&MyWalSnd->latch, MyProcPort->sock, WalSndDelay * 1000L); } /* Check if the connection was closed */ CheckClosedConnection(); } else { /* Attempt to send the log once every loop */ if (!XLogSend(output_message, &caughtup)) break; } } /* * Get here on send failure. Clean up and exit. * * Reset whereToSendOutput to prevent ereport from attempting to send any * more messages to the standby. */ if (whereToSendOutput == DestRemote) whereToSendOutput = DestNone; proc_exit(0); return 1; /* keep the compiler quiet */ }
/* * FileRepSubProcess_ProcessSignals() * */ bool FileRepSubProcess_ProcessSignals() { bool processExit = false; if (reloadConfigFile) { reloadConfigFile = false; ProcessConfigFile(PGC_SIGHUP); FileRep_SetFileRepRetry(); } if (shutdownRequested) { SegmentState_e segmentState; getPrimaryMirrorStatusCodes(NULL, &segmentState, NULL, NULL); shutdownRequested = false; if (segmentState == SegmentStateShutdownFilerepBackends) { processExit = FileRepIsBackendSubProcess(fileRepProcessType); FileRepSubProcess_SetState(FileRepStateShutdownBackends); } else { processExit = true; FileRepSubProcess_SetState(FileRepStateShutdown); } } /* * Immediate shutdown if postmaster or main filerep process (parent) is * not alive to avoid manual cleanup. */ if (!PostmasterIsAlive(false /* amDirectChild */ ) || !ParentProcIsAlive()) { quickdie_impl(); } for (;;) { /* check to see if change required */ sig_atomic_t curStateChangeRequestCounter = stateChangeRequestCounter; if (curStateChangeRequestCounter == lastChangeRequestProcessCounterValue) break; lastChangeRequestProcessCounterValue = curStateChangeRequestCounter; /* do the change in local memory */ getFileRepRoleAndState(&fileRepRole, &segmentState, &dataState, NULL, NULL); switch (segmentState) { case SegmentStateNotInitialized: FileRepSubProcess_SetState(FileRepStateNotInitialized); break; case SegmentStateInitialization: FileRepSubProcess_SetState(FileRepStateInitialization); break; case SegmentStateInResyncTransition: FileRepSubProcess_SetState(FileRepStateInitialization); break; case SegmentStateInChangeTrackingTransition: case SegmentStateInSyncTransition: /* fileRepState remains Ready */ break; case SegmentStateChangeTrackingDisabled: case SegmentStateReady: FileRepSubProcess_SetState(FileRepStateReady); break; case SegmentStateFault: FileRepSubProcess_SetState(FileRepStateFault); break; case SegmentStateShutdownFilerepBackends: if (fileRepRole == FileRepPrimaryRole) { FileRepSubProcess_SetState(FileRepStateShutdownBackends); } else { processExit = true; FileRepSubProcess_SetState(FileRepStateShutdown); } break; case SegmentStateImmediateShutdown: case SegmentStateShutdown: processExit = true; FileRepSubProcess_SetState(FileRepStateShutdown); break; default: Assert(0); break; } //switch () if (processExit == true) { FileRep_IpcSignalAll(); } } return (processExit); }
/* Main loop of walsender process */ static int WalSndLoop(void) { char *output_message; bool caughtup = false; /* * Allocate buffer that will be used for each output message. We do this * just once to reduce palloc overhead. The buffer must be made large * enough for maximum-sized messages. */ output_message = palloc(1 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE); /* * Allocate buffer that will be used for processing reply messages. As * above, do this just once to reduce palloc overhead. */ initStringInfo(&reply_message); /* Initialize the last reply timestamp */ last_reply_timestamp = GetCurrentTimestamp(); /* Loop forever, unless we get an error */ for (;;) { /* Clear any already-pending wakeups */ ResetLatch(&MyWalSnd->latch); /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (!PostmasterIsAlive()) exit(1); /* Process any requests or signals received recently */ if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); SyncRepInitConfig(); } /* Normal exit from the walsender is here */ if (walsender_shutdown_requested) { /* Inform the standby that XLOG streaming is done */ pq_puttextmessage('C', "COPY 0"); pq_flush(); proc_exit(0); } /* Check for input from the client */ ProcessRepliesIfAny(); /* * If we don't have any pending data in the output buffer, try to send * some more. If there is some, we don't bother to call XLogSend * again until we've flushed it ... but we'd better assume we are not * caught up. */ if (!pq_is_send_pending()) XLogSend(output_message, &caughtup); else caughtup = false; /* Try to flush pending output to the client */ if (pq_flush_if_writable() != 0) break; /* If nothing remains to be sent right now ... */ if (caughtup && !pq_is_send_pending()) { /* * If we're in catchup state, move to streaming. This is an * important state change for users to know about, since before * this point data loss might occur if the primary dies and we * need to failover to the standby. The state change is also * important for synchronous replication, since commits that * started to wait at that point might wait for some time. */ if (MyWalSnd->state == WALSNDSTATE_CATCHUP) { ereport(DEBUG1, (errmsg("standby \"%s\" has now caught up with primary", application_name))); WalSndSetState(WALSNDSTATE_STREAMING); } /* * When SIGUSR2 arrives, we send any outstanding logs up to the * shutdown checkpoint record (i.e., the latest record) and exit. * This may be a normal termination at shutdown, or a promotion, * the walsender is not sure which. */ if (walsender_ready_to_stop) { /* ... let's just be real sure we're caught up ... */ XLogSend(output_message, &caughtup); if (caughtup && !pq_is_send_pending()) { walsender_shutdown_requested = true; continue; /* don't want to wait more */ } } } /* * We don't block if not caught up, unless there is unsent data * pending in which case we'd better block until the socket is * write-ready. This test is only needed for the case where XLogSend * loaded a subset of the available data but then pq_flush_if_writable * flushed it all --- we should immediately try to send more. */ if (caughtup || pq_is_send_pending()) { TimestampTz finish_time = 0; long sleeptime = -1; int wakeEvents; wakeEvents = WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_SOCKET_READABLE; if (pq_is_send_pending()) wakeEvents |= WL_SOCKET_WRITEABLE; /* Determine time until replication timeout */ if (replication_timeout > 0) { long secs; int usecs; finish_time = TimestampTzPlusMilliseconds(last_reply_timestamp, replication_timeout); TimestampDifference(GetCurrentTimestamp(), finish_time, &secs, &usecs); sleeptime = secs * 1000 + usecs / 1000; /* Avoid Assert in WaitLatchOrSocket if timeout is past */ if (sleeptime < 0) sleeptime = 0; wakeEvents |= WL_TIMEOUT; } /* Sleep until something happens or replication timeout */ WaitLatchOrSocket(&MyWalSnd->latch, wakeEvents, MyProcPort->sock, sleeptime); /* * Check for replication timeout. Note we ignore the corner case * possibility that the client replied just as we reached the * timeout ... he's supposed to reply *before* that. */ if (replication_timeout > 0 && GetCurrentTimestamp() >= finish_time) { /* * Since typically expiration of replication timeout means * communication problem, we don't send the error message to * the standby. */ ereport(COMMERROR, (errmsg("terminating walsender process due to replication timeout"))); break; } } } /* * Get here on send failure. Clean up and exit. * * Reset whereToSendOutput to prevent ereport from attempting to send any * more messages to the standby. */ if (whereToSendOutput == DestRemote) whereToSendOutput = DestNone; proc_exit(0); return 1; /* keep the compiler quiet */ }
/* * Main entry point for bgwriter process * * This is invoked from BootstrapMain, which has already created the basic * execution environment, but not enabled signals yet. */ void BackgroundWriterMain(void) { sigjmp_buf local_sigjmp_buf; MemoryContext bgwriter_context; BgWriterShmem->bgwriter_pid = MyProcPid; am_bg_writer = true; /* * If possible, make this process a group leader, so that the postmaster * can signal any child processes too. (bgwriter probably never has any * child processes, but for consistency we make all postmaster child * processes do this.) */ #ifdef HAVE_SETSID if (setsid() < 0) elog(FATAL, "setsid() failed: %m"); #endif /* * Properly accept or ignore signals the postmaster might send us * * Note: we deliberately ignore SIGTERM, because during a standard Unix * system shutdown cycle, init will SIGTERM all processes at once. We * want to wait for the backends to exit, whereupon the postmaster will * tell us it's okay to shut down (via SIGUSR2). * * SIGUSR1 is presently unused; keep it spare in case someday we want this * process to participate in sinval messaging. */ pqsignal(SIGHUP, BgSigHupHandler); /* set flag to read config file */ pqsignal(SIGINT, ReqCheckpointHandler); /* request checkpoint */ pqsignal(SIGTERM, SIG_IGN); /* ignore SIGTERM */ pqsignal(SIGQUIT, bg_quickdie); /* hard crash time */ pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, SIG_IGN); /* reserve for sinval */ pqsignal(SIGUSR2, ReqShutdownHandler); /* request shutdown */ /* * Reset some signals that are accepted by postmaster but not here */ pqsignal(SIGCHLD, SIG_DFL); pqsignal(SIGTTIN, SIG_DFL); pqsignal(SIGTTOU, SIG_DFL); pqsignal(SIGCONT, SIG_DFL); pqsignal(SIGWINCH, SIG_DFL); /* We allow SIGQUIT (quickdie) at all times */ #ifdef HAVE_SIGPROCMASK sigdelset(&BlockSig, SIGQUIT); #else BlockSig &= ~(sigmask(SIGQUIT)); #endif /* * Initialize so that first time-driven event happens at the correct time. */ last_checkpoint_time = last_xlog_switch_time = time(NULL); /* * Create a resource owner to keep track of our resources (currently only * buffer pins). */ CurrentResourceOwner = ResourceOwnerCreate(NULL, "Background Writer"); /* * Create a memory context that we will do all our work in. We do this so * that we can reset the context during error recovery and thereby avoid * possible memory leaks. Formerly this code just ran in * TopMemoryContext, but resetting that would be a really bad idea. */ bgwriter_context = AllocSetContextCreate(TopMemoryContext, "Background Writer", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); MemoryContextSwitchTo(bgwriter_context); /* * If an exception is encountered, processing resumes here. * * See notes in postgres.c about the design of this coding. */ if (sigsetjmp(local_sigjmp_buf, 1) != 0) { /* Since not using PG_TRY, must reset error stack by hand */ error_context_stack = NULL; /* Prevent interrupts while cleaning up */ HOLD_INTERRUPTS(); /* Report the error to the server log */ EmitErrorReport(); /* * These operations are really just a minimal subset of * AbortTransaction(). We don't have very many resources to worry * about in bgwriter, but we do have LWLocks, buffers, and temp files. */ LWLockReleaseAll(); AbortBufferIO(); UnlockBuffers(); /* buffer pins are released here: */ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, true); /* we needn't bother with the other ResourceOwnerRelease phases */ AtEOXact_Buffers(false); AtEOXact_Files(); AtEOXact_HashTables(false); /* Warn any waiting backends that the checkpoint failed. */ if (ckpt_active) { /* use volatile pointer to prevent code rearrangement */ volatile BgWriterShmemStruct *bgs = BgWriterShmem; SpinLockAcquire(&bgs->ckpt_lck); bgs->ckpt_failed++; bgs->ckpt_done = bgs->ckpt_started; SpinLockRelease(&bgs->ckpt_lck); ckpt_active = false; } /* * Now return to normal top-level context and clear ErrorContext for * next time. */ MemoryContextSwitchTo(bgwriter_context); FlushErrorState(); /* Flush any leaked data in the top-level context */ MemoryContextResetAndDeleteChildren(bgwriter_context); /* Now we can allow interrupts again */ RESUME_INTERRUPTS(); /* * Sleep at least 1 second after any error. A write error is likely * to be repeated, and we don't want to be filling the error logs as * fast as we can. */ pg_usleep(1000000L); /* * Close all open files after any error. This is helpful on Windows, * where holding deleted files open causes various strange errors. * It's not clear we need it elsewhere, but shouldn't hurt. */ smgrcloseall(); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; /* * Unblock signals (they were blocked when the postmaster forked us) */ PG_SETMASK(&UnBlockSig); /* * Loop forever */ for (;;) { bool do_checkpoint = false; int flags = 0; time_t now; int elapsed_secs; /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (!PostmasterIsAlive(true)) exit(1); /* * Process any requests or signals received recently. */ AbsorbFsyncRequests(); if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); } if (checkpoint_requested) { checkpoint_requested = false; do_checkpoint = true; BgWriterStats.m_requested_checkpoints++; } if (shutdown_requested) { /* * From here on, elog(ERROR) should end with exit(1), not send * control back to the sigsetjmp block above */ ExitOnAnyError = true; /* Close down the database */ ShutdownXLOG(0, 0); DumpFreeSpaceMap(0, 0); /* Normal exit from the bgwriter is here */ proc_exit(0); /* done */ } /* * Force a checkpoint if too much time has elapsed since the last one. * Note that we count a timed checkpoint in stats only when this * occurs without an external request, but we set the CAUSE_TIME flag * bit even if there is also an external request. */ now = time(NULL); elapsed_secs = now - last_checkpoint_time; if (elapsed_secs >= CheckPointTimeout) { if (!do_checkpoint) BgWriterStats.m_timed_checkpoints++; do_checkpoint = true; flags |= CHECKPOINT_CAUSE_TIME; } /* * Do a checkpoint if requested, otherwise do one cycle of * dirty-buffer writing. */ if (do_checkpoint) { /* use volatile pointer to prevent code rearrangement */ volatile BgWriterShmemStruct *bgs = BgWriterShmem; /* * Atomically fetch the request flags to figure out what kind of a * checkpoint we should perform, and increase the started-counter * to acknowledge that we've started a new checkpoint. */ SpinLockAcquire(&bgs->ckpt_lck); flags |= bgs->ckpt_flags; bgs->ckpt_flags = 0; bgs->ckpt_started++; SpinLockRelease(&bgs->ckpt_lck); /* * We will warn if (a) too soon since last checkpoint (whatever * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag * since the last checkpoint start. Note in particular that this * implementation will not generate warnings caused by * CheckPointTimeout < CheckPointWarning. */ if ((flags & CHECKPOINT_CAUSE_XLOG) && elapsed_secs < CheckPointWarning) ereport(LOG, (errmsg("checkpoints are occurring too frequently (%d seconds apart)", elapsed_secs), errhint("Consider increasing the configuration parameter \"checkpoint_segments\"."))); /* * Initialize bgwriter-private variables used during checkpoint. */ ckpt_active = true; ckpt_start_recptr = GetInsertRecPtr(); ckpt_start_time = now; ckpt_cached_elapsed = 0; /* * Do the checkpoint. */ CreateCheckPoint(flags); /* * After any checkpoint, close all smgr files. This is so we * won't hang onto smgr references to deleted files indefinitely. */ smgrcloseall(); /* * Indicate checkpoint completion to any waiting backends. */ SpinLockAcquire(&bgs->ckpt_lck); bgs->ckpt_done = bgs->ckpt_started; SpinLockRelease(&bgs->ckpt_lck); ckpt_active = false; /* * Note we record the checkpoint start time not end time as * last_checkpoint_time. This is so that time-driven checkpoints * happen at a predictable spacing. */ last_checkpoint_time = now; } else BgBufferSync(); /* Check for archive_timeout and switch xlog files if necessary. */ CheckArchiveTimeout(); /* Nap for the configured time. */ BgWriterNap(); } }
int MainHandlerLoop_RMSEG(void) { int res = FUNC_RETURN_OK; uint64_t curtime = 0; int errorcode = FUNC_RETURN_OK; char errorbuf[1024]; while( DRMGlobalInstance->ResManagerMainKeepRun ) { if (!PostmasterIsAlive(true)) { DRMGlobalInstance->ResManagerMainKeepRun = false; elog(LOG, "Postmaster is not alive, resource manager exits"); break; } /* PART1. Handle socket server inputs. */ res = processAllCommFileDescs(); if ( res != FUNC_RETURN_OK ) { /* * The possible error here is the failure of poll(), we won't keep * running HAWQ RM any longer, graceful quit is requested. */ DRMGlobalInstance->ResManagerMainKeepRun = false; elog(LOG, "System error cause resource manager not possible to track " "network communications."); } /* PART2. Handle all BE submitted requests. */ processSubmittedRequests(); /* PART3. Fresh local host info and send IMAlive message to resource * manager server. */ curtime = gettime_microsec(); if ( DRMGlobalInstance->LocalHostStat == NULL || curtime - DRMGlobalInstance->LocalHostLastUpdateTime > SEGMENT_HOSTCHECK_INTERVAL ) { refreshLocalHostInstance(); checkLocalPostmasterStatus(); } if ( DRMGlobalInstance->SendIMAlive ) { if (DRMGlobalInstance->LocalHostStat != NULL && curtime - DRMGlobalInstance->HeartBeatLastSentTime > SEGMENT_HEARTBEAT_INTERVAL ) { sendIMAlive(&errorcode, errorbuf, sizeof(errorbuf)); DRMGlobalInstance->HeartBeatLastSentTime = gettime_microsec(); } } /* PART4. Send responses back to the clients. */ sendResponseToClients(); /* PART5. Resource enforcement work thread quit */ if (g_enforcement_thread_quited) { elog(ERROR, "Resource enforcement thread quited"); } } elog(RMLOG, "Resource manager main event handler exits."); return res; }
/* * Main entry point for walwriter process * * This is invoked from BootstrapMain, which has already created the basic * execution environment, but not enabled signals yet. */ void WalWriterMain(void) { sigjmp_buf local_sigjmp_buf; MemoryContext walwriter_context; /* * If possible, make this process a group leader, so that the postmaster * can signal any child processes too. (walwriter probably never has any * child processes, but for consistency we make all postmaster child * processes do this.) */ #ifdef HAVE_SETSID if (setsid() < 0) elog(FATAL, "setsid() failed: %m"); #endif /* * Properly accept or ignore signals the postmaster might send us * * We have no particular use for SIGINT at the moment, but seems * reasonable to treat like SIGTERM. */ pqsignal(SIGHUP, WalSigHupHandler); /* set flag to read config file */ pqsignal(SIGINT, WalShutdownHandler); /* request shutdown */ pqsignal(SIGTERM, WalShutdownHandler); /* request shutdown */ pqsignal(SIGQUIT, wal_quickdie); /* hard crash time */ pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, SIG_IGN); /* reserve for ProcSignal */ pqsignal(SIGUSR2, SIG_IGN); /* not used */ /* * Reset some signals that are accepted by postmaster but not here */ pqsignal(SIGCHLD, SIG_DFL); pqsignal(SIGTTIN, SIG_DFL); pqsignal(SIGTTOU, SIG_DFL); pqsignal(SIGCONT, SIG_DFL); pqsignal(SIGWINCH, SIG_DFL); /* We allow SIGQUIT (quickdie) at all times */ sigdelset(&BlockSig, SIGQUIT); /* * Create a resource owner to keep track of our resources (not clear that * we need this, but may as well have one). */ CurrentResourceOwner = ResourceOwnerCreate(NULL, "Wal Writer"); /* * Create a memory context that we will do all our work in. We do this so * that we can reset the context during error recovery and thereby avoid * possible memory leaks. Formerly this code just ran in * TopMemoryContext, but resetting that would be a really bad idea. */ walwriter_context = AllocSetContextCreate(TopMemoryContext, "Wal Writer", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); MemoryContextSwitchTo(walwriter_context); /* * If an exception is encountered, processing resumes here. * * This code is heavily based on bgwriter.c, q.v. */ if (sigsetjmp(local_sigjmp_buf, 1) != 0) { /* Since not using PG_TRY, must reset error stack by hand */ error_context_stack = NULL; /* Prevent interrupts while cleaning up */ HOLD_INTERRUPTS(); /* Report the error to the server log */ EmitErrorReport(); /* * These operations are really just a minimal subset of * AbortTransaction(). We don't have very many resources to worry * about in walwriter, but we do have LWLocks, and perhaps buffers? */ LWLockReleaseAll(); AbortBufferIO(); UnlockBuffers(); /* buffer pins are released here: */ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, true); /* we needn't bother with the other ResourceOwnerRelease phases */ AtEOXact_Buffers(false); AtEOXact_Files(); AtEOXact_HashTables(false); /* * Now return to normal top-level context and clear ErrorContext for * next time. */ MemoryContextSwitchTo(walwriter_context); FlushErrorState(); /* Flush any leaked data in the top-level context */ MemoryContextResetAndDeleteChildren(walwriter_context); /* Now we can allow interrupts again */ RESUME_INTERRUPTS(); /* * Sleep at least 1 second after any error. A write error is likely * to be repeated, and we don't want to be filling the error logs as * fast as we can. */ pg_usleep(1000000L); /* * Close all open files after any error. This is helpful on Windows, * where holding deleted files open causes various strange errors. * It's not clear we need it elsewhere, but shouldn't hurt. */ smgrcloseall(); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; /* * Unblock signals (they were blocked when the postmaster forked us) */ PG_SETMASK(&UnBlockSig); /* * Loop forever */ for (;;) { long udelay; /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (!PostmasterIsAlive(true)) exit(1); /* * Process any requests or signals received recently. */ if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); } if (shutdown_requested) { /* Normal exit from the walwriter is here */ proc_exit(0); /* done */ } /* * Do what we're here for... */ XLogBackgroundFlush(); /* * Delay until time to do something more, but fall out of delay * reasonably quickly if signaled. */ udelay = WalWriterDelay * 1000L; while (udelay > 999999L) { if (got_SIGHUP || shutdown_requested) break; pg_usleep(1000000L); udelay -= 1000000L; } if (!(got_SIGHUP || shutdown_requested)) pg_usleep(udelay); } }
/* * pgarch_MainLoop * * Main loop for archiver */ static void pgarch_MainLoop(void) { time_t last_copy_time = 0; bool time_to_stop; /* * We run the copy loop immediately upon entry, in case there are * unarchived files left over from a previous database run (or maybe the * archiver died unexpectedly). After that we wait for a signal or * timeout before doing more. */ wakened = true; do { /* When we get SIGUSR2, we do one more archive cycle, then exit */ time_to_stop = ready_to_stop; /* Check for config update */ if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); } /* * If we've gotten SIGTERM, we normally just sit and do nothing until * SIGUSR2 arrives. However, that means a random SIGTERM would * disable archiving indefinitely, which doesn't seem like a good * idea. If more than 60 seconds pass since SIGTERM, exit anyway, so * that the postmaster can start a new archiver if needed. */ if (got_SIGTERM) { time_t curtime = time(NULL); if (last_sigterm_time == 0) last_sigterm_time = curtime; else if ((unsigned int) (curtime - last_sigterm_time) >= (unsigned int) 60) break; } /* Do what we're here for */ if (wakened || time_to_stop) { wakened = false; pgarch_ArchiverCopyLoop(); last_copy_time = time(NULL); } /* * There shouldn't be anything for the archiver to do except to wait * for a signal ... however, the archiver exists to protect our data, * so she wakes up occasionally to allow herself to be proactive. * * On some platforms, signals won't interrupt the sleep. To ensure we * respond reasonably promptly when someone signals us, break down the * sleep into 1-second increments, and check for interrupts after each * nap. */ while (!(wakened || ready_to_stop || got_SIGHUP || !PostmasterIsAlive(true))) { time_t curtime; pg_usleep(1000000L); curtime = time(NULL); if ((unsigned int) (curtime - last_copy_time) >= (unsigned int) PGARCH_AUTOWAKE_INTERVAL) wakened = true; } /* * The archiver quits either when the postmaster dies (not expected) * or after completing one more archiving cycle after receiving * SIGUSR2. */ } while (PostmasterIsAlive(true) && !time_to_stop); }
/* Main entry point for walreceiver process */ void WalReceiverMain(void) { char conninfo[MAXCONNINFO]; XLogRecPtr startpoint; /* use volatile pointer to prevent code rearrangement */ volatile WalRcvData *walrcv = WalRcv; am_walreceiver = true; /* * WalRcv should be set up already (if we are a backend, we inherit this * by fork() or EXEC_BACKEND mechanism from the postmaster). */ Assert(walrcv != NULL); /* * Mark walreceiver as running in shared memory. * * Do this as early as possible, so that if we fail later on, we'll set * state to STOPPED. If we die before this, the startup process will keep * waiting for us to start up, until it times out. */ SpinLockAcquire(&walrcv->mutex); Assert(walrcv->pid == 0); switch (walrcv->walRcvState) { case WALRCV_STOPPING: /* If we've already been requested to stop, don't start up. */ walrcv->walRcvState = WALRCV_STOPPED; /* fall through */ case WALRCV_STOPPED: SpinLockRelease(&walrcv->mutex); proc_exit(1); break; case WALRCV_STARTING: /* The usual case */ break; case WALRCV_RUNNING: /* Shouldn't happen */ elog(PANIC, "walreceiver still running according to shared memory state"); } /* Advertise our PID so that the startup process can kill us */ walrcv->pid = MyProcPid; walrcv->walRcvState = WALRCV_RUNNING; /* Fetch information required to start streaming */ strlcpy(conninfo, (char *) walrcv->conninfo, MAXCONNINFO); startpoint = walrcv->receivedUpto; SpinLockRelease(&walrcv->mutex); /* Arrange to clean up at walreceiver exit */ on_shmem_exit(WalRcvDie, 0); /* * If possible, make this process a group leader, so that the postmaster * can signal any child processes too. (walreceiver probably never has * any child processes, but for consistency we make all postmaster child * processes do this.) */ #ifdef HAVE_SETSID if (setsid() < 0) elog(FATAL, "setsid() failed: %m"); #endif /* Properly accept or ignore signals the postmaster might send us */ pqsignal(SIGHUP, WalRcvSigHupHandler); /* set flag to read config * file */ pqsignal(SIGINT, SIG_IGN); pqsignal(SIGTERM, WalRcvShutdownHandler); /* request shutdown */ pqsignal(SIGQUIT, WalRcvQuickDieHandler); /* hard crash time */ pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, SIG_IGN); pqsignal(SIGUSR2, SIG_IGN); /* Reset some signals that are accepted by postmaster but not here */ pqsignal(SIGCHLD, SIG_DFL); pqsignal(SIGTTIN, SIG_DFL); pqsignal(SIGTTOU, SIG_DFL); pqsignal(SIGCONT, SIG_DFL); pqsignal(SIGWINCH, SIG_DFL); /* We allow SIGQUIT (quickdie) at all times */ sigdelset(&BlockSig, SIGQUIT); /* Load the libpq-specific functions */ load_file("libpqwalreceiver", false); if (walrcv_connect == NULL || walrcv_receive == NULL || walrcv_disconnect == NULL) elog(ERROR, "libpqwalreceiver didn't initialize correctly"); /* * Create a resource owner to keep track of our resources (not clear that * we need this, but may as well have one). */ CurrentResourceOwner = ResourceOwnerCreate(NULL, "Wal Receiver"); /* Unblock signals (they were blocked when the postmaster forked us) */ PG_SETMASK(&UnBlockSig); /* Establish the connection to the primary for XLOG streaming */ EnableWalRcvImmediateExit(); walrcv_connect(conninfo, startpoint); DisableWalRcvImmediateExit(); /* Loop until end-of-streaming or error */ for (;;) { unsigned char type; char *buf; int len; /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (!PostmasterIsAlive(true)) exit(1); /* * Exit walreceiver if we're not in recovery. This should not happen, * but cross-check the status here. */ if (!RecoveryInProgress()) ereport(FATAL, (errmsg("cannot continue WAL streaming, recovery has already ended"))); /* Process any requests or signals received recently */ ProcessWalRcvInterrupts(); if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); } /* Wait a while for data to arrive */ if (walrcv_receive(NAPTIME_PER_CYCLE, &type, &buf, &len)) { /* Accept the received data, and process it */ XLogWalRcvProcessMsg(type, buf, len); /* Receive any more data we can without sleeping */ while (walrcv_receive(0, &type, &buf, &len)) XLogWalRcvProcessMsg(type, buf, len); /* * If we've written some records, flush them to disk and let the * startup process know about them. */ XLogWalRcvFlush(); } } }
/* * Like WaitLatch, but with an extra socket argument for WL_SOCKET_* * conditions. * * When waiting on a socket, WL_SOCKET_READABLE *must* be included in * 'wakeEvents'; WL_SOCKET_WRITEABLE is optional. The reason for this is * that EOF and error conditions are reported only via WL_SOCKET_READABLE. */ int WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock, long timeout) { int result = 0; int rc; instr_time start_time, cur_time; long cur_timeout; #ifdef HAVE_POLL struct pollfd pfds[3]; int nfds; #else struct timeval tv, *tvp; fd_set input_mask; fd_set output_mask; int hifd; #endif /* Ignore WL_SOCKET_* events if no valid socket is given */ if (sock == PGINVALID_SOCKET) wakeEvents &= ~(WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE); Assert(wakeEvents != 0); /* must have at least one wake event */ /* Cannot specify WL_SOCKET_WRITEABLE without WL_SOCKET_READABLE */ Assert((wakeEvents & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) != WL_SOCKET_WRITEABLE); if ((wakeEvents & WL_LATCH_SET) && latch->owner_pid != MyProcPid) elog(ERROR, "cannot wait on a latch owned by another process"); /* * Initialize timeout if requested. We must record the current time so * that we can determine the remaining timeout if the poll() or select() * is interrupted. (On some platforms, select() will update the contents * of "tv" for us, but unfortunately we can't rely on that.) */ if (wakeEvents & WL_TIMEOUT) { INSTR_TIME_SET_CURRENT(start_time); Assert(timeout >= 0 && timeout <= INT_MAX); cur_timeout = timeout; #ifndef HAVE_POLL tv.tv_sec = cur_timeout / 1000L; tv.tv_usec = (cur_timeout % 1000L) * 1000L; tvp = &tv; #endif } else { cur_timeout = -1; #ifndef HAVE_POLL tvp = NULL; #endif } waiting = true; do { /* * Clear the pipe, then check if the latch is set already. If someone * sets the latch between this and the poll()/select() below, the * setter will write a byte to the pipe (or signal us and the signal * handler will do that), and the poll()/select() will return * immediately. * * Note: we assume that the kernel calls involved in drainSelfPipe() * and SetLatch() will provide adequate synchronization on machines * with weak memory ordering, so that we cannot miss seeing is_set if * the signal byte is already in the pipe when we drain it. */ drainSelfPipe(); if ((wakeEvents & WL_LATCH_SET) && latch->is_set) { result |= WL_LATCH_SET; /* * Leave loop immediately, avoid blocking again. We don't attempt * to report any other events that might also be satisfied. */ break; } /* Must wait ... we use poll(2) if available, otherwise select(2) */ #ifdef HAVE_POLL nfds = 0; if (wakeEvents & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) { /* socket, if used, is always in pfds[0] */ pfds[0].fd = sock; pfds[0].events = 0; if (wakeEvents & WL_SOCKET_READABLE) pfds[0].events |= POLLIN; if (wakeEvents & WL_SOCKET_WRITEABLE) pfds[0].events |= POLLOUT; pfds[0].revents = 0; nfds++; } pfds[nfds].fd = selfpipe_readfd; pfds[nfds].events = POLLIN; pfds[nfds].revents = 0; nfds++; if (wakeEvents & WL_POSTMASTER_DEATH) { /* postmaster fd, if used, is always in pfds[nfds - 1] */ pfds[nfds].fd = postmaster_alive_fds[POSTMASTER_FD_WATCH]; pfds[nfds].events = POLLIN; pfds[nfds].revents = 0; nfds++; } /* Sleep */ rc = poll(pfds, nfds, (int) cur_timeout); /* Check return code */ if (rc < 0) { /* EINTR is okay, otherwise complain */ if (errno != EINTR) { waiting = false; ereport(ERROR, (errcode_for_socket_access(), errmsg("poll() failed: %m"))); } } else if (rc == 0) { /* timeout exceeded */ if (wakeEvents & WL_TIMEOUT) result |= WL_TIMEOUT; } else { /* at least one event occurred, so check revents values */ if ((wakeEvents & WL_SOCKET_READABLE) && (pfds[0].revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL))) { /* data available in socket, or EOF/error condition */ result |= WL_SOCKET_READABLE; } if ((wakeEvents & WL_SOCKET_WRITEABLE) && (pfds[0].revents & POLLOUT)) { result |= WL_SOCKET_WRITEABLE; } /* * We expect a POLLHUP when the remote end is closed, but because * we don't expect the pipe to become readable or to have any * errors either, treat those cases as postmaster death, too. */ if ((wakeEvents & WL_POSTMASTER_DEATH) && (pfds[nfds - 1].revents & (POLLHUP | POLLIN | POLLERR | POLLNVAL))) { /* * According to the select(2) man page on Linux, select(2) may * spuriously return and report a file descriptor as readable, * when it's not; and presumably so can poll(2). It's not * clear that the relevant cases would ever apply to the * postmaster pipe, but since the consequences of falsely * returning WL_POSTMASTER_DEATH could be pretty unpleasant, * we take the trouble to positively verify EOF with * PostmasterIsAlive(). */ if (!PostmasterIsAlive()) result |= WL_POSTMASTER_DEATH; } } #else /* !HAVE_POLL */ FD_ZERO(&input_mask); FD_ZERO(&output_mask); FD_SET(selfpipe_readfd, &input_mask); hifd = selfpipe_readfd; if (wakeEvents & WL_POSTMASTER_DEATH) { FD_SET(postmaster_alive_fds[POSTMASTER_FD_WATCH], &input_mask); if (postmaster_alive_fds[POSTMASTER_FD_WATCH] > hifd) hifd = postmaster_alive_fds[POSTMASTER_FD_WATCH]; } if (wakeEvents & WL_SOCKET_READABLE) { FD_SET(sock, &input_mask); if (sock > hifd) hifd = sock; } if (wakeEvents & WL_SOCKET_WRITEABLE) { FD_SET(sock, &output_mask); if (sock > hifd) hifd = sock; } /* Sleep */ rc = select(hifd + 1, &input_mask, &output_mask, NULL, tvp); /* Check return code */ if (rc < 0) { /* EINTR is okay, otherwise complain */ if (errno != EINTR) { waiting = false; ereport(ERROR, (errcode_for_socket_access(), errmsg("select() failed: %m"))); } } else if (rc == 0) { /* timeout exceeded */ if (wakeEvents & WL_TIMEOUT) result |= WL_TIMEOUT; } else { /* at least one event occurred, so check masks */ if ((wakeEvents & WL_SOCKET_READABLE) && FD_ISSET(sock, &input_mask)) { /* data available in socket, or EOF */ result |= WL_SOCKET_READABLE; } if ((wakeEvents & WL_SOCKET_WRITEABLE) && FD_ISSET(sock, &output_mask)) { result |= WL_SOCKET_WRITEABLE; } if ((wakeEvents & WL_POSTMASTER_DEATH) && FD_ISSET(postmaster_alive_fds[POSTMASTER_FD_WATCH], &input_mask)) { /* * According to the select(2) man page on Linux, select(2) may * spuriously return and report a file descriptor as readable, * when it's not; and presumably so can poll(2). It's not * clear that the relevant cases would ever apply to the * postmaster pipe, but since the consequences of falsely * returning WL_POSTMASTER_DEATH could be pretty unpleasant, * we take the trouble to positively verify EOF with * PostmasterIsAlive(). */ if (!PostmasterIsAlive()) result |= WL_POSTMASTER_DEATH; } } #endif /* HAVE_POLL */ /* If we're not done, update cur_timeout for next iteration */ if (result == 0 && cur_timeout >= 0) { INSTR_TIME_SET_CURRENT(cur_time); INSTR_TIME_SUBTRACT(cur_time, start_time); cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time); if (cur_timeout < 0) cur_timeout = 0; #ifndef HAVE_POLL tv.tv_sec = cur_timeout / 1000L; tv.tv_usec = (cur_timeout % 1000L) * 1000L; #endif } } while (result == 0); waiting = false; return result; }
/* Main loop of walsender process */ static int WalSndLoop(void) { char *output_message; bool caughtup = false; /* * Allocate buffer that will be used for each output message. We do this * just once to reduce palloc overhead. The buffer must be made large * enough for maximum-sized messages. */ output_message = palloc(1 + sizeof(WalDataMessageHeader) + MAX_SEND_SIZE); /* * Allocate buffer that will be used for processing reply messages. As * above, do this just once to reduce palloc overhead. */ initStringInfo(&reply_message); /* Initialize the last reply timestamp */ last_reply_timestamp = GetCurrentTimestamp(); /* Loop forever, unless we get an error */ for (;;) { /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (!PostmasterIsAlive(true)) exit(1); /* Process any requests or signals received recently */ if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); SyncRepInitConfig(); } /* Normal exit from the walsender is here */ if (walsender_shutdown_requested) { /* Inform the standby that XLOG streaming was done */ pq_puttextmessage('C', "COPY 0"); pq_flush(); proc_exit(0); } /* * If we don't have any pending data in the output buffer, try to send * some more. */ if (!pq_is_send_pending()) { XLogSend(output_message, &caughtup); /* * Even if we wrote all the WAL that was available when we started * sending, more might have arrived while we were sending this * batch. We had the latch set while sending, so we have not * received any signals from that time. Let's arm the latch again, * and after that check that we're still up-to-date. */ if (caughtup && !pq_is_send_pending()) { ResetLatch(&MyWalSnd->latch); XLogSend(output_message, &caughtup); } } /* Flush pending output to the client */ if (pq_flush_if_writable() != 0) break; /* * When SIGUSR2 arrives, we send any outstanding logs up to the * shutdown checkpoint record (i.e., the latest record) and exit. */ if (walsender_ready_to_stop && !pq_is_send_pending()) { XLogSend(output_message, &caughtup); ProcessRepliesIfAny(); if (caughtup && !pq_is_send_pending()) walsender_shutdown_requested = true; } if ((caughtup || pq_is_send_pending()) && !got_SIGHUP && !walsender_shutdown_requested) { TimestampTz finish_time = 0; long sleeptime; /* Reschedule replication timeout */ if (replication_timeout > 0) { long secs; int usecs; finish_time = TimestampTzPlusMilliseconds(last_reply_timestamp, replication_timeout); TimestampDifference(GetCurrentTimestamp(), finish_time, &secs, &usecs); sleeptime = secs * 1000 + usecs / 1000; if (WalSndDelay < sleeptime) sleeptime = WalSndDelay; } else { /* * XXX: Without timeout, we don't really need the periodic * wakeups anymore, WaitLatchOrSocket should reliably wake up * as soon as something interesting happens. */ sleeptime = WalSndDelay; } /* Sleep */ WaitLatchOrSocket(&MyWalSnd->latch, MyProcPort->sock, true, pq_is_send_pending(), sleeptime); /* Check for replication timeout */ if (replication_timeout > 0 && GetCurrentTimestamp() >= finish_time) { /* * Since typically expiration of replication timeout means * communication problem, we don't send the error message to * the standby. */ ereport(COMMERROR, (errmsg("terminating walsender process due to replication timeout"))); break; } } /* * If we're in catchup state, see if its time to move to streaming. * This is an important state change for users, since before this * point data loss might occur if the primary dies and we need to * failover to the standby. The state change is also important for * synchronous replication, since commits that started to wait at that * point might wait for some time. */ if (MyWalSnd->state == WALSNDSTATE_CATCHUP && caughtup) { ereport(DEBUG1, (errmsg("standby \"%s\" has now caught up with primary", application_name))); WalSndSetState(WALSNDSTATE_STREAMING); } ProcessRepliesIfAny(); } /* * Get here on send failure. Clean up and exit. * * Reset whereToSendOutput to prevent ereport from attempting to send any * more messages to the standby. */ if (whereToSendOutput == DestRemote) whereToSendOutput = DestNone; proc_exit(0); return 1; /* keep the compiler quiet */ }
int WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock, long timeout) { DWORD rc; HANDLE events[4]; HANDLE latchevent; HANDLE sockevent = WSA_INVALID_EVENT; int numevents; int result = 0; int pmdeath_eventno = 0; /* Ignore WL_SOCKET_* events if no valid socket is given */ if (sock == PGINVALID_SOCKET) wakeEvents &= ~(WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE); Assert(wakeEvents != 0); /* must have at least one wake event */ /* Cannot specify WL_SOCKET_WRITEABLE without WL_SOCKET_READABLE */ Assert((wakeEvents & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) != WL_SOCKET_WRITEABLE); if ((wakeEvents & WL_LATCH_SET) && latch->owner_pid != MyProcPid) elog(ERROR, "cannot wait on a latch owned by another process"); /* Convert timeout to form used by WaitForMultipleObjects() */ if (wakeEvents & WL_TIMEOUT) Assert(timeout >= 0); else timeout = INFINITE; /* * Construct an array of event handles for WaitforMultipleObjects(). * * Note: pgwin32_signal_event should be first to ensure that it will be * reported when multiple events are set. We want to guarantee that * pending signals are serviced. */ latchevent = latch->event; events[0] = pgwin32_signal_event; events[1] = latchevent; numevents = 2; if (wakeEvents & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) { /* Need an event object to represent events on the socket */ int flags = 0; if (wakeEvents & WL_SOCKET_READABLE) flags |= (FD_READ | FD_CLOSE); if (wakeEvents & WL_SOCKET_WRITEABLE) flags |= FD_WRITE; sockevent = WSACreateEvent(); if (sockevent == WSA_INVALID_EVENT) elog(ERROR, "failed to create event for socket: error code %u", WSAGetLastError()); if (WSAEventSelect(sock, sockevent, flags) != 0) elog(ERROR, "failed to set up event for socket: error code %u", WSAGetLastError()); events[numevents++] = sockevent; } if (wakeEvents & WL_POSTMASTER_DEATH) { pmdeath_eventno = numevents; events[numevents++] = PostmasterHandle; } /* Ensure that signals are serviced even if latch is already set */ pgwin32_dispatch_queued_signals(); do { /* * Reset the event, and check if the latch is set already. If someone * sets the latch between this and the WaitForMultipleObjects() call * below, the setter will set the event and WaitForMultipleObjects() * will return immediately. */ if (!ResetEvent(latchevent)) elog(ERROR, "ResetEvent failed: error code %lu", GetLastError()); if ((wakeEvents & WL_LATCH_SET) && latch->is_set) { result |= WL_LATCH_SET; /* * Leave loop immediately, avoid blocking again. We don't attempt * to report any other events that might also be satisfied. */ break; } rc = WaitForMultipleObjects(numevents, events, FALSE, timeout); if (rc == WAIT_FAILED) elog(ERROR, "WaitForMultipleObjects() failed: error code %lu", GetLastError()); else if (rc == WAIT_TIMEOUT) { result |= WL_TIMEOUT; } else if (rc == WAIT_OBJECT_0) { /* Service newly-arrived signals */ pgwin32_dispatch_queued_signals(); } else if (rc == WAIT_OBJECT_0 + 1) { /* Latch is set, we'll handle that on next iteration of loop */ } else if ((wakeEvents & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) && rc == WAIT_OBJECT_0 + 2) /* socket is at event slot 2 */ { WSANETWORKEVENTS resEvents; ZeroMemory(&resEvents, sizeof(resEvents)); if (WSAEnumNetworkEvents(sock, sockevent, &resEvents) != 0) elog(ERROR, "failed to enumerate network events: error code %u", WSAGetLastError()); if ((wakeEvents & WL_SOCKET_READABLE) && (resEvents.lNetworkEvents & (FD_READ | FD_CLOSE))) { result |= WL_SOCKET_READABLE; } if ((wakeEvents & WL_SOCKET_WRITEABLE) && (resEvents.lNetworkEvents & FD_WRITE)) { result |= WL_SOCKET_WRITEABLE; } } else if ((wakeEvents & WL_POSTMASTER_DEATH) && rc == WAIT_OBJECT_0 + pmdeath_eventno) { /* * Postmaster apparently died. Since the consequences of falsely * returning WL_POSTMASTER_DEATH could be pretty unpleasant, we * take the trouble to positively verify this with * PostmasterIsAlive(), even though there is no known reason to * think that the event could be falsely set on Windows. */ if (!PostmasterIsAlive()) result |= WL_POSTMASTER_DEATH; } else elog(ERROR, "unexpected return code from WaitForMultipleObjects(): %lu", rc); } while (result == 0); /* Clean up the event object we created for the socket */ if (sockevent != WSA_INVALID_EVENT) { WSAEventSelect(sock, NULL, 0); WSACloseEvent(sockevent); } return result; }
/* * Main entry point for bgwriter process * * This is invoked from BootstrapMain, which has already created the basic * execution environment, but not enabled signals yet. */ void BackgroundWriterMain(void) { sigjmp_buf local_sigjmp_buf; MemoryContext bgwriter_context; am_bg_writer = true; /* * If possible, make this process a group leader, so that the postmaster * can signal any child processes too. (bgwriter probably never has any * child processes, but for consistency we make all postmaster child * processes do this.) */ #ifdef HAVE_SETSID if (setsid() < 0) elog(FATAL, "setsid() failed: %m"); #endif /* * Properly accept or ignore signals the postmaster might send us * * SIGUSR1 is presently unused; keep it spare in case someday we want this * process to participate in ProcSignal signalling. */ pqsignal(SIGHUP, BgSigHupHandler); /* set flag to read config file */ pqsignal(SIGINT, SIG_IGN); /* as of 9.2 no longer requests checkpoint */ pqsignal(SIGTERM, ReqShutdownHandler); /* shutdown */ pqsignal(SIGQUIT, bg_quickdie); /* hard crash time */ pqsignal(SIGALRM, SIG_IGN); pqsignal(SIGPIPE, SIG_IGN); pqsignal(SIGUSR1, SIG_IGN); /* reserve for ProcSignal */ pqsignal(SIGUSR2, SIG_IGN); /* * Reset some signals that are accepted by postmaster but not here */ pqsignal(SIGCHLD, SIG_DFL); pqsignal(SIGTTIN, SIG_DFL); pqsignal(SIGTTOU, SIG_DFL); pqsignal(SIGCONT, SIG_DFL); pqsignal(SIGWINCH, SIG_DFL); /* We allow SIGQUIT (quickdie) at all times */ sigdelset(&BlockSig, SIGQUIT); /* * Create a resource owner to keep track of our resources (currently only * buffer pins). */ CurrentResourceOwner = ResourceOwnerCreate(NULL, "Background Writer"); /* * Create a memory context that we will do all our work in. We do this so * that we can reset the context during error recovery and thereby avoid * possible memory leaks. Formerly this code just ran in * TopMemoryContext, but resetting that would be a really bad idea. */ bgwriter_context = AllocSetContextCreate(TopMemoryContext, "Background Writer", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); MemoryContextSwitchTo(bgwriter_context); /* * If an exception is encountered, processing resumes here. * * See notes in postgres.c about the design of this coding. */ if (sigsetjmp(local_sigjmp_buf, 1) != 0) { /* Since not using PG_TRY, must reset error stack by hand */ error_context_stack = NULL; /* Prevent interrupts while cleaning up */ HOLD_INTERRUPTS(); /* Report the error to the server log */ EmitErrorReport(); /* * These operations are really just a minimal subset of * AbortTransaction(). We don't have very many resources to worry * about in bgwriter, but we do have LWLocks, buffers, and temp files. */ LWLockReleaseAll(); AbortBufferIO(); UnlockBuffers(); /* buffer pins are released here: */ ResourceOwnerRelease(CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, true); /* we needn't bother with the other ResourceOwnerRelease phases */ AtEOXact_Buffers(false); AtEOXact_Files(); AtEOXact_HashTables(false); /* * Now return to normal top-level context and clear ErrorContext for * next time. */ MemoryContextSwitchTo(bgwriter_context); FlushErrorState(); /* Flush any leaked data in the top-level context */ MemoryContextResetAndDeleteChildren(bgwriter_context); /* Now we can allow interrupts again */ RESUME_INTERRUPTS(); /* * Sleep at least 1 second after any error. A write error is likely * to be repeated, and we don't want to be filling the error logs as * fast as we can. */ pg_usleep(1000000L); /* * Close all open files after any error. This is helpful on Windows, * where holding deleted files open causes various strange errors. * It's not clear we need it elsewhere, but shouldn't hurt. */ smgrcloseall(); } /* We can now handle ereport(ERROR) */ PG_exception_stack = &local_sigjmp_buf; /* * Unblock signals (they were blocked when the postmaster forked us) */ PG_SETMASK(&UnBlockSig); /* * Use the recovery target timeline ID during recovery */ if (RecoveryInProgress()) ThisTimeLineID = GetRecoveryTargetTLI(); /* * Loop forever */ for (;;) { /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (!PostmasterIsAlive()) exit(1); if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); /* update global shmem state for sync rep */ } if (shutdown_requested) { /* * From here on, elog(ERROR) should end with exit(1), not send * control back to the sigsetjmp block above */ ExitOnAnyError = true; /* Normal exit from the bgwriter is here */ proc_exit(0); /* done */ } /* * Do one cycle of dirty-buffer writing. */ BgBufferSync(); /* Nap for the configured time. */ BgWriterNap(); } }
/* * Execute commands from walreceiver, until we enter streaming mode. */ static void WalSndHandshake(void) { StringInfoData input_message; bool replication_started = false; initStringInfo(&input_message); while (!replication_started) { int firstchar; WalSndSetState(WALSNDSTATE_STARTUP); set_ps_display("idle", false); /* Wait for a command to arrive */ firstchar = pq_getbyte(); /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (!PostmasterIsAlive()) exit(1); /* * Check for any other interesting events that happened while we * slept. */ if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); } if (firstchar != EOF) { /* * Read the message contents. This is expected to be done without * blocking because we've been able to get message type code. */ if (pq_getmessage(&input_message, 0)) firstchar = EOF; /* suitable message already logged */ } /* Handle the very limited subset of commands expected in this phase */ switch (firstchar) { case 'Q': /* Query message */ { const char *query_string; query_string = pq_getmsgstring(&input_message); pq_getmsgend(&input_message); if (HandleReplicationCommand(query_string)) replication_started = true; } break; case 'X': /* standby is closing the connection */ proc_exit(0); case EOF: /* standby disconnected unexpectedly */ ereport(COMMERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("unexpected EOF on standby connection"))); proc_exit(0); default: ereport(FATAL, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("invalid standby handshake message type %d", firstchar))); } } }
/* * Execute commands from walreceiver, until we enter streaming mode. */ static void WalSndHandshake(void) { StringInfoData input_message; bool replication_started = false; initStringInfo(&input_message); while (!replication_started) { int firstchar; /* Wait for a command to arrive */ firstchar = pq_getbyte(); /* * Emergency bailout if postmaster has died. This is to avoid the * necessity for manual cleanup of all postmaster children. */ if (!PostmasterIsAlive(true)) exit(1); /* * Check for any other interesting events that happened while we * slept. */ if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); } if (firstchar != EOF) { /* * Read the message contents. This is expected to be done without * blocking because we've been able to get message type code. */ if (pq_getmessage(&input_message, 0)) firstchar = EOF; /* suitable message already logged */ } /* Handle the very limited subset of commands expected in this phase */ switch (firstchar) { case 'Q': /* Query message */ { const char *query_string; XLogRecPtr recptr; query_string = pq_getmsgstring(&input_message); pq_getmsgend(&input_message); if (strcmp(query_string, "IDENTIFY_SYSTEM") == 0) { StringInfoData buf; char sysid[32]; char tli[11]; /* * Reply with a result set with one row, two columns. * First col is system ID, and second is timeline ID */ snprintf(sysid, sizeof(sysid), UINT64_FORMAT, GetSystemIdentifier()); snprintf(tli, sizeof(tli), "%u", ThisTimeLineID); /* Send a RowDescription message */ pq_beginmessage(&buf, 'T'); pq_sendint(&buf, 2, 2); /* 2 fields */ /* first field */ pq_sendstring(&buf, "systemid"); /* col name */ pq_sendint(&buf, 0, 4); /* table oid */ pq_sendint(&buf, 0, 2); /* attnum */ pq_sendint(&buf, TEXTOID, 4); /* type oid */ pq_sendint(&buf, -1, 2); /* typlen */ pq_sendint(&buf, 0, 4); /* typmod */ pq_sendint(&buf, 0, 2); /* format code */ /* second field */ pq_sendstring(&buf, "timeline"); /* col name */ pq_sendint(&buf, 0, 4); /* table oid */ pq_sendint(&buf, 0, 2); /* attnum */ pq_sendint(&buf, INT4OID, 4); /* type oid */ pq_sendint(&buf, 4, 2); /* typlen */ pq_sendint(&buf, 0, 4); /* typmod */ pq_sendint(&buf, 0, 2); /* format code */ pq_endmessage(&buf); /* Send a DataRow message */ pq_beginmessage(&buf, 'D'); pq_sendint(&buf, 2, 2); /* # of columns */ pq_sendint(&buf, strlen(sysid), 4); /* col1 len */ pq_sendbytes(&buf, (char *) &sysid, strlen(sysid)); pq_sendint(&buf, strlen(tli), 4); /* col2 len */ pq_sendbytes(&buf, (char *) tli, strlen(tli)); pq_endmessage(&buf); /* Send CommandComplete and ReadyForQuery messages */ EndCommand("SELECT", DestRemote); ReadyForQuery(DestRemote); /* ReadyForQuery did pq_flush for us */ } else if (sscanf(query_string, "START_REPLICATION %X/%X", &recptr.xlogid, &recptr.xrecoff) == 2) { StringInfoData buf; /* * Check that we're logging enough information in the * WAL for log-shipping. * * NOTE: This only checks the current value of * wal_level. Even if the current setting is not * 'minimal', there can be old WAL in the pg_xlog * directory that was created with 'minimal'. So this * is not bulletproof, the purpose is just to give a * user-friendly error message that hints how to * configure the system correctly. */ if (wal_level == WAL_LEVEL_MINIMAL) ereport(FATAL, (errcode(ERRCODE_CANNOT_CONNECT_NOW), errmsg("standby connections not allowed because wal_level=minimal"))); /* Send a CopyOutResponse message, and start streaming */ pq_beginmessage(&buf, 'H'); pq_sendbyte(&buf, 0); pq_sendint(&buf, 0, 2); pq_endmessage(&buf); pq_flush(); /* * Initialize position to the received one, then the * xlog records begin to be shipped from that position */ sentPtr = recptr; /* break out of the loop */ replication_started = true; } else { ereport(FATAL, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("invalid standby query string: %s", query_string))); } break; } case 'X': /* standby is closing the connection */ proc_exit(0); case EOF: /* standby disconnected unexpectedly */ ereport(COMMERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("unexpected EOF on standby connection"))); proc_exit(0); default: ereport(FATAL, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("invalid standby handshake message type %d", firstchar))); } } }
/* * pgarch_MainLoop * * Main loop for archiver */ static void pgarch_MainLoop(void) { pg_time_t last_copy_time = 0; bool time_to_stop; /* * We run the copy loop immediately upon entry, in case there are * unarchived files left over from a previous database run (or maybe the * archiver died unexpectedly). After that we wait for a signal or * timeout before doing more. */ wakened = true; /* * There shouldn't be anything for the archiver to do except to wait * for a signal ... however, the archiver exists to protect our data, * so she wakes up occasionally to allow herself to be proactive. */ do { ResetLatch(&mainloop_latch); /* When we get SIGUSR2, we do one more archive cycle, then exit */ time_to_stop = ready_to_stop; /* Check for config update */ if (got_SIGHUP) { got_SIGHUP = false; ProcessConfigFile(PGC_SIGHUP); } /* * If we've gotten SIGTERM, we normally just sit and do nothing until * SIGUSR2 arrives. However, that means a random SIGTERM would * disable archiving indefinitely, which doesn't seem like a good * idea. If more than 60 seconds pass since SIGTERM, exit anyway, so * that the postmaster can start a new archiver if needed. */ if (got_SIGTERM) { time_t curtime = time(NULL); if (last_sigterm_time == 0) last_sigterm_time = curtime; else if ((unsigned int) (curtime - last_sigterm_time) >= (unsigned int) 60) break; } /* Do what we're here for */ if (wakened || time_to_stop) { wakened = false; pgarch_ArchiverCopyLoop(); last_copy_time = time(NULL); } /* * Sleep until a signal is received, or until a poll is forced by * PGARCH_AUTOWAKE_INTERVAL having passed since last_copy_time, or * until postmaster dies. */ if (!time_to_stop) /* Don't wait during last iteration */ { pg_time_t curtime = (pg_time_t) time(NULL); int timeout; timeout = PGARCH_AUTOWAKE_INTERVAL - (curtime - last_copy_time); if (timeout > 0) { int rc; rc = WaitLatch(&mainloop_latch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, timeout * 1000000L); if (rc & WL_TIMEOUT) wakened = true; } else wakened = true; } /* * The archiver quits either when the postmaster dies (not expected) * or after completing one more archiving cycle after receiving * SIGUSR2. */ } while (PostmasterIsAlive() && !time_to_stop); }
/* * Wait for synchronous replication, if requested by user. * * Initially backends start in state SYNC_REP_NOT_WAITING and then * change that state to SYNC_REP_WAITING before adding ourselves * to the wait queue. During SyncRepWakeQueue() a WALSender changes * the state to SYNC_REP_WAIT_COMPLETE once replication is confirmed. * This backend then resets its state to SYNC_REP_NOT_WAITING. */ void SyncRepWaitForLSN(XLogRecPtr XactCommitLSN) { char *new_status = NULL; const char *old_status; int mode = SyncRepWaitMode; /* * Fast exit if user has not requested sync replication, or there are no * sync replication standby names defined. Note that those standbys don't * need to be connected. */ if (!SyncRepRequested() || !SyncStandbysDefined()) return; Assert(SHMQueueIsDetached(&(MyProc->syncRepLinks))); Assert(WalSndCtl != NULL); LWLockAcquire(SyncRepLock, LW_EXCLUSIVE); Assert(MyProc->syncRepState == SYNC_REP_NOT_WAITING); /* * We don't wait for sync rep if WalSndCtl->sync_standbys_defined is not * set. See SyncRepUpdateSyncStandbysDefined. * * Also check that the standby hasn't already replied. Unlikely race * condition but we'll be fetching that cache line anyway so its likely to * be a low cost check. */ if (!WalSndCtl->sync_standbys_defined || XLByteLE(XactCommitLSN, WalSndCtl->lsn[mode])) { LWLockRelease(SyncRepLock); return; } /* * Set our waitLSN so WALSender will know when to wake us, and add * ourselves to the queue. */ MyProc->waitLSN = XactCommitLSN; MyProc->syncRepState = SYNC_REP_WAITING; SyncRepQueueInsert(mode); Assert(SyncRepQueueIsOrderedByLSN(mode)); LWLockRelease(SyncRepLock); /* Alter ps display to show waiting for sync rep. */ if (update_process_title) { int len; old_status = get_ps_display(&len); new_status = (char *) palloc(len + 32 + 1); memcpy(new_status, old_status, len); sprintf(new_status + len, " waiting for %X/%X", XactCommitLSN.xlogid, XactCommitLSN.xrecoff); set_ps_display(new_status, false); new_status[len] = '\0'; /* truncate off " waiting ..." */ } /* * Wait for specified LSN to be confirmed. * * Each proc has its own wait latch, so we perform a normal latch * check/wait loop here. */ for (;;) { int syncRepState; /* Must reset the latch before testing state. */ ResetLatch(&MyProc->procLatch); /* * Try checking the state without the lock first. There's no * guarantee that we'll read the most up-to-date value, so if it looks * like we're still waiting, recheck while holding the lock. But if * it looks like we're done, we must really be done, because once * walsender changes the state to SYNC_REP_WAIT_COMPLETE, it will * never update it again, so we can't be seeing a stale value in that * case. * * Note: on machines with weak memory ordering, the acquisition of the * lock is essential to avoid race conditions: we cannot be sure the * sender's state update has reached main memory until we acquire the * lock. We could get rid of this dance if SetLatch/ResetLatch * contained memory barriers. */ syncRepState = MyProc->syncRepState; if (syncRepState == SYNC_REP_WAITING) { LWLockAcquire(SyncRepLock, LW_SHARED); syncRepState = MyProc->syncRepState; LWLockRelease(SyncRepLock); } if (syncRepState == SYNC_REP_WAIT_COMPLETE) break; /* * If a wait for synchronous replication is pending, we can neither * acknowledge the commit nor raise ERROR or FATAL. The latter would * lead the client to believe that that the transaction aborted, which * is not true: it's already committed locally. The former is no good * either: the client has requested synchronous replication, and is * entitled to assume that an acknowledged commit is also replicated, * which might not be true. So in this case we issue a WARNING (which * some clients may be able to interpret) and shut off further output. * We do NOT reset ProcDiePending, so that the process will die after * the commit is cleaned up. */ if (ProcDiePending) { ereport(WARNING, (errcode(ERRCODE_ADMIN_SHUTDOWN), errmsg("canceling the wait for synchronous replication and terminating connection due to administrator command"), errdetail("The transaction has already committed locally, but might not have been replicated to the standby."))); whereToSendOutput = DestNone; SyncRepCancelWait(); break; } /* * It's unclear what to do if a query cancel interrupt arrives. We * can't actually abort at this point, but ignoring the interrupt * altogether is not helpful, so we just terminate the wait with a * suitable warning. */ if (QueryCancelPending) { QueryCancelPending = false; ereport(WARNING, (errmsg("canceling wait for synchronous replication due to user request"), errdetail("The transaction has already committed locally, but might not have been replicated to the standby."))); SyncRepCancelWait(); break; } /* * If the postmaster dies, we'll probably never get an * acknowledgement, because all the wal sender processes will exit. So * just bail out. */ if (!PostmasterIsAlive()) { ProcDiePending = true; whereToSendOutput = DestNone; SyncRepCancelWait(); break; } /* * Wait on latch. Any condition that should wake us up will set the * latch, so no need for timeout. */ WaitLatch(&MyProc->procLatch, WL_LATCH_SET | WL_POSTMASTER_DEATH, -1); } /* * WalSender has checked our LSN and has removed us from queue. Clean up * state and leave. It's OK to reset these shared memory fields without * holding SyncRepLock, because any walsenders will ignore us anyway when * we're not on the queue. */ Assert(SHMQueueIsDetached(&(MyProc->syncRepLinks))); MyProc->syncRepState = SYNC_REP_NOT_WAITING; MyProc->waitLSN.xlogid = 0; MyProc->waitLSN.xrecoff = 0; if (new_status) { /* Reset ps display */ set_ps_display(new_status, false); pfree(new_status); } }