/* * Like WaitLatch, but with an extra socket argument for WL_SOCKET_* * conditions. * * When waiting on a socket, WL_SOCKET_READABLE *must* be included in * 'wakeEvents'; WL_SOCKET_WRITEABLE is optional. The reason for this is * that EOF and error conditions are reported only via WL_SOCKET_READABLE. */ int WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock, long timeout) { int result = 0; int rc; instr_time start_time, cur_time; long cur_timeout; #ifdef HAVE_POLL struct pollfd pfds[3]; int nfds; #else struct timeval tv, *tvp; fd_set input_mask; fd_set output_mask; int hifd; #endif /* Ignore WL_SOCKET_* events if no valid socket is given */ if (sock == PGINVALID_SOCKET) wakeEvents &= ~(WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE); Assert(wakeEvents != 0); /* must have at least one wake event */ /* Cannot specify WL_SOCKET_WRITEABLE without WL_SOCKET_READABLE */ Assert((wakeEvents & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) != WL_SOCKET_WRITEABLE); if ((wakeEvents & WL_LATCH_SET) && latch->owner_pid != MyProcPid) elog(ERROR, "cannot wait on a latch owned by another process"); /* * Initialize timeout if requested. We must record the current time so * that we can determine the remaining timeout if the poll() or select() * is interrupted. (On some platforms, select() will update the contents * of "tv" for us, but unfortunately we can't rely on that.) */ if (wakeEvents & WL_TIMEOUT) { INSTR_TIME_SET_CURRENT(start_time); Assert(timeout >= 0 && timeout <= INT_MAX); cur_timeout = timeout; #ifndef HAVE_POLL tv.tv_sec = cur_timeout / 1000L; tv.tv_usec = (cur_timeout % 1000L) * 1000L; tvp = &tv; #endif } else { cur_timeout = -1; #ifndef HAVE_POLL tvp = NULL; #endif } waiting = true; do { /* * Clear the pipe, then check if the latch is set already. If someone * sets the latch between this and the poll()/select() below, the * setter will write a byte to the pipe (or signal us and the signal * handler will do that), and the poll()/select() will return * immediately. * * Note: we assume that the kernel calls involved in drainSelfPipe() * and SetLatch() will provide adequate synchronization on machines * with weak memory ordering, so that we cannot miss seeing is_set if * the signal byte is already in the pipe when we drain it. */ drainSelfPipe(); if ((wakeEvents & WL_LATCH_SET) && latch->is_set) { result |= WL_LATCH_SET; /* * Leave loop immediately, avoid blocking again. We don't attempt * to report any other events that might also be satisfied. */ break; } /* Must wait ... we use poll(2) if available, otherwise select(2) */ #ifdef HAVE_POLL nfds = 0; if (wakeEvents & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) { /* socket, if used, is always in pfds[0] */ pfds[0].fd = sock; pfds[0].events = 0; if (wakeEvents & WL_SOCKET_READABLE) pfds[0].events |= POLLIN; if (wakeEvents & WL_SOCKET_WRITEABLE) pfds[0].events |= POLLOUT; pfds[0].revents = 0; nfds++; } pfds[nfds].fd = selfpipe_readfd; pfds[nfds].events = POLLIN; pfds[nfds].revents = 0; nfds++; if (wakeEvents & WL_POSTMASTER_DEATH) { /* postmaster fd, if used, is always in pfds[nfds - 1] */ pfds[nfds].fd = postmaster_alive_fds[POSTMASTER_FD_WATCH]; pfds[nfds].events = POLLIN; pfds[nfds].revents = 0; nfds++; } /* Sleep */ rc = poll(pfds, nfds, (int) cur_timeout); /* Check return code */ if (rc < 0) { /* EINTR is okay, otherwise complain */ if (errno != EINTR) { waiting = false; ereport(ERROR, (errcode_for_socket_access(), errmsg("poll() failed: %m"))); } } else if (rc == 0) { /* timeout exceeded */ if (wakeEvents & WL_TIMEOUT) result |= WL_TIMEOUT; } else { /* at least one event occurred, so check revents values */ if ((wakeEvents & WL_SOCKET_READABLE) && (pfds[0].revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL))) { /* data available in socket, or EOF/error condition */ result |= WL_SOCKET_READABLE; } if ((wakeEvents & WL_SOCKET_WRITEABLE) && (pfds[0].revents & POLLOUT)) { result |= WL_SOCKET_WRITEABLE; } /* * We expect a POLLHUP when the remote end is closed, but because * we don't expect the pipe to become readable or to have any * errors either, treat those cases as postmaster death, too. */ if ((wakeEvents & WL_POSTMASTER_DEATH) && (pfds[nfds - 1].revents & (POLLHUP | POLLIN | POLLERR | POLLNVAL))) { /* * According to the select(2) man page on Linux, select(2) may * spuriously return and report a file descriptor as readable, * when it's not; and presumably so can poll(2). It's not * clear that the relevant cases would ever apply to the * postmaster pipe, but since the consequences of falsely * returning WL_POSTMASTER_DEATH could be pretty unpleasant, * we take the trouble to positively verify EOF with * PostmasterIsAlive(). */ if (!PostmasterIsAlive()) result |= WL_POSTMASTER_DEATH; } } #else /* !HAVE_POLL */ FD_ZERO(&input_mask); FD_ZERO(&output_mask); FD_SET(selfpipe_readfd, &input_mask); hifd = selfpipe_readfd; if (wakeEvents & WL_POSTMASTER_DEATH) { FD_SET(postmaster_alive_fds[POSTMASTER_FD_WATCH], &input_mask); if (postmaster_alive_fds[POSTMASTER_FD_WATCH] > hifd) hifd = postmaster_alive_fds[POSTMASTER_FD_WATCH]; } if (wakeEvents & WL_SOCKET_READABLE) { FD_SET(sock, &input_mask); if (sock > hifd) hifd = sock; } if (wakeEvents & WL_SOCKET_WRITEABLE) { FD_SET(sock, &output_mask); if (sock > hifd) hifd = sock; } /* Sleep */ rc = select(hifd + 1, &input_mask, &output_mask, NULL, tvp); /* Check return code */ if (rc < 0) { /* EINTR is okay, otherwise complain */ if (errno != EINTR) { waiting = false; ereport(ERROR, (errcode_for_socket_access(), errmsg("select() failed: %m"))); } } else if (rc == 0) { /* timeout exceeded */ if (wakeEvents & WL_TIMEOUT) result |= WL_TIMEOUT; } else { /* at least one event occurred, so check masks */ if ((wakeEvents & WL_SOCKET_READABLE) && FD_ISSET(sock, &input_mask)) { /* data available in socket, or EOF */ result |= WL_SOCKET_READABLE; } if ((wakeEvents & WL_SOCKET_WRITEABLE) && FD_ISSET(sock, &output_mask)) { result |= WL_SOCKET_WRITEABLE; } if ((wakeEvents & WL_POSTMASTER_DEATH) && FD_ISSET(postmaster_alive_fds[POSTMASTER_FD_WATCH], &input_mask)) { /* * According to the select(2) man page on Linux, select(2) may * spuriously return and report a file descriptor as readable, * when it's not; and presumably so can poll(2). It's not * clear that the relevant cases would ever apply to the * postmaster pipe, but since the consequences of falsely * returning WL_POSTMASTER_DEATH could be pretty unpleasant, * we take the trouble to positively verify EOF with * PostmasterIsAlive(). */ if (!PostmasterIsAlive()) result |= WL_POSTMASTER_DEATH; } } #endif /* HAVE_POLL */ /* If we're not done, update cur_timeout for next iteration */ if (result == 0 && cur_timeout >= 0) { INSTR_TIME_SET_CURRENT(cur_time); INSTR_TIME_SUBTRACT(cur_time, start_time); cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time); if (cur_timeout < 0) cur_timeout = 0; #ifndef HAVE_POLL tv.tv_sec = cur_timeout / 1000L; tv.tv_usec = (cur_timeout % 1000L) * 1000L; #endif } } while (result == 0); waiting = false; return result; }
/* * Wait using linux's epoll_wait(2). * * This is the preferrable wait method, as several readiness notifications are * delivered, without having to iterate through all of set->events. The return * epoll_event struct contain a pointer to our events, making association * easy. */ static inline int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout, WaitEvent *occurred_events, int nevents) { int returned_events = 0; int rc; WaitEvent *cur_event; struct epoll_event *cur_epoll_event; /* Sleep */ rc = epoll_wait(set->epoll_fd, set->epoll_ret_events, nevents, cur_timeout); /* Check return code */ if (rc < 0) { /* EINTR is okay, otherwise complain */ if (errno != EINTR) { waiting = false; ereport(ERROR, (errcode_for_socket_access(), errmsg("epoll_wait() failed: %m"))); } return 0; } else if (rc == 0) { /* timeout exceeded */ return -1; } /* * At least one event occurred, iterate over the returned epoll events * until they're either all processed, or we've returned all the events * the caller desired. */ for (cur_epoll_event = set->epoll_ret_events; cur_epoll_event < (set->epoll_ret_events + rc) && returned_events < nevents; cur_epoll_event++) { /* epoll's data pointer is set to the associated WaitEvent */ cur_event = (WaitEvent *) cur_epoll_event->data.ptr; occurred_events->pos = cur_event->pos; occurred_events->user_data = cur_event->user_data; occurred_events->events = 0; if (cur_event->events == WL_LATCH_SET && cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP)) { /* There's data in the self-pipe, clear it. */ drainSelfPipe(); if (set->latch->is_set) { occurred_events->fd = PGINVALID_SOCKET; occurred_events->events = WL_LATCH_SET; occurred_events++; returned_events++; } } else if (cur_event->events == WL_POSTMASTER_DEATH && cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP)) { /* * We expect an EPOLLHUP when the remote end is closed, but * because we don't expect the pipe to become readable or to have * any errors either, treat those cases as postmaster death, too. * * As explained in the WAIT_USE_SELECT implementation, select(2) * may spuriously return. Be paranoid about that here too, a * spurious WL_POSTMASTER_DEATH would be painful. */ if (!PostmasterIsAlive()) { occurred_events->fd = PGINVALID_SOCKET; occurred_events->events = WL_POSTMASTER_DEATH; occurred_events++; returned_events++; } } else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) { Assert(cur_event->fd != PGINVALID_SOCKET); if ((cur_event->events & WL_SOCKET_READABLE) && (cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))) { /* data available in socket, or EOF */ occurred_events->events |= WL_SOCKET_READABLE; } if ((cur_event->events & WL_SOCKET_WRITEABLE) && (cur_epoll_event->events & (EPOLLOUT | EPOLLERR | EPOLLHUP))) { /* writable, or EOF */ occurred_events->events |= WL_SOCKET_WRITEABLE; } if (occurred_events->events != 0) { occurred_events->fd = cur_event->fd; occurred_events++; returned_events++; } } } return returned_events; }
/* * Wait using poll(2). * * This allows to receive readiness notifications for several events at once, * but requires iterating through all of set->pollfds. */ static inline int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout, WaitEvent *occurred_events, int nevents) { int returned_events = 0; int rc; WaitEvent *cur_event; struct pollfd *cur_pollfd; /* Sleep */ rc = poll(set->pollfds, set->nevents, (int) cur_timeout); /* Check return code */ if (rc < 0) { /* EINTR is okay, otherwise complain */ if (errno != EINTR) { waiting = false; ereport(ERROR, (errcode_for_socket_access(), errmsg("poll() failed: %m"))); } return 0; } else if (rc == 0) { /* timeout exceeded */ return -1; } for (cur_event = set->events, cur_pollfd = set->pollfds; cur_event < (set->events + set->nevents) && returned_events < nevents; cur_event++, cur_pollfd++) { /* no activity on this FD, skip */ if (cur_pollfd->revents == 0) continue; occurred_events->pos = cur_event->pos; occurred_events->user_data = cur_event->user_data; occurred_events->events = 0; if (cur_event->events == WL_LATCH_SET && (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL))) { /* There's data in the self-pipe, clear it. */ drainSelfPipe(); if (set->latch->is_set) { occurred_events->fd = PGINVALID_SOCKET; occurred_events->events = WL_LATCH_SET; occurred_events++; returned_events++; } } else if (cur_event->events == WL_POSTMASTER_DEATH && (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL))) { /* * We expect an POLLHUP when the remote end is closed, but because * we don't expect the pipe to become readable or to have any * errors either, treat those cases as postmaster death, too. * * As explained in the WAIT_USE_SELECT implementation, select(2) * may spuriously return. Be paranoid about that here too, a * spurious WL_POSTMASTER_DEATH would be painful. */ if (!PostmasterIsAlive()) { occurred_events->fd = PGINVALID_SOCKET; occurred_events->events = WL_POSTMASTER_DEATH; occurred_events++; returned_events++; } } else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) { int errflags = POLLHUP | POLLERR | POLLNVAL; Assert(cur_event->fd >= PGINVALID_SOCKET); if ((cur_event->events & WL_SOCKET_READABLE) && (cur_pollfd->revents & (POLLIN | errflags))) { /* data available in socket, or EOF */ occurred_events->events |= WL_SOCKET_READABLE; } if ((cur_event->events & WL_SOCKET_WRITEABLE) && (cur_pollfd->revents & (POLLOUT | errflags))) { /* writeable, or EOF */ occurred_events->events |= WL_SOCKET_WRITEABLE; } if (occurred_events->events != 0) { occurred_events->fd = cur_event->fd; occurred_events++; returned_events++; } } } return returned_events; }
/* * Wait using select(2). * * XXX: On at least older linux kernels select(), in violation of POSIX, * doesn't reliably return a socket as writable if closed - but we rely on * that. So far all the known cases of this problem are on platforms that also * provide a poll() implementation without that bug. If we find one where * that's not the case, we'll need to add a workaround. */ static inline int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout, WaitEvent *occurred_events, int nevents) { int returned_events = 0; int rc; WaitEvent *cur_event; fd_set input_mask; fd_set output_mask; int hifd; struct timeval tv; struct timeval *tvp = NULL; FD_ZERO(&input_mask); FD_ZERO(&output_mask); /* * Prepare input/output masks. We do so every loop iteration as there's no * entirely portable way to copy fd_sets. */ for (cur_event = set->events; cur_event < (set->events + set->nevents); cur_event++) { if (cur_event->events == WL_LATCH_SET) FD_SET(cur_event->fd, &input_mask); else if (cur_event->events == WL_POSTMASTER_DEATH) FD_SET(cur_event->fd, &input_mask); else { Assert(cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)); if (cur_event->events == WL_SOCKET_READABLE) FD_SET(cur_event->fd, &input_mask); else if (cur_event->events == WL_SOCKET_WRITEABLE) FD_SET(cur_event->fd, &output_mask); } if (cur_event->fd > hifd) hifd = cur_event->fd; } /* Sleep */ if (cur_timeout >= 0) { tv.tv_sec = cur_timeout / 1000L; tv.tv_usec = (cur_timeout % 1000L) * 1000L; tvp = &tv; } rc = select(hifd + 1, &input_mask, &output_mask, NULL, tvp); /* Check return code */ if (rc < 0) { /* EINTR is okay, otherwise complain */ if (errno != EINTR) { waiting = false; ereport(ERROR, (errcode_for_socket_access(), errmsg("select() failed: %m"))); } return 0; /* retry */ } else if (rc == 0) { /* timeout exceeded */ return -1; } /* * To associate events with select's masks, we have to check the status of * the file descriptors associated with an event; by looping through all * events. */ for (cur_event = set->events; cur_event < (set->events + set->nevents) && returned_events < nevents; cur_event++) { occurred_events->pos = cur_event->pos; occurred_events->user_data = cur_event->user_data; occurred_events->events = 0; if (cur_event->events == WL_LATCH_SET && FD_ISSET(cur_event->fd, &input_mask)) { /* There's data in the self-pipe, clear it. */ drainSelfPipe(); if (set->latch->is_set) { occurred_events->fd = PGINVALID_SOCKET; occurred_events->events = WL_LATCH_SET; occurred_events++; returned_events++; } } else if (cur_event->events == WL_POSTMASTER_DEATH && FD_ISSET(cur_event->fd, &input_mask)) { /* * According to the select(2) man page on Linux, select(2) may * spuriously return and report a file descriptor as readable, * when it's not; and presumably so can poll(2). It's not clear * that the relevant cases would ever apply to the postmaster * pipe, but since the consequences of falsely returning * WL_POSTMASTER_DEATH could be pretty unpleasant, we take the * trouble to positively verify EOF with PostmasterIsAlive(). */ if (!PostmasterIsAlive()) { occurred_events->fd = PGINVALID_SOCKET; occurred_events->events = WL_POSTMASTER_DEATH; occurred_events++; returned_events++; } } else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) { Assert(cur_event->fd != PGINVALID_SOCKET); if ((cur_event->events & WL_SOCKET_READABLE) && FD_ISSET(cur_event->fd, &input_mask)) { /* data available in socket, or EOF */ occurred_events->events |= WL_SOCKET_READABLE; } if ((cur_event->events & WL_SOCKET_WRITEABLE) && FD_ISSET(cur_event->fd, &output_mask)) { /* socket is writeable, or EOF */ occurred_events->events |= WL_SOCKET_WRITEABLE; } if (occurred_events->events != 0) { occurred_events->fd = cur_event->fd; occurred_events++; returned_events++; } } } return returned_events; }
/* * Like WaitLatch, but will also return when there's data available in * 'sock' for reading. Returns 0 if timeout was reached, 1 if the latch * was set, or 2 if the scoket became readable. */ int WaitLatchOrSocket(volatile Latch *latch, pgsocket sock, long timeout) { struct timeval tv, *tvp = NULL; fd_set input_mask; int rc; int result = 0; if (latch->owner_pid != MyProcPid) elog(ERROR, "cannot wait on a latch owned by another process"); /* Initialize timeout */ if (timeout >= 0) { tv.tv_sec = timeout / 1000000L; tv.tv_usec = timeout % 1000000L; tvp = &tv; } waiting = true; for (;;) { int hifd; /* * Clear the pipe, and check if the latch is set already. If someone * sets the latch between this and the select() below, the setter * will write a byte to the pipe (or signal us and the signal handler * will do that), and the select() will return immediately. */ drainSelfPipe(); if (latch->is_set) { result = 1; break; } FD_ZERO(&input_mask); FD_SET(selfpipe_readfd, &input_mask); hifd = selfpipe_readfd; if (sock != PGINVALID_SOCKET) { FD_SET(sock, &input_mask); if (sock > hifd) hifd = sock; } rc = select(hifd + 1, &input_mask, NULL, NULL, tvp); if (rc < 0) { if (errno == EINTR) continue; ereport(ERROR, (errcode_for_socket_access(), errmsg("select() failed: %m"))); } if (rc == 0) { /* timeout exceeded */ result = 0; break; } if (sock != PGINVALID_SOCKET && FD_ISSET(sock, &input_mask)) { result = 2; break; /* data available in socket */ } } waiting = false; return result; }