__attribute_noinline__ /*(most client time is spent waiting)*/ static int retry_poll_fd (const int fd, const short events, const int timeout) { struct pollfd pfd = { .fd = fd, .events = events, .revents = 0 }; int n; /*EINTR results in retrying poll with same timeout again and again*/ retry_eintr_do_while(n = poll(&pfd, 1, timeout), -1 == n); if (0 == n) errno = ETIME; /* specific for bsock; not generic */ return n; } __attribute_nonnull__ static int bsock_bind_send_addr_and_recv (const int fd, const struct addrinfo * const restrict ai, const int sfd) { /* bsock_unix_recv_fds() fills errnum to indicate remote success/failure * (no poll before sending addrinfo since this is first write to socket) * (dup2 rfd to fd if rfd != -1; indicates persistent reserved addr,port) * (persistent reserved addr does not preserve setsockopt() before bind())*/ int rfd = -1; unsigned int nrfd = 1; int errnum = 0; struct iovec iov = { .iov_base = &errnum, .iov_len = sizeof(errnum) }; if (!MSG_DONTWAIT) (void)fcntl(sfd, F_SETFL, fcntl(sfd, F_GETFL, 0) | O_NONBLOCK); if (bsock_addrinfo_send(sfd, ai, fd) && 1 == retry_poll_fd(sfd, POLLIN, BSOCK_POLL_TIMEOUT) && -1 != bsock_unix_recv_fds(sfd, &rfd, &nrfd, &iov, 1)) { if (-1 != rfd) { /* assert(rfd != fd); *//*(should not happen that they are same)*/ if (0 == errnum) { const int flflags = fcntl(fd, F_GETFL, 0); const int fdflags = fcntl(fd, F_GETFD, 0); do { errnum = dup2(rfd, fd); } while (errnum == -1 && (errno == EINTR || errno == EBUSY)); if (0 == (errnum = (errnum == fd) ? 0 : errno)) { (void)fcntl(fd, F_SETFL, flflags); (void)fcntl(fd, F_SETFD, fdflags); } } nointr_close(rfd); } } else { errnum = errno; /* server might have responded and closed socket before client sendmsg*/ if (EPIPE == errnum && -1 == bsock_unix_recv_fds(sfd,NULL,NULL,&iov,1)) errnum = EPIPE; } return errnum; } __attribute_nonnull__ static bool bsock_bind_viafork (const int fd, const struct addrinfo * const restrict ai) { /* (ai->ai_next is ignored) */ int sv[2]; int errnum; pid_t pid; struct stat st; if (0 != stat(BSOCK_EXE, &st)) return false; if (!(st.st_mode & S_ISUID)) return (errno = EPERM, false); if (0 != socketpair(AF_UNIX, SOCK_STREAM, 0, sv)) return false; pid = fork(); /*(bsock_bind_resvaddr() retries on EAGAIN)*/ if (0 == pid) { /* child; no retry if child signalled, errno==EINTR */ static char bsock_exe[] = BSOCK_EXE; static char *args[] = { bsock_exe, NULL }; if ( dup2(sv[0], STDIN_FILENO) != STDIN_FILENO || (sv[0] != STDIN_FILENO && 0 != close(sv[0])) || (sv[1] != STDIN_FILENO && 0 != close(sv[1]))) _exit(errno); (void)fcntl(STDIN_FILENO, F_SETFD, 0);/*unset fdflags, incl FD_CLOEXEC*/ execve(args[0], args, environ); _exit(errno); /*(not reached unless execve() failed)*/ } else if (-1 != pid) { /* parent */ nointr_close(sv[0]); errnum = bsock_bind_send_addr_and_recv(fd, ai, sv[1]); retry_eintr_while(pid != waitpid(pid,NULL,0)); /* reap child process but ignore exit status; program might be ignoring * SIGCHLD or might have custom SIGCHLD handler, either of which would * prevent waitpid() above from reliably obtaining child status */ } else { /* fork() error */ errnum = errno; nointr_close(sv[0]); } nointr_close(sv[1]); errno = errnum; return (0 == errnum); } __attribute_nonnull__ static bool bsock_bind_viasock (const int fd, const struct addrinfo * const restrict ai) { int errnum; int sfd; do { sfd = bsock_unix_socket_connect(BSOCK_SOCKET); if (-1 == sfd) return false; errnum = bsock_bind_send_addr_and_recv(fd, ai, sfd); nointr_close(sfd); if (errnum == EAGAIN) { /*(sched_yield() results in non-productive spin on my uniprocessor * during performance tests sending lots of requests by same uid, * since bsock defers if uid already has request in progress)*/ static const struct timespec ts = { 0, 10L }; nanosleep(&ts, NULL); } } while (errnum == EAGAIN || errnum == ETIME); errno = errnum; return (0 == errnum); } __attribute_nonnull__ int bsock_bind_addrinfo (const int fd, const struct addrinfo * const restrict ai) { /* (return value 0 for success, -1 upon error; match return value of bind()) * (ai->ai_next is ignored) */ if (bsock_bind_viasock(fd, ai) || bsock_bind_viafork(fd, ai)) return 0; switch (errno) { default: errno = EACCES; /*FALLTHRU*/ case EACCES: case EADDRINUSE: case EBADF: case EINVAL: case ENOTSOCK: return -1; } }
/* nointr_close() - make effort to avoid leaking open file descriptors */ static int nointr_close (const int fd) { int r; do { r = close(fd); } while (r != 0 && errno == EINTR); return r; } static int __attribute__((noinline)) /*(most client time is spent waiting)*/ retry_poll_fd (const int fd, const short events, const int timeout) { struct pollfd pfd = { .fd = fd, .events = events, .revents = 0 }; int n; /*EINTR results in retrying poll with same timeout again and again*/ do { n = poll(&pfd, 1, timeout); } while (-1 == n && errno == EINTR); if (0 == n) errno = ETIME; /* specific for bsock; not generic */ return n; } static int __attribute__((nonnull)) bsock_bind_send_addr_and_recv (const int fd, const struct addrinfo * const restrict ai, const int sfd) { /* bsock_unix_recv_fds() fills errnum to indicate remote success/failure * (no poll before sending addrinfo since this is first write to socket) * (dup2 rfd to fd if rfd != -1; indicates persistent reserved addr,port) */ int rfd = -1; unsigned int nrfd = 1; int errnum = 0; struct iovec iov = { .iov_base = &errnum, .iov_len = sizeof(errnum) }; if (bsock_addrinfo_send(sfd, ai, fd) && 1 == retry_poll_fd(sfd, POLLIN, BSOCK_POLL_TIMEOUT) && -1 != bsock_unix_recv_fds(sfd, &rfd, &nrfd, &iov, 1)) { if (-1 != rfd) { /* assert(rfd != fd); *//*(should not happen)*/ if (0 == errnum) { do { errnum = dup2(rfd, fd); } while (errnum == -1 && (errno == EINTR || errno == EBUSY)); errnum = (errnum == fd) ? 0 : errno; } nointr_close(rfd); } } else { errnum = errno; /* server might have responded and closed socket before client sendmsg*/ if (EPIPE == errnum && -1 == bsock_unix_recv_fds(sfd,NULL,NULL,&iov,1)) errnum = EPIPE; } return errnum; } static bool __attribute__((nonnull)) bsock_bind_viafork (const int fd, const struct addrinfo * const restrict ai) { /* (ai->ai_next is ignored) */ int sv[2]; int errnum; pid_t pid; struct stat st; if (0 != stat(BSOCK_EXE, &st)) return false; if (!(st.st_mode & S_ISUID)) return (errno = EPERM, false); if (0 != socketpair(AF_UNIX, SOCK_STREAM, 0, sv)) return false; pid = fork(); /*(bsock_bind_resvaddr() retries on EAGAIN)*/ if (0 == pid) { /* child; no retry if child signalled, errno==EINTR */ static char bsock_exe[] = BSOCK_EXE; static char *args[] = { bsock_exe, NULL }; if ( dup2(sv[0], STDIN_FILENO) != STDIN_FILENO || (sv[0] != STDIN_FILENO && 0 != close(sv[0])) || (sv[1] != STDIN_FILENO && 0 != close(sv[1]))) _exit(errno); fcntl(STDIN_FILENO, F_SETFD, 0);/* unset all fdflags, incl FD_CLOEXEC */ execve(args[0], args, environ); _exit(errno); /*(not reached unless execve() failed)*/ } else if (-1 != pid) { /* parent */ nointr_close(sv[0]); errnum = bsock_bind_send_addr_and_recv(fd, ai, sv[1]); while (pid != waitpid(pid,NULL,0) && errno == EINTR) ; /* reap child process but ignore exit status; program might be ignoring * SIGCHLD or might have custom SIGCHLD handler, either of which would * prevent waitpid() above from reliably obtaining child status */ } else { /* fork() error */ errnum = errno; nointr_close(sv[0]); } nointr_close(sv[1]); errno = errnum; return (0 == errnum); } static bool __attribute__((nonnull)) bsock_bind_viasock (const int fd, const struct addrinfo * const restrict ai) { int errnum; int sfd; do { sfd = bsock_unix_socket_connect(BSOCK_SOCKET); if (-1 == sfd) return false; errnum = bsock_bind_send_addr_and_recv(fd, ai, sfd); nointr_close(sfd); if (errnum == EAGAIN) { /*(sched_yield() results in non-productive spin on my uniprocessor * during performance tests sending lots of requests by same uid, * since bsock defers if uid already has request in progress)*/ static const struct timespec ts = { 0, 10L }; nanosleep(&ts, NULL); } } while (errnum == EAGAIN || errnum == ETIME); errno = errnum; return (0 == errnum); } int __attribute__((nonnull)) bsock_bind_addrinfo (const int fd, const struct addrinfo * const restrict ai) { /* (return value 0 for success, -1 upon error; match return value of bind()) * (ai->ai_next is ignored) */ if (bsock_bind_viasock(fd, ai) || bsock_bind_viafork(fd, ai)) return 0; switch (errno) { default: errno = EACCES; /*FALLTHRU*/ case EACCES: case EADDRINUSE: case EBADF: case EINVAL: case ENOTSOCK: return -1; } } static int (*bind_rtld_next)(int, const struct sockaddr *, socklen_t); static int __attribute__((nonnull)) bind_rtld_findnext (int sockfd, const struct sockaddr *addr, socklen_t addrlen) { bind_rtld_next = (int(*)(int,const struct sockaddr *,socklen_t))(uintptr_t) dlsym((void *)-1L, "bind"); /* RTLD_NEXT=(void *)-1L is glibc extension */ return (NULL != bind_rtld_next) ? bind_rtld_next(sockfd, addr, addrlen) : (bind_rtld_next = bind_rtld_findnext, errno = ENOSYS, -1); } static int (*bind_rtld_next)(int, const struct sockaddr *, socklen_t) = bind_rtld_findnext; int __attribute__((nonnull)) bsock_bind_intercept (int sockfd, const struct sockaddr *addr, const socklen_t addrlen) { struct addrinfo ai = { .ai_flags = 0, .ai_family = addr->sa_family, .ai_socktype = 0, .ai_protocol = 0, .ai_addrlen = addrlen, .ai_addr = (struct sockaddr *)(uintptr_t)addr, .ai_canonname= NULL, .ai_next = NULL }; socklen_t optlen; /* bsock supports only AF_INET, AF_INET6, AF_UNIX; * simply bind if address family is otherwise */ if (ai.ai_family == AF_INET || ai.ai_family == AF_INET6) { /* simply bind if port < IPPORT_RESERVED; no root privileges needed */ const int port = (ai.ai_family == AF_INET) ? ntohs(((struct sockaddr_in *)ai.ai_addr)->sin_port) : ntohs(((struct sockaddr_in6 *)ai.ai_addr)->sin6_port); if (port >= IPPORT_RESERVED && 0 == bind_rtld_next(sockfd, ai.ai_addr, ai.ai_addrlen)) return 0; /*(fall through if bind() fails in case persistent reserved addr)*/ #if 0 /* getnameinfo() is overkill for simple port check */ char host[INET6_ADDRSTRLEN]; char port[6]; switch (getnameinfo(ai.ai_addr, ai.ai_addrlen, host, sizeof(host), port, sizeof(port), NI_NUMERICHOST|NI_NUMERICSERV)) { case 0: if (atoi(port) < IPPORT_RESERVED) break; else return bind_rtld_next(sockfd, ai.ai_addr, ai.ai_addrlen); case default: errno = EINVAL; return -1; case EAI_MEMORY: errno = ENOMEM; return -1; case EAI_SYSTEM: return -1; } #endif } else if (ai.ai_family != AF_UNIX) return bind_rtld_next(sockfd, ai.ai_addr, ai.ai_addrlen); if (0 == geteuid() && 0 == bind_rtld_next(sockfd,ai.ai_addr,ai.ai_addrlen)) return 0; /*(fall through if bind() fails in case persistent reserved addr)*/ optlen = sizeof(ai.ai_socktype); if (-1 == getsockopt(sockfd,SOL_SOCKET,SO_TYPE,&ai.ai_socktype,&optlen)) return -1; #ifdef SO_PROTOCOL optlen = sizeof(ai.ai_protocol); if (-1 == getsockopt(sockfd,SOL_SOCKET,SO_PROTOCOL,&ai.ai_socktype,&optlen)) return -1; #else /* else pass ai_protocol == 0, which will typically work as expected (tcp) * since bsock calls getaddrinfo() and uses first entry returned */ #endif return bsock_bind_addrinfo(sockfd, &ai); }