int main() { int pid; int fds[3]; int spair[2]; sys_chk(socketpair(AF_UNIX, SOCK_STREAM, 0, spair)); sys_chk(pid = fork()); if (pid == 0) { // Child. sys_chk(fds[0] = open("/etc/passwd", O_RDONLY)); sys_chk(lseek(fds[0], 5, SEEK_SET)); sys_chk(fds[1] = open("/etc/group", O_RDONLY)); sys_chk(lseek(fds[1], 10, SEEK_SET)); sys_chk(fds[2] = open("/bin/sh", O_RDONLY)); sys_chk(lseek(fds[2], 20, SEEK_SET)); send_fds(spair[0], fds, 3); return EXIT_SUCCESS; } else { // Parent. int *fds; int fds_len; recv_fds(spair[1], &fds, &fds_len); printf("parent: got fds:"); for (int i = 0; i < fds_len; ++i) { printf(" %d (fp=%lld)", fds[i], (long long)lseek(fds[i], 0, SEEK_CUR)); } printf("\n"); return EXIT_SUCCESS; } }
static int send_criu_msg_with_fd(int socket_fd, CriuResp *msg, int fd) { unsigned char *buf; int len, ret; len = criu_resp__get_packed_size(msg); buf = xmalloc(len); if (!buf) return -ENOMEM; if (criu_resp__pack(msg, buf) != len) { pr_perror("Failed packing response"); goto err; } if (fd >= 0) { ret = send_fds(socket_fd, NULL, 0, &fd, 1, buf, len); } else ret = write(socket_fd, buf, len); if (ret < 0) { pr_perror("Can't send response"); goto err; } xfree(buf); return 0; err: xfree(buf); return -1; }
static int drain_fds(struct parasite_drain_fd *args) { int ret; ret = send_fds(tsock, NULL, 0, args->fds, args->nr_fds, true); if (ret) pr_err("send_fds failed\n"); return ret; }
int do_sendmsg(message *dev_m_in, message *dev_m_out) { int minor, peer, rc, i; struct msg_control msg_ctrl; #if DEBUG == 1 static int call_count = 0; printf("(uds) [%d] do_sendmsg() call_count=%d\n", uds_minor(dev_m_in), ++call_count); #endif minor = uds_minor(dev_m_in); memset(&msg_ctrl, '\0', sizeof(struct msg_control)); rc = sys_safecopyfrom(VFS_PROC_NR, (cp_grant_id_t) dev_m_in->IO_GRANT, (vir_bytes) 0, (vir_bytes) &msg_ctrl, sizeof(struct msg_control)); if (rc != OK) { return EIO; } /* locate peer */ peer = -1; if (uds_fd_table[minor].type == SOCK_DGRAM) { if (uds_fd_table[minor].target.sun_path[0] == '\0' || uds_fd_table[minor].target.sun_family != AF_UNIX) { return EDESTADDRREQ; } for (i = 0; i < NR_FDS; i++) { /* look for a SOCK_DGRAM socket that is bound on * the target address */ if (uds_fd_table[i].type == SOCK_DGRAM && uds_fd_table[i].addr.sun_family == AF_UNIX && !strncmp(uds_fd_table[minor].target.sun_path, uds_fd_table[i].addr.sun_path, UNIX_PATH_MAX)){ peer = i; break; } } if (peer == -1) { return ENOENT; } } else { peer = uds_fd_table[minor].peer; if (peer == -1) { return ENOTCONN; } } #if DEBUG == 1 printf("(uds) [%d] sendmsg() -- peer=%d\n", minor, peer); #endif /* note: it's possible that there is already some file * descriptors in ancillary_data if the peer didn't call * recvmsg() yet. That's okay. The receiver will * get the current file descriptors plus the new ones. */ rc = msg_control_read(&msg_ctrl, &uds_fd_table[peer].ancillary_data, minor); if (rc != OK) { return rc; } return send_fds(minor, &uds_fd_table[peer].ancillary_data); }
int main(int argc, char ** argv) { int listen_port = -1; char listen_port_str[8]; const char * ctrl_socket_path = NULL; { int opt; while ((opt = getopt(argc, argv, "p:hu:")) != EOF) { switch (opt) { case 'p' : listen_port = atoi(optarg); break; case 'h' : fprintf(stderr, "%s [-p port] [-u socket-path]\n", argv[0]); fprintf(stderr, "default: -p 9134\n"); exit(0); case 'u' : ctrl_socket_path = optarg; break; } } argc -= optind; argv += optind; } if (listen_port == -1) { listen_port = 9134; } sprintf(listen_port_str, "%d", listen_port); typedef std::vector<fd_ctx> server_sockets_t; server_sockets_t server_sockets; peer_sockets_t peer_sockets; fd_ctx ctrl_socket, ctrl_socket_conn; bool ctrl_socket_mode_listen = false; bool decay_mode = false; ctrl_socket.fd = -1; ctrl_socket_conn.fd = -1; int sockets_inherited = 0; int epoll = epoll_create(1024); if (epoll < 0) { VPERROR("epoll_create"); exit(1); } if (ctrl_socket_path) { int s = socket(PF_UNIX, SOCK_SEQPACKET, 0); if (s < 0) { VPERROR("socket(AF_UNIX)"); exit(1); } struct sockaddr_un sun; sun.sun_family = AF_UNIX; strncpy(sun.sun_path, ctrl_socket_path, sizeof(sun.sun_path)); if (connect(s, (sockaddr *) &sun, sizeof(sun))) { if (errno == ECONNREFUSED || errno == ENOENT) { if (errno == ECONNREFUSED) { if (unlink(ctrl_socket_path) < 0) { fprintf(stderr, "unlink(%s): %s\n", ctrl_socket_path, strerror(errno)); exit(1); } } ctrl_socket_listen(s, ctrl_socket_path); ctrl_socket.fd = s; poll_in(epoll, &ctrl_socket); ctrl_socket_mode_listen = true; } else { fprintf(stderr, "connect(%s): %s\n", ctrl_socket_path, strerror(errno)); } } else { char buf[16]; ssize_t n = send(s, "unlisten", sizeof("unlisten") - 1, 0); if (n < 0) { VPERROR("sendmsg"); exit(1); } else if (n == 0) { fprintf(stderr, "unexpected EOF\n"); exit(1); } // blocking read n = recv(s, buf, sizeof(buf), 0); if (strncmp(buf, "unlistening", strlen("unlistening")) != 0) { fprintf(stderr, "running server reported: "); fwrite(buf, n, 1, stderr); exit(1); } ctrl_socket_conn.fd = s; poll_in(epoll, &ctrl_socket_conn); } } { struct addrinfo hints, * ai_res; hints.ai_family = AF_UNSPEC; hints.ai_socktype = SOCK_STREAM; hints.ai_protocol = IPPROTO_TCP; hints.ai_flags = AI_PASSIVE; int r = getaddrinfo(NULL, listen_port_str, &hints, &ai_res); if (r) { fprintf(stderr, "getaddrinfo: %s\n", gai_strerror(r)); exit(1); } for (struct addrinfo * ai = ai_res; ai; ai = ai->ai_next) { int s = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol); if (s < 0) { VPERROR("socket"); exit(1); } if (ai->ai_family == AF_INET6) { int on = 1; if (setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY, (char *)&on, sizeof(on)) == -1) { VPERROR("setsockopt(IPV6_ONLY)"); exit(1); } } { int on = 1; if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *) &on, sizeof(on)) == -1) { VPERROR("setsockopt(REUSEADDR)"); exit(1); } } if (bind(s, ai->ai_addr, ai->ai_addrlen) < 0) { VPERROR("bind"); exit(1); } if (listen(s, 50) < 0) { VPERROR("listen"); exit(1); } fd_ctx c; c.fd = s; c.is_server = true; c.protocol = ai->ai_protocol; char * strp = c.buf; int slen = sizeof(c.buf); if (ai->ai_family == AF_INET6) { *strp++ = '['; slen -= 2; } get_ip_str(ai->ai_addr, strp, slen); if (ai->ai_family == AF_INET6) { strcat(c.buf, "]"); } sprintf(c.buf + strlen(c.buf), ":%d", listen_port); server_sockets.push_back(c); } freeaddrinfo(ai_res); } for (int i = 0; i < server_sockets.size(); ++i) { poll_in(epoll, &server_sockets[i]); } epoll_event epoll_events[32]; const int epoll_max_events = 32; fd_ctx fd_ctx_finder; signal(SIGUSR1, sigusr1); signal(SIGPIPE, SIG_IGN); total_sockets = server_sockets.size(); time_t status_time = time(NULL); while (total_sockets) { if (unlikely(got_sigusr1)) { // close listening sockets for (int i = 0; i < server_sockets.size(); ++i) { fprintf(stderr, "close server %s\n", server_sockets[i].buf); if (epoll_ctl(epoll, EPOLL_CTL_DEL, server_sockets[i].fd, NULL) < 0) { VPERROR("epoll_ctl"); } close(server_sockets[i].fd); --total_sockets; } got_sigusr1 = false; } if (unlikely(status_time + 5 < time(NULL))) { fprintf(stderr, "%d connections, %d identified peers\n", total_connections - server_sockets.size(), peer_sockets.size()); status_time = time(NULL); } int ep_num = epoll_wait(epoll, epoll_events, epoll_max_events, 1000); if (unlikely(ep_num < 0)) { if (errno == EINTR) continue; VPERROR("epoll_wait"); continue; } bool epoll_restart = false; for (int epi = 0; epi < ep_num && ! epoll_restart; ++epi) { fd_ctx * ctxp = (fd_ctx *) epoll_events[epi].data.ptr; if (unlikely(ctxp == &ctrl_socket)) { sockaddr_storage ss; socklen_t sl = sizeof(ss); int nsock = accept(ctxp->fd, (sockaddr *) &ss, &sl); if (nsock < 0) { VPERROR("accept"); continue; } epoll_event ev; ev.events = EPOLLIN; ev.data.ptr = (void *) &ctrl_socket_conn; if (epoll_ctl(epoll, EPOLL_CTL_ADD, nsock, &ev) < 0) { VPERROR("epoll_ctl"); close(nsock); continue; } // we only ever accept one ctrl client if (epoll_ctl(epoll, EPOLL_CTL_DEL, ctrl_socket.fd, NULL) < 0) { VPERROR("epoll_ctl"); close(nsock); continue; } ctrl_socket_conn.fd = nsock; } else if (unlikely(ctxp == &ctrl_socket_conn)) { if (ctrl_socket_mode_listen) { char buf[1024]; int n = read(ctxp->fd, buf, sizeof(buf)); if (n < 0) { if (errno == EINTR || errno == EAGAIN) continue; VPERROR("read"); close(ctxp->fd); poll_in(epoll, &ctrl_socket); } else if (n == 0) { close(ctxp->fd); poll_in(epoll, &ctrl_socket); } else { if (strncmp(buf, "unlisten", sizeof("unlisten") - 1) == 0) { for (int i = 0; i < server_sockets.size(); ++i) { fprintf(stderr, "close server %s\n", server_sockets[i].buf); if (epoll_ctl(epoll, EPOLL_CTL_DEL, server_sockets[i].fd, NULL) < 0) { VPERROR("epoll_ctl"); } close(server_sockets[i].fd); --total_sockets; } if (write(ctrl_socket_conn.fd, "unlistening", sizeof("unlistening") - 1) < 0) { VPERROR("write"); } else { int nsent = 0; do { nsent = send_fds(ctrl_socket_conn.fd, epoll, peer_sockets.begin(), peer_sockets.end(), &peer_sockets); if (nsent) { fprintf(stderr, "bulk send: %d\n", nsent); } } while (nsent && ! peer_sockets.empty()); epoll_restart = true; decay_mode = true; } } } } else { msghdr msg; iovec iov; optional_buf<MAX_CONTROL_MESSAGE_CONTROL_SIZE, (MAX_CONTROL_MESSAGE_TOTAL_SIZE > FDCTX_BUFFER_SIZE)> control; char * controlp = control.placeholder ? ctxp->buf + MAX_CONTROL_MESSAGE_SIZE : control.value; optional_buf<MAX_CONTROL_MESSAGE_SIZE, (MAX_CONTROL_MESSAGE_SIZE > FDCTX_BUFFER_SIZE)> buf; char * bufp = buf.placeholder ? ctxp->buf : control.value; iov.iov_base = bufp; iov.iov_len = MAX_CONTROL_MESSAGE_SIZE; msg.msg_name = NULL; msg.msg_namelen = 0; msg.msg_iov = &iov; msg.msg_iovlen = 1; msg.msg_control = (void *) controlp; msg.msg_controllen = MAX_CONTROL_MESSAGE_CONTROL_SIZE; msg.msg_flags = 0; int n = recvmsg(ctxp->fd, &msg, 0); if (n < 0) { VPERROR("recvmsg"); } else if (n == 0) { fprintf(stderr, "unexpected close\n"); close(ctxp->fd); } else { if (strncmp((const char *) iov.iov_base, "desc", std::min(4, n)) == 0) { cmsghdr * cmp = CMSG_FIRSTHDR(&msg); if (cmp->cmsg_level != SOL_SOCKET || cmp->cmsg_type != SCM_RIGHTS) { fprintf(stderr, "malformed control message: wrong type\n"); exit(1); } int * uidp = (int *) ((char *) iov.iov_base + 4); int * uidpend = (int *) ((char *) iov.iov_base + n); int fd_count = 0; for (; uidp < uidpend; ++uidp, ++fd_count) { int fd = * ((int *) CMSG_DATA(cmp) + fd_count); ++sockets_inherited; ++total_sockets; fd_ctx * cp = new fd_ctx; cp->fd = fd; cp->faf_uid = *uidp; cp->is_server = false; cp->protocol = IPPROTO_TCP; cp->buf_len = 0; epoll_event ev; ev.events = EPOLLIN; ev.data.ptr = (void *) cp; if (epoll_ctl(epoll, EPOLL_CTL_ADD, cp->fd, &ev) < 0) { VPERROR("epoll_ctl"); --total_sockets; close(cp->fd); delete cp; } if (cp->faf_uid != -1) { peer_sockets.insert(cp); } } } else if (strncmp((const char *) iov.iov_base, "exit", std::min(4, n)) == 0) { close(ctxp->fd); int s = socket(PF_UNIX, SOCK_SEQPACKET, 0); if (s < 0) { VPERROR("socket(PF_UNIX)"); } else { ctrl_socket_listen(s, ctrl_socket_path); ctrl_socket.fd = s; poll_in(epoll, &ctrl_socket); ctrl_socket_mode_listen = true; } fprintf(stderr, "%d sockets inherited from the dead\n", sockets_inherited); } } } } else if (unlikely(ctxp->is_server && ctxp->protocol == IPPROTO_TCP)) { sockaddr_storage saddr; socklen_t saddrlen = sizeof(saddr); int nsock = accept(ctxp->fd, (sockaddr *) &saddr, &saddrlen); if (nsock < 0) { VPERROR("accept"); } else { ++total_sockets; fd_ctx * cp = new fd_ctx; cp->fd = nsock; cp->faf_uid = -1; cp->is_server = false; cp->protocol = IPPROTO_TCP; cp->buf_len = 0; epoll_event ev; ev.events = EPOLLIN; ev.data.ptr = (void *) cp; if (epoll_ctl(epoll, EPOLL_CTL_ADD, nsock, &ev) < 0) { VPERROR("epoll_ctl"); --total_sockets; close(nsock); delete cp; } } } else { if (unlikely(decay_mode && ctxp->buf_len == 0)) { fprintf(stderr, "single send\n"); send_fd(ctrl_socket_conn.fd, epoll, ctxp); if (ctxp->faf_uid != -1) { peer_sockets.erase(ctxp); } continue; // -> next epoll result } int n = read(ctxp->fd, ctxp->buf + ctxp->buf_len, PEER_CTX_BUF_SIZE - ctxp->buf_len); if (unlikely(n < 0)) { if (errno != ECONNRESET && errno != EAGAIN && errno != EINTR) { VPERROR("read"); } continue; } else if (unlikely(n == 0)) { close(ctxp->fd); --total_sockets; if (ctxp->faf_uid != -1) { peer_sockets.erase(ctxp); } ctxp->remove_myself_from_peer_caches(); --ctxp->refcount; if (ctxp->refcount == 0) { delete ctxp; } else { ctxp->faf_uid = -1; } } else { ctxp->buf_len += n; char * buf_head = ctxp->buf; bool postprocess = true; while (buf_head < ctxp->buf + ctxp->buf_len) { proxy_msg_header * h = (proxy_msg_header *) buf_head; const int buf_len = ctxp->buf + ctxp->buf_len - buf_head; const int in_msg_size = ntohl(h->size); if (buf_len < 4) { break; } if (unlikely(buf_len > PEER_CTX_BUF_SIZE)) { // message to big if (epoll_ctl(epoll, EPOLL_CTL_DEL, ctxp->fd, NULL) < 0) { VPERROR("epoll_ctl"); } close(ctxp->fd); --total_sockets; if (ctxp->faf_uid != -1) { peer_sockets.erase(ctxp); } ctxp->remove_myself_from_peer_caches(); --ctxp->refcount; if (ctxp->refcount == 0) { delete ctxp; } else { ctxp->faf_uid = -1; } postprocess = false; break; } if (in_msg_size + 4 > buf_len) { break; } if (unlikely(ctxp->faf_uid == -1)) { proxy_msg_header_set_uid * hu = (proxy_msg_header_set_uid *) h; ctxp->faf_uid = ntohs(hu->uid); peer_sockets.insert(ctxp); buf_head += in_msg_size + 4; continue; // -> next message from this fd_ctx } // in decay mode we always drop, because we expect our // caches and refcounts to be inconsistent // we can decay without bookkeeping if we never send any packets // out (== we never expect a context to exists unless epoll still // knows about it) if (! decay_mode) { int uid = ntohs(h->destuid); fd_ctx * peer = ctxp->peers.find(uid); if (unlikely(! peer)) { fd_ctx_finder.faf_uid = uid; peer_sockets_t::iterator iter = peer_sockets.find(&fd_ctx_finder); if (iter != peer_sockets.end()) { peer = *iter; ctxp->peers.add(peer); } else { buf_head += in_msg_size + 4; continue; } } int in_port = ntohs(h->port); proxy_msg_header_to_peer * hout = (proxy_msg_header_to_peer *) (buf_head + OUT_HEADER_OFFSET_ADJ); hout->port = htons(in_port); const int out_size = in_msg_size - OUT_HEADER_OFFSET_ADJ; hout->size = htonl(out_size); { int n = write(peer->fd, (char *) hout, out_size + 4); if (unlikely(n < 0)) { if (errno != ECONNRESET && errno != EPIPE) { VPERROR("write"); } } else if (unlikely(n != out_size + 4)) { fprintf(stderr, "short write (%d of %d\n", n, out_size + 4); } } } buf_head += in_msg_size + 4; } if (likely(postprocess)) { int new_buflen = ctxp->buf + ctxp->buf_len - buf_head; if (unlikely(new_buflen && ctxp->buf != buf_head)) { for (char * p = ctxp->buf; buf_head < ctxp->buf + ctxp->buf_len; ++p, ++buf_head) { *p = *buf_head; } } ctxp->buf_len = new_buflen; } // we want to get rid of clients as soon as possible and // dont wait for them to send the next message to trigger it if (unlikely(decay_mode && ctxp->buf_len == 0)) { send_fd(ctrl_socket_conn.fd, epoll, ctxp); if (ctxp->faf_uid != -1) { peer_sockets.erase(ctxp); } } } } } } if (decay_mode && ctrl_socket_path) { close(ctrl_socket.fd); unlink(ctrl_socket_path); if (write(ctrl_socket_conn.fd, "exit", strlen("exit")) < 0) { VPERROR("send"); } } fprintf(stderr, "exit due to %d sockets left to serve\n", total_sockets); exit(0); }
int send_fd(int ctrlsock, int epoll, fd_ctx * ctxp) { return send_fds(ctrlsock, epoll, &ctxp, &ctxp + 1, (dummy_erase_container<fd_ctx *> *) NULL); }