static void citp_fdinfo_do_handover(citp_fdinfo* fdi, int fdt_locked) { int rc; citp_fdinfo* epoll_fdi = NULL; int os_fd = fdi->fd; #ifndef NDEBUG /* Yuk: does for UDP too. */ volatile citp_fdinfo_p* p_fdip; p_fdip = &citp_fdtable.table[fdi->fd].fdip; ci_assert(fdip_is_busy(*p_fdip)); #endif Log_V(ci_log("%s: fd=%d nonb_switch=%d", __FUNCTION__, fdi->fd, fdi->on_rcz.handover_nonb_switch)); if( fdi->epoll_fd >= 0 ) { epoll_fdi = citp_epoll_fdi_from_member(fdi, fdt_locked); if( epoll_fdi->protocol->type == CITP_EPOLLB_FD ) citp_epollb_on_handover(epoll_fdi, fdi); } rc = fdtable_fd_move(fdi->fd, OO_IOC_TCP_HANDOVER); if( rc == -EBUSY && fdi->epoll_fd >= 0 ) { ci_assert(fdi_to_sock_fdi(fdi)->sock.s->b.sb_aflags & CI_SB_AFLAG_MOVED_AWAY); /* If this is our epoll, we can do full handover: we manually add os * fd into the epoll set. * Fixme: ensure we are not in _other_ epoll sets */ ci_bit_clear(&fdi_to_sock_fdi(fdi)->sock.s->b.sb_aflags, CI_SB_AFLAG_MOVED_AWAY_IN_EPOLL_BIT); rc = fdtable_fd_move(fdi->fd, OO_IOC_FILE_MOVED); } if( rc != 0 ) { citp_fdinfo* new_fdi; if( ! fdt_locked ) CITP_FDTABLE_LOCK(); new_fdi = citp_fdtable_probe_locked(fdi->fd, CI_TRUE, CI_TRUE); citp_fdinfo_release_ref(new_fdi, 1); if( ! fdt_locked ) CITP_FDTABLE_UNLOCK(); ci_assert_equal(citp_fdinfo_get_type(new_fdi), CITP_PASSTHROUGH_FD); os_fd = fdi_to_alien_fdi(new_fdi)->os_socket; } if( fdi->on_rcz.handover_nonb_switch >= 0 ) { int on_off = !! fdi->on_rcz.handover_nonb_switch; int rc = ci_sys_ioctl(os_fd, FIONBIO, &on_off); if( rc < 0 ) Log_E(ci_log("%s: ioctl failed on_off=%d", __FUNCTION__, on_off)); } if( rc != 0 ) goto exit; citp_fdtable_busy_clear(fdi->fd, fdip_passthru, fdt_locked); exit: citp_fdinfo_get_ops(fdi)->dtor(fdi, fdt_locked); if( epoll_fdi != NULL && epoll_fdi->protocol->type == CITP_EPOLL_FD ) citp_epoll_on_handover(epoll_fdi, fdi, fdt_locked); if( epoll_fdi != NULL ) citp_fdinfo_release_ref(epoll_fdi, fdt_locked); citp_fdinfo_free(fdi); }
static int citp_udp_setsockopt(citp_fdinfo* fdinfo, int level, int optname, const void* optval, socklen_t optlen) { citp_sock_fdi *epi = fdi_to_sock_fdi(fdinfo); citp_socket* ep = &epi->sock; ci_sock_cmn* s = ep->s; int rc; Log_VSC(log("%s("EF_FMT", %d, %d)", __FUNCTION__, EF_PRI_ARGS(epi, fdinfo->fd), level, optname)); rc = ci_udp_setsockopt(&epi->sock, fdinfo->fd, level, optname, optval, optlen); Log_V(log(LPF "setsockopt: fd=%d rc=%d", fdinfo->fd, rc)); if( rc == CI_SOCKET_HANDOVER ) { CITP_STATS_NETIF(++epi->sock.netif->state->stats.udp_handover_setsockopt); citp_fdinfo_handover(fdinfo, -1); return 0; } if( ci_opt_is_setting_reuseport(level, optname, optval, optlen) != 0 && ! CI_SOCK_NOT_BOUND(s) ) { ci_log("%s: setting reuseport after binding on udp not supported", __FUNCTION__); return -ENOSYS; } citp_fdinfo_release_ref(fdinfo, 0); return rc; }
static int citp_udp_sendmmsg(citp_fdinfo* fdinfo, struct mmsghdr* mmsg, unsigned vlen, int flags) { citp_sock_fdi* epi = fdi_to_sock_fdi(fdinfo); ci_udp_iomsg_args a; int i, rc; Log_V(log(LPF "sendmmsg(%d, msg, %u, %#x)", fdinfo->fd, vlen, (unsigned) flags)); if( vlen == 0 ) return 0; a.ep = &epi->sock; a.fd = fdinfo->fd; a.ni = epi->sock.netif; a.us = SOCK_TO_UDP(epi->sock.s); i = 0; do { rc = ci_udp_sendmsg(&a, &mmsg[i].msg_hdr, flags); if(CI_LIKELY( rc >= 0 ) ) mmsg[i].msg_len = rc; ++i; } while( rc >= 0 && i < vlen ); return (rc>=0) ? i : rc; }
static int citp_udp_connect(citp_fdinfo* fdinfo, const struct sockaddr* sa, socklen_t sa_len, citp_lib_context_t* lib_context) { citp_sock_fdi *epi = fdi_to_sock_fdi(fdinfo); int rc; Log_V(log(LPF "connect(%d, sa, %d)", fdinfo->fd, sa_len)); if( (epi->sock.s->s_flags & CI_SOCK_FLAG_REUSEPORT_LEGACY) != 0 ) { log("ERROR: connect of socket with SO_REUSEPORT not supported unless " "supported by the OS."); return -1; } ci_netif_lock_fdi(epi); rc = ci_udp_connect(&epi->sock, fdinfo->fd, sa, sa_len); ci_netif_unlock_fdi(epi); if( rc == CI_SOCKET_HANDOVER ) { CITP_STATS_NETIF(++epi->sock.netif->state->stats.udp_handover_connect); citp_fdinfo_handover(fdinfo, -1); return 0; } citp_fdinfo_release_ref( fdinfo, 0 ); return rc; }
static int citp_udp_getsockname(citp_fdinfo* fdinfo, struct sockaddr* sa, socklen_t* p_sa_len) { citp_sock_fdi *epi = fdi_to_sock_fdi(fdinfo); Log_VSC(log(LPF "getsockname("EF_FMT")", EF_PRI_ARGS(epi, fdinfo->fd))); __citp_getsockname(epi->sock.s, sa, p_sa_len); return 0; }
static int citp_udp_bind(citp_fdinfo* fdinfo, const struct sockaddr* sa, socklen_t sa_len) { citp_sock_fdi *epi = fdi_to_sock_fdi(fdinfo); citp_socket* ep = &epi->sock; ci_sock_cmn* s = ep->s; int rc; Log_V(log(LPF "bind(%d, sa, %d)", fdinfo->fd, sa_len)); ci_udp_handle_force_reuseport(fdinfo->fd, ep, sa, sa_len); if( (s->s_flags & CI_SOCK_FLAG_REUSEPORT) != 0 ) { if( (rc = ci_udp_reuseport_bind(ep, fdinfo->fd, sa, sa_len)) == 0 ) { /* The socket has moved so need to reprobe the fd. This will also * map the the new stack into user space of the executing process. */ fdinfo = citp_fdtable_lookup(fdinfo->fd); fdinfo = citp_reprobe_moved(fdinfo, CI_FALSE); epi = fdi_to_sock_fdi(fdinfo); ep = &epi->sock; ci_netif_cluster_prefault(ep->netif); } else { goto done; } } ci_netif_lock_fdi(epi); rc = ci_udp_bind(ep, fdinfo->fd, sa, sa_len); ci_netif_unlock_fdi(epi); done: if( rc == CI_SOCKET_HANDOVER ) { ci_assert_equal(s->s_flags & CI_SOCK_FLAG_REUSEPORT_LEGACY, 0); CITP_STATS_NETIF(++epi->sock.netif->state->stats.udp_handover_bind); citp_fdinfo_handover(fdinfo, -1); return 0; } citp_fdinfo_release_ref( fdinfo, 0 ); return rc; }
static int citp_udp_getpeername(citp_fdinfo* fdinfo, struct sockaddr* sa, socklen_t* p_sa_len) { citp_sock_fdi *epi = fdi_to_sock_fdi(fdinfo); int rc; Log_V(log("%s("EF_FMT")", __FUNCTION__, EF_PRI_ARGS(epi,fdinfo->fd))); ci_netif_lock_fdi(epi); rc = ci_udp_getpeername(&epi->sock, sa, p_sa_len); ci_netif_unlock_fdi(epi); return rc; }
static int citp_udp_shutdown(citp_fdinfo* fdinfo, int how) { citp_sock_fdi *epi = fdi_to_sock_fdi(fdinfo); int rc; Log_V(ci_log("%s("EF_FMT", %d)", __FUNCTION__, EF_PRI_ARGS(epi,fdinfo->fd), how)); ci_netif_lock_fdi(epi); rc = ci_udp_shutdown(&epi->sock, fdinfo->fd, how); ci_netif_unlock_fdi(epi); Log_V(log(LPF "shutdown: fd=%d rc=%d", fdinfo->fd, rc)); return rc; }
static int citp_udp_recv(citp_fdinfo* fdinfo, struct msghdr* msg, int flags) { citp_sock_fdi* epi = fdi_to_sock_fdi(fdinfo); ci_udp_iomsg_args a; Log_V(log(LPF "recv(%d, msg, %#x)", fdinfo->fd, (unsigned) flags)); a.fd = fdinfo->fd; a.ep = &epi->sock; a.ni = epi->sock.netif; a.us = SOCK_TO_UDP(epi->sock.s); return ci_udp_recvmsg( &a, msg, flags); }
static int citp_udp_ioctl(citp_fdinfo* fdinfo, int request, void* arg) { citp_sock_fdi *epi = fdi_to_sock_fdi(fdinfo); int rc; Log_V(log("%s("EF_FMT", %d, 0x%lx)", __FUNCTION__, EF_PRI_ARGS(epi, fdinfo->fd), request, (long) arg)); rc = ci_udp_ioctl(&epi->sock, fdinfo->fd, request, arg); Log_V(log(LPF "ioctl()=%d", rc)); if( rc < 0 ) CI_SET_ERROR(rc, -rc); return rc; }
static int citp_udp_getsockopt(citp_fdinfo* fdinfo, int level, int optname, void* optval, socklen_t* optlen) { citp_sock_fdi *epi = fdi_to_sock_fdi(fdinfo); int rc; Log_V(log("%s("EF_FMT", %d, %d)", __FUNCTION__, EF_PRI_ARGS(epi,fdinfo->fd), level, optname )); ci_netif_lock_fdi(epi); rc = ci_udp_getsockopt(&epi->sock, fdinfo->fd, level, optname, optval, optlen); ci_netif_unlock_fdi(epi); return rc; }
static int citp_udp_recvmmsg(citp_fdinfo* fdinfo, struct mmsghdr* msg, unsigned vlen, int flags, const struct timespec *timeout) { citp_sock_fdi* epi = fdi_to_sock_fdi(fdinfo); ci_udp_iomsg_args a; Log_V(log(LPF "recvmmsg(%d, msg, %u, %#x)", fdinfo->fd, vlen, (unsigned) flags)); a.fd = fdinfo->fd; a.ep = &epi->sock; a.ni = epi->sock.netif; a.us = SOCK_TO_UDP(epi->sock.s); return ci_udp_recvmmsg(&a, msg, vlen, flags, timeout); }
static int citp_udp_send(citp_fdinfo* fdinfo, const struct msghdr * msg, int flags) { citp_sock_fdi *epi = fdi_to_sock_fdi(fdinfo); ci_udp_iomsg_args a; int rc; ci_assert(msg != NULL); a.ep = &epi->sock; a.fd = fdinfo->fd; a.ni = epi->sock.netif; a.us = SOCK_TO_UDP(epi->sock.s); /* NB. msg_name[len] validated in ci_udp_sendmsg(). */ if(CI_LIKELY( msg->msg_iov != NULL || msg->msg_iovlen == 0 )) { rc = ci_udp_sendmsg( &a, msg, flags); } else { rc = -1; errno = EFAULT; } return rc; }
CI_SET_ERROR(rc, -rc); return rc; } #if CI_CFG_USERSPACE_SELECT static int citp_udp_select(citp_fdinfo* fdi, int* n, int rd, int wr, int ex, struct oo_ul_select_state*__restrict__ ss) { citp_sock_fdi* epi; ci_udp_state* us; unsigned mask; ci_netif* ni; epi = fdi_to_sock_fdi(fdi); us = SOCK_TO_UDP(epi->sock.s); ni = epi->sock.netif; citp_poll_if_needed(ni, ss->now_frc, ss->ul_select_spin); mask = ci_udp_poll_events(ni, us); if( rd && (mask & SELECT_RD_SET) ) { FD_SET(fdi->fd, ss->rdu); ++*n; } if( wr && (mask & SELECT_WR_SET) ) { FD_SET(fdi->fd, ss->wru); ++*n; }
int onload_zc_alloc_buffers(int fd, struct onload_zc_iovec* iovecs, int iovecs_len, enum onload_zc_buffer_type_flags flags) { int rc = 0, i; citp_lib_context_t lib_context; citp_fdinfo* fdi; citp_sock_fdi* epi; ci_netif* ni; ci_ip_pkt_fmt *pkt; unsigned max_len; Log_CALL(ci_log("%s(%d, %p, %d, %x)", __FUNCTION__, fd, iovecs, iovecs_len, flags)); citp_enter_lib(&lib_context); if( (fdi = citp_fdtable_lookup(fd)) != NULL ) { switch( citp_fdinfo_get_type(fdi) ) { case CITP_UDP_SOCKET: case CITP_TCP_SOCKET: epi = fdi_to_sock_fdi(fdi); ni = epi->sock.netif; ci_netif_lock(ni); for( i = 0; i < iovecs_len; ++i ) { max_len = CI_CFG_PKT_BUF_SIZE; pkt = ci_netif_pkt_tx_tcp_alloc(ni); if( pkt == NULL ) { while( --i >= 0 ) ci_netif_pkt_release(ni, (ci_ip_pkt_fmt*)iovecs[i].buf); rc = -ENOMEM; ci_netif_unlock(ni); goto out; } /* Make sure this is clear as it affects behaviour when freeing */ pkt->pf.udp.rx_flags = 0; iovecs[i].buf = (struct oo_zc_buf *)pkt; if( flags & ONLOAD_ZC_BUFFER_HDR_TCP ) { if( (citp_fdinfo_get_type(fdi) == CITP_TCP_SOCKET) && (epi->sock.s->b.state & CI_TCP_STATE_TCP_CONN) ) { ci_tcp_state* ts = SOCK_TO_TCP(epi->sock.s); oo_tx_pkt_layout_init(pkt); iovecs[i].iov_base = ((char *)oo_tx_ip_hdr(pkt)) + ts->outgoing_hdrs_len; max_len = tcp_eff_mss(ts); } else { /* Best guess. We can fix it up later. Magic 12 leaves * space for time stamp option (common case) */ oo_tx_pkt_layout_init(pkt); iovecs[i].iov_base = (uint8_t*) oo_tx_ip_data(pkt) + sizeof(ci_tcp_hdr) + 12; } } else if( flags & ONLOAD_ZC_BUFFER_HDR_UDP ) { oo_tx_pkt_layout_init(pkt); iovecs[i].iov_base = (uint8_t*) oo_tx_ip_data(pkt) + sizeof(ci_udp_hdr); } else iovecs[i].iov_base = PKT_START(pkt); iovecs[i].iov_len = CI_CFG_PKT_BUF_SIZE - ((char *)iovecs[i].iov_base - (char *)pkt); if( iovecs[i].iov_len > max_len ) iovecs[i].iov_len = max_len; } ni->state->n_async_pkts += iovecs_len; ci_netif_unlock(ni); break; #if CI_CFG_USERSPACE_EPOLL case CITP_EPOLL_FD: rc = -ENOTSOCK; break; #endif #if CI_CFG_USERSPACE_PIPE case CITP_PIPE_FD: rc = -ENOTSOCK; break; #endif case CITP_PASSTHROUGH_FD: rc = -ESOCKTNOSUPPORT; break; default: LOG_U(log("%s: unknown fdinfo type %d", __FUNCTION__, citp_fdinfo_get_type(fdi))); rc = -EINVAL; } citp_fdinfo_release_ref(fdi, 0); } else { /* Not onload socket */ rc = -ESOCKTNOSUPPORT; } out: citp_exit_lib(&lib_context, TRUE); Log_CALL_RESULT(rc); return rc; }
int onload_zc_release_buffers(int fd, onload_zc_handle* bufs, int bufs_len) { int rc = 0, i; citp_lib_context_t lib_context; citp_fdinfo* fdi; citp_sock_fdi* epi; ci_netif* ni; Log_CALL(ci_log("%s(%d, %p, %d)", __FUNCTION__, fd, bufs, bufs_len)); citp_enter_lib(&lib_context); if( (fdi = citp_fdtable_lookup(fd)) != NULL ) { switch( citp_fdinfo_get_type(fdi) ) { case CITP_UDP_SOCKET: case CITP_TCP_SOCKET: epi = fdi_to_sock_fdi(fdi); ni = epi->sock.netif; ci_netif_lock(ni); for( i = 0; i < bufs_len; ++i ) { ci_ip_pkt_fmt* pkt = (ci_ip_pkt_fmt*)bufs[i]; if( pkt->stack_id != ni->state->stack_id ) { LOG_U(log("%s: attempt to free buffer from stack %d to stack %d", __FUNCTION__, pkt->stack_id, ni->state->stack_id)); rc = -EINVAL; break; } } if( rc == 0 ) { for( i = 0; i < bufs_len; ++i ) ci_netif_pkt_release_check_keep(ni, (ci_ip_pkt_fmt*)bufs[i]); } ci_netif_unlock(ni); break; #if CI_CFG_USERSPACE_EPOLL case CITP_EPOLL_FD: rc = -ENOTSOCK; break; #endif #if CI_CFG_USERSPACE_PIPE case CITP_PIPE_FD: rc = -ENOTSOCK; break; #endif default: LOG_U(log("%s: unknown fdinfo type %d", __FUNCTION__, citp_fdinfo_get_type(fdi))); rc = -EINVAL; } citp_fdinfo_release_ref(fdi, 0); } else { /* Not onload socket */ rc = -ESOCKTNOSUPPORT; } citp_exit_lib(&lib_context, TRUE); Log_CALL_RESULT(rc); return rc; }
static void ufd_fmt(int fd, char* buf, int* buf_n, int buf_len) { citp_fdinfo_p fdip; citp_fdinfo* fdi; char s[30]; if( fd >= citp_fdtable.inited_count ) { bprintf("unknown"); return; } fdip = citp_fdtable.table[fd].fdip; if( fdip_is_passthru(fdip) ) { bprintf("passthru"); return; } else if( fdip_is_busy(fdip) ) { bprintf("busy"); return; } else if( fdip_is_unknown(fdip) ) { bprintf("unknown"); return; } fdi = fdip_to_fdi(fdip); #if CI_CFG_FD_CACHING sprintf(s, "%s%s", fdi->is_special ? "Special":"", fdi->can_cache ? "Cancache":""); #else sprintf(s, "%s", fdi->is_special ? "Special":""); #endif if( fdi == &citp_the_closed_fd ) { bprintf("closed_fd[%s]", s); return; } else if( fdi == &citp_the_reserved_fd ) { bprintf("reserved_fd[%s]", s); return; } if( fdi->protocol == &citp_tcp_protocol_impl ) { citp_sock_fdi* t = fdi_to_sock_fdi(fdi); if( citp_fdtable_dump_verbose ) citp_waitable_dump(t->sock.netif, &t->sock.s->b, ""); bprintf("tcp[%s]", s); } else if( fdi->protocol == &citp_udp_protocol_impl ) { citp_sock_fdi* u = fdi_to_sock_fdi(fdi); if( citp_fdtable_dump_verbose ) citp_waitable_dump(u->sock.netif, &u->sock.s->b, ""); bprintf("udp[%s]", s); } #if CI_CFG_USERSPACE_EPOLL else if( fdi->protocol == &citp_epoll_protocol_impl ) { bprintf("epoll[%s]", s); } #endif else { bprintf("bad[%s,%p] *****", s, fdi->protocol); } }
int onload_zc_release_buffers(int fd, onload_zc_handle* bufs, int bufs_len) { int rc = 0, i, rx_pkt, released; citp_lib_context_t lib_context; citp_fdinfo* fdi; citp_sock_fdi* epi; ci_netif* ni; ci_ip_pkt_fmt* pkt; Log_CALL(ci_log("%s(%d, %p, %d)", __FUNCTION__, fd, bufs, bufs_len)); citp_enter_lib(&lib_context); if( (fdi = citp_fdtable_lookup(fd)) != NULL ) { switch( citp_fdinfo_get_type(fdi) ) { case CITP_UDP_SOCKET: case CITP_TCP_SOCKET: epi = fdi_to_sock_fdi(fdi); ni = epi->sock.netif; ci_netif_lock(ni); for( i = 0; i < bufs_len; ++i ) { pkt = (ci_ip_pkt_fmt*)bufs[i]; if( pkt->stack_id != ni->state->stack_id ) { LOG_U(log("%s: attempt to free buffer from stack %d to stack %d", __FUNCTION__, pkt->stack_id, ni->state->stack_id)); rc = -EINVAL; break; } } if( rc == 0 ) { for( i = 0; i < bufs_len; ++i ) { pkt = (ci_ip_pkt_fmt*)bufs[i]; /* If we are releasing a packet without the RX_FLAG then the user * allocated and then freed the packet (without using it). * We detect this to decrement n_asyn_pkts. * RX packets (kept via ONLOAD_ZC_KEEP) are counted differently * so don't decrement here. (But may release) */ rx_pkt = pkt->flags & CI_PKT_FLAG_RX; released = ci_netif_pkt_release_check_keep(ni, pkt); if ( ! rx_pkt ) { ci_assert(released == 1); (void) released; --ni->state->n_async_pkts; } } } ci_netif_unlock(ni); break; #if CI_CFG_USERSPACE_EPOLL case CITP_EPOLL_FD: rc = -ENOTSOCK; break; #endif #if CI_CFG_USERSPACE_PIPE case CITP_PIPE_FD: rc = -ENOTSOCK; break; #endif default: LOG_U(log("%s: unknown fdinfo type %d", __FUNCTION__, citp_fdinfo_get_type(fdi))); rc = -EINVAL; } citp_fdinfo_release_ref(fdi, 0); } else { /* Not onload socket */ rc = -ESOCKTNOSUPPORT; } citp_exit_lib(&lib_context, TRUE); Log_CALL_RESULT(rc); return rc; }
static int citp_udp_fcntl(citp_fdinfo* fdinfo, int cmd, long arg) { return citp_sock_fcntl(fdi_to_sock_fdi(fdinfo), fdinfo->fd, cmd, arg); }