static void citp_fdinfo_do_handover(citp_fdinfo* fdi, int fdt_locked) { int rc; citp_fdinfo* epoll_fdi = NULL; int os_fd = fdi->fd; #ifndef NDEBUG /* Yuk: does for UDP too. */ volatile citp_fdinfo_p* p_fdip; p_fdip = &citp_fdtable.table[fdi->fd].fdip; ci_assert(fdip_is_busy(*p_fdip)); #endif Log_V(ci_log("%s: fd=%d nonb_switch=%d", __FUNCTION__, fdi->fd, fdi->on_rcz.handover_nonb_switch)); if( fdi->epoll_fd >= 0 ) { epoll_fdi = citp_epoll_fdi_from_member(fdi, fdt_locked); if( epoll_fdi->protocol->type == CITP_EPOLLB_FD ) citp_epollb_on_handover(epoll_fdi, fdi); } rc = fdtable_fd_move(fdi->fd, OO_IOC_TCP_HANDOVER); if( rc == -EBUSY && fdi->epoll_fd >= 0 ) { ci_assert(fdi_to_sock_fdi(fdi)->sock.s->b.sb_aflags & CI_SB_AFLAG_MOVED_AWAY); /* If this is our epoll, we can do full handover: we manually add os * fd into the epoll set. * Fixme: ensure we are not in _other_ epoll sets */ ci_bit_clear(&fdi_to_sock_fdi(fdi)->sock.s->b.sb_aflags, CI_SB_AFLAG_MOVED_AWAY_IN_EPOLL_BIT); rc = fdtable_fd_move(fdi->fd, OO_IOC_FILE_MOVED); } if( rc != 0 ) { citp_fdinfo* new_fdi; if( ! fdt_locked ) CITP_FDTABLE_LOCK(); new_fdi = citp_fdtable_probe_locked(fdi->fd, CI_TRUE, CI_TRUE); citp_fdinfo_release_ref(new_fdi, 1); if( ! fdt_locked ) CITP_FDTABLE_UNLOCK(); ci_assert_equal(citp_fdinfo_get_type(new_fdi), CITP_PASSTHROUGH_FD); os_fd = fdi_to_alien_fdi(new_fdi)->os_socket; } if( fdi->on_rcz.handover_nonb_switch >= 0 ) { int on_off = !! fdi->on_rcz.handover_nonb_switch; int rc = ci_sys_ioctl(os_fd, FIONBIO, &on_off); if( rc < 0 ) Log_E(ci_log("%s: ioctl failed on_off=%d", __FUNCTION__, on_off)); } if( rc != 0 ) goto exit; citp_fdtable_busy_clear(fdi->fd, fdip_passthru, fdt_locked); exit: citp_fdinfo_get_ops(fdi)->dtor(fdi, fdt_locked); if( epoll_fdi != NULL && epoll_fdi->protocol->type == CITP_EPOLL_FD ) citp_epoll_on_handover(epoll_fdi, fdi, fdt_locked); if( epoll_fdi != NULL ) citp_fdinfo_release_ref(epoll_fdi, fdt_locked); citp_fdinfo_free(fdi); }
static int ci_udp_ioctl_slow(ci_netif* ni, ci_udp_state* us, ci_fd_t fd, int request, void* arg) { int os_rc, rc = 0; /* Keep the O/S socket in sync. Also checks that this is a valid ioctl() * for a UDP socket on this kernel. */ if( request != FIOASYNC && (os_rc = oo_os_sock_ioctl(ni, us->s.b.bufid, request, arg, NULL)) < 0 ) return os_rc; switch( request ) { case FIONBIO: /* set asynchronous (*arg == 1) or synchronous (*arg == 0) IO * Want this to stay efficient, so we don't do the extra call to the common * ioctl handler. */ CI_CMN_IOCTL_FIONBIO(&us->s, arg); break; case FIOASYNC: /* Need to apply this to [fd] so that our fasync file-op will be invoked. */ rc = ci_sys_ioctl(fd, request, arg); if( rc < 0 ) { /* This is very unexpected, as it worked on the OS socket. */ LOG_E(ci_log("%s: ERROR: FIOASYNC failed on fd=%d rc=%d errno=%d", __FUNCTION__, fd, rc, errno)); rc = -errno; } break; case SIOCSPGRP: /* Need to apply this to [fd] to get signal delivery to work. However, * SIOCSPGRP is only supported on sockets, so we need to convert to * fcntl(). */ rc = ci_sys_fcntl(fd, F_SETOWN, CI_IOCTL_GETARG(int, arg)); if( rc < 0 ) /* This is very unexpected, as it worked on the OS socket. */ LOG_E(ci_log("%s: ERROR: fcntl(F_SETOWN) failed on fd=%d rc=%d errno=%d", __FUNCTION__, fd, rc, errno)); rc = ci_cmn_ioctl(ni, &us->s, request, arg, os_rc, 1); break; default: rc = ci_cmn_ioctl(ni, &us->s, request, arg, os_rc, 1); } return rc; }
int oo_version_check_ul(ci_fd_t fd) { int rc; oo_version_check_t vc; strncpy(vc.in_version, ONLOAD_VERSION, sizeof(vc.in_version)); strncpy(vc.in_uk_intf_ver, OO_UK_INTF_VER, sizeof(vc.in_uk_intf_ver)); vc.debug = #ifdef NDEBUG 0; #else 1; #endif rc = ci_sys_ioctl(fd, OO_IOC_CHECK_VERSION, &vc); if( rc == -1 ) return -errno; return rc; }
dev_t oo_onloadfs_dev_t(void) { static ci_uint32 onloadfs_dev_t = 0; if( onloadfs_dev_t == 0 ) { int fd; if( ef_onload_driver_open(&fd, OO_STACK_DEV, 1) != 0 ) { fprintf(stderr, "%s: Failed to open /dev/onload\n", __FUNCTION__); return 0; } if( ci_sys_ioctl(fd, OO_IOC_GET_ONLOADFS_DEV, &onloadfs_dev_t) != 0 ) { LOG_E(ci_log("%s: Failed to find onloadfs dev_t", __FUNCTION__)); } ci_sys_close(fd); } return onloadfs_dev_t; }
void citp_oo_get_cpu_khz(ci_uint32* cpu_khz) { ef_driver_handle fd; /* set up a constant value for the case everything goes wrong */ *cpu_khz = 1000; if( ef_onload_driver_open(&fd, OO_STACK_DEV, 1) != 0 ) { fprintf(stderr, "%s: Failed to open /dev/onload\n", __FUNCTION__); ci_get_cpu_khz(cpu_khz); return; } if( ci_sys_ioctl(fd, OO_IOC_GET_CPU_KHZ, cpu_khz) != 0 ) { Log_E(log("%s: Failed to query cpu_khz", __FUNCTION__)); ci_get_cpu_khz(cpu_khz); } ef_onload_driver_close(fd); }
int citp_pipe_splice_read(citp_fdinfo* fdi, int alien_fd, loff_t* alien_off, size_t len, int flags, citp_lib_context_t* lib_context) { citp_pipe_fdi* epi = fdi_to_pipe_fdi(fdi); int rc; int read_len = 0; int non_block = (flags & SPLICE_F_NONBLOCK) || (epi->pipe->aflags & (CI_PFD_AFLAG_NONBLOCK << CI_PFD_AFLAG_READER_SHIFT)); if( ! fdi_is_reader(fdi) ) { errno = EINVAL; return -1; } if( alien_off ) { /* TODO support this */ errno = ENOTSUP; return -1; } if( len == 0 ) return 0; do { struct oo_splice_read_context ctx = { .alien_fd = alien_fd, .len = len, .lib_context = lib_context }; rc = ci_pipe_zc_read(epi->ni, epi->pipe, len, non_block ? MSG_DONTWAIT : 0, oo_splice_read_cb, &ctx); if( rc > 0 ) read_len += rc; } while(0); if( rc < 0 && errno == EPIPE && ! (flags & MSG_NOSIGNAL) ) { ci_sys_ioctl(ci_netif_get_driver_handle(epi->ni), OO_IOC_KILL_SELF_SIGPIPE, NULL); return rc; } if( rc > 0 ) return read_len; return rc; }
int oo_os_sock_ioctl(ci_netif* ni, oo_sp sock_p, int request, void* arg, int* ioctl_rc) { oo_os_file os_sock_fd; int rc; if( (rc = oo_os_sock_get(ni, sock_p, &os_sock_fd)) == 0 ) { rc = ci_sys_ioctl(os_sock_fd, request, arg); if( rc < 0 ) rc = -errno; oo_os_sock_release(ni, os_sock_fd); if( ioctl_rc != NULL ) { *ioctl_rc = rc; rc = 0; } } else { LOG_E(ci_log("%s: [%d:%d] ERROR: failed to get kernel sock fd " "(rc=%d req=%d)", __FUNCTION__, NI_ID(ni), OO_SP_FMT(sock_p), rc, request)); } return rc; }
static int citp_passthrough_ioctl(citp_fdinfo* fdi, int request, void* arg) { return ci_sys_ioctl(fdi_to_alien_fdi(fdi)->os_socket, request, arg); }
/* NOTE: in the kernel version [fd] is unused and, if it's a ptr, [arg] will * be in user-space and may need to be fetched into kernel memory. */ static int ci_tcp_ioctl_lk(citp_socket* ep, ci_fd_t fd, int request, void* arg) { ci_netif* netif = ep->netif; ci_sock_cmn* s = ep->s; ci_tcp_state* ts = NULL; int rc = 0; int os_socket_exists = s->b.sb_aflags & CI_SB_AFLAG_OS_BACKED; if( s->b.state != CI_TCP_LISTEN ) ts = SOCK_TO_TCP(s); /* Keep the os socket in sync. If this is a "get" request then the * return will be based on our support, not the os's (except for EFAULT * handling which we get for free). * Exceptions: * - FIONBIO is applied just in time on handover if needed (listening * sockets always have a non-blocking OS socket) * - FIONREAD, TIOCOUTQ, SIOCOUTQNSD and SIOCATMARK are useless on OS * socket, let's avoid syscall. */ if( os_socket_exists && request != FIONREAD && request != SIOCATMARK && request != FIOASYNC && request != TIOCOUTQ && request != SIOCOUTQNSD && request != (int) FIONBIO ) { rc = oo_os_sock_ioctl(netif, s->b.bufid, request, arg, NULL); if( rc < 0 ) return rc; } /* ioctl defines are listed in `man ioctl_list` and the CI equivalent * CI defines are in include/ci/net/ioctls.h */ LOG_TV( ci_log("%s: request = %d, arg = %ld", __FUNCTION__, request, (long)arg)); switch( request ) { case FIONBIO: if( CI_IOCTL_ARG_OK(int, arg) ) { CI_CMN_IOCTL_FIONBIO(ep->s, arg); rc = 0; break; } goto fail_fault; case FIONREAD: /* synonym of SIOCINQ */ if( !CI_IOCTL_ARG_OK(int, arg) ) goto fail_fault; if( s->b.state == CI_TCP_LISTEN ) goto fail_inval; if( s->b.state == CI_TCP_SYN_SENT ) { CI_IOCTL_SETARG((int*)arg, 0); } else { /* In inline mode, return the total number of bytes in the receive queue. If SO_OOBINLINE isn't set then return the number of bytes up to the mark but without counting the mark */ int bytes_in_rxq = tcp_rcv_usr(ts); if (bytes_in_rxq && ! (ts->s.s_flags & CI_SOCK_FLAG_OOBINLINE)) { if (tcp_urg_data(ts) & CI_TCP_URG_PTR_VALID) { /*! \TODO: what if FIN has been received? */ unsigned int readnxt = tcp_rcv_nxt(ts) - bytes_in_rxq; if (SEQ_LT(readnxt, tcp_rcv_up(ts))) { bytes_in_rxq = tcp_rcv_up(ts) - readnxt; } else if (SEQ_EQ(readnxt, tcp_rcv_up(ts))) { bytes_in_rxq--; } } } CI_IOCTL_SETARG((int*)arg, bytes_in_rxq); } break; case TIOCOUTQ: /* synonym of SIOCOUTQ */ case SIOCOUTQNSD: { CI_BUILD_ASSERT(TIOCOUTQ == SIOCOUTQ); int outq_bytes = 0; if( !CI_IOCTL_ARG_OK(int, arg) ) goto fail_fault; if( s->b.state == CI_TCP_LISTEN ) goto fail_inval; if( s->b.state != CI_TCP_SYN_SENT ) { /* TIOCOUTQ counts all unacknowledged data, so includes retrans queue. */ if( request == TIOCOUTQ ) outq_bytes = SEQ_SUB(tcp_enq_nxt(ts), tcp_snd_una(ts)); else outq_bytes = SEQ_SUB(tcp_enq_nxt(ts), tcp_snd_nxt(ts)); } CI_IOCTL_SETARG((int*)arg, outq_bytes); } break; case SIOCATMARK: { if( !CI_IOCTL_ARG_OK(int, arg) ) goto fail_fault; /* return true, if we are at the out-of-band byte */ CI_IOCTL_SETARG((int*)arg, 0); if( s->b.state != CI_TCP_LISTEN ) { int readnxt; readnxt = SEQ_SUB(tcp_rcv_nxt(ts), tcp_rcv_usr(ts)); if( ~ts->s.b.state & CI_TCP_STATE_ACCEPT_DATA ) readnxt = SEQ_SUB(readnxt, 1); if( tcp_urg_data(ts) & CI_TCP_URG_PTR_VALID ) CI_IOCTL_SETARG((int*)arg, readnxt == tcp_rcv_up(ts)); LOG_URG(log(NTS_FMT "SIOCATMARK atmark=%d readnxt=%u rcv_up=%u%s", NTS_PRI_ARGS(ep->netif, ts), readnxt == tcp_rcv_up(ts), readnxt, tcp_rcv_up(SOCK_TO_TCP(ep->s)), (tcp_urg_data(ts)&CI_TCP_URG_PTR_VALID)?"":" (invalid)")); } break; } #ifndef __KERNEL__ case FIOASYNC: /* Need to apply this to [fd] so that our fasync file-op will be * invoked. */ rc = ci_sys_ioctl(fd, request, arg); break; case SIOCSPGRP: if( !CI_IOCTL_ARG_OK(int, arg) ) goto fail_fault; /* Need to apply this to [fd] to get signal delivery to work. However, * SIOCSPGRP is only supported on sockets, so we need to convert to * fcntl(). */ rc = ci_sys_fcntl(fd, F_SETOWN, CI_IOCTL_GETARG(int, arg)); if( rc == 0 ) { rc = ci_cmn_ioctl(netif, ep->s, request, arg, rc, os_socket_exists); } else { CI_SET_ERROR(rc, -rc); } break; #endif default: return ci_cmn_ioctl(netif, ep->s, request, arg, rc, os_socket_exists); }
static int citp_epoll_ioctl(citp_fdinfo *fdi, int cmd, void *arg) { return ci_sys_ioctl(fdi->fd, cmd, arg); }
int ef_onload_driver_open(ef_driver_handle* pfd, enum oo_device_type dev_type, int do_cloexec) { int rc; int flags = 0; int saved_errno = errno; #ifdef O_CLOEXEC if( do_cloexec ) flags = O_CLOEXEC; #endif ci_assert(pfd); rc = oo_open(pfd, dev_type, flags); if( rc != 0 && errno != EMFILE && fd_is_saved[dev_type] >= 0 ) { ci_clone_fd_t op; op.do_cloexec = do_cloexec; LOG_NV(ci_log("%s: open failed, but cloning from saved fd", __func__)); rc = ci_sys_ioctl((ci_fd_t) saved_fd[dev_type], clone_ioctl[dev_type], &op); if( rc < 0 ) return rc; errno = saved_errno; *pfd = op.fd; } if( rc != 0 ) return rc; /* Our internal driver handles are not visible to the application. It may * make assumptions about the fd space available to it, and try to dup2/3 * onto one of our driver fds. To try and minimise this we allow the user * to specify a minimum value for us to use, to try and keep out of their * way. * * We have to be able to cope with them coming along and trying to dup onto * one of these fds anyway, as they may not have set the option up. As such * we treat failure to shift the fd as acceptable, and just retain the old * one. */ if( *pfd < CITP_OPTS.fd_base ) if( ef_onload_handle_move_and_do_cloexec(pfd, do_cloexec) == 0 ) return 0; if( do_cloexec ) { #if defined(O_CLOEXEC) static int o_cloexec_fails = -1; if( o_cloexec_fails < 0 ) { int arg; rc = ci_sys_fcntl(*(int *)pfd, F_GETFD, &arg); if( rc == 0 && (arg & FD_CLOEXEC) ) o_cloexec_fails = 0; else o_cloexec_fails = 1; } #else static const int o_cloexec_fails = 1; #endif if( o_cloexec_fails ) CI_DEBUG_TRY(ci_sys_fcntl(*(int *)pfd, F_SETFD, FD_CLOEXEC)); } return 0; }
int citp_pipe_splice_write(citp_fdinfo* fdi, int alien_fd, loff_t* alien_off, size_t olen, int flags, citp_lib_context_t* lib_context) { citp_pipe_fdi* epi = fdi_to_pipe_fdi(fdi); int len_in_bufs = OO_PIPE_SIZE_TO_BUFS(olen); struct iovec iov_on_stack[CITP_PIPE_SPLICE_WRITE_STACK_IOV_LEN]; struct iovec* iov = iov_on_stack; int want_buf_count; int rc; int bytes_to_read; int len = olen; int no_more = 1; /* for now we only run single loop */ int written_total = 0; int non_block = (flags & SPLICE_F_NONBLOCK) || (epi->pipe->aflags & (CI_PFD_AFLAG_NONBLOCK << CI_PFD_AFLAG_WRITER_SHIFT)); if( fdi_is_reader(fdi) ) { errno = EINVAL; return -1; } if( alien_off ) { /* TODO support this */ errno = ENOTSUP; return -1; } do { int count; int iov_num; int bytes_to_write; struct ci_pipe_pkt_list pkts = {}; struct ci_pipe_pkt_list pkts2; want_buf_count = len_in_bufs; /* We might need to wait for buffers here on the first iteration */ rc = ci_pipe_zc_alloc_buffers(epi->ni, epi->pipe, want_buf_count, MSG_NOSIGNAL | (non_block || written_total ? MSG_DONTWAIT : 0), &pkts); if( rc < 0 && written_total ) { /* whatever the error we need to report already written_bytes */ rc = written_total; break; } else if( rc < 0 ) break; else if( pkts.count == 0 && non_block ) { errno = EAGAIN; rc = -1; break; } else ci_assert_gt(pkts.count, 0); count = pkts.count; if( count > CITP_PIPE_SPLICE_WRITE_STACK_IOV_LEN ) { void* niov = realloc(iov == iov_on_stack ? NULL : iov, sizeof(*iov) * len_in_bufs); if( niov == NULL ) /* we can still move quite a few pkts */ count = CITP_PIPE_SPLICE_WRITE_STACK_IOV_LEN; else niov = iov; } ci_assert_ge(count, 1); iov_num = count; pkts2 = pkts; bytes_to_read = ci_pipe_list_to_iovec(epi->ni, epi->pipe, iov, &iov_num, &pkts2, len); citp_exit_lib_if(lib_context, TRUE); /* Note: the following call might be non-blocking as well as blocking */ rc = readv(alien_fd, iov, count); citp_reenter_lib(lib_context); if( rc > 0 ) { bytes_to_write = rc; written_total += bytes_to_write; len -= bytes_to_write; no_more |= bytes_to_write < bytes_to_read; } else { bytes_to_write = 0; no_more = 1; } { /* pipe zc_write will write non_empty buffers and release the empty * ones */ int rc2 = ci_pipe_zc_write(epi->ni, epi->pipe, &pkts, bytes_to_write, CI_PIPE_ZC_WRITE_FLAG_FORCE | MSG_DONTWAIT | MSG_NOSIGNAL); (void) rc2; ci_assert_equal(rc2, bytes_to_write); } /* for now we will not be doing second iteration, to allow for that * we'd need to have guarantee that read will not block * e.g. insight into type of fd and a nonblokcing operation * (to name a valid case: socket, recvmsg) */ } while( ! no_more ); if( iov != iov_on_stack ) free(iov); if( rc > 0 ) return written_total; if( rc < 0 && errno == EPIPE && ! (flags & MSG_NOSIGNAL) ) { ci_sys_ioctl(ci_netif_get_driver_handle(epi->ni), OO_IOC_KILL_SELF_SIGPIPE, NULL); } return rc; }