static void citp_fdinfo_do_handover(citp_fdinfo* fdi, int fdt_locked) { int rc; citp_fdinfo* epoll_fdi = NULL; int os_fd = fdi->fd; #ifndef NDEBUG /* Yuk: does for UDP too. */ volatile citp_fdinfo_p* p_fdip; p_fdip = &citp_fdtable.table[fdi->fd].fdip; ci_assert(fdip_is_busy(*p_fdip)); #endif Log_V(ci_log("%s: fd=%d nonb_switch=%d", __FUNCTION__, fdi->fd, fdi->on_rcz.handover_nonb_switch)); if( fdi->epoll_fd >= 0 ) { epoll_fdi = citp_epoll_fdi_from_member(fdi, fdt_locked); if( epoll_fdi->protocol->type == CITP_EPOLLB_FD ) citp_epollb_on_handover(epoll_fdi, fdi); } rc = fdtable_fd_move(fdi->fd, OO_IOC_TCP_HANDOVER); if( rc == -EBUSY && fdi->epoll_fd >= 0 ) { ci_assert(fdi_to_sock_fdi(fdi)->sock.s->b.sb_aflags & CI_SB_AFLAG_MOVED_AWAY); /* If this is our epoll, we can do full handover: we manually add os * fd into the epoll set. * Fixme: ensure we are not in _other_ epoll sets */ ci_bit_clear(&fdi_to_sock_fdi(fdi)->sock.s->b.sb_aflags, CI_SB_AFLAG_MOVED_AWAY_IN_EPOLL_BIT); rc = fdtable_fd_move(fdi->fd, OO_IOC_FILE_MOVED); } if( rc != 0 ) { citp_fdinfo* new_fdi; if( ! fdt_locked ) CITP_FDTABLE_LOCK(); new_fdi = citp_fdtable_probe_locked(fdi->fd, CI_TRUE, CI_TRUE); citp_fdinfo_release_ref(new_fdi, 1); if( ! fdt_locked ) CITP_FDTABLE_UNLOCK(); ci_assert_equal(citp_fdinfo_get_type(new_fdi), CITP_PASSTHROUGH_FD); os_fd = fdi_to_alien_fdi(new_fdi)->os_socket; } if( fdi->on_rcz.handover_nonb_switch >= 0 ) { int on_off = !! fdi->on_rcz.handover_nonb_switch; int rc = ci_sys_ioctl(os_fd, FIONBIO, &on_off); if( rc < 0 ) Log_E(ci_log("%s: ioctl failed on_off=%d", __FUNCTION__, on_off)); } if( rc != 0 ) goto exit; citp_fdtable_busy_clear(fdi->fd, fdip_passthru, fdt_locked); exit: citp_fdinfo_get_ops(fdi)->dtor(fdi, fdt_locked); if( epoll_fdi != NULL && epoll_fdi->protocol->type == CITP_EPOLL_FD ) citp_epoll_on_handover(epoll_fdi, fdi, fdt_locked); if( epoll_fdi != NULL ) citp_fdinfo_release_ref(epoll_fdi, fdt_locked); citp_fdinfo_free(fdi); }
static int citp_udp_connect(citp_fdinfo* fdinfo, const struct sockaddr* sa, socklen_t sa_len, citp_lib_context_t* lib_context) { citp_sock_fdi *epi = fdi_to_sock_fdi(fdinfo); int rc; Log_V(log(LPF "connect(%d, sa, %d)", fdinfo->fd, sa_len)); if( (epi->sock.s->s_flags & CI_SOCK_FLAG_REUSEPORT_LEGACY) != 0 ) { log("ERROR: connect of socket with SO_REUSEPORT not supported unless " "supported by the OS."); return -1; } ci_netif_lock_fdi(epi); rc = ci_udp_connect(&epi->sock, fdinfo->fd, sa, sa_len); ci_netif_unlock_fdi(epi); if( rc == CI_SOCKET_HANDOVER ) { CITP_STATS_NETIF(++epi->sock.netif->state->stats.udp_handover_connect); citp_fdinfo_handover(fdinfo, -1); return 0; } citp_fdinfo_release_ref( fdinfo, 0 ); return rc; }
static int citp_passthrough_listen(citp_fdinfo* fdi, int backlog) { int rc = ci_sys_listen(fdi_to_alien_fdi(fdi)->os_socket, backlog); citp_fdinfo_release_ref(fdi, 0); return rc; }
static int citp_udp_setsockopt(citp_fdinfo* fdinfo, int level, int optname, const void* optval, socklen_t optlen) { citp_sock_fdi *epi = fdi_to_sock_fdi(fdinfo); citp_socket* ep = &epi->sock; ci_sock_cmn* s = ep->s; int rc; Log_VSC(log("%s("EF_FMT", %d, %d)", __FUNCTION__, EF_PRI_ARGS(epi, fdinfo->fd), level, optname)); rc = ci_udp_setsockopt(&epi->sock, fdinfo->fd, level, optname, optval, optlen); Log_V(log(LPF "setsockopt: fd=%d rc=%d", fdinfo->fd, rc)); if( rc == CI_SOCKET_HANDOVER ) { CITP_STATS_NETIF(++epi->sock.netif->state->stats.udp_handover_setsockopt); citp_fdinfo_handover(fdinfo, -1); return 0; } if( ci_opt_is_setting_reuseport(level, optname, optval, optlen) != 0 && ! CI_SOCK_NOT_BOUND(s) ) { ci_log("%s: setting reuseport after binding on udp not supported", __FUNCTION__); return -ENOSYS; } citp_fdinfo_release_ref(fdinfo, 0); return rc; }
void citp_fdinfo_handover(citp_fdinfo* fdi, int nonb_switch) { /* Please see comments in internal.h. */ volatile citp_fdinfo_p* p_fdip; citp_fdinfo_p fdip; unsigned fd = fdi->fd; /* We're about to free some user-level state, so we need to interlock ** against select and poll. */ CITP_FDTABLE_LOCK(); p_fdip = &citp_fdtable.table[fd].fdip; again: fdip = *p_fdip; if( fdip_is_busy(fdip) ) fdip = citp_fdtable_busy_wait(fd, 1); if( fdip == fdi_to_fdip(fdi) ) { if( fdip_cas_fail(p_fdip, fdip, fdip_busy) ) goto again; } else { /* [fd] must have changed meaning under our feet. It must be closing, ** so do nothing except drop the ref passed in. */ ci_assert(fdip_is_closing(fdip)); ci_assert_nequal(fdi->on_ref_count_zero, FDI_ON_RCZ_NONE); } if( fdip == fdi_to_fdip(fdi) ) { ci_assert_equal(fdi->on_ref_count_zero, FDI_ON_RCZ_NONE); fdi->on_ref_count_zero = FDI_ON_RCZ_HANDOVER; fdi->on_rcz.handover_nonb_switch = nonb_switch; /* Drop the fdtable ref. When the ref count goes to zero, the handover ** will be done. We return without waiting, because the caller ** shouldn't do anything more with this socket anyway. */ citp_fdinfo_release_ref(fdi, 1); } /* Drop the ref passed in. */ citp_fdinfo_release_ref(fdi, 1); CITP_FDTABLE_UNLOCK(); }
int onload_zc_send(struct onload_zc_mmsg* msgs, int mlen, int flags) { int done = 0, last_fd = -1, i; citp_lib_context_t lib_context; citp_fdinfo* fdi = NULL; Log_CALL(ci_log("%s(%p, %d, %x)", __FUNCTION__, msgs, mlen, flags)); citp_enter_lib(&lib_context); for( i = 0; i < mlen; ++i ) { if( msgs[i].fd != last_fd ) { if( fdi != NULL ) citp_fdinfo_release_ref(fdi, 0); fdi = citp_fdtable_lookup(msgs[i].fd); if( fdi == NULL ) { msgs[i].rc = -ESOCKTNOSUPPORT; ++done; goto out; } last_fd = msgs[i].fd; } CI_TRY_EQ( citp_fdinfo_get_ops(fdi)->zc_send(fdi, &msgs[i], flags), 1); /* If we got an error, return the number of msgs that have had * rc set and exit. fd_op should have updated msgs.rc appropriately */ ++done; if( msgs[i].rc < 0 ) goto out; } out: if( fdi != NULL ) citp_fdinfo_release_ref(fdi, 0); citp_exit_lib(&lib_context, TRUE); ci_assert_gt(done, 0); ci_assert_le(done, mlen); Log_CALL_RESULT(done); return done; }
static int citp_passthrough_setsockopt(citp_fdinfo* fdi, int level, int optname, const void* optval, socklen_t optlen) { int rc = ci_sys_setsockopt(fdi_to_alien_fdi(fdi)->os_socket, level, optname, optval, optlen); citp_fdinfo_release_ref(fdi, 0); return rc; }
int citp_passthrough_bind(citp_fdinfo* fdi, const struct sockaddr* sa, socklen_t sa_len) { int rc = ci_sys_bind(fdi_to_alien_fdi(fdi)->os_socket, sa, sa_len); citp_fdinfo_release_ref(fdi, 0); return rc; }
static int citp_passthrough_connect(citp_fdinfo* fdi, const struct sockaddr* sa, socklen_t sa_len, citp_lib_context_t* lib_context) { int rc = ci_sys_connect(fdi_to_alien_fdi(fdi)->os_socket, sa, sa_len); citp_fdinfo_release_ref(fdi, 0); return rc; }
static int citp_udp_bind(citp_fdinfo* fdinfo, const struct sockaddr* sa, socklen_t sa_len) { citp_sock_fdi *epi = fdi_to_sock_fdi(fdinfo); citp_socket* ep = &epi->sock; ci_sock_cmn* s = ep->s; int rc; Log_V(log(LPF "bind(%d, sa, %d)", fdinfo->fd, sa_len)); ci_udp_handle_force_reuseport(fdinfo->fd, ep, sa, sa_len); if( (s->s_flags & CI_SOCK_FLAG_REUSEPORT) != 0 ) { if( (rc = ci_udp_reuseport_bind(ep, fdinfo->fd, sa, sa_len)) == 0 ) { /* The socket has moved so need to reprobe the fd. This will also * map the the new stack into user space of the executing process. */ fdinfo = citp_fdtable_lookup(fdinfo->fd); fdinfo = citp_reprobe_moved(fdinfo, CI_FALSE); epi = fdi_to_sock_fdi(fdinfo); ep = &epi->sock; ci_netif_cluster_prefault(ep->netif); } else { goto done; } } ci_netif_lock_fdi(epi); rc = ci_udp_bind(ep, fdinfo->fd, sa, sa_len); ci_netif_unlock_fdi(epi); done: if( rc == CI_SOCKET_HANDOVER ) { ci_assert_equal(s->s_flags & CI_SOCK_FLAG_REUSEPORT_LEGACY, 0); CITP_STATS_NETIF(++epi->sock.netif->state->stats.udp_handover_bind); citp_fdinfo_handover(fdinfo, -1); return 0; } citp_fdinfo_release_ref( fdinfo, 0 ); return rc; }
int onload_zc_release_buffers(int fd, onload_zc_handle* bufs, int bufs_len) { int rc = 0, i, rx_pkt, released; citp_lib_context_t lib_context; citp_fdinfo* fdi; citp_sock_fdi* epi; ci_netif* ni; ci_ip_pkt_fmt* pkt; Log_CALL(ci_log("%s(%d, %p, %d)", __FUNCTION__, fd, bufs, bufs_len)); citp_enter_lib(&lib_context); if( (fdi = citp_fdtable_lookup(fd)) != NULL ) { switch( citp_fdinfo_get_type(fdi) ) { case CITP_UDP_SOCKET: case CITP_TCP_SOCKET: epi = fdi_to_sock_fdi(fdi); ni = epi->sock.netif; ci_netif_lock(ni); for( i = 0; i < bufs_len; ++i ) { pkt = (ci_ip_pkt_fmt*)bufs[i]; if( pkt->stack_id != ni->state->stack_id ) { LOG_U(log("%s: attempt to free buffer from stack %d to stack %d", __FUNCTION__, pkt->stack_id, ni->state->stack_id)); rc = -EINVAL; break; } } if( rc == 0 ) { for( i = 0; i < bufs_len; ++i ) { pkt = (ci_ip_pkt_fmt*)bufs[i]; /* If we are releasing a packet without the RX_FLAG then the user * allocated and then freed the packet (without using it). * We detect this to decrement n_asyn_pkts. * RX packets (kept via ONLOAD_ZC_KEEP) are counted differently * so don't decrement here. (But may release) */ rx_pkt = pkt->flags & CI_PKT_FLAG_RX; released = ci_netif_pkt_release_check_keep(ni, pkt); if ( ! rx_pkt ) { ci_assert(released == 1); (void) released; --ni->state->n_async_pkts; } } } ci_netif_unlock(ni); break; #if CI_CFG_USERSPACE_EPOLL case CITP_EPOLL_FD: rc = -ENOTSOCK; break; #endif #if CI_CFG_USERSPACE_PIPE case CITP_PIPE_FD: rc = -ENOTSOCK; break; #endif default: LOG_U(log("%s: unknown fdinfo type %d", __FUNCTION__, citp_fdinfo_get_type(fdi))); rc = -EINVAL; } citp_fdinfo_release_ref(fdi, 0); } else { /* Not onload socket */ rc = -ESOCKTNOSUPPORT; } citp_exit_lib(&lib_context, TRUE); Log_CALL_RESULT(rc); return rc; }
citp_fdinfo* citp_fdtable_lookup_fast(citp_lib_context_t* ctx, unsigned fd) { /* Note that if we haven't yet initialised this module, then ** [inited_count] will be zero, and the following test will fail. So the ** test for initialisation is done further down... ** ** This is highly performance critial. DO NOT add any code between here ** and the first [return] statement. */ citp_fdinfo* fdi; /* Try to avoid entering lib. */ ctx->thread = NULL; if( fd < citp_fdtable.inited_count ) { volatile citp_fdinfo_p* p_fdip = &citp_fdtable.table[fd].fdip; citp_fdinfo_p fdip; again: fdip = *p_fdip; if( fdip_is_normal(fdip) ) { citp_enter_lib_if(ctx); if( citp_fdtable_is_mt_safe() ) { /* No need to use atomic ops or add a ref to the fdi when MT-safe. * The definition of "fds_mt_safe" is that the app does not change * the meaning of a file descriptor in one thread when it is being * used in another thread. */ fdi = fdip_to_fdi(fdip); if( ! citp_fdinfo_is_consistent(fdi) ) fdi = citp_reprobe_moved(fdi, CI_TRUE, CI_FALSE); return fdi; } else { /* Swap in the busy marker. */ if( fdip_cas_succeed(p_fdip, fdip, fdip_busy) ) { fdi = fdip_to_fdi(fdip); ci_assert(fdi); ci_assert_gt(oo_atomic_read(&fdi->ref_count), 0); ci_assert(fdip_is_closing(fdip) || fdip_is_reserved(fdip) || fdi->fd == fd); /* Bump the reference count. */ citp_fdinfo_ref(fdi); if( ! citp_fdinfo_is_consistent(fdi) ) fdi = citp_reprobe_moved(fdi, CI_FALSE, CI_TRUE); else { /* Swap the busy marker out again. */ citp_fdtable_busy_clear(fd, fdip, 0); } return fdi; } goto again; } } /* Not normal! */ if( fdip_is_passthru(fdip) ) return NULL; citp_enter_lib_if(ctx); if( fdip_is_busy(fdip) ) { citp_fdtable_busy_wait(fd, 0); goto again; } ci_assert(fdip_is_unknown(fdip)); goto probe; } if( citp.init_level < CITP_INIT_FDTABLE ) { if( _citp_do_init_inprogress == 0 ) CI_TRY(citp_do_init(CITP_INIT_ALL)); else CI_TRY(citp_do_init(CITP_INIT_FDTABLE)); /* get what we need */ } if( fd >= citp_fdtable.size ) return NULL; probe: citp_enter_lib_if(ctx); fdi = citp_fdtable_probe(fd); if( fdi && citp_fdtable_is_mt_safe() ) citp_fdinfo_release_ref(fdi, 0); return fdi; }
/* Re-probe fdinfo after endpoint was moved to another stack. * The function assumes that fdinfo was obtained via citp_fdtable_lookup() * or from citp_fdtable_lookup_fast(). */ citp_fdinfo* citp_reprobe_moved(citp_fdinfo* fdinfo, int from_fast_lookup, int fdip_is_already_busy) { int fd = fdinfo->fd; citp_fdinfo* new_fdinfo = NULL; CITP_FDTABLE_LOCK(); if( ! fdip_is_already_busy ) { volatile citp_fdinfo_p* p_fdip; citp_fdinfo_p fdip; p_fdip = &citp_fdtable.table[fd].fdip; again: fdip = *p_fdip; if( fdip_is_busy(fdip) ) fdip = citp_fdtable_busy_wait(fd, 1); ci_assert( fdip_is_normal(fdip) || fdip_is_passthru(fdip) ); if( fdip_cas_fail(p_fdip, fdip, fdip_busy) ) goto again; /* Possibly, a parrallel thread have already called * citp_reprobe_moved() for us. */ if( fdip_is_passthru(fdip) ) { citp_fdtable_busy_clear(fd, fdip, 1); if( new_fdinfo != NULL ) citp_fdinfo_ref(new_fdinfo); goto done; } ci_assert( fdip_is_normal(fdip) ); new_fdinfo = fdip_to_fdi(fdip); if( new_fdinfo != fdinfo) { citp_fdtable_busy_clear(fd, fdip, 1); if( new_fdinfo != NULL ) citp_fdinfo_ref(new_fdinfo); goto done; } } else ci_assert(fdip_is_busy(citp_fdtable.table[fd].fdip)); /* re-probe new fd */ new_fdinfo = citp_fdtable_probe_locked(fd, CI_TRUE, CI_TRUE); if( fdinfo->epoll_fd >= 0 ) { citp_fdinfo* epoll_fdi = citp_epoll_fdi_from_member(fdinfo, 1); if( epoll_fdi->protocol->type == CITP_EPOLL_FD ) citp_epoll_on_move(epoll_fdi, fdinfo, new_fdinfo, 1); else citp_epollb_on_handover(epoll_fdi, fdinfo); citp_fdinfo_release_ref(epoll_fdi, 1); } /* Drop refcount from fdtable */ fdinfo->on_ref_count_zero = FDI_ON_RCZ_MOVED; citp_fdinfo_release_ref(fdinfo, 1); done: /* One refcount from the caller */ if( from_fast_lookup ) citp_fdinfo_release_ref_fast(fdinfo); else citp_fdinfo_release_ref(fdinfo, 1); CITP_FDTABLE_UNLOCK(); if( new_fdinfo == NULL ) return NULL; if( from_fast_lookup ) { citp_fdinfo_ref_fast(new_fdinfo); citp_fdinfo_release_ref(new_fdinfo, 0); } return new_fdinfo; }
int citp_ep_close(unsigned fd) { volatile citp_fdinfo_p* p_fdip; citp_fdinfo_p fdip; int rc, got_lock; citp_fdinfo* fdi; /* Do not touch shared fdtable when in vfork child. */ if( oo_per_thread_get()->in_vfork_child ) return ci_tcp_helper_close_no_trampoline(fd); /* Interlock against other closes, against the fdtable being extended, ** and against select and poll. */ CITP_FDTABLE_LOCK(); got_lock = 1; __citp_fdtable_extend(fd); if( fd >= citp_fdtable.inited_count ) { rc = ci_sys_close(fd); goto done; } p_fdip = &citp_fdtable.table[fd].fdip; again: fdip = *p_fdip; if( fdip_is_busy(fdip) ) fdip = citp_fdtable_busy_wait(fd, 1); if( fdip_is_closing(fdip) | fdip_is_reserved(fdip) ) { /* Concurrent close or attempt to close reserved. */ Log_V(ci_log("%s: fd=%d closing=%d reserved=%d", __FUNCTION__, fd, fdip_is_closing(fdip), fdip_is_reserved(fdip))); errno = EBADF; rc = -1; goto done; } #if CI_CFG_FD_CACHING /* Need to check in case this sucker's cached */ if( fdip_is_unknown(fdip) ) { fdi = citp_fdtable_probe_locked(fd, CI_FALSE, CI_FALSE); if( fdi == &citp_the_closed_fd ) { citp_fdinfo_release_ref(fdi, CI_TRUE); errno = EBADF; rc = -1; goto done; } if( fdi ) citp_fdinfo_release_ref(fdi, CI_TRUE); } #endif ci_assert(fdip_is_normal(fdip) | fdip_is_passthru(fdip) | fdip_is_unknown(fdip)); /* Swap in the "closed" pseudo-fdinfo. This lets any other thread know ** that we're in the middle of closing this fd. */ if( fdip_cas_fail(p_fdip, fdip, fdip_closing) ) goto again; if( fdip_is_normal(fdip) ) { fdi = fdip_to_fdi(fdip); CITP_FDTABLE_UNLOCK(); got_lock = 0; if( fdi->is_special ) { Log_V(ci_log("%s: fd=%d is_special, returning EBADF", __FUNCTION__, fd)); errno = EBADF; rc = -1; fdtable_swap(fd, fdip_closing, fdip, 0); goto done; } Log_V(ci_log("%s: fd=%d u/l socket", __FUNCTION__, fd)); ci_assert_equal(fdi->fd, fd); ci_assert_equal(fdi->on_ref_count_zero, FDI_ON_RCZ_NONE); fdi->on_ref_count_zero = FDI_ON_RCZ_CLOSE; if( fdi->epoll_fd >= 0 ) { citp_fdinfo* epoll_fdi = citp_epoll_fdi_from_member(fdi, 0); if( epoll_fdi ) { if( epoll_fdi->protocol->type == CITP_EPOLL_FD ) citp_epoll_on_close(epoll_fdi, fdi, 0); citp_fdinfo_release_ref(epoll_fdi, 0); } } citp_fdinfo_release_ref(fdi, 0); rc = 0; } else { ci_assert(fdip_is_passthru(fdip) || fdip_is_unknown(fdip)); if( ! fdtable_strict() ) { CITP_FDTABLE_UNLOCK(); got_lock = 0; } Log_V(ci_log("%s: fd=%d passthru=%d unknown=%d", __FUNCTION__, fd, fdip_is_passthru(fdip), fdip_is_unknown(fdip))); fdtable_swap(fd, fdip_closing, fdip_unknown, fdtable_strict()); rc = ci_tcp_helper_close_no_trampoline(fd); } done: if( got_lock ) CITP_FDTABLE_UNLOCK(); FDTABLE_ASSERT_VALID(); return rc; }
int citp_ep_dup3(unsigned fromfd, unsigned tofd, int flags) { volatile citp_fdinfo_p* p_tofdip; citp_fdinfo_p tofdip; unsigned max; Log_V(log("%s(%d, %d)", __FUNCTION__, fromfd, tofd)); /* Must be checked by callers. */ ci_assert(fromfd != tofd); /* Hack: if [tofd] is the fd we're using for logging, we'd better choose ** a different one! */ if( tofd == citp.log_fd ) citp_log_change_fd(); ci_assert(citp.init_level >= CITP_INIT_FDTABLE); max = CI_MAX(fromfd, tofd); if( max >= citp_fdtable.inited_count ) { ci_assert(max < citp_fdtable.size); CITP_FDTABLE_LOCK(); __citp_fdtable_extend(max); CITP_FDTABLE_UNLOCK(); } /* Bug1151: Concurrent threads doing dup2(x,y) and dup2(y,x) can deadlock ** against one another. So we take out a fat lock to prevent concurrent ** dup2()s. */ /* Lock tofd. We need to interlock against select and poll etc, so we ** also grab the exclusive lock. Also grab the bug1151 lock. */ pthread_mutex_lock(&citp_dup_lock); CITP_FDTABLE_LOCK(); p_tofdip = &citp_fdtable.table[tofd].fdip; lock_tofdip_again: tofdip = *p_tofdip; if( fdip_is_busy(tofdip) ) tofdip = citp_fdtable_busy_wait(tofd, 1); if( fdip_is_closing(tofdip) ) tofdip = citp_fdtable_closing_wait(tofd, 1); if( fdip_is_reserved(tofdip) ) { /* ?? FIXME: we can't cope with this at the moment */ CITP_FDTABLE_UNLOCK(); Log_U(log("%s(%d, %d): target is reserved", __FUNCTION__, fromfd, tofd)); errno = EBUSY; tofd = -1; goto out; } if( fdip_cas_fail(p_tofdip, tofdip, fdip_busy) ) goto lock_tofdip_again; CITP_FDTABLE_UNLOCK(); ci_assert(fdip_is_normal(tofdip) | fdip_is_passthru(tofdip) | fdip_is_unknown(tofdip)); if( fdip_is_normal(tofdip) ) { /* We're duping onto a user-level socket. */ citp_fdinfo* tofdi = fdip_to_fdi(tofdip); if( tofdi->epoll_fd >= 0 ) { citp_fdinfo* epoll_fdi = citp_epoll_fdi_from_member(tofdi, 0); if( epoll_fdi ) { if( epoll_fdi->protocol->type == CITP_EPOLL_FD ) citp_epoll_on_close(epoll_fdi, tofdi, 0); citp_fdinfo_release_ref(epoll_fdi, 0); } } ci_assert_equal(tofdi->on_ref_count_zero, FDI_ON_RCZ_NONE); tofdi->on_ref_count_zero = FDI_ON_RCZ_DUP2; tofdi->on_rcz.dup3_args.fd = fromfd; tofdi->on_rcz.dup3_args.flags = flags; citp_fdinfo_release_ref(tofdi, 0); { int i = 0; /* We need to free this fdi. If someone is using it right now, * we are in trouble. So, we spin for a while and interrupt the * user. See bug 28123. */ while( tofdi->on_ref_count_zero != FDI_ON_RCZ_DONE ) { if( ci_is_multithreaded() && i % 10000 == 9999 ) { pthread_t pth = tofdi->thread_id; if( pth != pthread_self() && pth != PTHREAD_NULL ) { pthread_kill(pth, SIGONLOAD); sleep(1); } } ci_spinloop_pause(); i++; } ci_rmb(); } if( tofdi->on_rcz.dup2_result < 0 ) { errno = -tofdi->on_rcz.dup2_result; /* Need to re-insert [tofdi] into the table. */ ci_assert_equal(oo_atomic_read(&tofdi->ref_count), 0); oo_atomic_set(&tofdi->ref_count, 1); CI_DEBUG(tofdi->on_ref_count_zero = FDI_ON_RCZ_NONE); citp_fdtable_busy_clear(tofd, tofdip, 0); tofd = -1; } else { ci_assert(tofdi->on_rcz.dup2_result == tofd); citp_fdinfo_get_ops(tofdi)->dtor(tofdi, 0); citp_fdinfo_free(tofdi); } goto out; } ci_assert(fdip_is_passthru(tofdip) | fdip_is_unknown(tofdip)); { /* We're dupping onto an O/S descriptor, or it may be closed. Create a ** dummy [citp_fdinfo], just so we can share code with the case above. */ citp_fdinfo fdi; fdi.fd = tofd; fdi.on_rcz.dup3_args.fd = fromfd; fdi.on_rcz.dup3_args.flags = flags; dup2_complete(&fdi, tofdip, 0); if( fdi.on_rcz.dup2_result < 0 ) { errno = -fdi.on_rcz.dup2_result; citp_fdtable_busy_clear(tofd, tofdip, 0); tofd = -1; } else ci_assert(fdi.on_rcz.dup2_result == tofd); } out: pthread_mutex_unlock(&citp_dup_lock); return tofd; }
static void dup2_complete(citp_fdinfo* prev_tofdi, citp_fdinfo_p prev_tofdip, int fdt_locked) { volatile citp_fdinfo_p *p_fromfdip; unsigned fromfd = prev_tofdi->on_rcz.dup3_args.fd; unsigned tofd = prev_tofdi->fd; citp_fdinfo_p fromfdip; int rc; #if CI_LIBC_HAS_dup3 || !defined(NDEBUG) int flags = prev_tofdi->on_rcz.dup3_args.flags; #endif #ifndef NDEBUG volatile citp_fdinfo_p* p_tofdip; p_tofdip = &citp_fdtable.table[tofd].fdip; ci_assert(fdip_is_busy(*p_tofdip)); #endif citp_fdinfo* fromfdi; p_fromfdip = &citp_fdtable.table[fromfd].fdip; lock_fromfdip_again: fromfdip = *p_fromfdip; if( fdip_is_busy(fromfdip) ) fromfdip = citp_fdtable_busy_wait(fromfd, fdt_locked); if( fdip_is_closing(fromfdip) | fdip_is_reserved(fromfdip) ) { prev_tofdi->on_rcz.dup2_result = -EBADF; ci_wmb(); prev_tofdi->on_ref_count_zero = FDI_ON_RCZ_DONE; return; } #if CI_CFG_FD_CACHING /* Need to check in case this sucker's cached */ if( fdip_is_unknown(fromfdip) ) { if( !fdt_locked ) CITP_FDTABLE_LOCK(); fromfdi = citp_fdtable_probe_locked(fromfd, CI_FALSE, CI_FALSE); if( !fdt_locked ) CITP_FDTABLE_UNLOCK(); if( fromfdi == &citp_the_closed_fd ) { prev_tofdi->on_rcz.dup2_result = -EBADF; ci_wmb(); prev_tofdi->on_ref_count_zero = FDI_ON_RCZ_DONE; citp_fdinfo_release_ref(fromfdi, CI_TRUE); return; } if( fromfdi ) citp_fdinfo_release_ref(fromfdi, CI_TRUE); } #endif if( fdip_cas_fail(p_fromfdip, fromfdip, fdip_busy) ) goto lock_fromfdip_again; oo_rwlock_lock_write(&citp_dup2_lock); #if CI_LIBC_HAS_dup3 rc = ci_sys_dup3(fromfd, tofd, flags); #else ci_assert_equal(flags, 0); rc = ci_sys_dup2(fromfd, tofd); #endif oo_rwlock_unlock_write(&citp_dup2_lock); if( rc < 0 ) { citp_fdtable_busy_clear(fromfd, fromfdip, fdt_locked); prev_tofdi->on_rcz.dup2_result = -errno; ci_wmb(); prev_tofdi->on_ref_count_zero = FDI_ON_RCZ_DONE; return; } ci_assert(fdip_is_normal(fromfdip) | fdip_is_passthru(fromfdip) | fdip_is_unknown(fromfdip)); if( fdip_is_normal(fromfdip) && (((fromfdi = fdip_to_fdi(fromfdip))->protocol->type) == CITP_EPOLL_FD) ) { citp_fdinfo* newfdi = citp_fdinfo_get_ops(fromfdi)->dup(fromfdi); if( newfdi ) { citp_fdinfo_init(newfdi, fdip_to_fdi(fromfdip)->protocol); citp_fdtable_insert(newfdi, tofd, fdt_locked); } else { /* Out of memory. Can't probe epoll1 fd later on, so fail. */ citp_fdtable_busy_clear(fromfd, fromfdip, fdt_locked); prev_tofdi->on_rcz.dup2_result = -ENOMEM; ci_wmb(); prev_tofdi->on_ref_count_zero = FDI_ON_RCZ_DONE; return; } } else { /* Mark newfd as unknown. When used, it'll get probed. * * We are not just being lazy here: Setting to unknown rather than * installing a proper fdi (when oldfd is accelerated) is essential to * vfork()+dup2()+exec() working properly. Reason is that child and * parent share address space, so child is modifying the parent's * fdtable. Setting an entry to unknown is safe. */ citp_fdtable_busy_clear(tofd, fdip_unknown, fdt_locked); #if CI_CFG_FD_CACHING /* Multiple refs to this now, don't allow it to be cached. */ if( fdip_is_normal(fromfdip) ) fdip_to_fdi(fromfdip)->can_cache = 0; #endif } citp_fdtable_busy_clear(fromfd, fromfdip, fdt_locked); prev_tofdi->on_rcz.dup2_result = tofd; ci_wmb(); prev_tofdi->on_ref_count_zero = FDI_ON_RCZ_DONE; }
/* ** Why do these live here? Because they need to hack into the low-level ** dirty nastiness of the fdtable. */ int citp_ep_dup(unsigned oldfd, int (*syscall)(int oldfd, long arg), long arg) { /* This implements dup(oldfd) and fcntl(oldfd, F_DUPFD, arg). */ volatile citp_fdinfo_p* p_oldfdip; citp_fdinfo_p oldfdip; citp_fdinfo* newfdi = 0; citp_fdinfo* oldfdi; int newfd; Log_V(log("%s(%d)", __FUNCTION__, oldfd)); if(CI_UNLIKELY( citp.init_level < CITP_INIT_FDTABLE || oo_per_thread_get()->in_vfork_child )) /* Lib not initialised, so no U/L state, and therefore system dup() ** will do just fine. */ return syscall(oldfd, arg); if( oldfd >= citp_fdtable.inited_count ) { /* NB. We can't just pass through in this case because we need to worry ** about other threads racing with us. So we need to be able to lock ** this fd while we do the dup. */ ci_assert(oldfd < citp_fdtable.size); CITP_FDTABLE_LOCK(); __citp_fdtable_extend(oldfd); CITP_FDTABLE_UNLOCK(); } p_oldfdip = &citp_fdtable.table[oldfd].fdip; again: oldfdip = *p_oldfdip; if( fdip_is_busy(oldfdip) ) oldfdip = citp_fdtable_busy_wait(oldfd, 0); if( fdip_is_closing(oldfdip) | fdip_is_reserved(oldfdip) ) { errno = EBADF; return -1; } #if CI_CFG_FD_CACHING /* Need to check in case this sucker's cached */ if( fdip_is_unknown(oldfdip) ) { CITP_FDTABLE_LOCK(); oldfdi = citp_fdtable_probe_locked(oldfd, CI_FALSE, CI_FALSE); CITP_FDTABLE_UNLOCK(); if( oldfdi == &citp_the_closed_fd ) { citp_fdinfo_release_ref(oldfdi, CI_TRUE); errno = EBADF; return -1; } if( oldfdi ) citp_fdinfo_release_ref(oldfdi, CI_TRUE); } #endif if( fdip_cas_fail(p_oldfdip, oldfdip, fdip_busy) ) goto again; #if CI_CFG_FD_CACHING /* May end up with multiple refs to this, don't allow it to be cached. */ if( fdip_is_normal(oldfdip) ) fdip_to_fdi(oldfdip)->can_cache = 0; #endif if( fdip_is_normal(oldfdip) && (((oldfdi = fdip_to_fdi(oldfdip))->protocol->type) == CITP_EPOLL_FD) ) { newfdi = citp_fdinfo_get_ops(oldfdi)->dup(oldfdi); if( ! newfdi ) { citp_fdtable_busy_clear(oldfd, oldfdip, 0); errno = ENOMEM; return -1; } if( fdtable_strict() ) CITP_FDTABLE_LOCK(); newfd = syscall(oldfd, arg); if( newfd >= 0 ) citp_fdtable_new_fd_set(newfd, fdip_busy, fdtable_strict()); if( fdtable_strict() ) CITP_FDTABLE_UNLOCK(); if( newfd >= 0 ) { citp_fdtable_insert(newfdi, newfd, 0); newfdi = 0; } } else { if( fdtable_strict() ) CITP_FDTABLE_LOCK(); newfd = syscall(oldfd, arg); if( newfd >= 0 && newfd < citp_fdtable.inited_count ) { /* Mark newfd as unknown. When used, it'll get probed. * * We are not just being lazy here: Setting to unknown rather than * installing a proper fdi (when oldfd is accelerated) is essential to * vfork()+dup()+exec() working properly. Reason is that child and * parent share address space, so child is modifying the parent's * fdtable. Setting an entry to unknown is safe. */ citp_fdtable_new_fd_set(newfd, fdip_unknown, fdtable_strict()); } if( fdtable_strict() ) CITP_FDTABLE_UNLOCK(); } citp_fdtable_busy_clear(oldfd, oldfdip, 0); if( newfdi ) citp_fdinfo_free(newfdi); return newfd; }
int onload_zc_alloc_buffers(int fd, struct onload_zc_iovec* iovecs, int iovecs_len, enum onload_zc_buffer_type_flags flags) { int rc = 0, i; citp_lib_context_t lib_context; citp_fdinfo* fdi; citp_sock_fdi* epi; ci_netif* ni; ci_ip_pkt_fmt *pkt; unsigned max_len; Log_CALL(ci_log("%s(%d, %p, %d, %x)", __FUNCTION__, fd, iovecs, iovecs_len, flags)); citp_enter_lib(&lib_context); if( (fdi = citp_fdtable_lookup(fd)) != NULL ) { switch( citp_fdinfo_get_type(fdi) ) { case CITP_UDP_SOCKET: case CITP_TCP_SOCKET: epi = fdi_to_sock_fdi(fdi); ni = epi->sock.netif; ci_netif_lock(ni); for( i = 0; i < iovecs_len; ++i ) { max_len = CI_CFG_PKT_BUF_SIZE; pkt = ci_netif_pkt_tx_tcp_alloc(ni); if( pkt == NULL ) { while( --i >= 0 ) ci_netif_pkt_release(ni, (ci_ip_pkt_fmt*)iovecs[i].buf); rc = -ENOMEM; ci_netif_unlock(ni); goto out; } /* Make sure this is clear as it affects behaviour when freeing */ pkt->pf.udp.rx_flags = 0; iovecs[i].buf = (struct oo_zc_buf *)pkt; if( flags & ONLOAD_ZC_BUFFER_HDR_TCP ) { if( (citp_fdinfo_get_type(fdi) == CITP_TCP_SOCKET) && (epi->sock.s->b.state & CI_TCP_STATE_TCP_CONN) ) { ci_tcp_state* ts = SOCK_TO_TCP(epi->sock.s); oo_tx_pkt_layout_init(pkt); iovecs[i].iov_base = ((char *)oo_tx_ip_hdr(pkt)) + ts->outgoing_hdrs_len; max_len = tcp_eff_mss(ts); } else { /* Best guess. We can fix it up later. Magic 12 leaves * space for time stamp option (common case) */ oo_tx_pkt_layout_init(pkt); iovecs[i].iov_base = (uint8_t*) oo_tx_ip_data(pkt) + sizeof(ci_tcp_hdr) + 12; } } else if( flags & ONLOAD_ZC_BUFFER_HDR_UDP ) { oo_tx_pkt_layout_init(pkt); iovecs[i].iov_base = (uint8_t*) oo_tx_ip_data(pkt) + sizeof(ci_udp_hdr); } else iovecs[i].iov_base = PKT_START(pkt); iovecs[i].iov_len = CI_CFG_PKT_BUF_SIZE - ((char *)iovecs[i].iov_base - (char *)pkt); if( iovecs[i].iov_len > max_len ) iovecs[i].iov_len = max_len; } ni->state->n_async_pkts += iovecs_len; ci_netif_unlock(ni); break; #if CI_CFG_USERSPACE_EPOLL case CITP_EPOLL_FD: rc = -ENOTSOCK; break; #endif #if CI_CFG_USERSPACE_PIPE case CITP_PIPE_FD: rc = -ENOTSOCK; break; #endif case CITP_PASSTHROUGH_FD: rc = -ESOCKTNOSUPPORT; break; default: LOG_U(log("%s: unknown fdinfo type %d", __FUNCTION__, citp_fdinfo_get_type(fdi))); rc = -EINVAL; } citp_fdinfo_release_ref(fdi, 0); } else { /* Not onload socket */ rc = -ESOCKTNOSUPPORT; } out: citp_exit_lib(&lib_context, TRUE); Log_CALL_RESULT(rc); return rc; }
int onload_zc_release_buffers(int fd, onload_zc_handle* bufs, int bufs_len) { int rc = 0, i; citp_lib_context_t lib_context; citp_fdinfo* fdi; citp_sock_fdi* epi; ci_netif* ni; Log_CALL(ci_log("%s(%d, %p, %d)", __FUNCTION__, fd, bufs, bufs_len)); citp_enter_lib(&lib_context); if( (fdi = citp_fdtable_lookup(fd)) != NULL ) { switch( citp_fdinfo_get_type(fdi) ) { case CITP_UDP_SOCKET: case CITP_TCP_SOCKET: epi = fdi_to_sock_fdi(fdi); ni = epi->sock.netif; ci_netif_lock(ni); for( i = 0; i < bufs_len; ++i ) { ci_ip_pkt_fmt* pkt = (ci_ip_pkt_fmt*)bufs[i]; if( pkt->stack_id != ni->state->stack_id ) { LOG_U(log("%s: attempt to free buffer from stack %d to stack %d", __FUNCTION__, pkt->stack_id, ni->state->stack_id)); rc = -EINVAL; break; } } if( rc == 0 ) { for( i = 0; i < bufs_len; ++i ) ci_netif_pkt_release_check_keep(ni, (ci_ip_pkt_fmt*)bufs[i]); } ci_netif_unlock(ni); break; #if CI_CFG_USERSPACE_EPOLL case CITP_EPOLL_FD: rc = -ENOTSOCK; break; #endif #if CI_CFG_USERSPACE_PIPE case CITP_PIPE_FD: rc = -ENOTSOCK; break; #endif default: LOG_U(log("%s: unknown fdinfo type %d", __FUNCTION__, citp_fdinfo_get_type(fdi))); rc = -EINVAL; } citp_fdinfo_release_ref(fdi, 0); } else { /* Not onload socket */ rc = -ESOCKTNOSUPPORT; } citp_exit_lib(&lib_context, TRUE); Log_CALL_RESULT(rc); return rc; }