static int citp_udp_shutdown(citp_fdinfo* fdinfo, int how) { citp_sock_fdi *epi = fdi_to_sock_fdi(fdinfo); int rc; Log_V(ci_log("%s("EF_FMT", %d)", __FUNCTION__, EF_PRI_ARGS(epi,fdinfo->fd), how)); ci_netif_lock_fdi(epi); rc = ci_udp_shutdown(&epi->sock, fdinfo->fd, how); ci_netif_unlock_fdi(epi); Log_V(log(LPF "shutdown: fd=%d rc=%d", fdinfo->fd, rc)); return rc; }
static int citp_udp_ioctl(citp_fdinfo* fdinfo, int request, void* arg) { citp_sock_fdi *epi = fdi_to_sock_fdi(fdinfo); int rc; Log_V(log("%s("EF_FMT", %d, 0x%lx)", __FUNCTION__, EF_PRI_ARGS(epi, fdinfo->fd), request, (long) arg)); rc = ci_udp_ioctl(&epi->sock, fdinfo->fd, request, arg); Log_V(log(LPF "ioctl()=%d", rc)); if( rc < 0 ) CI_SET_ERROR(rc, -rc); return rc; }
static int citp_udp_setsockopt(citp_fdinfo* fdinfo, int level, int optname, const void* optval, socklen_t optlen) { citp_sock_fdi *epi = fdi_to_sock_fdi(fdinfo); citp_socket* ep = &epi->sock; ci_sock_cmn* s = ep->s; int rc; Log_VSC(log("%s("EF_FMT", %d, %d)", __FUNCTION__, EF_PRI_ARGS(epi, fdinfo->fd), level, optname)); rc = ci_udp_setsockopt(&epi->sock, fdinfo->fd, level, optname, optval, optlen); Log_V(log(LPF "setsockopt: fd=%d rc=%d", fdinfo->fd, rc)); if( rc == CI_SOCKET_HANDOVER ) { CITP_STATS_NETIF(++epi->sock.netif->state->stats.udp_handover_setsockopt); citp_fdinfo_handover(fdinfo, -1); return 0; } if( ci_opt_is_setting_reuseport(level, optname, optval, optlen) != 0 && ! CI_SOCK_NOT_BOUND(s) ) { ci_log("%s: setting reuseport after binding on udp not supported", __FUNCTION__); return -ENOSYS; } citp_fdinfo_release_ref(fdinfo, 0); return rc; }
citp_fdinfo_p citp_fdtable_busy_wait(unsigned fd, int fdt_locked) { volatile citp_fdinfo_p* p_fdip = &citp_fdtable.table[fd].fdip; citp_fdtable_waiter waiter; int saved_errno = errno; Log_V(ci_log("%s: fd=%u", __FUNCTION__, fd)); ci_assert(ci_is_multithreaded()); oo_rwlock_cond_init(&waiter.cond); oo_rwlock_cond_lock(&waiter.cond); again: waiter.next = *p_fdip; if( fdip_is_busy(waiter.next) ) { /* we can replace one "busy" fdip by another without fdtable lock */ if( fdip_cas_succeed(p_fdip, waiter.next, waiter_to_fdip(&waiter)) ) oo_rwlock_cond_wait(&waiter.cond); goto again; } oo_rwlock_cond_unlock(&waiter.cond); oo_rwlock_cond_destroy(&waiter.cond); errno = saved_errno; return waiter.next; }
static citp_fdinfo_p citp_fdtable_closing_wait(unsigned fd, int fdt_locked) { /* We're currently spinning in this case. Not ideal, but implementing ** blocking here is slightly tricky. (Can be done, but I want proof that ** it's needed first!) */ volatile citp_fdinfo_p* p_fdip = &citp_fdtable.table[fd].fdip; citp_fdinfo_p fdip; Log_V(ci_log("%s: fd=%u", __FUNCTION__, fd)); again: fdip = *p_fdip; if( fdip_is_busy(fdip) ) fdip = citp_fdtable_busy_wait(fd, fdt_locked); if( fdip_is_closing(fdip) ) { if( fdt_locked ) { /* Need to drop the lock to avoid deadlock with the other thread ** trying to closing this fd! */ CITP_FDTABLE_UNLOCK(); CITP_FDTABLE_LOCK(); } goto again; } return fdip; }
static int citp_udp_connect(citp_fdinfo* fdinfo, const struct sockaddr* sa, socklen_t sa_len, citp_lib_context_t* lib_context) { citp_sock_fdi *epi = fdi_to_sock_fdi(fdinfo); int rc; Log_V(log(LPF "connect(%d, sa, %d)", fdinfo->fd, sa_len)); if( (epi->sock.s->s_flags & CI_SOCK_FLAG_REUSEPORT_LEGACY) != 0 ) { log("ERROR: connect of socket with SO_REUSEPORT not supported unless " "supported by the OS."); return -1; } ci_netif_lock_fdi(epi); rc = ci_udp_connect(&epi->sock, fdinfo->fd, sa, sa_len); ci_netif_unlock_fdi(epi); if( rc == CI_SOCKET_HANDOVER ) { CITP_STATS_NETIF(++epi->sock.netif->state->stats.udp_handover_connect); citp_fdinfo_handover(fdinfo, -1); return 0; } citp_fdinfo_release_ref( fdinfo, 0 ); return rc; }
static int citp_udp_sendmmsg(citp_fdinfo* fdinfo, struct mmsghdr* mmsg, unsigned vlen, int flags) { citp_sock_fdi* epi = fdi_to_sock_fdi(fdinfo); ci_udp_iomsg_args a; int i, rc; Log_V(log(LPF "sendmmsg(%d, msg, %u, %#x)", fdinfo->fd, vlen, (unsigned) flags)); if( vlen == 0 ) return 0; a.ep = &epi->sock; a.fd = fdinfo->fd; a.ni = epi->sock.netif; a.us = SOCK_TO_UDP(epi->sock.s); i = 0; do { rc = ci_udp_sendmsg(&a, &mmsg[i].msg_hdr, flags); if(CI_LIKELY( rc >= 0 ) ) mmsg[i].msg_len = rc; ++i; } while( rc >= 0 && i < vlen ); return (rc>=0) ? i : rc; }
static void citp_fdinfo_do_handover(citp_fdinfo* fdi, int fdt_locked) { int rc; citp_fdinfo* epoll_fdi = NULL; int os_fd = fdi->fd; #ifndef NDEBUG /* Yuk: does for UDP too. */ volatile citp_fdinfo_p* p_fdip; p_fdip = &citp_fdtable.table[fdi->fd].fdip; ci_assert(fdip_is_busy(*p_fdip)); #endif Log_V(ci_log("%s: fd=%d nonb_switch=%d", __FUNCTION__, fdi->fd, fdi->on_rcz.handover_nonb_switch)); if( fdi->epoll_fd >= 0 ) { epoll_fdi = citp_epoll_fdi_from_member(fdi, fdt_locked); if( epoll_fdi->protocol->type == CITP_EPOLLB_FD ) citp_epollb_on_handover(epoll_fdi, fdi); } rc = fdtable_fd_move(fdi->fd, OO_IOC_TCP_HANDOVER); if( rc == -EBUSY && fdi->epoll_fd >= 0 ) { ci_assert(fdi_to_sock_fdi(fdi)->sock.s->b.sb_aflags & CI_SB_AFLAG_MOVED_AWAY); /* If this is our epoll, we can do full handover: we manually add os * fd into the epoll set. * Fixme: ensure we are not in _other_ epoll sets */ ci_bit_clear(&fdi_to_sock_fdi(fdi)->sock.s->b.sb_aflags, CI_SB_AFLAG_MOVED_AWAY_IN_EPOLL_BIT); rc = fdtable_fd_move(fdi->fd, OO_IOC_FILE_MOVED); } if( rc != 0 ) { citp_fdinfo* new_fdi; if( ! fdt_locked ) CITP_FDTABLE_LOCK(); new_fdi = citp_fdtable_probe_locked(fdi->fd, CI_TRUE, CI_TRUE); citp_fdinfo_release_ref(new_fdi, 1); if( ! fdt_locked ) CITP_FDTABLE_UNLOCK(); ci_assert_equal(citp_fdinfo_get_type(new_fdi), CITP_PASSTHROUGH_FD); os_fd = fdi_to_alien_fdi(new_fdi)->os_socket; } if( fdi->on_rcz.handover_nonb_switch >= 0 ) { int on_off = !! fdi->on_rcz.handover_nonb_switch; int rc = ci_sys_ioctl(os_fd, FIONBIO, &on_off); if( rc < 0 ) Log_E(ci_log("%s: ioctl failed on_off=%d", __FUNCTION__, on_off)); } if( rc != 0 ) goto exit; citp_fdtable_busy_clear(fdi->fd, fdip_passthru, fdt_locked); exit: citp_fdinfo_get_ops(fdi)->dtor(fdi, fdt_locked); if( epoll_fdi != NULL && epoll_fdi->protocol->type == CITP_EPOLL_FD ) citp_epoll_on_handover(epoll_fdi, fdi, fdt_locked); if( epoll_fdi != NULL ) citp_fdinfo_release_ref(epoll_fdi, fdt_locked); citp_fdinfo_free(fdi); }
void __citp_fdinfo_ref_count_zero(citp_fdinfo* fdi, int fdt_locked) { Log_V(log("%s: fd=%d on_rcz=%d", __FUNCTION__, fdi->fd, fdi->on_ref_count_zero)); citp_fdinfo_assert_valid(fdi); ci_assert(oo_atomic_read(&fdi->ref_count) == 0); ci_assert_ge(fdi->fd, 0); ci_assert_lt(fdi->fd, citp_fdtable.inited_count); ci_assert_nequal(fdi_to_fdip(fdi), citp_fdtable.table[fdi->fd].fdip); switch( fdi->on_ref_count_zero ) { case FDI_ON_RCZ_CLOSE: #if CI_CFG_FD_CACHING if( citp_fdinfo_get_ops(fdi)->cache(fdi) == 1 ) { if( ! fdt_locked && fdtable_strict() ) CITP_FDTABLE_LOCK(); fdtable_swap(fdi->fd, fdip_closing, fdip_unknown, fdt_locked | fdtable_strict()); citp_fdinfo_get_ops(fdi)->dtor(fdi, fdt_locked | fdtable_strict()); if( ! fdt_locked && fdtable_strict() ) CITP_FDTABLE_UNLOCK(); citp_fdinfo_free(fdi); break; } else #endif { if( ! fdt_locked && fdtable_strict() ) CITP_FDTABLE_LOCK(); ci_tcp_helper_close_no_trampoline(fdi->fd); /* The swap must occur after the close, otherwise another thread could * cause a probe of the old endpoint info, which is about be freed. */ fdtable_swap(fdi->fd, fdip_closing, fdip_unknown, fdt_locked | fdtable_strict()); citp_fdinfo_get_ops(fdi)->dtor(fdi, fdt_locked | fdtable_strict()); if( ! fdt_locked && fdtable_strict() ) CITP_FDTABLE_UNLOCK(); citp_fdinfo_free(fdi); break; } case FDI_ON_RCZ_DUP2: dup2_complete(fdi, fdi_to_fdip(fdi), fdt_locked); break; case FDI_ON_RCZ_HANDOVER: citp_fdinfo_do_handover(fdi, fdt_locked); break; case FDI_ON_RCZ_MOVED: citp_fdinfo_get_ops(fdi)->dtor(fdi, fdt_locked); citp_fdinfo_free(fdi); break; default: CI_DEBUG(ci_log("%s: fd=%d on_ref_count_zero=%d", __FUNCTION__, fdi->fd, fdi->on_ref_count_zero)); ci_assert(0); } }
int citp_ep_ioctl(citp_fdinfo* fdinfo, unsigned long request, long arg) { CITP_FDINFO_ASSERT_VALID(fdinfo); Log_V(log(LPF "ioctl(%d, %lu, %ld)", fdinfo->fd, request, arg)); /*! \TODO see /usr/include/bits/ioctls.h for lots of socketey ones */ ci_fail(("?? not yet implemented")); errno = ENOTSUP; return -1; }
/* 2004/08/16 stg: added [fdt_locked] to allow the fd table to be * locked before this function. [fdt_locked] = 0 for legacy operation */ void __citp_epinfo_ref_count_zero(citp_epinfo* epinfo,citp_fdinfo* last_fdinfo, int fdt_locked) { Log_V(log(LPF "ref_count_zero(%p, %d)", epinfo, last_fdinfo->fd)); ci_assert(epinfo); ci_assert(oo_atomic_read(&epinfo->ref_count) == 0); ci_assert(epinfo->protocol); ci_assert(last_fdinfo); ci_assert(last_fdinfo->ep == epinfo); epinfo->protocol->ops.dtor(epinfo, last_fdinfo, fdt_locked); }
static int citp_udp_getpeername(citp_fdinfo* fdinfo, struct sockaddr* sa, socklen_t* p_sa_len) { citp_sock_fdi *epi = fdi_to_sock_fdi(fdinfo); int rc; Log_V(log("%s("EF_FMT")", __FUNCTION__, EF_PRI_ARGS(epi,fdinfo->fd))); ci_netif_lock_fdi(epi); rc = ci_udp_getpeername(&epi->sock, sa, p_sa_len); ci_netif_unlock_fdi(epi); return rc; }
static int citp_udp_recv(citp_fdinfo* fdinfo, struct msghdr* msg, int flags) { citp_sock_fdi* epi = fdi_to_sock_fdi(fdinfo); ci_udp_iomsg_args a; Log_V(log(LPF "recv(%d, msg, %#x)", fdinfo->fd, (unsigned) flags)); a.fd = fdinfo->fd; a.ep = &epi->sock; a.ni = epi->sock.netif; a.us = SOCK_TO_UDP(epi->sock.s); return ci_udp_recvmsg( &a, msg, flags); }
static int citp_udp_getsockopt(citp_fdinfo* fdinfo, int level, int optname, void* optval, socklen_t* optlen) { citp_sock_fdi *epi = fdi_to_sock_fdi(fdinfo); int rc; Log_V(log("%s("EF_FMT", %d, %d)", __FUNCTION__, EF_PRI_ARGS(epi,fdinfo->fd), level, optname )); ci_netif_lock_fdi(epi); rc = ci_udp_getsockopt(&epi->sock, fdinfo->fd, level, optname, optval, optlen); ci_netif_unlock_fdi(epi); return rc; }
static int citp_udp_recvmmsg(citp_fdinfo* fdinfo, struct mmsghdr* msg, unsigned vlen, int flags, const struct timespec *timeout) { citp_sock_fdi* epi = fdi_to_sock_fdi(fdinfo); ci_udp_iomsg_args a; Log_V(log(LPF "recvmmsg(%d, msg, %u, %#x)", fdinfo->fd, vlen, (unsigned) flags)); a.fd = fdinfo->fd; a.ep = &epi->sock; a.ni = epi->sock.netif; a.us = SOCK_TO_UDP(epi->sock.s); return ci_udp_recvmmsg(&a, msg, vlen, flags, timeout); }
static int citp_udp_bind(citp_fdinfo* fdinfo, const struct sockaddr* sa, socklen_t sa_len) { citp_sock_fdi *epi = fdi_to_sock_fdi(fdinfo); citp_socket* ep = &epi->sock; ci_sock_cmn* s = ep->s; int rc; Log_V(log(LPF "bind(%d, sa, %d)", fdinfo->fd, sa_len)); ci_udp_handle_force_reuseport(fdinfo->fd, ep, sa, sa_len); if( (s->s_flags & CI_SOCK_FLAG_REUSEPORT) != 0 ) { if( (rc = ci_udp_reuseport_bind(ep, fdinfo->fd, sa, sa_len)) == 0 ) { /* The socket has moved so need to reprobe the fd. This will also * map the the new stack into user space of the executing process. */ fdinfo = citp_fdtable_lookup(fdinfo->fd); fdinfo = citp_reprobe_moved(fdinfo, CI_FALSE); epi = fdi_to_sock_fdi(fdinfo); ep = &epi->sock; ci_netif_cluster_prefault(ep->netif); } else { goto done; } } ci_netif_lock_fdi(epi); rc = ci_udp_bind(ep, fdinfo->fd, sa, sa_len); ci_netif_unlock_fdi(epi); done: if( rc == CI_SOCKET_HANDOVER ) { ci_assert_equal(s->s_flags & CI_SOCK_FLAG_REUSEPORT_LEGACY, 0); CITP_STATS_NETIF(++epi->sock.netif->state->stats.udp_handover_bind); citp_fdinfo_handover(fdinfo, -1); return 0; } citp_fdinfo_release_ref( fdinfo, 0 ); return rc; }
/* Find out what sort of thing [fd] is, and if it is a user-level socket * then map in the user-level state. */ static citp_fdinfo * citp_fdtable_probe_locked(unsigned fd, int print_banner, int fdip_is_already_busy) { citp_fdinfo* fdi = NULL; struct stat64 st; ci_ep_info_t info; if( ! fdip_is_already_busy ) { volatile citp_fdinfo_p* p_fdip; citp_fdinfo_p fdip; /* ?? We're repeating some effort already expended in lookup() here, but ** this keeps it cleaner. May optimise down the line when I understand ** what other code needs to call this. */ p_fdip = &citp_fdtable.table[fd].fdip; again: fdip = *p_fdip; if( fdip_is_busy(fdip) ) fdip = citp_fdtable_busy_wait(fd, 1); if( ! fdip_is_unknown(fdip) && ! fdip_is_normal(fdip) ) goto exit; if( fdip_cas_fail(p_fdip, fdip, fdip_busy) ) goto again; if( fdip_is_normal(fdip) ) { fdi = fdip_to_fdi(fdip); citp_fdinfo_ref(fdi); citp_fdtable_busy_clear(fd, fdip, 1); goto exit; } } if( ci_sys_fstat64(fd, &st) != 0 ) { /* fstat() failed. Must be a bad (closed) file descriptor, so ** leave this entry as unknown. Return citp_the_closed_fd to avoid the ** caller passing through to an fd that is created asynchronously. */ citp_fdtable_busy_clear(fd, fdip_unknown, 1); fdi = &citp_the_closed_fd; citp_fdinfo_ref(fdi); goto exit; } /* oo_get_st_rdev() and oo_onloadfs_dev_t() open-and-close fd, so * fdtable should be locked if strict mode requested. */ if( fdtable_strict() ) { CITP_FDTABLE_ASSERT_LOCKED(1); } if( st.st_dev == oo_onloadfs_dev_t() ) { /* Retrieve user-level endpoint info */ if( oo_ep_info(fd, &info) < 0 ) { Log_V(log("%s: fd=%d type=%d unknown", __FUNCTION__,fd,info.fd_type)); citp_fdtable_busy_clear(fd, fdip_passthru, 1); goto exit; } switch( info.fd_type ) { case CI_PRIV_TYPE_TCP_EP: case CI_PRIV_TYPE_UDP_EP: case CI_PRIV_TYPE_PASSTHROUGH_EP: case CI_PRIV_TYPE_ALIEN_EP: #if CI_CFG_USERSPACE_PIPE case CI_PRIV_TYPE_PIPE_READER: case CI_PRIV_TYPE_PIPE_WRITER: #endif { citp_fdinfo_p fdip; Log_V(log("%s: fd=%d %s restore", __FUNCTION__, fd, info.fd_type == CI_PRIV_TYPE_TCP_EP ? "TCP": #if CI_CFG_USERSPACE_PIPE info.fd_type != CI_PRIV_TYPE_UDP_EP ? "PIPE" : #endif "UDP")); fdip = citp_fdtable_probe_restore(fd, &info, print_banner); if( fdip_is_normal(fdip) ) fdi = fdip_to_fdi(fdip); else citp_fdtable_busy_clear(fd, fdip, 1); goto exit; } case CI_PRIV_TYPE_NETIF: /* This should never happen, because netif fds are close-on-exec. ** But let's leave this code here just in case my reasoning is bad. */ Log_U(log("%s: fd=%d NETIF reserved", __FUNCTION__, fd)); citp_fdtable_busy_clear(fd, fdip_reserved, 1); fdi = &citp_the_reserved_fd; citp_fdinfo_ref(fdi); goto exit; case CI_PRIV_TYPE_NONE: /* This happens if a thread gets at an onload driver fd that has just * been created, but not yet specialised. On Linux I think this * means it will shortly be a new netif internal fd. (fds associated * with sockets and pipes are never unspecialised). */ Log_V(log("%s: fd=%d TYPE_NONE", __FUNCTION__, fd)); citp_fdtable_busy_clear(fd, fdip_passthru, 1); goto exit; default: CI_TEST(0); break; } } else if( ci_major(st.st_rdev) == ci_major(oo_get_st_rdev(OO_EPOLL_DEV)) ) { citp_epollb_fdi *epi = CI_ALLOC_OBJ(citp_epollb_fdi); if( ! epi ) { Log_E(log("%s: out of memory (epoll_fdi)", __FUNCTION__)); citp_fdtable_busy_clear(fd, fdip_passthru, 1); goto exit; } oo_epollb_ctor(epi); fdi = &epi->fdinfo; citp_fdinfo_init(fdi, &citp_epollb_protocol_impl); citp_fdinfo_ref(fdi); citp_fdtable_insert(fdi, fd, 1); goto exit; } #ifndef NDEBUG /* /dev/onload may be netif only; they are closed on fork or exec */ if( ci_major(st.st_rdev) == ci_major(oo_get_st_rdev(OO_STACK_DEV)) ) Log_U(log("%s: %d is /dev/onload", __FUNCTION__, fd)); #endif /* Not one of ours, so pass-through. */ Log_V(log("%s: fd=%u non-efab", __FUNCTION__, fd)); citp_fdtable_busy_clear(fd, fdip_passthru, 1); exit: return fdi; }
int citp_ep_close(unsigned fd) { volatile citp_fdinfo_p* p_fdip; citp_fdinfo_p fdip; int rc, got_lock; citp_fdinfo* fdi; /* Do not touch shared fdtable when in vfork child. */ if( oo_per_thread_get()->in_vfork_child ) return ci_tcp_helper_close_no_trampoline(fd); /* Interlock against other closes, against the fdtable being extended, ** and against select and poll. */ CITP_FDTABLE_LOCK(); got_lock = 1; __citp_fdtable_extend(fd); if( fd >= citp_fdtable.inited_count ) { rc = ci_sys_close(fd); goto done; } p_fdip = &citp_fdtable.table[fd].fdip; again: fdip = *p_fdip; if( fdip_is_busy(fdip) ) fdip = citp_fdtable_busy_wait(fd, 1); if( fdip_is_closing(fdip) | fdip_is_reserved(fdip) ) { /* Concurrent close or attempt to close reserved. */ Log_V(ci_log("%s: fd=%d closing=%d reserved=%d", __FUNCTION__, fd, fdip_is_closing(fdip), fdip_is_reserved(fdip))); errno = EBADF; rc = -1; goto done; } #if CI_CFG_FD_CACHING /* Need to check in case this sucker's cached */ if( fdip_is_unknown(fdip) ) { fdi = citp_fdtable_probe_locked(fd, CI_FALSE, CI_FALSE); if( fdi == &citp_the_closed_fd ) { citp_fdinfo_release_ref(fdi, CI_TRUE); errno = EBADF; rc = -1; goto done; } if( fdi ) citp_fdinfo_release_ref(fdi, CI_TRUE); } #endif ci_assert(fdip_is_normal(fdip) | fdip_is_passthru(fdip) | fdip_is_unknown(fdip)); /* Swap in the "closed" pseudo-fdinfo. This lets any other thread know ** that we're in the middle of closing this fd. */ if( fdip_cas_fail(p_fdip, fdip, fdip_closing) ) goto again; if( fdip_is_normal(fdip) ) { fdi = fdip_to_fdi(fdip); CITP_FDTABLE_UNLOCK(); got_lock = 0; if( fdi->is_special ) { Log_V(ci_log("%s: fd=%d is_special, returning EBADF", __FUNCTION__, fd)); errno = EBADF; rc = -1; fdtable_swap(fd, fdip_closing, fdip, 0); goto done; } Log_V(ci_log("%s: fd=%d u/l socket", __FUNCTION__, fd)); ci_assert_equal(fdi->fd, fd); ci_assert_equal(fdi->on_ref_count_zero, FDI_ON_RCZ_NONE); fdi->on_ref_count_zero = FDI_ON_RCZ_CLOSE; if( fdi->epoll_fd >= 0 ) { citp_fdinfo* epoll_fdi = citp_epoll_fdi_from_member(fdi, 0); if( epoll_fdi ) { if( epoll_fdi->protocol->type == CITP_EPOLL_FD ) citp_epoll_on_close(epoll_fdi, fdi, 0); citp_fdinfo_release_ref(epoll_fdi, 0); } } citp_fdinfo_release_ref(fdi, 0); rc = 0; } else { ci_assert(fdip_is_passthru(fdip) || fdip_is_unknown(fdip)); if( ! fdtable_strict() ) { CITP_FDTABLE_UNLOCK(); got_lock = 0; } Log_V(ci_log("%s: fd=%d passthru=%d unknown=%d", __FUNCTION__, fd, fdip_is_passthru(fdip), fdip_is_unknown(fdip))); fdtable_swap(fd, fdip_closing, fdip_unknown, fdtable_strict()); rc = ci_tcp_helper_close_no_trampoline(fd); } done: if( got_lock ) CITP_FDTABLE_UNLOCK(); FDTABLE_ASSERT_VALID(); return rc; }
int citp_ep_dup3(unsigned fromfd, unsigned tofd, int flags) { volatile citp_fdinfo_p* p_tofdip; citp_fdinfo_p tofdip; unsigned max; Log_V(log("%s(%d, %d)", __FUNCTION__, fromfd, tofd)); /* Must be checked by callers. */ ci_assert(fromfd != tofd); /* Hack: if [tofd] is the fd we're using for logging, we'd better choose ** a different one! */ if( tofd == citp.log_fd ) citp_log_change_fd(); ci_assert(citp.init_level >= CITP_INIT_FDTABLE); max = CI_MAX(fromfd, tofd); if( max >= citp_fdtable.inited_count ) { ci_assert(max < citp_fdtable.size); CITP_FDTABLE_LOCK(); __citp_fdtable_extend(max); CITP_FDTABLE_UNLOCK(); } /* Bug1151: Concurrent threads doing dup2(x,y) and dup2(y,x) can deadlock ** against one another. So we take out a fat lock to prevent concurrent ** dup2()s. */ /* Lock tofd. We need to interlock against select and poll etc, so we ** also grab the exclusive lock. Also grab the bug1151 lock. */ pthread_mutex_lock(&citp_dup_lock); CITP_FDTABLE_LOCK(); p_tofdip = &citp_fdtable.table[tofd].fdip; lock_tofdip_again: tofdip = *p_tofdip; if( fdip_is_busy(tofdip) ) tofdip = citp_fdtable_busy_wait(tofd, 1); if( fdip_is_closing(tofdip) ) tofdip = citp_fdtable_closing_wait(tofd, 1); if( fdip_is_reserved(tofdip) ) { /* ?? FIXME: we can't cope with this at the moment */ CITP_FDTABLE_UNLOCK(); Log_U(log("%s(%d, %d): target is reserved", __FUNCTION__, fromfd, tofd)); errno = EBUSY; tofd = -1; goto out; } if( fdip_cas_fail(p_tofdip, tofdip, fdip_busy) ) goto lock_tofdip_again; CITP_FDTABLE_UNLOCK(); ci_assert(fdip_is_normal(tofdip) | fdip_is_passthru(tofdip) | fdip_is_unknown(tofdip)); if( fdip_is_normal(tofdip) ) { /* We're duping onto a user-level socket. */ citp_fdinfo* tofdi = fdip_to_fdi(tofdip); if( tofdi->epoll_fd >= 0 ) { citp_fdinfo* epoll_fdi = citp_epoll_fdi_from_member(tofdi, 0); if( epoll_fdi ) { if( epoll_fdi->protocol->type == CITP_EPOLL_FD ) citp_epoll_on_close(epoll_fdi, tofdi, 0); citp_fdinfo_release_ref(epoll_fdi, 0); } } ci_assert_equal(tofdi->on_ref_count_zero, FDI_ON_RCZ_NONE); tofdi->on_ref_count_zero = FDI_ON_RCZ_DUP2; tofdi->on_rcz.dup3_args.fd = fromfd; tofdi->on_rcz.dup3_args.flags = flags; citp_fdinfo_release_ref(tofdi, 0); { int i = 0; /* We need to free this fdi. If someone is using it right now, * we are in trouble. So, we spin for a while and interrupt the * user. See bug 28123. */ while( tofdi->on_ref_count_zero != FDI_ON_RCZ_DONE ) { if( ci_is_multithreaded() && i % 10000 == 9999 ) { pthread_t pth = tofdi->thread_id; if( pth != pthread_self() && pth != PTHREAD_NULL ) { pthread_kill(pth, SIGONLOAD); sleep(1); } } ci_spinloop_pause(); i++; } ci_rmb(); } if( tofdi->on_rcz.dup2_result < 0 ) { errno = -tofdi->on_rcz.dup2_result; /* Need to re-insert [tofdi] into the table. */ ci_assert_equal(oo_atomic_read(&tofdi->ref_count), 0); oo_atomic_set(&tofdi->ref_count, 1); CI_DEBUG(tofdi->on_ref_count_zero = FDI_ON_RCZ_NONE); citp_fdtable_busy_clear(tofd, tofdip, 0); tofd = -1; } else { ci_assert(tofdi->on_rcz.dup2_result == tofd); citp_fdinfo_get_ops(tofdi)->dtor(tofdi, 0); citp_fdinfo_free(tofdi); } goto out; } ci_assert(fdip_is_passthru(tofdip) | fdip_is_unknown(tofdip)); { /* We're dupping onto an O/S descriptor, or it may be closed. Create a ** dummy [citp_fdinfo], just so we can share code with the case above. */ citp_fdinfo fdi; fdi.fd = tofd; fdi.on_rcz.dup3_args.fd = fromfd; fdi.on_rcz.dup3_args.flags = flags; dup2_complete(&fdi, tofdip, 0); if( fdi.on_rcz.dup2_result < 0 ) { errno = -fdi.on_rcz.dup2_result; citp_fdtable_busy_clear(tofd, tofdip, 0); tofd = -1; } else ci_assert(fdi.on_rcz.dup2_result == tofd); } out: pthread_mutex_unlock(&citp_dup_lock); return tofd; }
/* ** Why do these live here? Because they need to hack into the low-level ** dirty nastiness of the fdtable. */ int citp_ep_dup(unsigned oldfd, int (*syscall)(int oldfd, long arg), long arg) { /* This implements dup(oldfd) and fcntl(oldfd, F_DUPFD, arg). */ volatile citp_fdinfo_p* p_oldfdip; citp_fdinfo_p oldfdip; citp_fdinfo* newfdi = 0; citp_fdinfo* oldfdi; int newfd; Log_V(log("%s(%d)", __FUNCTION__, oldfd)); if(CI_UNLIKELY( citp.init_level < CITP_INIT_FDTABLE || oo_per_thread_get()->in_vfork_child )) /* Lib not initialised, so no U/L state, and therefore system dup() ** will do just fine. */ return syscall(oldfd, arg); if( oldfd >= citp_fdtable.inited_count ) { /* NB. We can't just pass through in this case because we need to worry ** about other threads racing with us. So we need to be able to lock ** this fd while we do the dup. */ ci_assert(oldfd < citp_fdtable.size); CITP_FDTABLE_LOCK(); __citp_fdtable_extend(oldfd); CITP_FDTABLE_UNLOCK(); } p_oldfdip = &citp_fdtable.table[oldfd].fdip; again: oldfdip = *p_oldfdip; if( fdip_is_busy(oldfdip) ) oldfdip = citp_fdtable_busy_wait(oldfd, 0); if( fdip_is_closing(oldfdip) | fdip_is_reserved(oldfdip) ) { errno = EBADF; return -1; } #if CI_CFG_FD_CACHING /* Need to check in case this sucker's cached */ if( fdip_is_unknown(oldfdip) ) { CITP_FDTABLE_LOCK(); oldfdi = citp_fdtable_probe_locked(oldfd, CI_FALSE, CI_FALSE); CITP_FDTABLE_UNLOCK(); if( oldfdi == &citp_the_closed_fd ) { citp_fdinfo_release_ref(oldfdi, CI_TRUE); errno = EBADF; return -1; } if( oldfdi ) citp_fdinfo_release_ref(oldfdi, CI_TRUE); } #endif if( fdip_cas_fail(p_oldfdip, oldfdip, fdip_busy) ) goto again; #if CI_CFG_FD_CACHING /* May end up with multiple refs to this, don't allow it to be cached. */ if( fdip_is_normal(oldfdip) ) fdip_to_fdi(oldfdip)->can_cache = 0; #endif if( fdip_is_normal(oldfdip) && (((oldfdi = fdip_to_fdi(oldfdip))->protocol->type) == CITP_EPOLL_FD) ) { newfdi = citp_fdinfo_get_ops(oldfdi)->dup(oldfdi); if( ! newfdi ) { citp_fdtable_busy_clear(oldfd, oldfdip, 0); errno = ENOMEM; return -1; } if( fdtable_strict() ) CITP_FDTABLE_LOCK(); newfd = syscall(oldfd, arg); if( newfd >= 0 ) citp_fdtable_new_fd_set(newfd, fdip_busy, fdtable_strict()); if( fdtable_strict() ) CITP_FDTABLE_UNLOCK(); if( newfd >= 0 ) { citp_fdtable_insert(newfdi, newfd, 0); newfdi = 0; } } else { if( fdtable_strict() ) CITP_FDTABLE_LOCK(); newfd = syscall(oldfd, arg); if( newfd >= 0 && newfd < citp_fdtable.inited_count ) { /* Mark newfd as unknown. When used, it'll get probed. * * We are not just being lazy here: Setting to unknown rather than * installing a proper fdi (when oldfd is accelerated) is essential to * vfork()+dup()+exec() working properly. Reason is that child and * parent share address space, so child is modifying the parent's * fdtable. Setting an entry to unknown is safe. */ citp_fdtable_new_fd_set(newfd, fdip_unknown, fdtable_strict()); } if( fdtable_strict() ) CITP_FDTABLE_UNLOCK(); } citp_fdtable_busy_clear(oldfd, oldfdip, 0); if( newfdi ) citp_fdinfo_free(newfdi); return newfd; }
/*! Initialise - called on startup to save away any relevant current ** environment variables. ** \return 0 for success, -1 for failure */ int citp_environ_init(void) { char **env_ptr; size_t mem_needed = 0; unsigned int n; char *string_buf, *p; const char *lib_path = NULL; const char *ld_preload_value = NULL; env_ptr = __environ ? __environ : empty_env; saved_env_count = 0; while (*env_ptr != NULL) { if (is_our_env_var(*env_ptr)) { mem_needed += strlen(*env_ptr) + 1; saved_env_count++; if (env_is_ld_preload(*env_ptr)) ld_preload_value = *env_ptr + 11; } /* temporary hack for djr */ if (strcmp(*env_ptr, "EF_NO_PRELOAD_RESTORE=1") == 0) { Log_V(log("Environment restore disabled")); saved_env_count = 0; return 0; } /* end temporary hack */ env_ptr++; } if (saved_env_count == 0) { Log_V(log("Invoked without LD_PRELOAD? Environment restore disabled.")); return 0; } /* Add ourself to LD_PRELOAD if we've been asked to. */ if (getenv("EF_LD_PRELOAD")) { const char *full_path = citp_find_loaded_library(); ci_assert(full_path); ci_assert(strrchr(full_path, '/')); lib_path = strrchr(full_path, '/') + 1; /* Correct LD_PRELOAD value should be the same as lib_path, or at least * start with lib_path+':'. */ if ((!ld_preload_value || (strlen(ld_preload_value) < strlen(lib_path) || (strncmp(ld_preload_value, lib_path, strlen(lib_path)) != 0) || ((ld_preload_value[strlen(lib_path)] != ':') && (ld_preload_value[strlen(lib_path)] != '\0')) ))) { mem_needed += strlen(lib_path); /* Add our library */ if (!ld_preload_value) { mem_needed += 12; /* Add "LD_PRELOAD=" line */ saved_env_count++; } if (ld_preload_value && ld_preload_value[0] == '\0') ld_preload_value = NULL; /* Do not set ":" at the end */ if (ld_preload_value) mem_needed++; /* Add ':' separator */ Log_V(log("%s: LD_PRELOAD=\"%s\", but we are loaded as %s", __FUNCTION__, ld_preload_value ? : "", full_path)); } else
/* we don't register protocol impl */ int citp_pipe_create(int fds[2], int flags) { citp_pipe_fdi* epi_read; citp_pipe_fdi* epi_write; struct oo_pipe* p = NULL; /* make compiler happy */ ci_netif* ni; int rc = -1; ef_driver_handle fd = -1; Log_V(log(LPF "pipe()")); /* citp_netif_exists() does not need citp_ul_lock here */ if( CITP_OPTS.ul_pipe == CI_UNIX_PIPE_ACCELERATE_IF_NETIF && ! citp_netif_exists() ) { return CITP_NOT_HANDLED; } rc = citp_netif_alloc_and_init(&fd, &ni); if( rc != 0 ) { if( rc == CI_SOCKET_HANDOVER ) { /* This implies EF_DONT_ACCELERATE is set, so we handover * regardless of CITP_OPTS.no_fail */ return CITP_NOT_HANDLED; } /* may be lib mismatch - errno will be ELIBACC */ goto fail1; } rc = -1; CI_MAGIC_CHECK(ni, NETIF_MAGIC); /* add another reference as we have 2 fdis */ citp_netif_add_ref(ni); epi_read = citp_pipe_epi_alloc(ni, O_RDONLY); if( epi_read == NULL ) goto fail2; epi_write = citp_pipe_epi_alloc(ni, O_WRONLY); if( epi_write == NULL ) goto fail3; /* oo_pipe init code */ if( fdtable_strict() ) CITP_FDTABLE_LOCK(); rc = oo_pipe_ctor(ni, &p, fds, flags); if( rc < 0 ) goto fail4; citp_fdtable_new_fd_set(fds[0], fdip_busy, fdtable_strict()); citp_fdtable_new_fd_set(fds[1], fdip_busy, fdtable_strict()); if( fdtable_strict() ) CITP_FDTABLE_UNLOCK(); LOG_PIPE("%s: pipe=%p id=%d", __FUNCTION__, p, p->b.bufid); /* as pipe is created it should be attached to the end-points */ epi_read->pipe = p; epi_write->pipe = p; /* We're ready. Unleash us onto the world! */ ci_assert(epi_read->pipe->b.sb_aflags & CI_SB_AFLAG_NOT_READY); ci_assert(epi_write->pipe->b.sb_aflags & CI_SB_AFLAG_NOT_READY); ci_atomic32_and(&epi_read->pipe->b.sb_aflags, ~CI_SB_AFLAG_NOT_READY); ci_atomic32_and(&epi_read->pipe->b.sb_aflags, ~CI_SB_AFLAG_NOT_READY); citp_fdtable_insert(&epi_read->fdinfo, fds[0], 0); citp_fdtable_insert(&epi_write->fdinfo, fds[1], 0); CI_MAGIC_CHECK(ni, NETIF_MAGIC); return 0; fail4: if( fdtable_strict() ) CITP_FDTABLE_UNLOCK(); fail3: CI_FREE_OBJ(epi_write); fail2: CI_FREE_OBJ(epi_read); citp_netif_release_ref(ni, 0); citp_netif_release_ref(ni, 0); fail1: if( CITP_OPTS.no_fail && errno != ELIBACC ) { Log_U(ci_log("%s: failed (errno:%d) - PASSING TO OS", __FUNCTION__, errno)); return CITP_NOT_HANDLED; } return rc; }
static int citp_udp_socket(int domain, int type, int protocol) { citp_fdinfo* fdi; citp_sock_fdi* epi; ef_driver_handle fd; int rc; ci_netif* ni; Log_V(log(LPF "socket(%d, %d, %d)", domain, type, protocol)); epi = CI_ALLOC_OBJ(citp_sock_fdi); if( ! epi ) { Log_U(ci_log(LPF "socket: failed to allocate epi")); errno = ENOMEM; goto fail1; } fdi = &epi->fdinfo; citp_fdinfo_init(fdi, &citp_udp_protocol_impl); rc = citp_netif_alloc_and_init(&fd, &ni); if( rc != 0 ) { if( rc == CI_SOCKET_HANDOVER ) { /* This implies EF_DONT_ACCELERATE is set, so we handover * regardless of CITP_OPTS.no_fail */ CI_FREE_OBJ(epi); return rc; } goto fail2; } /* Protect the fdtable entry until we're done initialising. */ if( fdtable_strict() ) CITP_FDTABLE_LOCK(); if((fd = ci_udp_ep_ctor(&epi->sock, ni, domain, type)) < 0) { /*! ?? \TODO unpick the ci_udp_ep_ctor according to how failed */ Log_U(ci_log(LPF "socket: udp_ep_ctor failed")); errno = -fd; goto fail3; } citp_fdtable_new_fd_set(fd, fdip_busy, fdtable_strict()); if( fdtable_strict() ) CITP_FDTABLE_UNLOCK(); CI_DEBUG(epi->sock.s->pid = getpid()); /* We're ready. Unleash us onto the world! */ ci_assert(epi->sock.s->b.sb_aflags & CI_SB_AFLAG_NOT_READY); ci_atomic32_and(&epi->sock.s->b.sb_aflags, ~CI_SB_AFLAG_NOT_READY); citp_fdtable_insert(fdi, fd, 0); Log_VSS(log(LPF "socket(%d, %d, %d) = "EF_FMT, domain, type, protocol, EF_PRI_ARGS(epi,fd))); return fd; fail3: if( CITP_OPTS.no_fail && errno != ELIBACC ) CITP_STATS_NETIF(++ni->state->stats.udp_handover_socket); citp_netif_release_ref(ni, 0); fail2: CI_FREE_OBJ(epi); fail1: /* BUG1408: Graceful failure. We'll only fail outright if there's a * driver/library mismatch */ if( CITP_OPTS.no_fail && errno != ELIBACC ) { Log_U(ci_log("%s: failed (errno:%d) - PASSING TO OS", __FUNCTION__, errno)); return CI_SOCKET_HANDOVER; } return -1; }