void citp_passthrough_init(citp_alien_fdi* epi) { int rc = oo_os_sock_get(epi->netif, epi->ep->bufid, &epi->os_socket); /* No sensible way to handle oo_os_sock_get failure. Just record it. */ if( rc != 0 ) { Log_U(ci_log("%s: oo_os_sock_get([%d:%d]) returned %d", __func__, NI_ID(epi->netif), epi->ep->bufid, rc)); epi->os_socket = -1; return; } __citp_fdtable_reserve(epi->os_socket, 1); /* ci_tcp_helper_get_sock_fd gets the citp_dup2_lock lock: release it */ oo_rwlock_unlock_read(&citp_dup2_lock); }
static citp_pipe_fdi *citp_pipe_epi_alloc(ci_netif *ni, int flags) { citp_pipe_fdi* epi; epi = CI_ALLOC_OBJ(citp_pipe_fdi); if( ! epi ) { Log_U(ci_log(LPF "pipe: failed to allocate epi")); errno = ENOMEM; return NULL; } if( flags == O_WRONLY ) citp_fdinfo_init(&epi->fdinfo, &citp_pipe_write_protocol_impl); else citp_fdinfo_init(&epi->fdinfo, &citp_pipe_read_protocol_impl); epi->ni = ni; return epi; }
/* This function does some simple tests to ensure that the fdtable makes sense. * There are many more tests we could do; feel free to add them at your * leisure! */ void citp_fdtable_assert_valid(void) { int i; if( ! citp_fdtable.table ) return; CITP_FDTABLE_LOCK_RD(); for( i = 0; i < citp_fdtable.inited_count; i++ ) { citp_fdinfo_p fdip = citp_fdtable.table[i].fdip; if( fdip_is_normal(fdip) ) { citp_fdinfo * fdi = fdip_to_fdi(fdip); ci_assert(fdi); ci_assert(fdi->protocol); if( ( fdi->protocol->type == CITP_TCP_SOCKET || fdi->protocol->type == CITP_UDP_SOCKET ) && fdi_to_socket(fdi)->s ) ci_assert(! (fdi_to_socket(fdi)->s->b.sb_aflags & CI_SB_AFLAG_ORPHAN)); if (!fdi->is_special) { /* Ensure the "back pointer" makes sense */ ci_assert (fdi->fd == i); /* Ensure that the reference count is in a vaguely sensible range */ ci_assert ((oo_atomic_read (&fdi->ref_count) > 0) && (oo_atomic_read (&fdi->ref_count) < 10000)); /* 10,000 threads is a bit mad, warn if more than 20 */ if (oo_atomic_read (&fdi->ref_count) > 20) { Log_U (log ("Warning: fd %d's ref-count suspiciously large (%d)\n", i, oo_atomic_read (&fdi->ref_count))); } } } } CITP_FDTABLE_UNLOCK_RD(); }
int citp_fdtable_ctor() { struct rlimit rlim; int rc; Log_S(log("%s:", __FUNCTION__)); /* How big should our fdtable be by default? It's pretty arbitrary, but we have * seen a few apps that use setrlimit to set the fdtable to 4096 entries on * start-up (see bugs 3253 and 3373), so we choose that. (Note: we can't grow * the table if the app later does setrlimit, and unused entries consume virtual * space only, so it's worth allocating a table of reasonable sized.) */ citp_fdtable.size = 4096; if( getrlimit(RLIMIT_NOFILE, &rlim) == 0 ) { citp_fdtable.size = rlim.rlim_max; if( CITP_OPTS.fdtable_size != 0 && CITP_OPTS.fdtable_size != rlim.rlim_max ) { Log_S(ci_log("Set the limits for the number of opened files " "to EF_FDTABLE_SIZE=%u value.", CITP_OPTS.fdtable_size)); rlim.rlim_max = CITP_OPTS.fdtable_size; if( rlim.rlim_cur > rlim.rlim_max ) rlim.rlim_cur = rlim.rlim_max; if( ci_sys_setrlimit(RLIMIT_NOFILE, &rlim) == 0 ) citp_fdtable.size = rlim.rlim_max; else { /* Most probably, we've got EPERM */ ci_assert_lt(citp_fdtable.size, CITP_OPTS.fdtable_size); ci_log("Can't set EF_FDTABLE_SIZE=%u; using %u", CITP_OPTS.fdtable_size, citp_fdtable.size); rlim.rlim_max = rlim.rlim_cur = citp_fdtable.size; CI_TRY(ci_sys_setrlimit(RLIMIT_NOFILE, &rlim)); } } } else Log_S(ci_log("Assume EF_FDTABLE_SIZE=%u", citp_fdtable.size)); citp_fdtable.inited_count = 0; citp_fdtable.table = ci_libc_malloc(sizeof (citp_fdtable_entry) * citp_fdtable.size); if( ! citp_fdtable.table ) { Log_U(log("%s: failed to allocate fdtable (0x%x)", __FUNCTION__, citp_fdtable.size)); return -1; } /* The whole table is not initialised at start-of-day, but is initialised ** on demand. citp_fdtable.inited_count counts the number of initialised ** entries. */ if( (rc = oo_rwlock_ctor(&citp_ul_lock)) != 0 ) { Log_E(log("%s: oo_rwlock_ctor %d", __FUNCTION__, rc)); return -1; } /* Install SIGONLOAD handler */ { struct sigaction sa; memset(&sa, 0, sizeof(sa)); /* sa_flags and sa_mask = 0 */ sa.sa_handler = sighandler_do_nothing; sigaction(SIGONLOAD, &sa, NULL); } return 0; }
/* Find out what sort of thing [fd] is, and if it is a user-level socket * then map in the user-level state. */ static citp_fdinfo * citp_fdtable_probe_locked(unsigned fd, int print_banner, int fdip_is_already_busy) { citp_fdinfo* fdi = NULL; struct stat64 st; ci_ep_info_t info; if( ! fdip_is_already_busy ) { volatile citp_fdinfo_p* p_fdip; citp_fdinfo_p fdip; /* ?? We're repeating some effort already expended in lookup() here, but ** this keeps it cleaner. May optimise down the line when I understand ** what other code needs to call this. */ p_fdip = &citp_fdtable.table[fd].fdip; again: fdip = *p_fdip; if( fdip_is_busy(fdip) ) fdip = citp_fdtable_busy_wait(fd, 1); if( ! fdip_is_unknown(fdip) && ! fdip_is_normal(fdip) ) goto exit; if( fdip_cas_fail(p_fdip, fdip, fdip_busy) ) goto again; if( fdip_is_normal(fdip) ) { fdi = fdip_to_fdi(fdip); citp_fdinfo_ref(fdi); citp_fdtable_busy_clear(fd, fdip, 1); goto exit; } } if( ci_sys_fstat64(fd, &st) != 0 ) { /* fstat() failed. Must be a bad (closed) file descriptor, so ** leave this entry as unknown. Return citp_the_closed_fd to avoid the ** caller passing through to an fd that is created asynchronously. */ citp_fdtable_busy_clear(fd, fdip_unknown, 1); fdi = &citp_the_closed_fd; citp_fdinfo_ref(fdi); goto exit; } /* oo_get_st_rdev() and oo_onloadfs_dev_t() open-and-close fd, so * fdtable should be locked if strict mode requested. */ if( fdtable_strict() ) { CITP_FDTABLE_ASSERT_LOCKED(1); } if( st.st_dev == oo_onloadfs_dev_t() ) { /* Retrieve user-level endpoint info */ if( oo_ep_info(fd, &info) < 0 ) { Log_V(log("%s: fd=%d type=%d unknown", __FUNCTION__,fd,info.fd_type)); citp_fdtable_busy_clear(fd, fdip_passthru, 1); goto exit; } switch( info.fd_type ) { case CI_PRIV_TYPE_TCP_EP: case CI_PRIV_TYPE_UDP_EP: case CI_PRIV_TYPE_PASSTHROUGH_EP: case CI_PRIV_TYPE_ALIEN_EP: #if CI_CFG_USERSPACE_PIPE case CI_PRIV_TYPE_PIPE_READER: case CI_PRIV_TYPE_PIPE_WRITER: #endif { citp_fdinfo_p fdip; Log_V(log("%s: fd=%d %s restore", __FUNCTION__, fd, info.fd_type == CI_PRIV_TYPE_TCP_EP ? "TCP": #if CI_CFG_USERSPACE_PIPE info.fd_type != CI_PRIV_TYPE_UDP_EP ? "PIPE" : #endif "UDP")); fdip = citp_fdtable_probe_restore(fd, &info, print_banner); if( fdip_is_normal(fdip) ) fdi = fdip_to_fdi(fdip); else citp_fdtable_busy_clear(fd, fdip, 1); goto exit; } case CI_PRIV_TYPE_NETIF: /* This should never happen, because netif fds are close-on-exec. ** But let's leave this code here just in case my reasoning is bad. */ Log_U(log("%s: fd=%d NETIF reserved", __FUNCTION__, fd)); citp_fdtable_busy_clear(fd, fdip_reserved, 1); fdi = &citp_the_reserved_fd; citp_fdinfo_ref(fdi); goto exit; case CI_PRIV_TYPE_NONE: /* This happens if a thread gets at an onload driver fd that has just * been created, but not yet specialised. On Linux I think this * means it will shortly be a new netif internal fd. (fds associated * with sockets and pipes are never unspecialised). */ Log_V(log("%s: fd=%d TYPE_NONE", __FUNCTION__, fd)); citp_fdtable_busy_clear(fd, fdip_passthru, 1); goto exit; default: CI_TEST(0); break; } } else if( ci_major(st.st_rdev) == ci_major(oo_get_st_rdev(OO_EPOLL_DEV)) ) { citp_epollb_fdi *epi = CI_ALLOC_OBJ(citp_epollb_fdi); if( ! epi ) { Log_E(log("%s: out of memory (epoll_fdi)", __FUNCTION__)); citp_fdtable_busy_clear(fd, fdip_passthru, 1); goto exit; } oo_epollb_ctor(epi); fdi = &epi->fdinfo; citp_fdinfo_init(fdi, &citp_epollb_protocol_impl); citp_fdinfo_ref(fdi); citp_fdtable_insert(fdi, fd, 1); goto exit; } #ifndef NDEBUG /* /dev/onload may be netif only; they are closed on fork or exec */ if( ci_major(st.st_rdev) == ci_major(oo_get_st_rdev(OO_STACK_DEV)) ) Log_U(log("%s: %d is /dev/onload", __FUNCTION__, fd)); #endif /* Not one of ours, so pass-through. */ Log_V(log("%s: fd=%u non-efab", __FUNCTION__, fd)); citp_fdtable_busy_clear(fd, fdip_passthru, 1); exit: return fdi; }
int citp_ep_dup3(unsigned fromfd, unsigned tofd, int flags) { volatile citp_fdinfo_p* p_tofdip; citp_fdinfo_p tofdip; unsigned max; Log_V(log("%s(%d, %d)", __FUNCTION__, fromfd, tofd)); /* Must be checked by callers. */ ci_assert(fromfd != tofd); /* Hack: if [tofd] is the fd we're using for logging, we'd better choose ** a different one! */ if( tofd == citp.log_fd ) citp_log_change_fd(); ci_assert(citp.init_level >= CITP_INIT_FDTABLE); max = CI_MAX(fromfd, tofd); if( max >= citp_fdtable.inited_count ) { ci_assert(max < citp_fdtable.size); CITP_FDTABLE_LOCK(); __citp_fdtable_extend(max); CITP_FDTABLE_UNLOCK(); } /* Bug1151: Concurrent threads doing dup2(x,y) and dup2(y,x) can deadlock ** against one another. So we take out a fat lock to prevent concurrent ** dup2()s. */ /* Lock tofd. We need to interlock against select and poll etc, so we ** also grab the exclusive lock. Also grab the bug1151 lock. */ pthread_mutex_lock(&citp_dup_lock); CITP_FDTABLE_LOCK(); p_tofdip = &citp_fdtable.table[tofd].fdip; lock_tofdip_again: tofdip = *p_tofdip; if( fdip_is_busy(tofdip) ) tofdip = citp_fdtable_busy_wait(tofd, 1); if( fdip_is_closing(tofdip) ) tofdip = citp_fdtable_closing_wait(tofd, 1); if( fdip_is_reserved(tofdip) ) { /* ?? FIXME: we can't cope with this at the moment */ CITP_FDTABLE_UNLOCK(); Log_U(log("%s(%d, %d): target is reserved", __FUNCTION__, fromfd, tofd)); errno = EBUSY; tofd = -1; goto out; } if( fdip_cas_fail(p_tofdip, tofdip, fdip_busy) ) goto lock_tofdip_again; CITP_FDTABLE_UNLOCK(); ci_assert(fdip_is_normal(tofdip) | fdip_is_passthru(tofdip) | fdip_is_unknown(tofdip)); if( fdip_is_normal(tofdip) ) { /* We're duping onto a user-level socket. */ citp_fdinfo* tofdi = fdip_to_fdi(tofdip); if( tofdi->epoll_fd >= 0 ) { citp_fdinfo* epoll_fdi = citp_epoll_fdi_from_member(tofdi, 0); if( epoll_fdi ) { if( epoll_fdi->protocol->type == CITP_EPOLL_FD ) citp_epoll_on_close(epoll_fdi, tofdi, 0); citp_fdinfo_release_ref(epoll_fdi, 0); } } ci_assert_equal(tofdi->on_ref_count_zero, FDI_ON_RCZ_NONE); tofdi->on_ref_count_zero = FDI_ON_RCZ_DUP2; tofdi->on_rcz.dup3_args.fd = fromfd; tofdi->on_rcz.dup3_args.flags = flags; citp_fdinfo_release_ref(tofdi, 0); { int i = 0; /* We need to free this fdi. If someone is using it right now, * we are in trouble. So, we spin for a while and interrupt the * user. See bug 28123. */ while( tofdi->on_ref_count_zero != FDI_ON_RCZ_DONE ) { if( ci_is_multithreaded() && i % 10000 == 9999 ) { pthread_t pth = tofdi->thread_id; if( pth != pthread_self() && pth != PTHREAD_NULL ) { pthread_kill(pth, SIGONLOAD); sleep(1); } } ci_spinloop_pause(); i++; } ci_rmb(); } if( tofdi->on_rcz.dup2_result < 0 ) { errno = -tofdi->on_rcz.dup2_result; /* Need to re-insert [tofdi] into the table. */ ci_assert_equal(oo_atomic_read(&tofdi->ref_count), 0); oo_atomic_set(&tofdi->ref_count, 1); CI_DEBUG(tofdi->on_ref_count_zero = FDI_ON_RCZ_NONE); citp_fdtable_busy_clear(tofd, tofdip, 0); tofd = -1; } else { ci_assert(tofdi->on_rcz.dup2_result == tofd); citp_fdinfo_get_ops(tofdi)->dtor(tofdi, 0); citp_fdinfo_free(tofdi); } goto out; } ci_assert(fdip_is_passthru(tofdip) | fdip_is_unknown(tofdip)); { /* We're dupping onto an O/S descriptor, or it may be closed. Create a ** dummy [citp_fdinfo], just so we can share code with the case above. */ citp_fdinfo fdi; fdi.fd = tofd; fdi.on_rcz.dup3_args.fd = fromfd; fdi.on_rcz.dup3_args.flags = flags; dup2_complete(&fdi, tofdip, 0); if( fdi.on_rcz.dup2_result < 0 ) { errno = -fdi.on_rcz.dup2_result; citp_fdtable_busy_clear(tofd, tofdip, 0); tofd = -1; } else ci_assert(fdi.on_rcz.dup2_result == tofd); } out: pthread_mutex_unlock(&citp_dup_lock); return tofd; }
/* we don't register protocol impl */ int citp_pipe_create(int fds[2], int flags) { citp_pipe_fdi* epi_read; citp_pipe_fdi* epi_write; struct oo_pipe* p = NULL; /* make compiler happy */ ci_netif* ni; int rc = -1; ef_driver_handle fd = -1; Log_V(log(LPF "pipe()")); /* citp_netif_exists() does not need citp_ul_lock here */ if( CITP_OPTS.ul_pipe == CI_UNIX_PIPE_ACCELERATE_IF_NETIF && ! citp_netif_exists() ) { return CITP_NOT_HANDLED; } rc = citp_netif_alloc_and_init(&fd, &ni); if( rc != 0 ) { if( rc == CI_SOCKET_HANDOVER ) { /* This implies EF_DONT_ACCELERATE is set, so we handover * regardless of CITP_OPTS.no_fail */ return CITP_NOT_HANDLED; } /* may be lib mismatch - errno will be ELIBACC */ goto fail1; } rc = -1; CI_MAGIC_CHECK(ni, NETIF_MAGIC); /* add another reference as we have 2 fdis */ citp_netif_add_ref(ni); epi_read = citp_pipe_epi_alloc(ni, O_RDONLY); if( epi_read == NULL ) goto fail2; epi_write = citp_pipe_epi_alloc(ni, O_WRONLY); if( epi_write == NULL ) goto fail3; /* oo_pipe init code */ if( fdtable_strict() ) CITP_FDTABLE_LOCK(); rc = oo_pipe_ctor(ni, &p, fds, flags); if( rc < 0 ) goto fail4; citp_fdtable_new_fd_set(fds[0], fdip_busy, fdtable_strict()); citp_fdtable_new_fd_set(fds[1], fdip_busy, fdtable_strict()); if( fdtable_strict() ) CITP_FDTABLE_UNLOCK(); LOG_PIPE("%s: pipe=%p id=%d", __FUNCTION__, p, p->b.bufid); /* as pipe is created it should be attached to the end-points */ epi_read->pipe = p; epi_write->pipe = p; /* We're ready. Unleash us onto the world! */ ci_assert(epi_read->pipe->b.sb_aflags & CI_SB_AFLAG_NOT_READY); ci_assert(epi_write->pipe->b.sb_aflags & CI_SB_AFLAG_NOT_READY); ci_atomic32_and(&epi_read->pipe->b.sb_aflags, ~CI_SB_AFLAG_NOT_READY); ci_atomic32_and(&epi_read->pipe->b.sb_aflags, ~CI_SB_AFLAG_NOT_READY); citp_fdtable_insert(&epi_read->fdinfo, fds[0], 0); citp_fdtable_insert(&epi_write->fdinfo, fds[1], 0); CI_MAGIC_CHECK(ni, NETIF_MAGIC); return 0; fail4: if( fdtable_strict() ) CITP_FDTABLE_UNLOCK(); fail3: CI_FREE_OBJ(epi_write); fail2: CI_FREE_OBJ(epi_read); citp_netif_release_ref(ni, 0); citp_netif_release_ref(ni, 0); fail1: if( CITP_OPTS.no_fail && errno != ELIBACC ) { Log_U(ci_log("%s: failed (errno:%d) - PASSING TO OS", __FUNCTION__, errno)); return CITP_NOT_HANDLED; } return rc; }
/* fixme kostik: this is partially copy-paste from citp_sock_fcntl */ static int citp_pipe_fcntl(citp_fdinfo* fdinfo, int cmd, long arg) { int rc = 0; citp_pipe_fdi* epi = fdi_to_pipe_fdi(fdinfo); struct oo_pipe* p = epi->pipe; switch ( cmd ) { case F_GETFL: { ci_uint32 flag_nonb = CI_PFD_AFLAG_NONBLOCK; if( ! fdi_is_reader(fdinfo) ) { rc = O_WRONLY; flag_nonb <<= CI_PFD_AFLAG_WRITER_SHIFT; } else flag_nonb <<= CI_PFD_AFLAG_READER_SHIFT; if ( p->aflags & flag_nonb ) rc |= O_NONBLOCK; break; } case F_SETFL: { ci_uint32 bit; rc = ci_sys_fcntl(fdinfo->fd, cmd, arg); if( rc < 0 ) break; bit = CI_PFD_AFLAG_NONBLOCK << (fdi_is_reader(fdinfo) ? CI_PFD_AFLAG_READER_SHIFT : CI_PFD_AFLAG_WRITER_SHIFT); if( arg & (O_NONBLOCK | O_NDELAY) ) ci_bit_mask_set(&p->aflags, bit); else ci_bit_mask_clear(&p->aflags, bit); break; } case F_DUPFD: rc = citp_ep_dup(fdinfo->fd, citp_ep_dup_fcntl_dup, arg); break; #ifdef F_DUPFD_CLOEXEC case F_DUPFD_CLOEXEC: rc = citp_ep_dup(fdinfo->fd, citp_ep_dup_fcntl_dup_cloexec, arg); break; #endif case F_GETFD: case F_SETFD: rc = ci_sys_fcntl(fdinfo->fd, cmd, arg); break; case F_GETLK: case F_SETLK: case F_SETLKW: /* File locks not supported on sockets */ Log_U(ci_log("%s: cmd %d not supported on sockets!",__FUNCTION__, cmd)); errno = ENOTSUP; rc = CI_SOCKET_ERROR; break; case F_GETOWN: case F_SETOWN: #ifdef F_GETOWN_EX case F_GETOWN_EX: #endif #ifdef F_SETOWN_EX case F_SETOWN_EX: #endif rc = ci_sys_fcntl(fdinfo->fd, cmd, arg); if( rc != 0 ) break; p->b.sigown = arg; if( p->b.sigown && (p->b.sb_aflags & CI_SB_AFLAG_O_ASYNC) ) ci_bit_set(&p->b.wake_request, CI_SB_FLAG_WAKE_RX_B); break; #ifdef F_SETPIPE_SZ case F_SETPIPE_SZ: /* System pipe buf size is rounded up to power of two. We * cannot replicate this. */ rc = ci_pipe_set_size(epi->ni, p, arg); if( rc < 0 ) { errno = EINVAL; rc = CI_SOCKET_ERROR; break; } rc = 0; break; #endif #ifdef F_GETPIPE_SZ case F_GETPIPE_SZ: rc = (p->bufs_max - 1) * OO_PIPE_BUF_MAX_SIZE; break; #endif default: /* fixme kostik: logging should include some pipe identification */ errno = ENOTSUP; rc = CI_SOCKET_ERROR; } Log_VSC(log("%s(%d, %d, %ld) = %d (errno=%d)", __FUNCTION__, fdinfo->fd, cmd, arg, rc, errno)); return rc; }
static int citp_udp_socket(int domain, int type, int protocol) { citp_fdinfo* fdi; citp_sock_fdi* epi; ef_driver_handle fd; int rc; ci_netif* ni; Log_V(log(LPF "socket(%d, %d, %d)", domain, type, protocol)); epi = CI_ALLOC_OBJ(citp_sock_fdi); if( ! epi ) { Log_U(ci_log(LPF "socket: failed to allocate epi")); errno = ENOMEM; goto fail1; } fdi = &epi->fdinfo; citp_fdinfo_init(fdi, &citp_udp_protocol_impl); rc = citp_netif_alloc_and_init(&fd, &ni); if( rc != 0 ) { if( rc == CI_SOCKET_HANDOVER ) { /* This implies EF_DONT_ACCELERATE is set, so we handover * regardless of CITP_OPTS.no_fail */ CI_FREE_OBJ(epi); return rc; } goto fail2; } /* Protect the fdtable entry until we're done initialising. */ if( fdtable_strict() ) CITP_FDTABLE_LOCK(); if((fd = ci_udp_ep_ctor(&epi->sock, ni, domain, type)) < 0) { /*! ?? \TODO unpick the ci_udp_ep_ctor according to how failed */ Log_U(ci_log(LPF "socket: udp_ep_ctor failed")); errno = -fd; goto fail3; } citp_fdtable_new_fd_set(fd, fdip_busy, fdtable_strict()); if( fdtable_strict() ) CITP_FDTABLE_UNLOCK(); CI_DEBUG(epi->sock.s->pid = getpid()); /* We're ready. Unleash us onto the world! */ ci_assert(epi->sock.s->b.sb_aflags & CI_SB_AFLAG_NOT_READY); ci_atomic32_and(&epi->sock.s->b.sb_aflags, ~CI_SB_AFLAG_NOT_READY); citp_fdtable_insert(fdi, fd, 0); Log_VSS(log(LPF "socket(%d, %d, %d) = "EF_FMT, domain, type, protocol, EF_PRI_ARGS(epi,fd))); return fd; fail3: if( CITP_OPTS.no_fail && errno != ELIBACC ) CITP_STATS_NETIF(++ni->state->stats.udp_handover_socket); citp_netif_release_ref(ni, 0); fail2: CI_FREE_OBJ(epi); fail1: /* BUG1408: Graceful failure. We'll only fail outright if there's a * driver/library mismatch */ if( CITP_OPTS.no_fail && errno != ELIBACC ) { Log_U(ci_log("%s: failed (errno:%d) - PASSING TO OS", __FUNCTION__, errno)); return CI_SOCKET_HANDOVER; } return -1; }