/* Common handler for IOCTL calls. * NOTE: in the kernel version if [arg] is a pointer then it will point * into user space. Use the CI_IOCTL_* macros in internal.h please. */ int ci_cmn_ioctl(ci_netif* netif, ci_sock_cmn* s, int request, void* arg, int os_rc, int os_socket_exists) { ci_assert(netif); ci_assert(s); /* ioctl defines are listed in `man ioctl_list` and the CI equivalent * CI defines are in include/ci/net/ioctls.h */ LOG_SV( ci_log("request = %u/%#x, arg = %lu/%#lx", request, request, (long) arg, (long) arg)); switch( request ) { case SIOCGPGRP: /* get the process ID/group that is receiving signals for this fd */ if( !CI_IOCTL_ARG_OK(int, arg) ) goto fail_fault; CI_IOCTL_SETARG( ((int*)arg), s->b.sigown); break; case SIOCSPGRP: /* set the process ID/group that is receiving signals for this fd */ if( !CI_IOCTL_ARG_OK(int, arg) ) goto fail_fault; s->b.sigown = CI_IOCTL_GETARG(int,arg); if( s->b.sigown && (s->b.sb_aflags & CI_SB_AFLAG_O_ASYNC) ) ci_bit_set(&s->b.wake_request, CI_SB_FLAG_WAKE_RX_B); break; case SIOCGSTAMP: case SIOCGSTAMPNS: RET_WITH_ERRNO(ENOENT); default: if (!os_socket_exists) RET_WITH_ERRNO(ENOTTY); /* Assumes that errno is unchanged from the OS call, or that [os_rc] == 0 */ return os_rc; } /* Successful conclusion */ return 0; fail_fault: LOG_SC( ci_log("%s: "NS_FMT" req %d/%#x arg %ld/%#lx unhandled (EINVAL)", __FUNCTION__, NS_PRI_ARGS(netif, s), request, request, (long)arg, (long)arg)); RET_WITH_ERRNO(EFAULT); }
void citp_waitable_all_fds_gone(ci_netif* ni, oo_sp w_id) { citp_waitable_obj* wo; ci_assert(ni); ci_assert(IS_VALID_SOCK_P(ni, w_id)); ci_assert(ci_netif_is_locked(ni)); wo = SP_TO_WAITABLE_OBJ(ni, w_id); ci_assert(wo->waitable.state != CI_TCP_STATE_FREE); LOG_NC(ci_log("%s: %d:%d %s", __FUNCTION__, NI_ID(ni), OO_SP_FMT(w_id), ci_tcp_state_str(wo->waitable.state))); /* listening socket is closed in blocking conext, see * efab_tcp_helper_close_endpoint(). * CI_SB_AFLAG_ORPHAN is set earlier in this case.. */ CI_DEBUG(if( (wo->waitable.sb_aflags & CI_SB_AFLAG_ORPHAN) && wo->waitable.state != CI_TCP_LISTEN ) ci_log("%s: %d:%d already orphan", __FUNCTION__, NI_ID(ni), OO_SP_FMT(w_id))); /* It's essential that an ORPHANed socket not be on the deferred * socket list, because the same link field is used as timewait * list, free list etc. We must purge the deferred list before * setting the orphan flag. * * NB. This socket cannot now be added to the deferred list, because * no-one has a reference to it. */ ci_netif_purge_deferred_socket_list(ni); ci_bit_set(&wo->waitable.sb_aflags, CI_SB_AFLAG_ORPHAN_BIT); /* We also need to remove the socket from the post-poll list. It may * have been left there because the stack believes a wakeup is needed. */ ci_ni_dllist_remove_safe(ni, &wo->waitable.post_poll_link); ci_ni_dllist_remove_safe(ni, &wo->waitable.ready_link); wo->waitable.ready_list_id = 0; citp_waitable_cleanup(ni, wo, 1); }
static int oo_epoll2_ctl(struct oo_epoll_private *priv, int op_kepfd, int op_op, int op_fd, struct epoll_event *op_event) { tcp_helper_resource_t *fd_thr; struct file *file; int rc; ci_uint32 fd_sock_id; citp_waitable *fd_w; /* We are interested in ADD only */ if( op_op != EPOLL_CTL_ADD ) return efab_linux_sys_epoll_ctl(op_kepfd, op_op, op_fd, op_event); /* system poll() and friends use fget_light(), which is cheap. * But they do not export fget_light to us, so we have to use fget(). */ file = fget(op_fd); if(unlikely( file == NULL )) return -EBADF; /* Check for the dead circle. * We should check that we are not adding ourself. */ if(unlikely( file->private_data == priv )) { fput(file); return -EINVAL; } /* Is op->fd ours and if yes, which netif it has? */ /* Fixme: epoll fd - do we want to accelerate something? */ if( file->f_op != &linux_tcp_helper_fops_udp && file->f_op != &linux_tcp_helper_fops_tcp ) { int rc; #ifdef OO_EPOLL_NEED_NEST_PROTECTION struct oo_epoll_busy_task t; t.task = current; spin_lock(&priv->lock); list_add(&t.link, &priv->p.p2.busy_tasks); spin_unlock(&priv->lock); #endif #if CI_CFG_USERSPACE_PIPE if( ( file->f_op == &linux_tcp_helper_fops_pipe_reader || file->f_op == &linux_tcp_helper_fops_pipe_writer ) ) priv->p.p2.do_spin = 1; #endif fput(file); rc = efab_linux_sys_epoll_ctl(op_kepfd, op_op, op_fd, op_event); #ifdef OO_EPOLL_NEED_NEST_PROTECTION spin_lock(&priv->lock); list_del(&t.link); spin_unlock(&priv->lock); #endif return rc; } /* Onload socket here! */ fd_thr = ((ci_private_t *)file->private_data)->thr; fd_sock_id = ((ci_private_t *)file->private_data)->sock_id; priv->p.p2.do_spin = 1; if(unlikely( ! oo_epoll_add_stack(priv, fd_thr) )) { static int printed; if( !printed ) ci_log("Can't add stack %d to epoll set: consider " "increasing epoll_max_stacks module option", fd_thr->id); /* fall through to sys_epoll_ctl() without interrupt */ } /* Let kernel add fd to the epoll set, but ask endpoint to avoid enabling * interrupts. * And we keep file ref while using fd_w to avoid nasty things. */ fd_w = SP_TO_WAITABLE(&fd_thr->netif, fd_sock_id); ci_bit_set(&fd_w->sb_aflags, CI_SB_AFLAG_AVOID_INTERRUPTS_BIT); rc = efab_linux_sys_epoll_ctl(op_kepfd, op_op, op_fd, op_event); ci_bit_clear(&fd_w->sb_aflags, CI_SB_AFLAG_AVOID_INTERRUPTS_BIT); fput(file); return rc; }
/* Move priv file to the alien_ni stack. * Should be called with the locked priv stack and socket; * the function returns with this stack being unlocked. * If rc=0, it returns with alien_ni stack locked; * otherwise, both stacks are unlocked. * Socket is always unlocked on return. */ int efab_file_move_to_alien_stack(ci_private_t *priv, ci_netif *alien_ni) { tcp_helper_resource_t *old_thr = priv->thr; tcp_helper_resource_t *new_thr = netif2tcp_helper_resource(alien_ni); ci_sock_cmn *old_s = SP_TO_SOCK(&old_thr->netif, priv->sock_id); ci_sock_cmn *new_s; ci_sock_cmn *mid_s; tcp_helper_endpoint_t *old_ep, *new_ep; int rc, i; int pollwait_register = 0; #if CI_CFG_FD_CACHING oo_p sp; #endif OO_DEBUG_TCPH(ci_log("%s: move %d:%d to %d", __func__, old_thr->id, priv->sock_id, new_thr->id)); /* Poll the old stack - deliver all data to our socket */ ci_netif_poll(&old_thr->netif); /* Endpoints in epoll list should not be moved, because waitq is already * in the epoll internal structures (bug 41152). */ if( !list_empty(&priv->_filp->f_ep_links) ) { rc = -EBUSY; goto fail1; } if( !efab_file_move_supported(&old_thr->netif, old_s) ) { rc = -EINVAL; goto fail1; } /* Lock the second stack */ i = 0; while( ! ci_netif_trylock(alien_ni) ) { ci_netif_unlock(&old_thr->netif); if( i++ >= 1000 ) { rc = -EBUSY; goto fail1_ni_unlocked; } rc = ci_netif_lock(&old_thr->netif); if( rc != 0 ) goto fail1_ni_unlocked; } /* Allocate a new socket in the alien_ni stack */ rc = -ENOMEM; if( old_s->b.state == CI_TCP_STATE_UDP ) { ci_udp_state *new_us = ci_udp_get_state_buf(alien_ni); if( new_us == NULL ) goto fail2; new_s = &new_us->s; } else { ci_tcp_state *new_ts = ci_tcp_get_state_buf(alien_ni); if( new_ts == NULL ) goto fail2; new_s = &new_ts->s; } /* Allocate an intermediate "socket" outside of everything */ mid_s = ci_alloc(CI_MAX(sizeof(ci_tcp_state), sizeof(ci_udp_state))); if( mid_s == NULL ) goto fail3; OO_DEBUG_TCPH(ci_log("%s: move %d:%d to %d:%d", __func__, old_thr->id, priv->sock_id, new_thr->id, new_s->b.bufid)); /* Copy TCP/UDP state */ memcpy(mid_s, old_s, CI_MAX(sizeof(ci_tcp_state), sizeof(ci_udp_state))); /* do not copy old_s->b.bufid * and other fields in stack adress space */ mid_s->b.sb_aflags |= CI_SB_AFLAG_ORPHAN; mid_s->b.bufid = new_s->b.bufid; mid_s->b.post_poll_link = new_s->b.post_poll_link; mid_s->b.ready_link = new_s->b.ready_link; mid_s->reap_link = new_s->reap_link; if( old_s->b.state & CI_TCP_STATE_TCP ) { ci_tcp_state *new_ts = SOCK_TO_TCP(new_s); ci_tcp_state *mid_ts = SOCK_TO_TCP(mid_s); mid_ts->timeout_q_link = new_ts->timeout_q_link; mid_ts->tx_ready_link = new_ts->tx_ready_link; mid_ts->rto_tid = new_ts->rto_tid; mid_ts->delack_tid = new_ts->delack_tid; mid_ts->zwin_tid = new_ts->zwin_tid; mid_ts->kalive_tid = new_ts->kalive_tid; mid_ts->cork_tid = new_ts->cork_tid; ci_ip_queue_init(&mid_ts->recv1); ci_ip_queue_init(&mid_ts->recv2); ci_ip_queue_init(&mid_ts->send); ci_ip_queue_init(&mid_ts->retrans); mid_ts->send_prequeue = OO_PP_ID_NULL; new_ts->retrans_ptr = OO_PP_NULL; mid_ts->tmpl_head = OO_PP_NULL; oo_atomic_set(&mid_ts->send_prequeue_in, 0); *new_ts = *mid_ts; ci_pmtu_state_init(alien_ni, &new_ts->s, &new_ts->pmtus, CI_IP_TIMER_PMTU_DISCOVER); #if CI_CFG_FD_CACHING sp = TS_OFF(alien_ni, new_ts); OO_P_ADD(sp, CI_MEMBER_OFFSET(ci_tcp_state, epcache_link)); ci_ni_dllist_link_init(alien_ni, &new_ts->epcache_link, sp, "epch"); ci_ni_dllist_self_link(alien_ni, &new_ts->epcache_link); sp = TS_OFF(alien_ni, new_ts); OO_P_ADD(sp, CI_MEMBER_OFFSET(ci_tcp_state, epcache_fd_link)); ci_ni_dllist_link_init(alien_ni, &new_ts->epcache_fd_link, sp, "ecfd"); ci_ni_dllist_self_link(alien_ni, &new_ts->epcache_fd_link); #endif /* free temporary mid_ts storage */ CI_FREE_OBJ(mid_ts); } else { ci_udp_state *mid_us = SOCK_TO_UDP(mid_s); *SOCK_TO_UDP(new_s) = *mid_us; CI_FREE_OBJ(mid_us); } /* Move the filter */ old_ep = ci_trs_ep_get(old_thr, priv->sock_id); new_ep = ci_trs_ep_get(new_thr, new_s->b.bufid); rc = tcp_helper_endpoint_move_filters_pre(old_ep, new_ep); if( rc != 0 ) { rc = -EINVAL; goto fail3; } /* Allocate a new file for the new endpoint */ rc = onload_alloc_file(new_thr, new_s->b.bufid, priv->_filp->f_flags, priv->fd_type, &old_ep->alien_ref); if( rc != 0 ) goto fail4; ci_assert(old_ep->alien_ref); /* Copy F_SETOWN_EX, F_SETSIG to the new file */ #ifdef F_SETOWN_EX rcu_read_lock(); __f_setown(old_ep->alien_ref->_filp, priv->_filp->f_owner.pid, priv->_filp->f_owner.pid_type, 1); rcu_read_unlock(); #endif old_ep->alien_ref->_filp->f_owner.signum = priv->_filp->f_owner.signum; old_ep->alien_ref->_filp->f_flags |= priv->_filp->f_flags & O_NONBLOCK; /* Move os_socket from one ep to another */ if( tcp_helper_endpoint_set_aflags(new_ep, OO_THR_EP_AFLAG_ATTACHED) & OO_THR_EP_AFLAG_ATTACHED ) { fput(old_ep->alien_ref->_filp); rc = -EBUSY; goto fail2; /* state & filters are cleared by fput() */ } /********* Point of no return **********/ ci_wmb(); priv->fd_type = CI_PRIV_TYPE_ALIEN_EP; priv->_filp->f_op = &linux_tcp_helper_fops_alien; ci_wmb(); oo_file_moved(priv); /* Read all already-arrived packets after the filters move but before * copying of the receive queue. */ ci_netif_poll(&old_thr->netif); tcp_helper_endpoint_move_filters_post(old_ep, new_ep); ci_assert( efab_file_move_supported(&old_thr->netif, old_s)); /* There's a gap between un-registering the old ep, and registering the * the new. However, the notifications shouldn't be in use for sockets * that are in a state that can be moved, so this shouldn't be a problem. */ if( old_ep->os_sock_pt.whead ) { pollwait_register = 1; efab_tcp_helper_os_pollwait_unregister(old_ep); } ci_assert_equal(new_ep->os_socket, NULL); new_ep->os_socket = oo_file_ref_xchg(&old_ep->os_socket, NULL); ci_assert_equal(old_ep->os_socket, NULL); if( pollwait_register ) efab_tcp_helper_os_pollwait_register(new_ep); ci_bit_clear(&new_s->b.sb_aflags, CI_SB_AFLAG_ORPHAN_BIT); if( new_s->b.state == CI_TCP_ESTABLISHED ) CI_TCP_STATS_INC_CURR_ESTAB(alien_ni); /* Copy recv queue */ if( new_s->b.state & CI_TCP_STATE_TCP ) { ci_tcp_state *new_ts = SOCK_TO_TCP(new_s); ci_tcp_state *old_ts = SOCK_TO_TCP(old_s); int i; /* Stop timers */ ci_ip_timer_clear(&old_thr->netif, &old_ts->kalive_tid); ci_ip_timer_clear(&old_thr->netif, &old_ts->delack_tid); efab_ip_queue_copy(alien_ni, &new_ts->recv1, &old_thr->netif, &old_ts->recv1); efab_ip_queue_copy(alien_ni, &new_ts->recv2, &old_thr->netif, &old_ts->recv2); new_ts->recv1_extract = new_ts->recv1.head; /* Drop reorder buffer */ ci_ip_queue_init(&new_ts->rob); new_ts->dsack_block = OO_PP_INVALID; new_ts->dsack_start = new_ts->dsack_end = 0; for( i = 0; i <= CI_TCP_SACK_MAX_BLOCKS; i++ ) new_ts->last_sack[i] = OO_PP_NULL; } else { /* There should not be any recv q, but drop it to be sure */ ci_udp_recv_q_init(&SOCK_TO_UDP(new_s)->recv_q); } /* Old stack can be unlocked */ old_s->b.sb_flags |= CI_SB_FLAG_MOVED; ci_netif_unlock(&old_thr->netif); ci_assert( efab_file_move_supported(alien_ni, new_s) ); /* Move done: poll for any new data. */ ci_netif_poll(alien_ni); if( new_s->b.state & CI_TCP_STATE_TCP ) { ci_tcp_state *new_ts = SOCK_TO_TCP(new_s); /* Timers setup: delack, keepalive */ if( (new_ts->acks_pending & CI_TCP_ACKS_PENDING_MASK) > 0) ci_tcp_timeout_delack(alien_ni, new_ts); ci_tcp_kalive_reset(alien_ni, new_ts); } /* Old ep: we are done. */ ci_bit_set(&old_s->b.sb_aflags, CI_SB_AFLAG_MOVED_AWAY_BIT); old_s->b.moved_to_stack_id = alien_ni->state->stack_id; old_s->b.moved_to_sock_id = new_s->b.bufid; if( ! list_empty(&priv->_filp->f_ep_links) ) ci_bit_set(&old_s->b.sb_aflags, CI_SB_AFLAG_MOVED_AWAY_IN_EPOLL_BIT); ci_sock_unlock(&old_thr->netif, &old_s->b); ci_sock_unlock(alien_ni, &new_s->b); ci_assert(ci_netif_is_locked(alien_ni)); OO_DEBUG_TCPH(ci_log("%s: -> [%d:%d] %s", __func__, new_thr->id, new_s->b.bufid, ci_tcp_state_str(new_s->b.state))); return 0; fail4: /* We clear the filters from the new ep. * For now, we do not need to re-insert old filters because hw filters * are alredy here (in case of accepted socket) or not needed. * We have not removed old sw filters yet. */ tcp_helper_endpoint_move_filters_undo(old_ep, new_ep); fail3: if( new_s->b.state & CI_TCP_STATE_TCP ) ci_tcp_state_free(alien_ni, SOCK_TO_TCP(new_s)); else ci_udp_state_free(alien_ni, SOCK_TO_UDP(new_s)); fail2: ci_netif_unlock(alien_ni); fail1: ci_netif_unlock(&old_thr->netif); fail1_ni_unlocked: ci_sock_unlock(&old_thr->netif, &old_s->b); OO_DEBUG_TCPH(ci_log("%s: rc=%d", __func__, rc)); return rc; }
static int efab_tcp_helper_move_state(ci_private_t* priv, void *arg) { oo_tcp_move_state_t *op = arg; tcp_helper_endpoint_t *new_ep; tcp_helper_resource_t * new_trs = NULL; ci_netif* ni, *new_ni; ci_tcp_state * ts, *new_ts; tcp_helper_endpoint_t* ep; int rc = efab_ioctl_get_ep(priv, op->ep_id, &ep); if (rc != 0) return rc; OO_DEBUG_TCPH(ci_log("%s: (trs=%p (%u), priv=%p, ep_id=%u, new_trs_id=%u, " "new_ep_id=%u", __FUNCTION__, priv->thr, priv->thr->id, priv, OO_SP_FMT(op->ep_id), op->new_trs_id, OO_SP_FMT(op->new_ep_id))); do { /* check that the existing id is valid */ ni = &priv->thr->netif; ts = SP_TO_TCP(ni, ep->id); /* TODO: check this endpoint belongs to the tcp helper resource of priv and not * somewhere else */ /* this function does not change fd_type or fd ops, so it is not able * to cope with changing the socket type. We think this only makes sense * for TCP, so assert we are taking a TCP endpoint. */ ci_assert_equal(ts->s.pkt.ip.ip_protocol, IPPROTO_TCP); ci_assert_equal(priv->fd_type, CI_PRIV_TYPE_TCP_EP); /* get pointer to resource from handle - increments ref count */ rc = efab_thr_table_lookup(NULL, op->new_trs_id, EFAB_THR_TABLE_LOOKUP_CHECK_USER, &new_trs); if (rc < 0) { OO_DEBUG_ERR( ci_log("%s: invalid new resource handle", __FUNCTION__) ); break; } ci_assert(new_trs != NULL); /* check valid endpoint in new netif */ new_ni = &new_trs->netif; new_ep = ci_netif_get_valid_ep(new_ni, op->new_ep_id); new_ts = SP_TO_TCP(new_ni, new_ep->id); /* check the two endpoint states look valid */ if( (ts->s.pkt.ip.ip_protocol != new_ts->s.pkt.ip.ip_protocol) || (ts->s.b.state != CI_TCP_CLOSED) || (ep->oofilter.sf_local_port != NULL) ) { efab_thr_release(new_trs); rc = -EINVAL; OO_DEBUG_ERR(ci_log("%s: invalid endpoint states", __FUNCTION__)); break; } /* should be fine to complete */ ci_assert(new_trs); { tcp_helper_resource_t *old_trs; again: old_trs = priv->thr; if (ci_cas_uintptr_fail((ci_uintptr_t *)&priv->thr, (ci_uintptr_t)old_trs, (ci_uintptr_t)new_trs)) goto again; efab_thr_release(old_trs); } /* move file to hold details of new resource, new endpoint */ ci_assert(OO_SP_EQ(priv->sock_id, op->ep_id)); priv->sock_id = new_ep->id; OO_DEBUG_TCPH(ci_log("%s: set epid %u", __FUNCTION__, OO_SP_FMT(priv->sock_id))); /* copy across any necessary state */ ci_assert_equal(new_ep->os_socket, NULL); new_ep->os_socket = ep->os_socket; ep->os_socket = NULL; /* set ORPHAN flag in current as not attached to an FD */ ci_bit_set(&ts->s.b.sb_aflags, CI_SB_AFLAG_ORPHAN_BIT); /* remove ORPHAN flag in new TCP state */ ci_atomic32_and(&new_ts->s.b.sb_aflags, ~(CI_SB_AFLAG_ORPHAN | CI_SB_AFLAG_TCP_IN_ACCEPTQ)); return 0; } while (0); return rc; }
/* fixme kostik: this is partially copy-paste from citp_sock_fcntl */ static int citp_pipe_fcntl(citp_fdinfo* fdinfo, int cmd, long arg) { int rc = 0; citp_pipe_fdi* epi = fdi_to_pipe_fdi(fdinfo); struct oo_pipe* p = epi->pipe; switch ( cmd ) { case F_GETFL: { ci_uint32 flag_nonb = CI_PFD_AFLAG_NONBLOCK; if( ! fdi_is_reader(fdinfo) ) { rc = O_WRONLY; flag_nonb <<= CI_PFD_AFLAG_WRITER_SHIFT; } else flag_nonb <<= CI_PFD_AFLAG_READER_SHIFT; if ( p->aflags & flag_nonb ) rc |= O_NONBLOCK; break; } case F_SETFL: { ci_uint32 bit; rc = ci_sys_fcntl(fdinfo->fd, cmd, arg); if( rc < 0 ) break; bit = CI_PFD_AFLAG_NONBLOCK << (fdi_is_reader(fdinfo) ? CI_PFD_AFLAG_READER_SHIFT : CI_PFD_AFLAG_WRITER_SHIFT); if( arg & (O_NONBLOCK | O_NDELAY) ) ci_bit_mask_set(&p->aflags, bit); else ci_bit_mask_clear(&p->aflags, bit); break; } case F_DUPFD: rc = citp_ep_dup(fdinfo->fd, citp_ep_dup_fcntl_dup, arg); break; #ifdef F_DUPFD_CLOEXEC case F_DUPFD_CLOEXEC: rc = citp_ep_dup(fdinfo->fd, citp_ep_dup_fcntl_dup_cloexec, arg); break; #endif case F_GETFD: case F_SETFD: rc = ci_sys_fcntl(fdinfo->fd, cmd, arg); break; case F_GETLK: case F_SETLK: case F_SETLKW: /* File locks not supported on sockets */ Log_U(ci_log("%s: cmd %d not supported on sockets!",__FUNCTION__, cmd)); errno = ENOTSUP; rc = CI_SOCKET_ERROR; break; case F_GETOWN: case F_SETOWN: #ifdef F_GETOWN_EX case F_GETOWN_EX: #endif #ifdef F_SETOWN_EX case F_SETOWN_EX: #endif rc = ci_sys_fcntl(fdinfo->fd, cmd, arg); if( rc != 0 ) break; p->b.sigown = arg; if( p->b.sigown && (p->b.sb_aflags & CI_SB_AFLAG_O_ASYNC) ) ci_bit_set(&p->b.wake_request, CI_SB_FLAG_WAKE_RX_B); break; #ifdef F_SETPIPE_SZ case F_SETPIPE_SZ: /* System pipe buf size is rounded up to power of two. We * cannot replicate this. */ rc = ci_pipe_set_size(epi->ni, p, arg); if( rc < 0 ) { errno = EINVAL; rc = CI_SOCKET_ERROR; break; } rc = 0; break; #endif #ifdef F_GETPIPE_SZ case F_GETPIPE_SZ: rc = (p->bufs_max - 1) * OO_PIPE_BUF_MAX_SIZE; break; #endif default: /* fixme kostik: logging should include some pipe identification */ errno = ENOTSUP; rc = CI_SOCKET_ERROR; } Log_VSC(log("%s(%d, %d, %ld) = %d (errno=%d)", __FUNCTION__, fdinfo->fd, cmd, arg, rc, errno)); return rc; }
/* ** promote a synrecv structure to an established socket ** ** Assumes that the caller will handle a fail if we can't allocate a new ** tcp_state structure due to memory pressure or the like */ int ci_tcp_listenq_try_promote(ci_netif* netif, ci_tcp_socket_listen* tls, ci_tcp_state_synrecv* tsr, ci_ip_cached_hdrs* ipcache, ci_tcp_state** ts_out) { int rc = 0; ci_assert(netif); ci_assert(tls); ci_assert(tls->s.b.state == CI_TCP_LISTEN); ci_assert(tsr); if( (int) ci_tcp_acceptq_n(tls) < tls->acceptq_max ) { ci_tcp_state* ts; /* grab a tcp_state structure that will go onto the accept queue. We take * from the cache of EPs if any are available */ ts = get_ts_from_cache (netif, tsr, tls); if( !ts ) { /* None on cache; try allocating a new ts */ ts = ci_tcp_get_state_buf(netif); #if CI_CFG_FD_CACHING if( ts == NULL ) { /* We've reaped. Did this result in any being cached */ ts = get_ts_from_cache(netif, tsr, tls); if (ts == NULL ) { /* No -- try again to allocate. */ ts = ci_tcp_get_state_buf(netif); } else { CITP_STATS_NETIF(++netif->state->stats.sockcache_hit_reap); } } #endif if( ts == NULL ) { LOG_TV(ci_log("%s: [%d] out of socket buffers", __FUNCTION__, NI_ID(netif))); CITP_STATS_TCP_LISTEN(++tls->stats.n_acceptq_no_sock); CI_SET_SO_ERROR(&tls->s, ENOMEM); citp_waitable_wake(netif, &tls->s.b, CI_SB_FLAG_WAKE_RX); return -ENOMEM; } ci_assert(ci_tcp_is_cached(ts) || (ts->s.b.sb_aflags & CI_SB_AFLAG_ORPHAN)); } #ifdef ONLOAD_OFE ts->s.ofe_code_start = tls->ofe_promote; #endif if( ! ci_tcp_is_cached(ts) ) { /* Need to initialise address information for use when setting filters */ ci_tcp_set_addr_on_promote(netif, ts, tsr, tls); /* "borrow" filter from listening socket. For loopback socket, we * do not need filters, but we have to take a reference of the OS * socket. */ rc = ci_tcp_ep_set_filters(netif, S_SP(ts), ts->s.cp.so_bindtodevice, S_SP(tls)); if( rc < 0 ) { LOG_U(ci_log("%s: Unable to set filters %d", __FUNCTION__, rc)); /* Either put this back on the list (at the head) or free it */ ci_tcp_state_free(netif, ts); return rc; } } #if CI_CFG_FD_CACHING else { /* Now set the s/w filter. We leave the hw filter in place for cached * EPS. This will probably not have the correct raddr and rport, but as * it's sharing the listening socket's filter that's not a problem. It * will be updated if this is still around when the listener is closed. */ rc = ci_netif_filter_insert(netif, S_SP(ts), tsr->l_addr, sock_lport_be16(&tls->s), tsr->r_addr, tsr->r_port, tcp_protocol(ts)); if (rc < 0) { /* Bung it back on the cache list */ LOG_EP(ci_log("Unable to create s/w filter!")); ci_ni_dllist_push(netif, &tls->epcache.cache, &ts->epcache_link); return rc; } /* Need to initialise address information. We do this after trying to * insert the sw filter, so we can push the tcp state back onto the * cache queue with as few changes as possible if we fail to add the * sw filter. */ ci_tcp_set_addr_on_promote(netif, ts, tsr, tls); LOG_EP(ci_log("Cached fd %d from cached to connected", ts->cached_on_fd)); ci_ni_dllist_push(netif, &tls->epcache_connected, &ts->epcache_link); } #endif ci_assert(IS_VALID_SOCK_P(netif, S_SP(ts))); ci_assert(ts->s.b.state == CI_TCP_CLOSED); ts->s.domain = tls->s.domain; cicp_ip_cache_update_from(netif, &ts->s.pkt, ipcache); ci_pmtu_state_init(netif, &ts->s, &ts->pmtus, CI_IP_TIMER_PMTU_DISCOVER); ci_pmtu_set(netif, &ts->pmtus, CI_MIN(ts->s.pkt.mtu, tsr->tcpopts.smss + sizeof(ci_tcp_hdr) + sizeof(ci_ip4_hdr))); /* If we've got SYN via local route, we can handle it */ ci_assert_equiv(ts->s.pkt.status == retrrc_localroute, OO_SP_NOT_NULL(tsr->local_peer)); if( ts->s.pkt.status == retrrc_localroute ) ts->s.pkt.flags |= CI_IP_CACHE_IS_LOCALROUTE; ts->amss = tsr->amss; /* options and flags */ ts->tcpflags = 0; ts->tcpflags |= tsr->tcpopts.flags; ts->tcpflags |= CI_TCPT_FLAG_PASSIVE_OPENED; ts->outgoing_hdrs_len = sizeof(ci_ip4_hdr) + sizeof(ci_tcp_hdr); if( ts->tcpflags & CI_TCPT_FLAG_WSCL ) { ts->snd_wscl = tsr->tcpopts.wscl_shft; ts->rcv_wscl = tsr->rcv_wscl; } else { ts->snd_wscl = ts->rcv_wscl = 0u; } CI_IP_SOCK_STATS_VAL_TXWSCL( ts, ts->snd_wscl); CI_IP_SOCK_STATS_VAL_RXWSCL( ts, ts->rcv_wscl); /* Send and receive sequence numbers */ tcp_snd_una(ts) = tcp_snd_nxt(ts) = tcp_enq_nxt(ts) = tcp_snd_up(ts) = tsr->snd_isn + 1; ci_tcp_set_snd_max(ts, tsr->rcv_nxt, tcp_snd_una(ts), 0); ci_tcp_rx_set_isn(ts, tsr->rcv_nxt); tcp_rcv_up(ts) = SEQ_SUB(tcp_rcv_nxt(ts), 1); if( ts->tcpflags & CI_TCPT_FLAG_TSO ) { ts->incoming_tcp_hdr_len += 12; ts->outgoing_hdrs_len += 12; ts->tspaws = ci_tcp_time_now(netif); ts->tsrecent = tsr->tspeer; ts->tslastack = tsr->rcv_nxt; } else { /* Must be after initialising snd_una. */ ci_tcp_clear_rtt_timing(ts); ts->timed_ts = tsr->timest; } /* SACK has nothing to be done. */ /* ?? ECN */ ci_tcp_set_hdr_len(ts, (ts->outgoing_hdrs_len - sizeof(ci_ip4_hdr))); ts->smss = tsr->tcpopts.smss; ts->c.user_mss = tls->c.user_mss; if (ts->c.user_mss && ts->c.user_mss < ts->smss) ts->smss = ts->c.user_mss; #if CI_CFG_LIMIT_SMSS ts->smss = ci_tcp_limit_mss(ts->smss, netif, __FUNCTION__); #endif ci_assert(ts->smss>0); ci_tcp_set_eff_mss(netif, ts); ci_tcp_set_initialcwnd(netif, ts); /* Copy socket options & related fields that should be inherited. * Note: Windows does not inherit rcvbuf until the call to accept * completes. The assumption here is that all options can be * inherited at the same time (most won't have an effect until there * is a socket available for use by the app.). */ ci_tcp_inherit_accept_options(netif, tls, ts, "SYN RECV (LISTENQ PROMOTE)"); /* NB. Must have already set peer (which we have). */ ci_tcp_set_established_state(netif, ts); CITP_STATS_NETIF(++netif->state->stats.synrecv2established); ci_assert(ts->ka_probes == 0); ci_tcp_kalive_restart(netif, ts, ci_tcp_kalive_idle_get(ts)); ci_tcp_set_flags(ts, CI_TCP_FLAG_ACK); /* Remove the synrecv structure from the listen queue, and free the ** buffer. */ if( tsr->tcpopts.flags & CI_TCPT_FLAG_SYNCOOKIE ) ci_free(tsr); else { ci_tcp_listenq_remove(netif, tls, tsr); ci_tcp_synrecv_free(netif, tsr); } ci_bit_set(&ts->s.b.sb_aflags, CI_SB_AFLAG_TCP_IN_ACCEPTQ_BIT); ci_tcp_acceptq_put(netif, tls, &ts->s.b); LOG_TC(log(LNT_FMT "new ts=%d SYN-RECV->ESTABLISHED flags=0x%x", LNT_PRI_ARGS(netif, tls), S_FMT(ts), ts->tcpflags); log(LNTS_FMT RCV_WND_FMT " snd=%08x-%08x-%08x enq=%08x", LNTS_PRI_ARGS(netif, ts), RCV_WND_ARGS(ts), tcp_snd_una(ts), tcp_snd_nxt(ts), ts->snd_max, tcp_enq_nxt(ts))); citp_waitable_wake(netif, &tls->s.b, CI_SB_FLAG_WAKE_RX); *ts_out = ts; return 0; }
/*! Copy socket options and related fields that should be inherited. * Inherits into [ts] from [s] & [c]. Options are inherited during EP * promotion for unix, during accept handler in Windows & as a result of * setsockopt:SOL_SOCKET:SO_UPDATE_ACCEPT_CONTEXT. MUST have a lock on * [ts]. [or_nonblock] controls whether the non-blocking state from [s] * overwrites that in [ts] or is OR'd into it. */ static void ci_tcp_inherit_options(ci_netif* ni, ci_sock_cmn* s, ci_tcp_socket_cmn* c, ci_tcp_state* ts, const char* ctxt) { ci_assert(ni); ci_assert(s); ci_assert(c); ci_assert(ts); ts->s.so = s->so; ts->s.cp.so_bindtodevice = s->cp.so_bindtodevice; ts->s.cp.ip_ttl = s->cp.ip_ttl; ts->s.rx_bind2dev_ifindex = s->rx_bind2dev_ifindex; ts->s.rx_bind2dev_base_ifindex = s->rx_bind2dev_base_ifindex; ts->s.rx_bind2dev_vlan = s->rx_bind2dev_vlan; ci_tcp_set_sndbuf(ni, ts); /* eff_mss must be valid */ ci_tcp_set_rcvbuf(ni, ts); /* and amss, and rcv_wscl */ { /* NB. We have exclusive access to [ts], so it is safe to manipulate ** s_aflags without using bit-ops. */ unsigned inherited_sflags = CI_SOCK_AFLAG_TCP_INHERITED; unsigned inherited_sbflags = 0; if( NI_OPTS(ni).accept_inherit_nonblock ) inherited_sbflags |= CI_SB_AFLAG_O_NONBLOCK | CI_SB_AFLAG_O_NDELAY; ci_assert((ts->s.s_aflags & inherited_sflags) == 0); ci_atomic32_or(&ts->s.s_aflags, s->s_aflags & inherited_sflags); if( NI_OPTS(ni).tcp_force_nodelay == 1 ) ci_bit_set(&ts->s.s_aflags, CI_SOCK_AFLAG_NODELAY_BIT); else if( NI_OPTS(ni).tcp_force_nodelay == 2 ) ci_bit_clear(&ts->s.s_aflags, CI_SOCK_AFLAG_NODELAY_BIT); ci_assert((ts->s.b.sb_aflags & inherited_sbflags) == 0); ci_atomic32_or(&ts->s.b.sb_aflags, s->b.sb_aflags & inherited_sbflags); ci_assert_equal((ts->s.s_flags & CI_SOCK_FLAG_TCP_INHERITED), CI_SOCK_FLAG_PMTU_DO); ts->s.s_flags &= ~CI_SOCK_FLAG_PMTU_DO; ts->s.s_flags |= s->s_flags & CI_SOCK_FLAG_TCP_INHERITED; } /* Bug1861: while not defined as such, various SOL_TCP/SOL_IP sockopts * are inherited in Linux. */ /* TCP_KEEPIDLE, TCP_KEEPINTVL, TCP_KEEPCNT */ ts->c.t_ka_time = c->t_ka_time; ts->c.t_ka_time_in_secs = c->t_ka_time_in_secs; ts->c.t_ka_intvl = c->t_ka_intvl; ts->c.t_ka_intvl_in_secs = c->t_ka_intvl_in_secs; ts->c.ka_probe_th = c->ka_probe_th; ci_ip_hdr_init_fixed(&ts->s.pkt.ip, IPPROTO_TCP, s->pkt.ip.ip_ttl, s->pkt.ip.ip_tos); ts->s.cmsg_flags = s->cmsg_flags; ts->s.timestamping_flags = s->timestamping_flags; /* Must have set up so.sndbuf */ ci_tcp_init_rcv_wnd(ts, ctxt); }
/* c_ni is assumed to be locked on enterance and is always unlocked on * exit. */ int ci_tcp_connect_lo_toconn(ci_netif *c_ni, oo_sp c_id, ci_uint32 dst, ci_netif *l_ni, oo_sp l_id) { ci_tcp_state *ts; ci_tcp_socket_listen *tls, *alien_tls; citp_waitable_obj *wo; citp_waitable *w; int rc; ci_assert(ci_netif_is_locked(c_ni)); ci_assert(OO_SP_NOT_NULL(c_id)); ci_assert(OO_SP_NOT_NULL(l_id)); LOG_TC(log("%s: connect %d:%d to %d:%d", __FUNCTION__, c_ni->state->stack_id, OO_SP_TO_INT(c_id), l_ni->state->stack_id, OO_SP_TO_INT(l_id))); alien_tls = SP_TO_TCP_LISTEN(l_ni, l_id); if( (int)ci_tcp_acceptq_n(alien_tls) >= alien_tls->acceptq_max ) { ci_netif_unlock(c_ni); return -EBUSY; } /* In c_ni, create shadow listening socket tls (copy l_id) */ ts = ci_tcp_get_state_buf(c_ni); if( ts == NULL ) { ci_netif_unlock(c_ni); LOG_E(ci_log("%s: [%d] out of socket buffers", __FUNCTION__, NI_ID(c_ni))); return -ENOMEM; } /* init common tcp fields */ ts->s.so = alien_tls->s.so; ts->s.cp.ip_ttl = alien_tls->s.cp.ip_ttl; S_TCP_HDR(&ts->s)->tcp_source_be16 = S_TCP_HDR(&alien_tls->s)->tcp_source_be16; ts->s.domain = alien_tls->s.domain; ts->c = alien_tls->c; ts->c.tcp_defer_accept = OO_TCP_DEFER_ACCEPT_OFF; /* make sure nobody will ever connect to our "shadow" socket * except us */ ci_bit_set(&ts->s.b.sb_aflags, CI_SB_AFLAG_ORPHAN_BIT); ci_tcp_set_slow_state(c_ni, ts, CI_TCP_LISTEN); tls = SOCK_TO_TCP_LISTEN(&ts->s); /* no timer: */ tls->s.s_flags = alien_tls->s.s_flags | CI_SOCK_FLAG_BOUND_ALIEN; tls->acceptq_max = 1; rc = ci_tcp_listen_init(c_ni, tls); if( rc != 0 ) { citp_waitable_obj_free(c_ni, &tls->s.b); return rc; } /* Connect c_id to tls */ ts = SP_TO_TCP(c_ni, c_id); rc = ci_tcp_connect_lo_samestack(c_ni, ts, tls->s.b.bufid); /* Accept as from tls */ if( !ci_tcp_acceptq_not_empty(tls) ) { /* it is possible, for example, if ci_tcp_listenq_try_promote() failed * because there are no endpoints */ ci_tcp_listenq_drop_all(c_ni, tls); citp_waitable_obj_free(c_ni, &tls->s.b); ci_netif_unlock(c_ni); return -EBUSY; } w = ci_tcp_acceptq_get(c_ni, tls); ci_assert(w); LOG_TV(ci_log("%s: %d:%d to %d:%d shadow %d:%d accepted %d:%d", __FUNCTION__, c_ni->state->stack_id, OO_SP_TO_INT(c_id), l_ni->state->stack_id, OO_SP_TO_INT(l_id), c_ni->state->stack_id, tls->s.b.bufid, c_ni->state->stack_id, w->bufid)); ci_assert(w->state & CI_TCP_STATE_TCP); ci_assert(w->state != CI_TCP_LISTEN); /* Destroy tls. * NB: nobody could possibly connect to it, so no need to do proper * shutdown. */ ci_assert_equal(ci_tcp_acceptq_n(tls), 0); ci_tcp_listenq_drop_all(c_ni, tls); citp_waitable_obj_free(c_ni, &tls->s.b); ci_netif_unlock(c_ni); /* Keep a port reference */ { tcp_helper_endpoint_t *l_ep, *a_ep; struct oo_file_ref* os_sock_ref; ci_irqlock_state_t lock_flags; l_ep = ci_trs_ep_get(netif2tcp_helper_resource(l_ni), l_id); a_ep = ci_trs_ep_get(netif2tcp_helper_resource(c_ni), W_SP(w)); ci_irqlock_lock(&l_ep->thr->lock, &lock_flags); os_sock_ref = l_ep->os_socket; ci_assert_equal(a_ep->os_port_keeper, NULL); if( os_sock_ref != NULL ) { os_sock_ref = oo_file_ref_add(os_sock_ref); os_sock_ref = oo_file_ref_xchg(&a_ep->os_port_keeper, os_sock_ref); ci_irqlock_unlock(&l_ep->thr->lock, &lock_flags); if( os_sock_ref != NULL ) oo_file_ref_drop(os_sock_ref); } else { ci_irqlock_unlock(&l_ep->thr->lock, &lock_flags); goto cleanup; } } /* lock l_ni: Check that l_id is the same socket it used to be */ /* create ref-sock in l_ni, put it into acc q */ if( ci_netif_lock(l_ni) != 0 ) goto cleanup; if( alien_tls->s.b.state != CI_TCP_LISTEN || (alien_tls->s.b.sb_aflags & CI_SB_AFLAG_ORPHAN) || S_TCP_HDR(&alien_tls->s)->tcp_source_be16 != TS_TCP(ts)->tcp_dest_be16 || (alien_tls->s.pkt.ip.ip_saddr_be32 != INADDR_ANY && alien_tls->s.pkt.ip.ip_saddr_be32 != ts->s.pkt.ip.ip_daddr_be32) ) { ci_netif_unlock(l_ni); goto cleanup; } ci_bit_mask_set(&w->sb_aflags, CI_SB_AFLAG_TCP_IN_ACCEPTQ | CI_SB_AFLAG_ORPHAN); wo = citp_waitable_obj_alloc(l_ni); if( wo == NULL ) { ci_netif_unlock(l_ni); goto cleanup; } wo->waitable.state = CI_TCP_CLOSED; wo->waitable.sb_aflags |= CI_SB_AFLAG_MOVED_AWAY; wo->waitable.moved_to_stack_id = c_ni->state->stack_id; wo->waitable.moved_to_sock_id = W_SP(w); LOG_TC(log("%s: put to acceptq %d:%d referencing %d:%d", __func__, l_ni->state->stack_id, OO_SP_TO_INT(W_SP(&wo->waitable)), c_ni->state->stack_id, OO_SP_TO_INT(W_SP(w)))); ci_tcp_acceptq_put(l_ni, alien_tls, &wo->waitable); citp_waitable_wake_not_in_poll(l_ni, &alien_tls->s.b, CI_SB_FLAG_WAKE_RX); ci_netif_unlock(l_ni); return rc; cleanup: ci_assert(w->sb_aflags & CI_SB_AFLAG_ORPHAN); ci_bit_mask_clear(&w->sb_aflags, CI_SB_AFLAG_TCP_IN_ACCEPTQ | CI_SB_AFLAG_ORPHAN); efab_tcp_helper_close_endpoint(netif2tcp_helper_resource(c_ni), w->bufid); /* we can not guarantee c_ni lock, so we can' call * ci_tcp_drop(c_ni, ts). So, we return error; UL will handover * and close ts endpoint. */ return -EBUSY; }