static int ci_tcp_connect_ul_start(ci_netif *ni, ci_tcp_state* ts, ci_uint32 dst_be32, unsigned dport_be16, int* fail_rc) { ci_ip_pkt_fmt* pkt; int rc = 0; ci_assert(ts->s.pkt.mtu); /* Now that we know the outgoing route, set the MTU related values. * Note, even these values are speculative since the real MTU * could change between now and passing the packet to the lower layers */ ts->amss = ts->s.pkt.mtu - sizeof(ci_tcp_hdr) - sizeof(ci_ip4_hdr); #if CI_CFG_LIMIT_AMSS ts->amss = ci_tcp_limit_mss(ts->amss, ni, __FUNCTION__); #endif /* Default smss until discovered by MSS option in SYN - RFC1122 4.2.2.6 */ ts->smss = CI_CFG_TCP_DEFAULT_MSS; /* set pmtu, eff_mss, snd_buf and adjust windows */ ci_pmtu_set(ni, &ts->pmtus, ts->s.pkt.mtu); ci_tcp_set_eff_mss(ni, ts); ci_tcp_set_initialcwnd(ni, ts); /* Send buffer adjusted by ci_tcp_set_eff_mss(), but we want it to stay * zero until the connection is established. */ ts->so_sndbuf_pkts = 0; /* * 3. State and address are OK. It's address routed through our NIC. * Do connect(). */ ci_assert_nequal(ts->s.pkt.ip.ip_saddr_be32, INADDR_ANY); if( ts->s.s_flags & CI_SOCK_FLAG_CONNECT_MUST_BIND ) { ci_sock_cmn* s = &ts->s; ci_uint16 source_be16 = 0; if( s->s_flags & CI_SOCK_FLAG_ADDR_BOUND ) rc = __ci_bind(ni, &ts->s, ts->s.pkt.ip.ip_saddr_be32, &source_be16); else rc = __ci_bind(ni, &ts->s, INADDR_ANY, &source_be16); if(CI_LIKELY( rc == 0 )) { TS_TCP(ts)->tcp_source_be16 = source_be16; ts->s.cp.lport_be16 = source_be16; LOG_TC(log(LNT_FMT "connect: our bind returned %s:%u", LNT_PRI_ARGS(ni, ts), ip_addr_str(INADDR_ANY), (unsigned) CI_BSWAP_BE16(TS_TCP(ts)->tcp_source_be16))); } else { LOG_U(ci_log("__ci_bind returned %d at %s:%d", CI_GET_ERROR(rc), __FILE__, __LINE__)); *fail_rc = rc; return CI_CONNECT_UL_FAIL; } if(CI_UNLIKELY( ts->s.pkt.ip.ip_saddr_be32 == 0 )) { CI_SET_ERROR(*fail_rc, EINVAL); return CI_CONNECT_UL_FAIL; } } ci_tcp_set_peer(ts, dst_be32, dport_be16); /* Make sure we can get a buffer before we change state. */ pkt = ci_netif_pkt_tx_tcp_alloc(ni); if( CI_UNLIKELY(! pkt) ) { /* NB. We've already done a poll above. */ rc = ci_netif_pkt_wait(ni, &ts->s, CI_SLEEP_NETIF_LOCKED|CI_SLEEP_NETIF_RQ); if( ci_netif_pkt_wait_was_interrupted(rc) ) { CI_SET_ERROR(*fail_rc, -rc); return CI_CONNECT_UL_LOCK_DROPPED; } /* OK, there are (probably) packets available - go try again. Note we * jump back to the top of the function because someone may have * connected this socket in the mean-time, so we need to check the * state once more. */ return CI_CONNECT_UL_START_AGAIN; } #ifdef ONLOAD_OFE if( ni->ofe != NULL ) ts->s.ofe_code_start = ofe_socktbl_find( ni->ofe, OFE_SOCKTYPE_TCP_ACTIVE, tcp_laddr_be32(ts), tcp_raddr_be32(ts), tcp_lport_be16(ts), tcp_rport_be16(ts)); #endif rc = ci_tcp_ep_set_filters(ni, S_SP(ts), ts->s.cp.so_bindtodevice, OO_SP_NULL); if( rc < 0 ) { /* Perhaps we've run out of filters? See if we can push a socket out * of timewait and steal its filter. */ ci_assert_nequal(rc, -EFILTERSSOME); if( rc != -EBUSY || ! ci_netif_timewait_try_to_free_filter(ni) || (rc = ci_tcp_ep_set_filters(ni, S_SP(ts), ts->s.cp.so_bindtodevice, OO_SP_NULL)) < 0 ) { ci_assert_nequal(rc, -EFILTERSSOME); /* Either a different error, or our efforts to free a filter did not * work. */ if( ! (ts->s.s_flags & CI_SOCK_FLAG_ADDR_BOUND) ) { ts->s.pkt.ip.ip_saddr_be32 = 0; ts->s.cp.ip_laddr_be32 = 0; } ci_netif_pkt_release(ni, pkt); CI_SET_ERROR(*fail_rc, -rc); return CI_CONNECT_UL_FAIL; } } LOG_TC(log(LNT_FMT "CONNECT %s:%u->%s:%u", LNT_PRI_ARGS(ni, ts), ip_addr_str(ts->s.pkt.ip.ip_saddr_be32), (unsigned) CI_BSWAP_BE16(TS_TCP(ts)->tcp_source_be16), ip_addr_str(ts->s.pkt.ip.ip_daddr_be32), (unsigned) CI_BSWAP_BE16(TS_TCP(ts)->tcp_dest_be16))); /* We are going to send the SYN - set states appropriately */ tcp_snd_una(ts) = tcp_snd_nxt(ts) = tcp_enq_nxt(ts) = tcp_snd_up(ts) = ci_tcp_initial_seqno(ni); ts->snd_max = tcp_snd_nxt(ts) + 1; /* Must be after initialising snd_una. */ ci_tcp_clear_rtt_timing(ts); ci_tcp_set_flags(ts, CI_TCP_FLAG_SYN); ts->tcpflags &=~ CI_TCPT_FLAG_OPT_MASK; ts->tcpflags |= NI_OPTS(ni).syn_opts; if( (ts->tcpflags & CI_TCPT_FLAG_WSCL) ) { ts->rcv_wscl = ci_tcp_wscl_by_buff(ni, ci_tcp_rcvbuf_established(ni, &ts->s)); CI_IP_SOCK_STATS_VAL_RXWSCL(ts, ts->rcv_wscl); } else { ts->rcv_wscl = 0; CI_IP_SOCK_STATS_VAL_RXWSCL(ts, 0); } ci_tcp_set_rcvbuf(ni, ts); ci_tcp_init_rcv_wnd(ts, "CONNECT"); /* outgoing_hdrs_len is initialised to include timestamp option. */ if( ! (ts->tcpflags & CI_TCPT_FLAG_TSO) ) ts->outgoing_hdrs_len = sizeof(ci_ip4_hdr)+sizeof(ci_tcp_hdr); if( ci_tcp_can_stripe(ni, ts->s.pkt.ip.ip_saddr_be32, ts->s.pkt.ip.ip_daddr_be32) ) ts->tcpflags |= CI_TCPT_FLAG_STRIPE; ci_tcp_set_slow_state(ni, ts, CI_TCP_SYN_SENT); /* If the app trys to send data on a socket in SYN_SENT state ** then the data is queued for send until the SYN gets ACKed. ** (rfc793 p56) ** ** Receive calls on the socket should block until data arrives ** (rfc793 p58) ** ** Clearing tx_errno and rx_errno acheive this. The transmit window ** is set to 1 byte which ensures that only the SYN packet gets ** sent until the ACK is received with more window. */ ci_assert(ts->snd_max == tcp_snd_nxt(ts) + 1); ts->s.rx_errno = 0; ts->s.tx_errno = 0; ci_tcp_enqueue_no_data(ts, ni, pkt); ci_tcp_set_flags(ts, CI_TCP_FLAG_ACK); if( ts->s.b.sb_aflags & (CI_SB_AFLAG_O_NONBLOCK | CI_SB_AFLAG_O_NDELAY) ) { ts->tcpflags |= CI_TCPT_FLAG_NONBLOCK_CONNECT; LOG_TC(log( LNT_FMT "Non-blocking connect - return EINPROGRESS", LNT_PRI_ARGS(ni, ts))); CI_SET_ERROR(*fail_rc, EINPROGRESS); return CI_CONNECT_UL_FAIL; } return CI_CONNECT_UL_OK; }
void ci_ip_timer_state_assert_valid(ci_netif* ni, const char* file, int line) { ci_ip_timer_state* ipts; ci_ip_timer* ts; ci_ni_dllist_t* bucket; ci_ni_dllist_link* l; ci_iptime_t stime, wheel_base, max_time, min_time; int a1, a2, a3, w, b, bit_shift; /* shifting a 32 bit integer left or right 32 bits has undefined results * (i.e. not 0 which is required). Therefore I now use an array of mask * values */ unsigned wheel_mask[CI_IPTIME_WHEELS] = { WHEEL0_MASK, WHEEL1_MASK, WHEEL2_MASK, 0 }; ipts = IPTIMER_STATE(ni); stime = ipts->sched_ticks; /* for each wheel */ for(w=0; w < CI_IPTIME_WHEELS; w++) { /* base time of wheel */ wheel_base = stime & wheel_mask[w]; /* for each bucket in wheel */ for (b=0; b < CI_IPTIME_BUCKETS; b++) { /* max and min relative times for this bucket */ bit_shift = CI_IPTIME_BUCKETBITS*w; min_time = wheel_base + (b << bit_shift); max_time = min_time + (1 << bit_shift); bucket = &ipts->warray[w*CI_IPTIME_BUCKETS + b]; /* check list looks valid */ if ( ci_ni_dllist_start(ni, bucket) == ci_ni_dllist_end(ni, bucket) ) { ci_assert( ci_ni_dllist_is_empty(ni, bucket) ); } /* check buckets that should be empty are! */ a3 = TIME_GT(min_time, stime) || ci_ni_dllist_is_empty(ni, bucket); /* run through timers in bucket */ for (l = ci_ni_dllist_start(ni, bucket); l != ci_ni_dllist_end(ni, bucket); ci_ni_dllist_iter(ni, l) ) { ci_ni_dllist_link_assert_valid(ni, l); /* get timer */ ts = LINK2TIMER(l); /* must be in the future */ a1 = TIME_GT(ts->time, stime); /* must be within time range of bucket */ a2 = TIME_LT(ts->time, max_time) && TIME_GE(ts->time, min_time); /* if any of the checks fail then print out timer details */ if (!a1 || !a2 || !a3) { ci_log("%s: [w=0x%x/b=0x%x] stime=0x%x", __FUNCTION__, w, b, stime); ci_log(" --> t=0x%x, min=0x%x, max=0x%x", ts->time, min_time, max_time); ci_log(" [%s line=%d]", file, line); } /* stop if assertion failed */ ci_assert(a1 && a2 && a3); } } } }
int onload_zc_release_buffers(int fd, onload_zc_handle* bufs, int bufs_len) { int rc = 0, i; citp_lib_context_t lib_context; citp_fdinfo* fdi; citp_sock_fdi* epi; ci_netif* ni; Log_CALL(ci_log("%s(%d, %p, %d)", __FUNCTION__, fd, bufs, bufs_len)); citp_enter_lib(&lib_context); if( (fdi = citp_fdtable_lookup(fd)) != NULL ) { switch( citp_fdinfo_get_type(fdi) ) { case CITP_UDP_SOCKET: case CITP_TCP_SOCKET: epi = fdi_to_sock_fdi(fdi); ni = epi->sock.netif; ci_netif_lock(ni); for( i = 0; i < bufs_len; ++i ) { ci_ip_pkt_fmt* pkt = (ci_ip_pkt_fmt*)bufs[i]; if( pkt->stack_id != ni->state->stack_id ) { LOG_U(log("%s: attempt to free buffer from stack %d to stack %d", __FUNCTION__, pkt->stack_id, ni->state->stack_id)); rc = -EINVAL; break; } } if( rc == 0 ) { for( i = 0; i < bufs_len; ++i ) ci_netif_pkt_release_check_keep(ni, (ci_ip_pkt_fmt*)bufs[i]); } ci_netif_unlock(ni); break; #if CI_CFG_USERSPACE_EPOLL case CITP_EPOLL_FD: rc = -ENOTSOCK; break; #endif #if CI_CFG_USERSPACE_PIPE case CITP_PIPE_FD: rc = -ENOTSOCK; break; #endif default: LOG_U(log("%s: unknown fdinfo type %d", __FUNCTION__, citp_fdinfo_get_type(fdi))); rc = -EINVAL; } citp_fdinfo_release_ref(fdi, 0); } else { /* Not onload socket */ rc = -ESOCKTNOSUPPORT; } citp_exit_lib(&lib_context, TRUE); Log_CALL_RESULT(rc); return rc; }
/* Move priv file to the alien_ni stack. * Should be called with the locked priv stack and socket; * the function returns with this stack being unlocked. * If rc=0, it returns with alien_ni stack locked; * otherwise, both stacks are unlocked. * Socket is always unlocked on return. */ int efab_file_move_to_alien_stack(ci_private_t *priv, ci_netif *alien_ni) { tcp_helper_resource_t *old_thr = priv->thr; tcp_helper_resource_t *new_thr = netif2tcp_helper_resource(alien_ni); ci_sock_cmn *old_s = SP_TO_SOCK(&old_thr->netif, priv->sock_id); ci_sock_cmn *new_s; ci_sock_cmn *mid_s; tcp_helper_endpoint_t *old_ep, *new_ep; int rc, i; int pollwait_register = 0; #if CI_CFG_FD_CACHING oo_p sp; #endif OO_DEBUG_TCPH(ci_log("%s: move %d:%d to %d", __func__, old_thr->id, priv->sock_id, new_thr->id)); /* Poll the old stack - deliver all data to our socket */ ci_netif_poll(&old_thr->netif); /* Endpoints in epoll list should not be moved, because waitq is already * in the epoll internal structures (bug 41152). */ if( !list_empty(&priv->_filp->f_ep_links) ) { rc = -EBUSY; goto fail1; } if( !efab_file_move_supported(&old_thr->netif, old_s) ) { rc = -EINVAL; goto fail1; } /* Lock the second stack */ i = 0; while( ! ci_netif_trylock(alien_ni) ) { ci_netif_unlock(&old_thr->netif); if( i++ >= 1000 ) { rc = -EBUSY; goto fail1_ni_unlocked; } rc = ci_netif_lock(&old_thr->netif); if( rc != 0 ) goto fail1_ni_unlocked; } /* Allocate a new socket in the alien_ni stack */ rc = -ENOMEM; if( old_s->b.state == CI_TCP_STATE_UDP ) { ci_udp_state *new_us = ci_udp_get_state_buf(alien_ni); if( new_us == NULL ) goto fail2; new_s = &new_us->s; } else { ci_tcp_state *new_ts = ci_tcp_get_state_buf(alien_ni); if( new_ts == NULL ) goto fail2; new_s = &new_ts->s; } /* Allocate an intermediate "socket" outside of everything */ mid_s = ci_alloc(CI_MAX(sizeof(ci_tcp_state), sizeof(ci_udp_state))); if( mid_s == NULL ) goto fail3; OO_DEBUG_TCPH(ci_log("%s: move %d:%d to %d:%d", __func__, old_thr->id, priv->sock_id, new_thr->id, new_s->b.bufid)); /* Copy TCP/UDP state */ memcpy(mid_s, old_s, CI_MAX(sizeof(ci_tcp_state), sizeof(ci_udp_state))); /* do not copy old_s->b.bufid * and other fields in stack adress space */ mid_s->b.sb_aflags |= CI_SB_AFLAG_ORPHAN; mid_s->b.bufid = new_s->b.bufid; mid_s->b.post_poll_link = new_s->b.post_poll_link; mid_s->b.ready_link = new_s->b.ready_link; mid_s->reap_link = new_s->reap_link; if( old_s->b.state & CI_TCP_STATE_TCP ) { ci_tcp_state *new_ts = SOCK_TO_TCP(new_s); ci_tcp_state *mid_ts = SOCK_TO_TCP(mid_s); mid_ts->timeout_q_link = new_ts->timeout_q_link; mid_ts->tx_ready_link = new_ts->tx_ready_link; mid_ts->rto_tid = new_ts->rto_tid; mid_ts->delack_tid = new_ts->delack_tid; mid_ts->zwin_tid = new_ts->zwin_tid; mid_ts->kalive_tid = new_ts->kalive_tid; mid_ts->cork_tid = new_ts->cork_tid; ci_ip_queue_init(&mid_ts->recv1); ci_ip_queue_init(&mid_ts->recv2); ci_ip_queue_init(&mid_ts->send); ci_ip_queue_init(&mid_ts->retrans); mid_ts->send_prequeue = OO_PP_ID_NULL; new_ts->retrans_ptr = OO_PP_NULL; mid_ts->tmpl_head = OO_PP_NULL; oo_atomic_set(&mid_ts->send_prequeue_in, 0); *new_ts = *mid_ts; ci_pmtu_state_init(alien_ni, &new_ts->s, &new_ts->pmtus, CI_IP_TIMER_PMTU_DISCOVER); #if CI_CFG_FD_CACHING sp = TS_OFF(alien_ni, new_ts); OO_P_ADD(sp, CI_MEMBER_OFFSET(ci_tcp_state, epcache_link)); ci_ni_dllist_link_init(alien_ni, &new_ts->epcache_link, sp, "epch"); ci_ni_dllist_self_link(alien_ni, &new_ts->epcache_link); sp = TS_OFF(alien_ni, new_ts); OO_P_ADD(sp, CI_MEMBER_OFFSET(ci_tcp_state, epcache_fd_link)); ci_ni_dllist_link_init(alien_ni, &new_ts->epcache_fd_link, sp, "ecfd"); ci_ni_dllist_self_link(alien_ni, &new_ts->epcache_fd_link); #endif /* free temporary mid_ts storage */ CI_FREE_OBJ(mid_ts); } else { ci_udp_state *mid_us = SOCK_TO_UDP(mid_s); *SOCK_TO_UDP(new_s) = *mid_us; CI_FREE_OBJ(mid_us); } /* Move the filter */ old_ep = ci_trs_ep_get(old_thr, priv->sock_id); new_ep = ci_trs_ep_get(new_thr, new_s->b.bufid); rc = tcp_helper_endpoint_move_filters_pre(old_ep, new_ep); if( rc != 0 ) { rc = -EINVAL; goto fail3; } /* Allocate a new file for the new endpoint */ rc = onload_alloc_file(new_thr, new_s->b.bufid, priv->_filp->f_flags, priv->fd_type, &old_ep->alien_ref); if( rc != 0 ) goto fail4; ci_assert(old_ep->alien_ref); /* Copy F_SETOWN_EX, F_SETSIG to the new file */ #ifdef F_SETOWN_EX rcu_read_lock(); __f_setown(old_ep->alien_ref->_filp, priv->_filp->f_owner.pid, priv->_filp->f_owner.pid_type, 1); rcu_read_unlock(); #endif old_ep->alien_ref->_filp->f_owner.signum = priv->_filp->f_owner.signum; old_ep->alien_ref->_filp->f_flags |= priv->_filp->f_flags & O_NONBLOCK; /* Move os_socket from one ep to another */ if( tcp_helper_endpoint_set_aflags(new_ep, OO_THR_EP_AFLAG_ATTACHED) & OO_THR_EP_AFLAG_ATTACHED ) { fput(old_ep->alien_ref->_filp); rc = -EBUSY; goto fail2; /* state & filters are cleared by fput() */ } /********* Point of no return **********/ ci_wmb(); priv->fd_type = CI_PRIV_TYPE_ALIEN_EP; priv->_filp->f_op = &linux_tcp_helper_fops_alien; ci_wmb(); oo_file_moved(priv); /* Read all already-arrived packets after the filters move but before * copying of the receive queue. */ ci_netif_poll(&old_thr->netif); tcp_helper_endpoint_move_filters_post(old_ep, new_ep); ci_assert( efab_file_move_supported(&old_thr->netif, old_s)); /* There's a gap between un-registering the old ep, and registering the * the new. However, the notifications shouldn't be in use for sockets * that are in a state that can be moved, so this shouldn't be a problem. */ if( old_ep->os_sock_pt.whead ) { pollwait_register = 1; efab_tcp_helper_os_pollwait_unregister(old_ep); } ci_assert_equal(new_ep->os_socket, NULL); new_ep->os_socket = oo_file_ref_xchg(&old_ep->os_socket, NULL); ci_assert_equal(old_ep->os_socket, NULL); if( pollwait_register ) efab_tcp_helper_os_pollwait_register(new_ep); ci_bit_clear(&new_s->b.sb_aflags, CI_SB_AFLAG_ORPHAN_BIT); if( new_s->b.state == CI_TCP_ESTABLISHED ) CI_TCP_STATS_INC_CURR_ESTAB(alien_ni); /* Copy recv queue */ if( new_s->b.state & CI_TCP_STATE_TCP ) { ci_tcp_state *new_ts = SOCK_TO_TCP(new_s); ci_tcp_state *old_ts = SOCK_TO_TCP(old_s); int i; /* Stop timers */ ci_ip_timer_clear(&old_thr->netif, &old_ts->kalive_tid); ci_ip_timer_clear(&old_thr->netif, &old_ts->delack_tid); efab_ip_queue_copy(alien_ni, &new_ts->recv1, &old_thr->netif, &old_ts->recv1); efab_ip_queue_copy(alien_ni, &new_ts->recv2, &old_thr->netif, &old_ts->recv2); new_ts->recv1_extract = new_ts->recv1.head; /* Drop reorder buffer */ ci_ip_queue_init(&new_ts->rob); new_ts->dsack_block = OO_PP_INVALID; new_ts->dsack_start = new_ts->dsack_end = 0; for( i = 0; i <= CI_TCP_SACK_MAX_BLOCKS; i++ ) new_ts->last_sack[i] = OO_PP_NULL; } else { /* There should not be any recv q, but drop it to be sure */ ci_udp_recv_q_init(&SOCK_TO_UDP(new_s)->recv_q); } /* Old stack can be unlocked */ old_s->b.sb_flags |= CI_SB_FLAG_MOVED; ci_netif_unlock(&old_thr->netif); ci_assert( efab_file_move_supported(alien_ni, new_s) ); /* Move done: poll for any new data. */ ci_netif_poll(alien_ni); if( new_s->b.state & CI_TCP_STATE_TCP ) { ci_tcp_state *new_ts = SOCK_TO_TCP(new_s); /* Timers setup: delack, keepalive */ if( (new_ts->acks_pending & CI_TCP_ACKS_PENDING_MASK) > 0) ci_tcp_timeout_delack(alien_ni, new_ts); ci_tcp_kalive_reset(alien_ni, new_ts); } /* Old ep: we are done. */ ci_bit_set(&old_s->b.sb_aflags, CI_SB_AFLAG_MOVED_AWAY_BIT); old_s->b.moved_to_stack_id = alien_ni->state->stack_id; old_s->b.moved_to_sock_id = new_s->b.bufid; if( ! list_empty(&priv->_filp->f_ep_links) ) ci_bit_set(&old_s->b.sb_aflags, CI_SB_AFLAG_MOVED_AWAY_IN_EPOLL_BIT); ci_sock_unlock(&old_thr->netif, &old_s->b); ci_sock_unlock(alien_ni, &new_s->b); ci_assert(ci_netif_is_locked(alien_ni)); OO_DEBUG_TCPH(ci_log("%s: -> [%d:%d] %s", __func__, new_thr->id, new_s->b.bufid, ci_tcp_state_str(new_s->b.state))); return 0; fail4: /* We clear the filters from the new ep. * For now, we do not need to re-insert old filters because hw filters * are alredy here (in case of accepted socket) or not needed. * We have not removed old sw filters yet. */ tcp_helper_endpoint_move_filters_undo(old_ep, new_ep); fail3: if( new_s->b.state & CI_TCP_STATE_TCP ) ci_tcp_state_free(alien_ni, SOCK_TO_TCP(new_s)); else ci_udp_state_free(alien_ni, SOCK_TO_UDP(new_s)); fail2: ci_netif_unlock(alien_ni); fail1: ci_netif_unlock(&old_thr->netif); fail1_ni_unlocked: ci_sock_unlock(&old_thr->netif, &old_s->b); OO_DEBUG_TCPH(ci_log("%s: rc=%d", __func__, rc)); return rc; }
static int oo_epoll2_ctl(struct oo_epoll_private *priv, int op_kepfd, int op_op, int op_fd, struct epoll_event *op_event) { tcp_helper_resource_t *fd_thr; struct file *file; int rc; ci_uint32 fd_sock_id; citp_waitable *fd_w; /* We are interested in ADD only */ if( op_op != EPOLL_CTL_ADD ) return efab_linux_sys_epoll_ctl(op_kepfd, op_op, op_fd, op_event); /* system poll() and friends use fget_light(), which is cheap. * But they do not export fget_light to us, so we have to use fget(). */ file = fget(op_fd); if(unlikely( file == NULL )) return -EBADF; /* Check for the dead circle. * We should check that we are not adding ourself. */ if(unlikely( file->private_data == priv )) { fput(file); return -EINVAL; } /* Is op->fd ours and if yes, which netif it has? */ /* Fixme: epoll fd - do we want to accelerate something? */ if( file->f_op != &linux_tcp_helper_fops_udp && file->f_op != &linux_tcp_helper_fops_tcp ) { int rc; #ifdef OO_EPOLL_NEED_NEST_PROTECTION struct oo_epoll_busy_task t; t.task = current; spin_lock(&priv->lock); list_add(&t.link, &priv->p.p2.busy_tasks); spin_unlock(&priv->lock); #endif #if CI_CFG_USERSPACE_PIPE if( ( file->f_op == &linux_tcp_helper_fops_pipe_reader || file->f_op == &linux_tcp_helper_fops_pipe_writer ) ) priv->p.p2.do_spin = 1; #endif fput(file); rc = efab_linux_sys_epoll_ctl(op_kepfd, op_op, op_fd, op_event); #ifdef OO_EPOLL_NEED_NEST_PROTECTION spin_lock(&priv->lock); list_del(&t.link); spin_unlock(&priv->lock); #endif return rc; } /* Onload socket here! */ fd_thr = ((ci_private_t *)file->private_data)->thr; fd_sock_id = ((ci_private_t *)file->private_data)->sock_id; priv->p.p2.do_spin = 1; if(unlikely( ! oo_epoll_add_stack(priv, fd_thr) )) { static int printed; if( !printed ) ci_log("Can't add stack %d to epoll set: consider " "increasing epoll_max_stacks module option", fd_thr->id); /* fall through to sys_epoll_ctl() without interrupt */ } /* Let kernel add fd to the epoll set, but ask endpoint to avoid enabling * interrupts. * And we keep file ref while using fd_w to avoid nasty things. */ fd_w = SP_TO_WAITABLE(&fd_thr->netif, fd_sock_id); ci_bit_set(&fd_w->sb_aflags, CI_SB_AFLAG_AVOID_INTERRUPTS_BIT); rc = efab_linux_sys_epoll_ctl(op_kepfd, op_op, op_fd, op_event); ci_bit_clear(&fd_w->sb_aflags, CI_SB_AFLAG_AVOID_INTERRUPTS_BIT); fput(file); return rc; }
int onload_zc_release_buffers(int fd, onload_zc_handle* bufs, int bufs_len) { int rc = 0, i, rx_pkt, released; citp_lib_context_t lib_context; citp_fdinfo* fdi; citp_sock_fdi* epi; ci_netif* ni; ci_ip_pkt_fmt* pkt; Log_CALL(ci_log("%s(%d, %p, %d)", __FUNCTION__, fd, bufs, bufs_len)); citp_enter_lib(&lib_context); if( (fdi = citp_fdtable_lookup(fd)) != NULL ) { switch( citp_fdinfo_get_type(fdi) ) { case CITP_UDP_SOCKET: case CITP_TCP_SOCKET: epi = fdi_to_sock_fdi(fdi); ni = epi->sock.netif; ci_netif_lock(ni); for( i = 0; i < bufs_len; ++i ) { pkt = (ci_ip_pkt_fmt*)bufs[i]; if( pkt->stack_id != ni->state->stack_id ) { LOG_U(log("%s: attempt to free buffer from stack %d to stack %d", __FUNCTION__, pkt->stack_id, ni->state->stack_id)); rc = -EINVAL; break; } } if( rc == 0 ) { for( i = 0; i < bufs_len; ++i ) { pkt = (ci_ip_pkt_fmt*)bufs[i]; /* If we are releasing a packet without the RX_FLAG then the user * allocated and then freed the packet (without using it). * We detect this to decrement n_asyn_pkts. * RX packets (kept via ONLOAD_ZC_KEEP) are counted differently * so don't decrement here. (But may release) */ rx_pkt = pkt->flags & CI_PKT_FLAG_RX; released = ci_netif_pkt_release_check_keep(ni, pkt); if ( ! rx_pkt ) { ci_assert(released == 1); (void) released; --ni->state->n_async_pkts; } } } ci_netif_unlock(ni); break; #if CI_CFG_USERSPACE_EPOLL case CITP_EPOLL_FD: rc = -ENOTSOCK; break; #endif #if CI_CFG_USERSPACE_PIPE case CITP_PIPE_FD: rc = -ENOTSOCK; break; #endif default: LOG_U(log("%s: unknown fdinfo type %d", __FUNCTION__, citp_fdinfo_get_type(fdi))); rc = -EINVAL; } citp_fdinfo_release_ref(fdi, 0); } else { /* Not onload socket */ rc = -ESOCKTNOSUPPORT; } citp_exit_lib(&lib_context, TRUE); Log_CALL_RESULT(rc); return rc; }
static int efab_tcp_helper_sock_attach(ci_private_t* priv, void *arg) { oo_sock_attach_t* op = arg; tcp_helper_resource_t* trs = priv->thr; tcp_helper_endpoint_t* ep = NULL; citp_waitable_obj *wo; int rc, flags, type = op->type; /* SOCK_CLOEXEC and SOCK_NONBLOCK exist from 2.6.27 both */ #ifdef SOCK_TYPE_MASK BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC); flags = type & (SOCK_CLOEXEC | SOCK_NONBLOCK); type &= SOCK_TYPE_MASK; # ifdef SOCK_NONBLOCK if( SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK) ) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; # endif #else flags = 0; #endif OO_DEBUG_TCPH(ci_log("%s: ep_id=%d", __FUNCTION__, op->ep_id)); if( trs == NULL ) { LOG_E(ci_log("%s: ERROR: not attached to a stack", __FUNCTION__)); return -EINVAL; } /* Validate and find the endpoint. */ if( ! IS_VALID_SOCK_P(&trs->netif, op->ep_id) ) return -EINVAL; ep = ci_trs_get_valid_ep(trs, op->ep_id); if( tcp_helper_endpoint_set_aflags(ep, OO_THR_EP_AFLAG_ATTACHED) & OO_THR_EP_AFLAG_ATTACHED ) return -EBUSY; wo = SP_TO_WAITABLE_OBJ(&trs->netif, ep->id); /* create OS socket */ if( op->domain != AF_UNSPEC ) { struct socket *sock; struct file *os_file; rc = sock_create(op->domain, type, 0, &sock); if( rc < 0 ) { LOG_E(ci_log("%s: ERROR: sock_create(%d, %d, 0) failed (%d)", __FUNCTION__, op->domain, type, rc)); tcp_helper_endpoint_clear_aflags(ep, OO_THR_EP_AFLAG_ATTACHED); return rc; } os_file = sock_alloc_file(sock, flags, NULL); if( IS_ERR(os_file) ) { LOG_E(ci_log("%s: ERROR: sock_alloc_file failed (%ld)", __FUNCTION__, PTR_ERR(os_file))); sock_release(sock); tcp_helper_endpoint_clear_aflags(ep, OO_THR_EP_AFLAG_ATTACHED); return PTR_ERR(os_file); } rc = efab_attach_os_socket(ep, os_file); if( rc < 0 ) { LOG_E(ci_log("%s: ERROR: efab_attach_os_socket failed (%d)", __FUNCTION__, rc)); /* NB. efab_attach_os_socket() consumes [os_file] even on error. */ tcp_helper_endpoint_clear_aflags(ep, OO_THR_EP_AFLAG_ATTACHED); return rc; } wo->sock.domain = op->domain; wo->sock.ino = ep->os_socket->file->f_dentry->d_inode->i_ino; #if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0) wo->sock.uid = ep->os_socket->file->f_dentry->d_inode->i_uid; #else wo->sock.uid = __kuid_val(ep->os_socket->file->f_dentry->d_inode->i_uid); #endif } /* Create a new file descriptor to attach the stack to. */ ci_assert((wo->waitable.state & CI_TCP_STATE_TCP) || wo->waitable.state == CI_TCP_STATE_UDP); rc = oo_create_fd(ep, flags, (wo->waitable.state & CI_TCP_STATE_TCP) ? CI_PRIV_TYPE_TCP_EP : CI_PRIV_TYPE_UDP_EP); if( rc < 0 ) { ci_irqlock_state_t lock_flags; struct oo_file_ref* os_socket; ci_irqlock_lock(&ep->thr->lock, &lock_flags); os_socket = ep->os_socket; ep->os_socket = NULL; ci_irqlock_unlock(&ep->thr->lock, &lock_flags); if( os_socket != NULL ) oo_file_ref_drop(os_socket); tcp_helper_endpoint_clear_aflags(ep, OO_THR_EP_AFLAG_ATTACHED); return rc; } op->fd = rc; #ifdef SOCK_NONBLOCK if( op->type & SOCK_NONBLOCK ) ci_bit_mask_set(&wo->waitable.sb_aflags, CI_SB_AFLAG_O_NONBLOCK); #endif /* Re-read the OS socket buffer size settings. This ensures we'll use * up-to-date values for this new socket. */ efab_get_os_settings(&NI_OPTS_TRS(trs)); return 0; }
static int citp_udp_socket(int domain, int type, int protocol) { citp_fdinfo* fdi; citp_sock_fdi* epi; ef_driver_handle fd; int rc; ci_netif* ni; Log_V(log(LPF "socket(%d, %d, %d)", domain, type, protocol)); epi = CI_ALLOC_OBJ(citp_sock_fdi); if( ! epi ) { Log_U(ci_log(LPF "socket: failed to allocate epi")); errno = ENOMEM; goto fail1; } fdi = &epi->fdinfo; citp_fdinfo_init(fdi, &citp_udp_protocol_impl); rc = citp_netif_alloc_and_init(&fd, &ni); if( rc != 0 ) { if( rc == CI_SOCKET_HANDOVER ) { /* This implies EF_DONT_ACCELERATE is set, so we handover * regardless of CITP_OPTS.no_fail */ CI_FREE_OBJ(epi); return rc; } goto fail2; } /* Protect the fdtable entry until we're done initialising. */ if( fdtable_strict() ) CITP_FDTABLE_LOCK(); if((fd = ci_udp_ep_ctor(&epi->sock, ni, domain, type)) < 0) { /*! ?? \TODO unpick the ci_udp_ep_ctor according to how failed */ Log_U(ci_log(LPF "socket: udp_ep_ctor failed")); errno = -fd; goto fail3; } citp_fdtable_new_fd_set(fd, fdip_busy, fdtable_strict()); if( fdtable_strict() ) CITP_FDTABLE_UNLOCK(); CI_DEBUG(epi->sock.s->pid = getpid()); /* We're ready. Unleash us onto the world! */ ci_assert(epi->sock.s->b.sb_aflags & CI_SB_AFLAG_NOT_READY); ci_atomic32_and(&epi->sock.s->b.sb_aflags, ~CI_SB_AFLAG_NOT_READY); citp_fdtable_insert(fdi, fd, 0); Log_VSS(log(LPF "socket(%d, %d, %d) = "EF_FMT, domain, type, protocol, EF_PRI_ARGS(epi,fd))); return fd; fail3: if( CITP_OPTS.no_fail && errno != ELIBACC ) CITP_STATS_NETIF(++ni->state->stats.udp_handover_socket); citp_netif_release_ref(ni, 0); fail2: CI_FREE_OBJ(epi); fail1: /* BUG1408: Graceful failure. We'll only fail outright if there's a * driver/library mismatch */ if( CITP_OPTS.no_fail && errno != ELIBACC ) { Log_U(ci_log("%s: failed (errno:%d) - PASSING TO OS", __FUNCTION__, errno)); return CI_SOCKET_HANDOVER; } return -1; }
/* fixme kostik: this is partially copy-paste from citp_sock_fcntl */ static int citp_pipe_fcntl(citp_fdinfo* fdinfo, int cmd, long arg) { int rc = 0; citp_pipe_fdi* epi = fdi_to_pipe_fdi(fdinfo); struct oo_pipe* p = epi->pipe; switch ( cmd ) { case F_GETFL: { ci_uint32 flag_nonb = CI_PFD_AFLAG_NONBLOCK; if( ! fdi_is_reader(fdinfo) ) { rc = O_WRONLY; flag_nonb <<= CI_PFD_AFLAG_WRITER_SHIFT; } else flag_nonb <<= CI_PFD_AFLAG_READER_SHIFT; if ( p->aflags & flag_nonb ) rc |= O_NONBLOCK; break; } case F_SETFL: { ci_uint32 bit; rc = ci_sys_fcntl(fdinfo->fd, cmd, arg); if( rc < 0 ) break; bit = CI_PFD_AFLAG_NONBLOCK << (fdi_is_reader(fdinfo) ? CI_PFD_AFLAG_READER_SHIFT : CI_PFD_AFLAG_WRITER_SHIFT); if( arg & (O_NONBLOCK | O_NDELAY) ) ci_bit_mask_set(&p->aflags, bit); else ci_bit_mask_clear(&p->aflags, bit); break; } case F_DUPFD: rc = citp_ep_dup(fdinfo->fd, citp_ep_dup_fcntl_dup, arg); break; #ifdef F_DUPFD_CLOEXEC case F_DUPFD_CLOEXEC: rc = citp_ep_dup(fdinfo->fd, citp_ep_dup_fcntl_dup_cloexec, arg); break; #endif case F_GETFD: case F_SETFD: rc = ci_sys_fcntl(fdinfo->fd, cmd, arg); break; case F_GETLK: case F_SETLK: case F_SETLKW: /* File locks not supported on sockets */ Log_U(ci_log("%s: cmd %d not supported on sockets!",__FUNCTION__, cmd)); errno = ENOTSUP; rc = CI_SOCKET_ERROR; break; case F_GETOWN: case F_SETOWN: #ifdef F_GETOWN_EX case F_GETOWN_EX: #endif #ifdef F_SETOWN_EX case F_SETOWN_EX: #endif rc = ci_sys_fcntl(fdinfo->fd, cmd, arg); if( rc != 0 ) break; p->b.sigown = arg; if( p->b.sigown && (p->b.sb_aflags & CI_SB_AFLAG_O_ASYNC) ) ci_bit_set(&p->b.wake_request, CI_SB_FLAG_WAKE_RX_B); break; #ifdef F_SETPIPE_SZ case F_SETPIPE_SZ: /* System pipe buf size is rounded up to power of two. We * cannot replicate this. */ rc = ci_pipe_set_size(epi->ni, p, arg); if( rc < 0 ) { errno = EINVAL; rc = CI_SOCKET_ERROR; break; } rc = 0; break; #endif #ifdef F_GETPIPE_SZ case F_GETPIPE_SZ: rc = (p->bufs_max - 1) * OO_PIPE_BUF_MAX_SIZE; break; #endif default: /* fixme kostik: logging should include some pipe identification */ errno = ENOTSUP; rc = CI_SOCKET_ERROR; } Log_VSC(log("%s(%d, %d, %ld) = %d (errno=%d)", __FUNCTION__, fdinfo->fd, cmd, arg, rc, errno)); return rc; }
/* ** promote a synrecv structure to an established socket ** ** Assumes that the caller will handle a fail if we can't allocate a new ** tcp_state structure due to memory pressure or the like */ int ci_tcp_listenq_try_promote(ci_netif* netif, ci_tcp_socket_listen* tls, ci_tcp_state_synrecv* tsr, ci_ip_cached_hdrs* ipcache, ci_tcp_state** ts_out) { int rc = 0; ci_assert(netif); ci_assert(tls); ci_assert(tls->s.b.state == CI_TCP_LISTEN); ci_assert(tsr); if( (int) ci_tcp_acceptq_n(tls) < tls->acceptq_max ) { ci_tcp_state* ts; /* grab a tcp_state structure that will go onto the accept queue. We take * from the cache of EPs if any are available */ ts = get_ts_from_cache (netif, tsr, tls); if( !ts ) { /* None on cache; try allocating a new ts */ ts = ci_tcp_get_state_buf(netif); #if CI_CFG_FD_CACHING if( ts == NULL ) { /* We've reaped. Did this result in any being cached */ ts = get_ts_from_cache(netif, tsr, tls); if (ts == NULL ) { /* No -- try again to allocate. */ ts = ci_tcp_get_state_buf(netif); } else { CITP_STATS_NETIF(++netif->state->stats.sockcache_hit_reap); } } #endif if( ts == NULL ) { LOG_TV(ci_log("%s: [%d] out of socket buffers", __FUNCTION__, NI_ID(netif))); CITP_STATS_TCP_LISTEN(++tls->stats.n_acceptq_no_sock); CI_SET_SO_ERROR(&tls->s, ENOMEM); citp_waitable_wake(netif, &tls->s.b, CI_SB_FLAG_WAKE_RX); return -ENOMEM; } ci_assert(ci_tcp_is_cached(ts) || (ts->s.b.sb_aflags & CI_SB_AFLAG_ORPHAN)); } #ifdef ONLOAD_OFE ts->s.ofe_code_start = tls->ofe_promote; #endif if( ! ci_tcp_is_cached(ts) ) { /* Need to initialise address information for use when setting filters */ ci_tcp_set_addr_on_promote(netif, ts, tsr, tls); /* "borrow" filter from listening socket. For loopback socket, we * do not need filters, but we have to take a reference of the OS * socket. */ rc = ci_tcp_ep_set_filters(netif, S_SP(ts), ts->s.cp.so_bindtodevice, S_SP(tls)); if( rc < 0 ) { LOG_U(ci_log("%s: Unable to set filters %d", __FUNCTION__, rc)); /* Either put this back on the list (at the head) or free it */ ci_tcp_state_free(netif, ts); return rc; } } #if CI_CFG_FD_CACHING else { /* Now set the s/w filter. We leave the hw filter in place for cached * EPS. This will probably not have the correct raddr and rport, but as * it's sharing the listening socket's filter that's not a problem. It * will be updated if this is still around when the listener is closed. */ rc = ci_netif_filter_insert(netif, S_SP(ts), tsr->l_addr, sock_lport_be16(&tls->s), tsr->r_addr, tsr->r_port, tcp_protocol(ts)); if (rc < 0) { /* Bung it back on the cache list */ LOG_EP(ci_log("Unable to create s/w filter!")); ci_ni_dllist_push(netif, &tls->epcache.cache, &ts->epcache_link); return rc; } /* Need to initialise address information. We do this after trying to * insert the sw filter, so we can push the tcp state back onto the * cache queue with as few changes as possible if we fail to add the * sw filter. */ ci_tcp_set_addr_on_promote(netif, ts, tsr, tls); LOG_EP(ci_log("Cached fd %d from cached to connected", ts->cached_on_fd)); ci_ni_dllist_push(netif, &tls->epcache_connected, &ts->epcache_link); } #endif ci_assert(IS_VALID_SOCK_P(netif, S_SP(ts))); ci_assert(ts->s.b.state == CI_TCP_CLOSED); ts->s.domain = tls->s.domain; cicp_ip_cache_update_from(netif, &ts->s.pkt, ipcache); ci_pmtu_state_init(netif, &ts->s, &ts->pmtus, CI_IP_TIMER_PMTU_DISCOVER); ci_pmtu_set(netif, &ts->pmtus, CI_MIN(ts->s.pkt.mtu, tsr->tcpopts.smss + sizeof(ci_tcp_hdr) + sizeof(ci_ip4_hdr))); /* If we've got SYN via local route, we can handle it */ ci_assert_equiv(ts->s.pkt.status == retrrc_localroute, OO_SP_NOT_NULL(tsr->local_peer)); if( ts->s.pkt.status == retrrc_localroute ) ts->s.pkt.flags |= CI_IP_CACHE_IS_LOCALROUTE; ts->amss = tsr->amss; /* options and flags */ ts->tcpflags = 0; ts->tcpflags |= tsr->tcpopts.flags; ts->tcpflags |= CI_TCPT_FLAG_PASSIVE_OPENED; ts->outgoing_hdrs_len = sizeof(ci_ip4_hdr) + sizeof(ci_tcp_hdr); if( ts->tcpflags & CI_TCPT_FLAG_WSCL ) { ts->snd_wscl = tsr->tcpopts.wscl_shft; ts->rcv_wscl = tsr->rcv_wscl; } else { ts->snd_wscl = ts->rcv_wscl = 0u; } CI_IP_SOCK_STATS_VAL_TXWSCL( ts, ts->snd_wscl); CI_IP_SOCK_STATS_VAL_RXWSCL( ts, ts->rcv_wscl); /* Send and receive sequence numbers */ tcp_snd_una(ts) = tcp_snd_nxt(ts) = tcp_enq_nxt(ts) = tcp_snd_up(ts) = tsr->snd_isn + 1; ci_tcp_set_snd_max(ts, tsr->rcv_nxt, tcp_snd_una(ts), 0); ci_tcp_rx_set_isn(ts, tsr->rcv_nxt); tcp_rcv_up(ts) = SEQ_SUB(tcp_rcv_nxt(ts), 1); if( ts->tcpflags & CI_TCPT_FLAG_TSO ) { ts->incoming_tcp_hdr_len += 12; ts->outgoing_hdrs_len += 12; ts->tspaws = ci_tcp_time_now(netif); ts->tsrecent = tsr->tspeer; ts->tslastack = tsr->rcv_nxt; } else { /* Must be after initialising snd_una. */ ci_tcp_clear_rtt_timing(ts); ts->timed_ts = tsr->timest; } /* SACK has nothing to be done. */ /* ?? ECN */ ci_tcp_set_hdr_len(ts, (ts->outgoing_hdrs_len - sizeof(ci_ip4_hdr))); ts->smss = tsr->tcpopts.smss; ts->c.user_mss = tls->c.user_mss; if (ts->c.user_mss && ts->c.user_mss < ts->smss) ts->smss = ts->c.user_mss; #if CI_CFG_LIMIT_SMSS ts->smss = ci_tcp_limit_mss(ts->smss, netif, __FUNCTION__); #endif ci_assert(ts->smss>0); ci_tcp_set_eff_mss(netif, ts); ci_tcp_set_initialcwnd(netif, ts); /* Copy socket options & related fields that should be inherited. * Note: Windows does not inherit rcvbuf until the call to accept * completes. The assumption here is that all options can be * inherited at the same time (most won't have an effect until there * is a socket available for use by the app.). */ ci_tcp_inherit_accept_options(netif, tls, ts, "SYN RECV (LISTENQ PROMOTE)"); /* NB. Must have already set peer (which we have). */ ci_tcp_set_established_state(netif, ts); CITP_STATS_NETIF(++netif->state->stats.synrecv2established); ci_assert(ts->ka_probes == 0); ci_tcp_kalive_restart(netif, ts, ci_tcp_kalive_idle_get(ts)); ci_tcp_set_flags(ts, CI_TCP_FLAG_ACK); /* Remove the synrecv structure from the listen queue, and free the ** buffer. */ if( tsr->tcpopts.flags & CI_TCPT_FLAG_SYNCOOKIE ) ci_free(tsr); else { ci_tcp_listenq_remove(netif, tls, tsr); ci_tcp_synrecv_free(netif, tsr); } ci_bit_set(&ts->s.b.sb_aflags, CI_SB_AFLAG_TCP_IN_ACCEPTQ_BIT); ci_tcp_acceptq_put(netif, tls, &ts->s.b); LOG_TC(log(LNT_FMT "new ts=%d SYN-RECV->ESTABLISHED flags=0x%x", LNT_PRI_ARGS(netif, tls), S_FMT(ts), ts->tcpflags); log(LNTS_FMT RCV_WND_FMT " snd=%08x-%08x-%08x enq=%08x", LNTS_PRI_ARGS(netif, ts), RCV_WND_ARGS(ts), tcp_snd_una(ts), tcp_snd_nxt(ts), ts->snd_max, tcp_enq_nxt(ts))); citp_waitable_wake(netif, &tls->s.b, CI_SB_FLAG_WAKE_RX); *ts_out = ts; return 0; }
static int parse_cfg_opt(int argc, char** argv, const char* context) { const ci_cfg_desc* a; const char* val = NULL; int result = 1; /* is it "-" ? */ if( argv[0][1] == 0 ) bad_cla(context, argv[0], "- is not allowed"); /* find the config descriptor */ a = 0; if( cfg_opts ) a = find_cfg_desc(argv[0], cfg_opts, n_cfg_opts, &val); if( (!a) && ci_app_standard_opts) { a = find_cfg_desc(argv[0], std_opts, N_STD_OPTS, &val); } if( !a ) bad_cla(context, argv[0], "unknown option"); /* the option value (if required) may be part of this arg or the next */ if( !val || *val == 0 ) { if( a->type == CI_CFG_FLAG || a->type == CI_CFG_USAGE || argc == 1 ) { val = 0; } else { val = argv[1]; result = 2; } } switch( a->type ) { case CI_CFG_FLAG: if( val ) { if( sscanf(val, "%d", (int*) a->value) != 1 ) bad_cla(context, argv[0], "expected integer or nothing"); } else ++(*(int*) a->value); break; case CI_CFG_INT: if( !val || sscanf(val, "%i", (int*) a->value) != 1 ) bad_cla(context, argv[0], "expected integer"); break; case CI_CFG_UINT: if( !val || sscanf(val, "%i", (int*) a->value) != 1 ) bad_cla(context, argv[0], "expected unsigned integer"); break; case CI_CFG_INT64: if( !val || sscanf(val, "%lli", (long long int*) a->value) != 1 ) bad_cla(context, argv[0], "expected 64bit integer"); break; case CI_CFG_UINT64: if( !val || sscanf(val, "%lli", (long long int*) a->value) != 1 ) bad_cla(context, argv[0], "expected unsigned 64bit integer"); break; case CI_CFG_STR: *(const char**) a->value = val ? val : ""; break; case CI_CFG_USAGE: ci_app_usage(0); break; case CI_CFG_FN: ci_assert(a->fn); a->fn(val, a); break; case CI_CFG_IRANGE: { int *v; v = (int*) a->value; if( sscanf(val, " %i - %i", v, v + 1) != 2 ) { if( sscanf(val, " %i", v) == 1 ) v[1] = v[0]; else bad_cla(context, argv[0], "expected integer or range"); } } break; default: ci_log("ci_app: unknown config option type %u", a->type); break; } return result; }
/* Return 1 if the bucket is empty now */ static int ci_tcp_listenq_bucket_remove(ci_netif* ni, ci_tcp_socket_listen* tls, ci_tcp_listen_bucket* bucket, ci_tcp_state_synrecv* tsr, int level) { ci_ni_aux_mem* aux; int idx = ci_tcp_listenq_hash2idx(tsr->hash, level); oo_p tsr_p = ci_tcp_synrecv2p(ni, tsr); /* Fixme: we remove empty buckets only. In theory, it may be useful to * remove a bucket with one non-empty list, but it maked code more * complicated. */ int empty = 0; #ifdef __KERNEL__ int i = 0; if( level > CI_LISTENQ_BUCKET_MAX_DEPTH(ni) ) { ci_netif_error_detected(ni, CI_NETIF_ERROR_SYNRECV_TABLE, __FUNCTION__); return 0; } #endif LOG_TV(ci_log("%s([%d] level=%d "TSR_FMT")", __func__, NI_ID(ni), level, TSR_ARGS(tsr))); ci_assert( OO_P_NOT_NULL(bucket->bucket[idx]) ); #ifdef __KERNEL__ if( OO_P_IS_NULL(bucket->bucket[idx]) ) { ci_netif_error_detected(ni, CI_NETIF_ERROR_SYNRECV_TABLE, __FUNCTION__); return 0; } #endif level++; aux = ci_ni_aux_p2aux(ni, bucket->bucket[idx]); if( aux->type == CI_TCP_AUX_TYPE_BUCKET ) { empty = ci_tcp_listenq_bucket_remove(ni, tls, &aux->u.bucket, tsr, level); if( empty ) { bucket->bucket[idx] = OO_P_NULL; ci_ni_aux_free(ni, aux); tls->n_buckets--; } } else { if( bucket->bucket[idx] == tsr_p ) { bucket->bucket[idx] = tsr->bucket_link; empty = OO_P_IS_NULL(bucket->bucket[idx]); } else { ci_tcp_state_synrecv* prev = &aux->u.synrecv; while( prev->bucket_link != tsr_p ) { aux = ci_ni_aux_p2aux(ni, prev->bucket_link); prev = &aux->u.synrecv; #ifdef __KERNEL__ if( i++ > CI_LISTENQ_BUCKET_LIST_LIMIT(ni) ) { ci_netif_error_detected(ni, CI_NETIF_ERROR_SYNRECV_TABLE, __FUNCTION__); return 0; } #endif } prev->bucket_link = tsr->bucket_link; } } if( empty ) { int i; for( i = 0; i < CI_TCP_LISTEN_BUCKET_SIZE; i++ ) if( OO_P_NOT_NULL(bucket->bucket[i]) ) return 0; return 1; } return 0; }
/* c_ni is assumed to be locked on enterance and is always unlocked on * exit. */ int ci_tcp_connect_lo_toconn(ci_netif *c_ni, oo_sp c_id, ci_uint32 dst, ci_netif *l_ni, oo_sp l_id) { ci_tcp_state *ts; ci_tcp_socket_listen *tls, *alien_tls; citp_waitable_obj *wo; citp_waitable *w; int rc; ci_assert(ci_netif_is_locked(c_ni)); ci_assert(OO_SP_NOT_NULL(c_id)); ci_assert(OO_SP_NOT_NULL(l_id)); LOG_TC(log("%s: connect %d:%d to %d:%d", __FUNCTION__, c_ni->state->stack_id, OO_SP_TO_INT(c_id), l_ni->state->stack_id, OO_SP_TO_INT(l_id))); alien_tls = SP_TO_TCP_LISTEN(l_ni, l_id); if( (int)ci_tcp_acceptq_n(alien_tls) >= alien_tls->acceptq_max ) { ci_netif_unlock(c_ni); return -EBUSY; } /* In c_ni, create shadow listening socket tls (copy l_id) */ ts = ci_tcp_get_state_buf(c_ni); if( ts == NULL ) { ci_netif_unlock(c_ni); LOG_E(ci_log("%s: [%d] out of socket buffers", __FUNCTION__, NI_ID(c_ni))); return -ENOMEM; } /* init common tcp fields */ ts->s.so = alien_tls->s.so; ts->s.cp.ip_ttl = alien_tls->s.cp.ip_ttl; S_TCP_HDR(&ts->s)->tcp_source_be16 = S_TCP_HDR(&alien_tls->s)->tcp_source_be16; ts->s.domain = alien_tls->s.domain; ts->c = alien_tls->c; ts->c.tcp_defer_accept = OO_TCP_DEFER_ACCEPT_OFF; /* make sure nobody will ever connect to our "shadow" socket * except us */ ci_bit_set(&ts->s.b.sb_aflags, CI_SB_AFLAG_ORPHAN_BIT); ci_tcp_set_slow_state(c_ni, ts, CI_TCP_LISTEN); tls = SOCK_TO_TCP_LISTEN(&ts->s); /* no timer: */ tls->s.s_flags = alien_tls->s.s_flags | CI_SOCK_FLAG_BOUND_ALIEN; tls->acceptq_max = 1; rc = ci_tcp_listen_init(c_ni, tls); if( rc != 0 ) { citp_waitable_obj_free(c_ni, &tls->s.b); return rc; } /* Connect c_id to tls */ ts = SP_TO_TCP(c_ni, c_id); rc = ci_tcp_connect_lo_samestack(c_ni, ts, tls->s.b.bufid); /* Accept as from tls */ if( !ci_tcp_acceptq_not_empty(tls) ) { /* it is possible, for example, if ci_tcp_listenq_try_promote() failed * because there are no endpoints */ ci_tcp_listenq_drop_all(c_ni, tls); citp_waitable_obj_free(c_ni, &tls->s.b); ci_netif_unlock(c_ni); return -EBUSY; } w = ci_tcp_acceptq_get(c_ni, tls); ci_assert(w); LOG_TV(ci_log("%s: %d:%d to %d:%d shadow %d:%d accepted %d:%d", __FUNCTION__, c_ni->state->stack_id, OO_SP_TO_INT(c_id), l_ni->state->stack_id, OO_SP_TO_INT(l_id), c_ni->state->stack_id, tls->s.b.bufid, c_ni->state->stack_id, w->bufid)); ci_assert(w->state & CI_TCP_STATE_TCP); ci_assert(w->state != CI_TCP_LISTEN); /* Destroy tls. * NB: nobody could possibly connect to it, so no need to do proper * shutdown. */ ci_assert_equal(ci_tcp_acceptq_n(tls), 0); ci_tcp_listenq_drop_all(c_ni, tls); citp_waitable_obj_free(c_ni, &tls->s.b); ci_netif_unlock(c_ni); /* Keep a port reference */ { tcp_helper_endpoint_t *l_ep, *a_ep; struct oo_file_ref* os_sock_ref; ci_irqlock_state_t lock_flags; l_ep = ci_trs_ep_get(netif2tcp_helper_resource(l_ni), l_id); a_ep = ci_trs_ep_get(netif2tcp_helper_resource(c_ni), W_SP(w)); ci_irqlock_lock(&l_ep->thr->lock, &lock_flags); os_sock_ref = l_ep->os_socket; ci_assert_equal(a_ep->os_port_keeper, NULL); if( os_sock_ref != NULL ) { os_sock_ref = oo_file_ref_add(os_sock_ref); os_sock_ref = oo_file_ref_xchg(&a_ep->os_port_keeper, os_sock_ref); ci_irqlock_unlock(&l_ep->thr->lock, &lock_flags); if( os_sock_ref != NULL ) oo_file_ref_drop(os_sock_ref); } else { ci_irqlock_unlock(&l_ep->thr->lock, &lock_flags); goto cleanup; } } /* lock l_ni: Check that l_id is the same socket it used to be */ /* create ref-sock in l_ni, put it into acc q */ if( ci_netif_lock(l_ni) != 0 ) goto cleanup; if( alien_tls->s.b.state != CI_TCP_LISTEN || (alien_tls->s.b.sb_aflags & CI_SB_AFLAG_ORPHAN) || S_TCP_HDR(&alien_tls->s)->tcp_source_be16 != TS_TCP(ts)->tcp_dest_be16 || (alien_tls->s.pkt.ip.ip_saddr_be32 != INADDR_ANY && alien_tls->s.pkt.ip.ip_saddr_be32 != ts->s.pkt.ip.ip_daddr_be32) ) { ci_netif_unlock(l_ni); goto cleanup; } ci_bit_mask_set(&w->sb_aflags, CI_SB_AFLAG_TCP_IN_ACCEPTQ | CI_SB_AFLAG_ORPHAN); wo = citp_waitable_obj_alloc(l_ni); if( wo == NULL ) { ci_netif_unlock(l_ni); goto cleanup; } wo->waitable.state = CI_TCP_CLOSED; wo->waitable.sb_aflags |= CI_SB_AFLAG_MOVED_AWAY; wo->waitable.moved_to_stack_id = c_ni->state->stack_id; wo->waitable.moved_to_sock_id = W_SP(w); LOG_TC(log("%s: put to acceptq %d:%d referencing %d:%d", __func__, l_ni->state->stack_id, OO_SP_TO_INT(W_SP(&wo->waitable)), c_ni->state->stack_id, OO_SP_TO_INT(W_SP(w)))); ci_tcp_acceptq_put(l_ni, alien_tls, &wo->waitable); citp_waitable_wake_not_in_poll(l_ni, &alien_tls->s.b, CI_SB_FLAG_WAKE_RX); ci_netif_unlock(l_ni); return rc; cleanup: ci_assert(w->sb_aflags & CI_SB_AFLAG_ORPHAN); ci_bit_mask_clear(&w->sb_aflags, CI_SB_AFLAG_TCP_IN_ACCEPTQ | CI_SB_AFLAG_ORPHAN); efab_tcp_helper_close_endpoint(netif2tcp_helper_resource(c_ni), w->bufid); /* we can not guarantee c_ni lock, so we can' call * ci_tcp_drop(c_ni, ts). So, we return error; UL will handover * and close ts endpoint. */ return -EBUSY; }
/* Returns: * 0 on success * * CI_SOCKET_ERROR (and errno set) * this is a normal error that is returned to the * the application * * CI_SOCKET_HANDOVER we tell the upper layers to handover, no need * to set errno since it isn't a real error */ int ci_tcp_connect(citp_socket* ep, const struct sockaddr* serv_addr, socklen_t addrlen, ci_fd_t fd, int *p_moved) { /* Address family is validated earlier. */ struct sockaddr_in* inaddr = (struct sockaddr_in*) serv_addr; ci_sock_cmn* s = ep->s; ci_tcp_state* ts = &SOCK_TO_WAITABLE_OBJ(s)->tcp; int rc = 0, crc; ci_uint32 dst_be32; if( NI_OPTS(ep->netif).tcp_connect_handover ) return CI_SOCKET_HANDOVER; /* Make sure we're up-to-date. */ ci_netif_lock(ep->netif); CHECK_TEP(ep); ci_netif_poll(ep->netif); /* * 1. Check if state of the socket is OK for connect operation. */ start_again: if( (rc = ci_tcp_connect_handle_so_error(s)) != 0) { CI_SET_ERROR(rc, rc); goto unlock_out; } if( s->b.state != CI_TCP_CLOSED ) { /* see if progress can be made on this socket before ** determining status (e.g. non-blocking connect and connect poll)*/ if( s->b.state & CI_TCP_STATE_SYNCHRONISED ) { if( ts->tcpflags & CI_TCPT_FLAG_NONBLOCK_CONNECT ) { ts->tcpflags &= ~CI_TCPT_FLAG_NONBLOCK_CONNECT; rc = 0; goto unlock_out; } if( serv_addr->sa_family == AF_UNSPEC ) LOG_E(ci_log("Onload does not support TCP disconnect via " "connect(addr->sa_family==AF_UNSPEC)")); CI_SET_ERROR(rc, EISCONN); } else if( s->b.state == CI_TCP_LISTEN ) { #if CI_CFG_POSIX_CONNECT_AFTER_LISTEN CI_SET_ERROR(rc, EOPNOTSUPP); #else if( ci_tcp_validate_sa(s->domain, serv_addr, addrlen) ) { /* Request should be forwarded to OS */ rc = CI_SOCKET_HANDOVER; goto unlock_out; } if( serv_addr->sa_family == AF_UNSPEC ) { /* Linux does listen shutdown on disconnect (AF_UNSPEC) */ ci_netif_unlock(ep->netif); rc = ci_tcp_shutdown(ep, SHUT_RD, fd); goto out; } else { /* Linux has curious error reporting in this case */ CI_SET_ERROR(rc, EISCONN); } #endif } else { /* Socket is in SYN-SENT state. Let's block for receiving SYN-ACK */ ci_assert_equal(s->b.state, CI_TCP_SYN_SENT); if( s->b.sb_aflags & (CI_SB_AFLAG_O_NONBLOCK | CI_SB_AFLAG_O_NDELAY) ) CI_SET_ERROR(rc, EALREADY); else goto syn_sent; } goto unlock_out; } /* Check if we've ever been connected. */ if( ts->tcpflags & CI_TCPT_FLAG_WAS_ESTAB ) { CI_SET_ERROR(rc, EISCONN); goto unlock_out; } /* * 2. Check address parameter, if it's inappropriate for handover * decision or handover should be done, try to to call OS and * do handover on success. */ if ( /* Af first, check that address family and length is OK. */ ci_tcp_validate_sa(s->domain, serv_addr, addrlen) /* rfc793 p54 if the foreign socket is unspecified return */ /* "error: foreign socket unspecified" (EINVAL), but keep it to OS */ || (dst_be32 = ci_get_ip4_addr(inaddr->sin_family, serv_addr)) == 0 /* Zero destination port is tricky as well, keep it to OS */ || inaddr->sin_port == 0 ) { rc = CI_SOCKET_HANDOVER; goto unlock_out; } /* is this a socket that we can handle? */ rc = ci_tcp_connect_check_dest(ep, dst_be32, inaddr->sin_port); if( rc ) goto unlock_out; if( (ts->s.pkt.flags & CI_IP_CACHE_IS_LOCALROUTE) && OO_SP_IS_NULL(ts->local_peer) ) { /* Try to connect to another stack; handover if can't */ struct oo_op_loopback_connect op; op.dst_port = inaddr->sin_port; op.dst_addr = dst_be32; /* this operation unlocks netif */ rc = oo_resource_op(fd, OO_IOC_TCP_LOOPBACK_CONNECT, &op); if( rc < 0) return CI_SOCKET_HANDOVER; if( op.out_moved ) *p_moved = 1; if( op.out_rc == -EINPROGRESS ) RET_WITH_ERRNO( EINPROGRESS ); else if( op.out_rc == -EAGAIN ) return -EAGAIN; else if( op.out_rc != 0 ) return CI_SOCKET_HANDOVER; return 0; } /* filters can't handle alien source address */ if( (s->s_flags & CI_SOCK_FLAG_BOUND_ALIEN) && ! (ts->s.pkt.flags & CI_IP_CACHE_IS_LOCALROUTE) ) { rc = CI_SOCKET_HANDOVER; goto unlock_out; } crc = ci_tcp_connect_ul_start(ep->netif, ts, dst_be32, inaddr->sin_port, &rc); if( crc != CI_CONNECT_UL_OK ) { switch( crc ) { case CI_CONNECT_UL_FAIL: goto unlock_out; case CI_CONNECT_UL_LOCK_DROPPED: goto out; case CI_CONNECT_UL_START_AGAIN: goto start_again; } } CI_TCP_STATS_INC_ACTIVE_OPENS( ep->netif ); syn_sent: rc = ci_tcp_connect_ul_syn_sent(ep->netif, ts); unlock_out: ci_netif_unlock(ep->netif); out: return rc; }
int ef_onload_driver_open(ef_driver_handle* pfd, enum oo_device_type dev_type, int do_cloexec) { int rc; int flags = 0; int saved_errno = errno; #ifdef O_CLOEXEC if( do_cloexec ) flags = O_CLOEXEC; #endif ci_assert(pfd); rc = oo_open(pfd, dev_type, flags); if( rc != 0 && errno != EMFILE && fd_is_saved[dev_type] >= 0 ) { ci_clone_fd_t op; op.do_cloexec = do_cloexec; LOG_NV(ci_log("%s: open failed, but cloning from saved fd", __func__)); rc = ci_sys_ioctl((ci_fd_t) saved_fd[dev_type], clone_ioctl[dev_type], &op); if( rc < 0 ) return rc; errno = saved_errno; *pfd = op.fd; } if( rc != 0 ) return rc; /* Our internal driver handles are not visible to the application. It may * make assumptions about the fd space available to it, and try to dup2/3 * onto one of our driver fds. To try and minimise this we allow the user * to specify a minimum value for us to use, to try and keep out of their * way. * * We have to be able to cope with them coming along and trying to dup onto * one of these fds anyway, as they may not have set the option up. As such * we treat failure to shift the fd as acceptable, and just retain the old * one. */ if( *pfd < CITP_OPTS.fd_base ) if( ef_onload_handle_move_and_do_cloexec(pfd, do_cloexec) == 0 ) return 0; if( do_cloexec ) { #if defined(O_CLOEXEC) static int o_cloexec_fails = -1; if( o_cloexec_fails < 0 ) { int arg; rc = ci_sys_fcntl(*(int *)pfd, F_GETFD, &arg); if( rc == 0 && (arg & FD_CLOEXEC) ) o_cloexec_fails = 0; else o_cloexec_fails = 1; } #else static const int o_cloexec_fails = 1; #endif if( o_cloexec_fails ) CI_DEBUG_TRY(ci_sys_fcntl(*(int *)pfd, F_SETFD, FD_CLOEXEC)); } return 0; }
/* we don't register protocol impl */ int citp_pipe_create(int fds[2], int flags) { citp_pipe_fdi* epi_read; citp_pipe_fdi* epi_write; struct oo_pipe* p = NULL; /* make compiler happy */ ci_netif* ni; int rc = -1; ef_driver_handle fd = -1; Log_V(log(LPF "pipe()")); /* citp_netif_exists() does not need citp_ul_lock here */ if( CITP_OPTS.ul_pipe == CI_UNIX_PIPE_ACCELERATE_IF_NETIF && ! citp_netif_exists() ) { return CITP_NOT_HANDLED; } rc = citp_netif_alloc_and_init(&fd, &ni); if( rc != 0 ) { if( rc == CI_SOCKET_HANDOVER ) { /* This implies EF_DONT_ACCELERATE is set, so we handover * regardless of CITP_OPTS.no_fail */ return CITP_NOT_HANDLED; } /* may be lib mismatch - errno will be ELIBACC */ goto fail1; } rc = -1; CI_MAGIC_CHECK(ni, NETIF_MAGIC); /* add another reference as we have 2 fdis */ citp_netif_add_ref(ni); epi_read = citp_pipe_epi_alloc(ni, O_RDONLY); if( epi_read == NULL ) goto fail2; epi_write = citp_pipe_epi_alloc(ni, O_WRONLY); if( epi_write == NULL ) goto fail3; /* oo_pipe init code */ if( fdtable_strict() ) CITP_FDTABLE_LOCK(); rc = oo_pipe_ctor(ni, &p, fds, flags); if( rc < 0 ) goto fail4; citp_fdtable_new_fd_set(fds[0], fdip_busy, fdtable_strict()); citp_fdtable_new_fd_set(fds[1], fdip_busy, fdtable_strict()); if( fdtable_strict() ) CITP_FDTABLE_UNLOCK(); LOG_PIPE("%s: pipe=%p id=%d", __FUNCTION__, p, p->b.bufid); /* as pipe is created it should be attached to the end-points */ epi_read->pipe = p; epi_write->pipe = p; /* We're ready. Unleash us onto the world! */ ci_assert(epi_read->pipe->b.sb_aflags & CI_SB_AFLAG_NOT_READY); ci_assert(epi_write->pipe->b.sb_aflags & CI_SB_AFLAG_NOT_READY); ci_atomic32_and(&epi_read->pipe->b.sb_aflags, ~CI_SB_AFLAG_NOT_READY); ci_atomic32_and(&epi_read->pipe->b.sb_aflags, ~CI_SB_AFLAG_NOT_READY); citp_fdtable_insert(&epi_read->fdinfo, fds[0], 0); citp_fdtable_insert(&epi_write->fdinfo, fds[1], 0); CI_MAGIC_CHECK(ni, NETIF_MAGIC); return 0; fail4: if( fdtable_strict() ) CITP_FDTABLE_UNLOCK(); fail3: CI_FREE_OBJ(epi_write); fail2: CI_FREE_OBJ(epi_read); citp_netif_release_ref(ni, 0); citp_netif_release_ref(ni, 0); fail1: if( CITP_OPTS.no_fail && errno != ELIBACC ) { Log_U(ci_log("%s: failed (errno:%d) - PASSING TO OS", __FUNCTION__, errno)); return CITP_NOT_HANDLED; } return rc; }
int citp_epoll_create(int size, int flags) { citp_fdinfo *fdi; citp_epoll_fdi *epi; struct citp_epoll_fd* ep; int fd; if( (epi = CI_ALLOC_OBJ(citp_epoll_fdi)) == NULL ) goto fail0; if( (ep = CI_ALLOC_OBJ(struct citp_epoll_fd)) == NULL ) goto fail1; fdi = &epi->fdinfo; citp_fdinfo_init(fdi, &citp_epoll_protocol_impl); /* Create the epoll fd. */ CITP_FDTABLE_LOCK(); if( (fd = ci_sys_epoll_create_compat(size, flags, 0)) < 0 ) goto fail2; citp_fdtable_new_fd_set(fd, fdip_busy, TRUE); /* Init epfd_os */ #ifdef O_CLOEXEC ep->epfd_os = ci_sys_open(OO_EPOLL_DEV, O_RDWR | O_CLOEXEC); #else ep->epfd_os = ci_sys_open(OO_EPOLL_DEV, O_RDWR); if( ep->epfd_os >= 0 ) ci_sys_fcntl(ep->epfd_os, F_SETFD, FD_CLOEXEC); #endif if( ep->epfd_os < 0 ) { Log_E(ci_log("%s: ERROR: failed to open(%s) errno=%d", __FUNCTION__, OO_EPOLL_DEV, errno)); goto fail3; } __citp_fdtable_reserve(ep->epfd_os, 1); ep->shared = mmap(NULL, sizeof(*ep->shared), PROT_READ, MAP_SHARED, ep->epfd_os, 0); if( ep->shared == MAP_FAILED ) { Log_E(ci_log("%s: ERROR: failed to mmap shared segment errno=%d", __FUNCTION__, errno)); goto fail4; } __citp_fdtable_reserve(ep->shared->epfd, 1); CITP_FDTABLE_UNLOCK(); epi->epoll = ep; ep->size = size; oo_wqlock_init(&ep->lock); ep->not_mt_safe = ! CITP_OPTS.ul_epoll_mt_safe; ci_dllist_init(&ep->oo_sockets); ep->oo_sockets_n = 0; ci_dllist_init(&ep->dead_sockets); oo_atomic_set(&ep->refcount, 1); ep->epfd_syncs_needed = 0; ep->blocking = 0; citp_fdtable_insert(fdi, fd, 0); Log_POLL(ci_log("%s: fd=%d driver_fd=%d epfd=%d", __FUNCTION__, fd, ep->epfd_os, (int) ep->shared->epfd)); return fd; fail4: __citp_fdtable_reserve(ep->epfd_os, 0); ci_sys_close(ep->epfd_os); fail3: ci_sys_close(fd); citp_fdtable_busy_clear(fd, fdip_unknown, 1); fail2: CITP_FDTABLE_UNLOCK(); CI_FREE_OBJ(ep); fail1: CI_FREE_OBJ(epi); fail0: return -1; }
static void citp_dump_opts(citp_opts_t *o) { /* ?? TODO: should be using opts_cittp_def.h here */ # define DUMP_OPT_INT(envstr, name) \ ci_log("%s=%d", (envstr), (int) o->name) # define DUMP_OPT_HEX(envstr, name) \ ci_log("%s=%x", (envstr), (unsigned) o->name) DUMP_OPT_HEX("EF_UNIX_LOG", log_level); DUMP_OPT_INT("EF_PROBE", probe); DUMP_OPT_INT("EF_TCP", ul_tcp); DUMP_OPT_INT("EF_UDP", ul_udp); DUMP_OPT_INT("EF_UL_SELECT", ul_select); DUMP_OPT_INT("EF_SELECT_SPIN", ul_select_spin); DUMP_OPT_INT("EF_SELECT_FAST", ul_select_fast); DUMP_OPT_INT("EF_UL_POLL", ul_poll); DUMP_OPT_INT("EF_POLL_SPIN", ul_poll_spin); DUMP_OPT_INT("EF_POLL_FAST", ul_poll_fast); DUMP_OPT_INT("EF_POLL_FAST_USEC", ul_poll_fast_usec); DUMP_OPT_INT("EF_POLL_NONBLOCK_FAST_USEC", ul_poll_nonblock_fast_usec); DUMP_OPT_INT("EF_SELECT_FAST_USEC", ul_select_fast_usec); DUMP_OPT_INT("EF_SELECT_NONBLOCK_FAST_USEC", ul_select_nonblock_fast_usec); #if CI_CFG_UDP DUMP_OPT_INT("EF_UDP_RECV_SPIN", udp_recv_spin); DUMP_OPT_INT("EF_UDP_SEND_SPIN", udp_send_spin); #endif DUMP_OPT_INT("EF_TCP_RECV_SPIN", tcp_recv_spin); DUMP_OPT_INT("EF_TCP_SEND_SPIN", tcp_send_spin); DUMP_OPT_INT("EF_TCP_ACCEPT_SPIN", tcp_accept_spin); DUMP_OPT_INT("EF_TCP_CONNECT_SPIN", tcp_connect_spin); DUMP_OPT_INT("EF_PKT_WAIT_SPIN", pkt_wait_spin); #if CI_CFG_USERSPACE_PIPE DUMP_OPT_INT("EF_PIPE_RECV_SPIN", pipe_recv_spin); DUMP_OPT_INT("EF_PIPE_SEND_SPIN", pipe_send_spin); DUMP_OPT_INT("EF_PIPE_SIZE", pipe_size); #endif DUMP_OPT_INT("EF_SOCK_LOCK_BUZZ", sock_lock_buzz); DUMP_OPT_INT("EF_STACK_LOCK_BUZZ", stack_lock_buzz); DUMP_OPT_INT("EF_SO_BUSY_POLL_SPIN", so_busy_poll_spin); #if CI_CFG_USERSPACE_EPOLL DUMP_OPT_INT("EF_UL_EPOLL", ul_epoll); DUMP_OPT_INT("EF_EPOLL_SPIN", ul_epoll_spin); DUMP_OPT_INT("EF_EPOLL_CTL_FAST", ul_epoll_ctl_fast); DUMP_OPT_INT("EF_EPOLL_CTL_HANDOFF", ul_epoll_ctl_handoff); DUMP_OPT_INT("EF_EPOLL_MT_SAFE", ul_epoll_mt_safe); #endif DUMP_OPT_INT("EF_FDTABLE_SIZE", fdtable_size); DUMP_OPT_INT("EF_SPIN_USEC", ul_spin_usec); DUMP_OPT_INT("EF_STACK_PER_THREAD", stack_per_thread); DUMP_OPT_INT("EF_DONT_ACCELERATE", dont_accelerate); DUMP_OPT_INT("EF_FDTABLE_STRICT", fdtable_strict); DUMP_OPT_INT("EF_FDS_MT_SAFE", fds_mt_safe); DUMP_OPT_INT("EF_FORK_NETIF", fork_netif); DUMP_OPT_INT("EF_NETIF_DTOR", netif_dtor); DUMP_OPT_INT("EF_NO_FAIL", no_fail); DUMP_OPT_INT("EF_SA_ONSTACK_INTERCEPT", sa_onstack_intercept); DUMP_OPT_INT("EF_ACCEPT_INHERIT_NONBLOCK", accept_force_inherit_nonblock); #if CI_CFG_USERSPACE_PIPE DUMP_OPT_INT("EF_PIPE", ul_pipe); #endif DUMP_OPT_HEX("EF_SIGNALS_NOPOSTPONE", signals_no_postpone); DUMP_OPT_INT("EF_CLUSTER_SIZE", cluster_size); DUMP_OPT_INT("EF_CLUSTER_RESTART", cluster_restart_opt); ci_log("EF_CLUSTER_NAME=%s", o->cluster_name); if( o->tcp_reuseports == 0 ) { DUMP_OPT_INT("EF_TCP_FORCE_REUSEPORT", tcp_reuseports); } else { struct ci_port_list *force_reuseport; CI_DLLIST_FOR_EACH2(struct ci_port_list, force_reuseport, link, (ci_dllist*)(ci_uintptr_t)o->tcp_reuseports) ci_log("%s=%d", "EF_TCP_FORCE_REUSEPORT", ntohs(force_reuseport->port)); } if( o->udp_reuseports == 0 ) { DUMP_OPT_INT("EF_UDP_FORCE_REUSEPORT", udp_reuseports); } else { struct ci_port_list *force_reuseport; CI_DLLIST_FOR_EACH2(struct ci_port_list, force_reuseport, link, (ci_dllist*)(ci_uintptr_t)o->udp_reuseports) ci_log("%s=%d", "EF_UDP_FORCE_REUSEPORT", ntohs(force_reuseport->port)); } }
static int onload_alloc_file(tcp_helper_resource_t *thr, oo_sp ep_id, int flags, int fd_type) { struct qstr name = { .name = "" }; #ifdef EFX_HAVE_STRUCT_PATH struct path path; #define my_dentry path.dentry #else struct dentry *dentry; #define my_dentry dentry #endif struct file *file; int fd; struct inode *inode; ci_private_t *priv; struct file_operations *fops; fops = oo_fops_by_type(fd_type); if( fops == NULL ) return -EINVAL; ci_assert_equal(fops->owner, THIS_MODULE); inode = new_inode(onload_mnt->mnt_sb); if( inode == NULL ) return -ENOMEM; #ifdef EFX_FSTYPE_HAS_MOUNT inode->i_ino = get_next_ino(); #endif if( fd_type == CI_PRIV_TYPE_NETIF ) inode->i_mode = S_IRWXUGO; if( fd_type == CI_PRIV_TYPE_TCP_EP || fd_type == CI_PRIV_TYPE_UDP_EP ) inode->i_mode = #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,21) /* in 2.6.18 this flag makes us "socket" and sendmsg crashes; * see sock_from_file() */ S_IFSOCK | #endif S_IRWXUGO; else inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); priv = &container_of(inode, struct onload_inode, vfs_inode)->priv; priv->thr = thr; priv->sock_id = ep_id; priv->fd_type = fd_type; fd = get_unused_fd(); if( fd < 0 ) { iput(inode); return fd; } /*ci_log("[%d]%s(%d:%d) return %d priv=%p", current->pid, __func__, thr->id, ep_id, fd, priv);*/ #ifdef EFX_FSTYPE_HAS_MOUNT #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,37) path.dentry = d_alloc(onload_mnt->mnt_sb->s_root, &name); if( path.dentry != NULL ) path.dentry->d_op = &onloadfs_dentry_operations; #else path.dentry = d_alloc_pseudo(onload_mnt->mnt_sb, &name); #endif #else /* EFX_FSTYPE_HAS_MOUNT */ #ifdef EFX_HAVE_D_DNAME my_dentry = d_alloc(onload_mnt->mnt_sb->s_root, &name); #else { char str[32]; name.len = onloadfs_name(&container_of(inode, struct onload_inode, vfs_inode)->priv, str, sizeof(str)); name.name = str; name.hash = inode->i_ino; my_dentry = d_alloc(onload_mnt->mnt_sb->s_root, &name); } #endif #endif /* EFX_FSTYPE_HAS_MOUNT */ if( my_dentry == NULL ) { put_unused_fd(fd); iput(inode); return -ENOMEM; } #if !defined(EFX_FSTYPE_HAS_MOUNT) || defined(EFX_OLD_MOUNT_PSEUDO) my_dentry->d_op = &onloadfs_dentry_operations; #if !defined(EFX_HAVE_STRUCT_PATH) && defined(EFX_HAVE_D_DNAME) my_dentry->d_flags &= ~DCACHE_UNHASHED; #endif #endif d_instantiate(my_dentry, inode); #ifndef EFX_HAVE_D_DNAME d_rehash(my_dentry); #endif inode->i_fop = fops; #ifdef EFX_HAVE_STRUCT_PATH path.mnt = mntget(onload_mnt); file = alloc_file(&path, FMODE_READ | FMODE_WRITE, fops); #else file = alloc_file(onload_mnt, dentry, FMODE_READ | FMODE_WRITE, fops); #endif if( file == NULL) { #ifdef EFX_HAVE_STRUCT_PATH path_put(&path); #else dput(dentry); iput(inode); #endif put_unused_fd(fd); return -ENFILE; } priv->_filp = file; file->f_flags = O_RDWR | (flags & O_NONBLOCK); file->f_pos = 0; file->private_data = priv; if( flags & O_CLOEXEC ) { struct files_struct *files = current->files; struct fdtable *fdt; spin_lock(&files->file_lock); fdt = files_fdtable(files); rcu_assign_pointer(fdt->fd[fd], file); efx_set_close_on_exec(fd, fdt); spin_unlock(&files->file_lock); } else fd_install(fd, file); try_module_get(THIS_MODULE); ci_assert_equal(file->f_op, fops); return fd; } void onload_priv_free(ci_private_t *priv) { if( priv->_filp->f_vfsmnt != onload_mnt) ci_free(priv); /* inode will free the priv automatically */ } int oo_create_fd(tcp_helper_endpoint_t* ep, int flags, int fd_type) { int fd; tcp_helper_resource_t *trs = ep->thr; citp_waitable_obj *wo = SP_TO_WAITABLE_OBJ(&trs->netif, ep->id); efab_thr_ref(trs); fd = onload_alloc_file(trs, ep->id, flags, fd_type); if( fd < 0 ) { efab_thr_release(trs); OO_DEBUG_ERR(ci_log("%s: onload_alloc_file failed (%d)", __FUNCTION__, fd)); return fd; } ci_atomic32_and(&wo-> waitable.sb_aflags, ~(CI_SB_AFLAG_ORPHAN | CI_SB_AFLAG_TCP_IN_ACCEPTQ)); return fd; }
static void citp_opts_getenv(citp_opts_t* opts) { /* ?? TODO: would like to use opts_citp_def.h here */ const char* s; unsigned v; opts->log_via_ioctl = 3; /* TODO: Old name. Keeping reading 'til 2011, then purge. */ GET_ENV_OPT_HEX("EF_Log_VIA_IOCTL", log_via_ioctl); GET_ENV_OPT_INT("EF_LOG_VIA_IOCTL", log_via_ioctl); if( (s = getenv("EF_LOG_FILE")) && opts->log_via_ioctl == 3) { opts->log_via_ioctl = 0; citp_log_to_file(s); } else if( opts->log_via_ioctl == 3 ) { /* citp_setup_logging_early() have already detected stderr as * tty/non-tty, so just trust it. */ if( ci_log_fn == citp_log_fn_drv ) opts->log_via_ioctl = 1; else opts->log_via_ioctl = 0; } if( opts->log_via_ioctl ) { ci_log_options &=~ CI_LOG_PID; citp_setup_logging_change(citp_log_fn_drv); } else { if( getenv("EF_LOG_TIMESTAMPS") ) ci_log_options |= CI_LOG_TIME; citp_setup_logging_change(citp_log_fn_ul); } if( getenv("EF_POLL_NONBLOCK_FAST_LOOPS") && ! getenv("EF_POLL_NONBLOCK_FAST_USEC") ) log("ERROR: EF_POLL_NONBLOCK_FAST_LOOPS is deprecated, use" " EF_POLL_NONBLOCK_FAST_USEC instead"); if( getenv("EF_POLL_FAST_LOOPS") && ! getenv("EF_POLL_FAST_USEC") ) log("ERROR: EF_POLL_FAST_LOOPS is deprecated, use" " EF_POLL_FAST_USEC instead"); if( (s = getenv("EF_POLL_USEC")) && atoi(s) ) { GET_ENV_OPT_INT("EF_POLL_USEC", ul_spin_usec); opts->ul_select_spin = 1; opts->ul_poll_spin = 1; #if CI_CFG_USERSPACE_EPOLL opts->ul_epoll_spin = 1; #endif #if CI_CFG_UDP opts->udp_recv_spin = 1; opts->udp_send_spin = 1; #endif opts->tcp_recv_spin = 1; opts->tcp_send_spin = 1; opts->pkt_wait_spin = 1; opts->sock_lock_buzz = 1; opts->stack_lock_buzz = 1; } if( (s = getenv("EF_BUZZ_USEC")) && atoi(s) ) { opts->sock_lock_buzz = 1; opts->stack_lock_buzz = 1; } GET_ENV_OPT_HEX("EF_UNIX_LOG", log_level); GET_ENV_OPT_INT("EF_PROBE", probe); GET_ENV_OPT_INT("EF_TCP", ul_tcp); GET_ENV_OPT_INT("EF_UDP", ul_udp); GET_ENV_OPT_INT("EF_UL_SELECT", ul_select); GET_ENV_OPT_INT("EF_SELECT_SPIN", ul_select_spin); GET_ENV_OPT_INT("EF_SELECT_FAST", ul_select_fast); GET_ENV_OPT_INT("EF_UL_POLL", ul_poll); GET_ENV_OPT_INT("EF_POLL_SPIN", ul_poll_spin); GET_ENV_OPT_INT("EF_POLL_FAST", ul_poll_fast); GET_ENV_OPT_INT("EF_POLL_FAST_USEC", ul_poll_fast_usec); GET_ENV_OPT_INT("EF_POLL_NONBLOCK_FAST_USEC", ul_poll_nonblock_fast_usec); GET_ENV_OPT_INT("EF_SELECT_FAST_USEC", ul_select_fast_usec); GET_ENV_OPT_INT("EF_SELECT_NONBLOCK_FAST_USEC", ul_select_nonblock_fast_usec); #if CI_CFG_UDP GET_ENV_OPT_INT("EF_UDP_RECV_SPIN", udp_recv_spin); GET_ENV_OPT_INT("EF_UDP_SEND_SPIN", udp_send_spin); #endif GET_ENV_OPT_INT("EF_TCP_RECV_SPIN", tcp_recv_spin); GET_ENV_OPT_INT("EF_TCP_SEND_SPIN", tcp_send_spin); GET_ENV_OPT_INT("EF_TCP_ACCEPT_SPIN", tcp_accept_spin); GET_ENV_OPT_INT("EF_TCP_CONNECT_SPIN",tcp_connect_spin); GET_ENV_OPT_INT("EF_PKT_WAIT_SPIN", pkt_wait_spin); #if CI_CFG_USERSPACE_PIPE GET_ENV_OPT_INT("EF_PIPE_RECV_SPIN", pipe_recv_spin); GET_ENV_OPT_INT("EF_PIPE_SEND_SPIN", pipe_send_spin); GET_ENV_OPT_INT("EF_PIPE_SIZE", pipe_size); #endif GET_ENV_OPT_INT("EF_SOCK_LOCK_BUZZ", sock_lock_buzz); GET_ENV_OPT_INT("EF_STACK_LOCK_BUZZ", stack_lock_buzz); GET_ENV_OPT_INT("EF_SO_BUSY_POLL_SPIN", so_busy_poll_spin); #if CI_CFG_USERSPACE_EPOLL GET_ENV_OPT_INT("EF_UL_EPOLL", ul_epoll); if( opts->ul_epoll == 0 && ci_cfg_opts.netif_opts.int_driven == 0 ) { ci_log("EF_INT_DRIVEN=0 and EF_UL_EPOLL=0 are not compatible. " "EF_INT_DRIVEN can be set to 0 implicitly, because of non-zero " "EF_POLL_USEC. If you need both spinning and EF_UL_EPOLL=0, " "please set EF_INT_DRIVEN=1 explicitly."); } GET_ENV_OPT_INT("EF_EPOLL_SPIN", ul_epoll_spin); GET_ENV_OPT_INT("EF_EPOLL_CTL_FAST", ul_epoll_ctl_fast); GET_ENV_OPT_INT("EF_EPOLL_CTL_HANDOFF",ul_epoll_ctl_handoff); GET_ENV_OPT_INT("EF_EPOLL_MT_SAFE", ul_epoll_mt_safe); #endif GET_ENV_OPT_INT("EF_FDTABLE_SIZE", fdtable_size); GET_ENV_OPT_INT("EF_SPIN_USEC", ul_spin_usec); GET_ENV_OPT_INT("EF_STACK_PER_THREAD",stack_per_thread); GET_ENV_OPT_INT("EF_DONT_ACCELERATE", dont_accelerate); GET_ENV_OPT_INT("EF_FDTABLE_STRICT", fdtable_strict); GET_ENV_OPT_INT("EF_FDS_MT_SAFE", fds_mt_safe); GET_ENV_OPT_INT("EF_NO_FAIL", no_fail); GET_ENV_OPT_INT("EF_SA_ONSTACK_INTERCEPT", sa_onstack_intercept); GET_ENV_OPT_INT("EF_ACCEPT_INHERIT_NONBLOCK", accept_force_inherit_nonblock); GET_ENV_OPT_INT("EF_VFORK_MODE", vfork_mode); #if CI_CFG_USERSPACE_PIPE GET_ENV_OPT_INT("EF_PIPE", ul_pipe); #endif if( (s = getenv("EF_FORK_NETIF")) && sscanf(s, "%x", &v) == 1 ) { opts->fork_netif = CI_MIN(v, CI_UNIX_FORK_NETIF_BOTH); } if( (s = getenv("EF_NETIF_DTOR")) && sscanf(s, "%x", &v) == 1 ) { opts->netif_dtor = CI_MIN(v, CITP_NETIF_DTOR_ALL); } if( (s = getenv("EF_SIGNALS_NOPOSTPONE")) ) { opts->signals_no_postpone = 0; while( sscanf(s, "%u", &v) == 1 ) { opts->signals_no_postpone |= (1 << (v-1)); s = strchr(s, ','); if( s == NULL ) break; s++; } } if( (s = getenv("EF_CLUSTER_NAME")) ) { strncpy(opts->cluster_name, s, CI_CFG_CLUSTER_NAME_LEN); opts->cluster_name[CI_CFG_CLUSTER_NAME_LEN] = '\0'; } else { opts->cluster_name[0] = '\0'; } GET_ENV_OPT_INT("EF_CLUSTER_SIZE", cluster_size); if( opts->cluster_size < 2 ) log("ERROR: cluster_size < 2 are not supported"); GET_ENV_OPT_INT("EF_CLUSTER_RESTART", cluster_restart_opt); get_env_opt_port_list(&opts->tcp_reuseports, "EF_TCP_FORCE_REUSEPORT"); get_env_opt_port_list(&opts->udp_reuseports, "EF_UDP_FORCE_REUSEPORT"); #if CI_CFG_FD_CACHING get_env_opt_port_list(&opts->sock_cache_ports, "EF_SOCKET_CACHE_PORTS"); #endif }
static int efab_tcp_helper_move_state(ci_private_t* priv, void *arg) { oo_tcp_move_state_t *op = arg; tcp_helper_endpoint_t *new_ep; tcp_helper_resource_t * new_trs = NULL; ci_netif* ni, *new_ni; ci_tcp_state * ts, *new_ts; tcp_helper_endpoint_t* ep; int rc = efab_ioctl_get_ep(priv, op->ep_id, &ep); if (rc != 0) return rc; OO_DEBUG_TCPH(ci_log("%s: (trs=%p (%u), priv=%p, ep_id=%u, new_trs_id=%u, " "new_ep_id=%u", __FUNCTION__, priv->thr, priv->thr->id, priv, OO_SP_FMT(op->ep_id), op->new_trs_id, OO_SP_FMT(op->new_ep_id))); do { /* check that the existing id is valid */ ni = &priv->thr->netif; ts = SP_TO_TCP(ni, ep->id); /* TODO: check this endpoint belongs to the tcp helper resource of priv and not * somewhere else */ /* this function does not change fd_type or fd ops, so it is not able * to cope with changing the socket type. We think this only makes sense * for TCP, so assert we are taking a TCP endpoint. */ ci_assert_equal(ts->s.pkt.ip.ip_protocol, IPPROTO_TCP); ci_assert_equal(priv->fd_type, CI_PRIV_TYPE_TCP_EP); /* get pointer to resource from handle - increments ref count */ rc = efab_thr_table_lookup(NULL, op->new_trs_id, EFAB_THR_TABLE_LOOKUP_CHECK_USER, &new_trs); if (rc < 0) { OO_DEBUG_ERR( ci_log("%s: invalid new resource handle", __FUNCTION__) ); break; } ci_assert(new_trs != NULL); /* check valid endpoint in new netif */ new_ni = &new_trs->netif; new_ep = ci_netif_get_valid_ep(new_ni, op->new_ep_id); new_ts = SP_TO_TCP(new_ni, new_ep->id); /* check the two endpoint states look valid */ if( (ts->s.pkt.ip.ip_protocol != new_ts->s.pkt.ip.ip_protocol) || (ts->s.b.state != CI_TCP_CLOSED) || (ep->oofilter.sf_local_port != NULL) ) { efab_thr_release(new_trs); rc = -EINVAL; OO_DEBUG_ERR(ci_log("%s: invalid endpoint states", __FUNCTION__)); break; } /* should be fine to complete */ ci_assert(new_trs); { tcp_helper_resource_t *old_trs; again: old_trs = priv->thr; if (ci_cas_uintptr_fail((ci_uintptr_t *)&priv->thr, (ci_uintptr_t)old_trs, (ci_uintptr_t)new_trs)) goto again; efab_thr_release(old_trs); } /* move file to hold details of new resource, new endpoint */ ci_assert(OO_SP_EQ(priv->sock_id, op->ep_id)); priv->sock_id = new_ep->id; OO_DEBUG_TCPH(ci_log("%s: set epid %u", __FUNCTION__, OO_SP_FMT(priv->sock_id))); /* copy across any necessary state */ ci_assert_equal(new_ep->os_socket, NULL); new_ep->os_socket = ep->os_socket; ep->os_socket = NULL; /* set ORPHAN flag in current as not attached to an FD */ ci_bit_set(&ts->s.b.sb_aflags, CI_SB_AFLAG_ORPHAN_BIT); /* remove ORPHAN flag in new TCP state */ ci_atomic32_and(&new_ts->s.b.sb_aflags, ~(CI_SB_AFLAG_ORPHAN | CI_SB_AFLAG_TCP_IN_ACCEPTQ)); return 0; } while (0); return rc; }
static int setup_trampoline(struct pt_regs *regs, int opcode, int arg, int bits) { struct mm_hash *p; ci_uintptr_t trampoline_entry = 0, trampoline_exclude = 0, trampoline_toc = 0, trampoline_fixup = 0; int rc = -EBADF; read_lock(&oo_mm_tbl_lock); p = oo_mm_tbl_lookup(current->mm); if (p) { trampoline_entry = (ci_uintptr_t) CI_USER_PTR_GET(p->trampoline_entry); trampoline_exclude = (ci_uintptr_t) CI_USER_PTR_GET(p->trampoline_exclude); trampoline_toc = (ci_uintptr_t) CI_USER_PTR_GET(p->trampoline_toc); trampoline_fixup = (ci_uintptr_t) CI_USER_PTR_GET(p->trampoline_user_fixup); } read_unlock(&oo_mm_tbl_lock); TRAMP_DEBUG("%s: trampoline_entry = %p \n", __func__, (void *)trampoline_entry); /* OK. We have the entry - set up a trampoline to user space */ if (trampoline_entry) { if (!access_ok(VERIFY_READ, trampoline_entry, 1)) { /* Can't read this address. Fail! */ ci_log("Pid %d (mm=%p) has bad trampoline entry: %p", current->tgid, current->mm, (void *)trampoline_entry); return -EBADF; } /* Check for the excluded address */ if (regs->nip == trampoline_exclude) { TRAMP_DEBUG("Ignoring call from excluded address 0x%08lx", (unsigned long)trampoline_exclude); return -EBUSY; } TRAMP_DEBUG("%s: bits = %d; set up trampoline. \n", __func__, bits); if (bits == TRAMPOLINE_BITS_64) { setup_trampoline64(regs, opcode, arg, (void *)trampoline_entry, (void *)trampoline_toc, (void *)trampoline_fixup); } #ifdef CONFIG_COMPAT else { setup_trampoline32(regs, opcode, arg, (void *)trampoline_entry, (void *)trampoline_toc, (void *)trampoline_fixup); } #endif rc = 0; } else { OO_DEBUG_VERB(ci_log("Error -- attempt to trampoline for unknown process")); rc = -ENOENT; } return rc; }
int efab_file_move_to_alien_stack_rsop(ci_private_t *stack_priv, void *arg) { ci_fixed_descriptor_t sock_fd = *(ci_fixed_descriptor_t *)arg; struct file *sock_file = fget(sock_fd); ci_private_t *sock_priv; tcp_helper_resource_t *old_thr; tcp_helper_resource_t *new_thr; citp_waitable *w; int rc; if( sock_file == NULL ) return -EINVAL; if( !FILE_IS_ENDPOINT_SOCK(sock_file) || stack_priv->fd_type != CI_PRIV_TYPE_NETIF ) { fput(sock_file); return -EINVAL; } sock_priv = sock_file->private_data; ci_assert(sock_priv->fd_type == CI_PRIV_TYPE_TCP_EP || sock_priv->fd_type == CI_PRIV_TYPE_UDP_EP); old_thr = sock_priv->thr; new_thr = stack_priv->thr; ci_assert(old_thr); ci_assert(new_thr); if( old_thr == new_thr ) { fput(sock_file); return 0; } if( tcp_helper_cluster_from_cluster(old_thr) != 0 ) { LOG_S(ci_log("%s: move_fd() not permitted on clustered stacks", __func__)); fput(sock_file); return -EINVAL; } w = SP_TO_WAITABLE(&old_thr->netif, sock_priv->sock_id); rc = ci_sock_lock(&old_thr->netif, w); if( rc != 0 ) { fput(sock_file); return rc; } rc = ci_netif_lock(&old_thr->netif); if( rc != 0 ) { ci_sock_unlock(&old_thr->netif, w); fput(sock_file); return rc; } efab_thr_ref(new_thr); rc = efab_file_move_to_alien_stack(sock_priv, &stack_priv->thr->netif); fput(sock_file); if( rc != 0 ) efab_thr_release(new_thr); else ci_netif_unlock(&new_thr->netif); return rc; }
/* Maps the shared memory regions that are used as the interface between the * control plane and its clients. On failure, this function will clean up any * partially-initialised state. */ int oo_cp_create(int fd, struct oo_cplane_handle* cp, enum cp_sync_mode mode) { struct cp_mibs* mibs = cp->mib; int rc; void* mem; ci_uint32 op = mode; /* Check user-kernel interface version. */ rc = cp_ioctl(fd, OO_IOC_CP_CHECK_VERSION, &oo_cplane_api_version); if( rc != 0 ) return rc; /* Wait for the control plane server to start if necessary. This ioctl does * an interruptible sleep while waiting. If a non-fatal signal is received * while we're asleep, the ioctl will fail with EINTR, and we want to try * again. */ do { rc = cp_ioctl(fd, OO_IOC_CP_WAIT_FOR_SERVER, &op); } while( rc == -EINTR ); if( rc != 0 ) return rc; /* Find out the MIB size */ rc = cp_ioctl(fd, OO_IOC_CP_MIB_SIZE, &cp->bytes); if( rc != 0 ) return rc; ci_assert(cp->bytes); ci_assert_equal(cp->bytes & (CI_PAGE_SIZE - 1), 0); /* Mmap MIBs */ mem = mmap(NULL, cp->bytes, PROT_READ , MAP_SHARED, fd, OO_MMAP_TYPE_CPLANE << OO_MMAP_TYPE_SHIFT); if( mem == MAP_FAILED ) { ci_log("ERROR: failed to mmap cplane MIBs: %s", strerror(errno)); return -errno; } /* Build MIBs */ mibs[1].dim = mibs[0].dim = mem; cp_init_mibs(mem, mibs); /* Mmap rw memory */ mibs[1].fwd_rw = mibs[0].fwd_rw = mmap( NULL, CI_ROUND_UP((mibs[0].dim->fwd_mask + 1) * sizeof(mibs[0].fwd_rw[0]), CI_PAGE_SIZE), PROT_READ | PROT_WRITE, MAP_SHARED, fd, #ifdef CP_SYSUNIT /* see server.c init_memory() */ CI_ROUND_UP(cp->bytes, CI_PAGE_SIZE) + #endif ((OO_MMAP_TYPE_CPLANE << OO_MMAP_TYPE_SHIFT) | (OO_MMAP_CPLANE_ID_FWD_RW << OO_MMAP_ID_SHIFT))); if( mibs[0].fwd_rw == MAP_FAILED ) { ci_log("ERROR: failed to mmap rw part of Control Plane memory: %s", strerror(errno)); rc = -errno; munmap(mem, cp->bytes); return rc; } cp->fd = fd; return 0; }
/* Insert for either TCP or UDP */ int ci_netif_filter_insert(ci_netif* netif, oo_sp tcp_id, unsigned laddr, unsigned lport, unsigned raddr, unsigned rport, unsigned protocol) { ci_netif_filter_table_entry* entry; unsigned hash1, hash2; ci_netif_filter_table* tbl; #if !defined(NDEBUG) || CI_CFG_STATS_NETIF unsigned hops = 1; #endif unsigned first; ci_assert(netif); ci_assert(ci_netif_is_locked(netif)); ci_assert(netif->filter_table); tbl = netif->filter_table; hash1 = tcp_hash1(tbl, laddr, lport, raddr, rport, protocol); hash2 = tcp_hash2(tbl, laddr, lport, raddr, rport, protocol); first = hash1; /* Find a free slot. */ while( 1 ) { entry = &tbl->table[hash1]; if( entry->id < 0 ) break; ++entry->route_count; #if !defined(NDEBUG) || CI_CFG_STATS_NETIF ++hops; #endif /* A socket can only have multiple entries in the filter table if each * entry has a different [laddr]. */ ci_assert( !((entry->id == OO_SP_TO_INT(tcp_id)) && (laddr == entry->laddr)) ); hash1 = (hash1 + hash2) & tbl->table_size_mask; if( hash1 == first ) { ci_sock_cmn *s = SP_TO_SOCK_CMN(netif, tcp_id); if( ! (s->s_flags & CI_SOCK_FLAG_SW_FILTER_FULL) ) { LOG_E(ci_log(FN_FMT "%d FULL %s %s:%u->%s:%u hops=%u", FN_PRI_ARGS(netif), OO_SP_FMT(tcp_id), CI_IP_PROTOCOL_STR(protocol), ip_addr_str(laddr), (unsigned) CI_BSWAP_BE16(lport), ip_addr_str(raddr), (unsigned) CI_BSWAP_BE16(rport), hops)); s->s_flags |= CI_SOCK_FLAG_SW_FILTER_FULL; } CITP_STATS_NETIF_INC(netif, sw_filter_insert_table_full); return -ENOBUFS; } } /* Now insert the new entry. */ LOG_TC(ci_log(FN_FMT "%d INSERT %s %s:%u->%s:%u hash=%u:%u at=%u " "over=%d hops=%u", FN_PRI_ARGS(netif), OO_SP_FMT(tcp_id), CI_IP_PROTOCOL_STR(protocol), ip_addr_str(laddr), (unsigned) CI_BSWAP_BE16(lport), ip_addr_str(raddr), (unsigned) CI_BSWAP_BE16(rport), first, hash2, hash1, entry->id, hops)); #if CI_CFG_STATS_NETIF if( hops > netif->state->stats.table_max_hops ) netif->state->stats.table_max_hops = hops; /* Keep a rolling average of the number of hops per entry. */ if( netif->state->stats.table_mean_hops == 0 ) netif->state->stats.table_mean_hops = 1; netif->state->stats.table_mean_hops = (netif->state->stats.table_mean_hops * 9 + hops) / 10; if( entry->id == EMPTY ) ++netif->state->stats.table_n_slots; ++netif->state->stats.table_n_entries; #endif entry->id = OO_SP_TO_INT(tcp_id); entry->laddr = laddr; return 0; }
/* Change substituted sigaction to the structure really meant by user. * If sa is provided, copy user sigaction data here to pass to user. * If sa==NULL, substitute in-place. */ static int efab_signal_report_sigaction(int sig, struct sigaction *sa, struct mm_signal_data *tramp_data) { struct oo_sigaction *signal_data = &(tramp_data->signal_data[sig - 1]); ci_int32 type; #define MAX_TRIES_BUSY 1000 int tried_busy = 0; int tried_changed = 0; int sa_provided = (sa != NULL); re_read_data: do { tried_busy++; type = signal_data->type; } while( (type & OO_SIGHANGLER_TYPE_MASK) == OO_SIGHANGLER_BUSY && tried_busy <= MAX_TRIES_BUSY ); if( tried_busy > MAX_TRIES_BUSY ) { ci_log("%s(%d): pid %d signal() or sigaction() runs for too long", __func__, sig, current->pid); return -EBUSY; } report: spin_lock_irq(¤t->sighand->siglock); if( sa_provided ) *sa = current->sighand->action[sig - 1].sa; else sa = ¤t->sighand->action[sig - 1].sa; if( sa->sa_handler != tramp_data->handler_postpone ) { spin_unlock_irq(¤t->sighand->siglock); return 0; } OO_DEBUG_SIGNAL(ci_log("%s: %d process sig=%d type %d handler %p " "flags %lx restorer %p", __func__, current->pid, sig, type & OO_SIGHANGLER_TYPE_MASK, sa->sa_handler, sa->sa_flags, sa->sa_restorer)); if( (signal_data->type & OO_SIGHANGLER_TYPE_MASK) == OO_SIGHANGLER_USER) { sa->sa_handler = CI_USER_PTR_GET(signal_data->handler); if( ! (signal_data->flags & SA_SIGINFO) ) sa->sa_flags &= ~SA_SIGINFO; } else if( ! (signal_data->type & OO_SIGHANGLER_IGN_BIT) ) { sa->sa_handler = SIG_DFL; sa->sa_flags &= ~SA_RESTORER; if( ! (signal_data->flags & SA_SIGINFO) ) sa->sa_flags &= ~SA_SIGINFO; sa->sa_restorer = NULL; } OO_DEBUG_SIGNAL(ci_log("%s: %d to user sig=%d handler %p flags %lx " "restorer %p", __func__, current->pid, sig, sa->sa_handler, sa->sa_flags, sa->sa_restorer)); spin_unlock_irq(¤t->sighand->siglock); /* Re-check that UL have not changed signal_data. */ if( type != signal_data->type ) { tried_changed++; if( tried_changed > MAX_TRIES_BUSY ) { ci_log("%s: signal() or sigaction() called too fast", __func__); return -EBUSY; } if( (signal_data->type & OO_SIGHANGLER_TYPE_MASK) == OO_SIGHANGLER_BUSY ) { tried_busy = 0; goto re_read_data; } else goto report; } return 0; }
/* NOTE: in the kernel version [fd] is unused and, if it's a ptr, [arg] will * be in user-space and may need to be fetched into kernel memory. */ static int ci_tcp_ioctl_lk(citp_socket* ep, ci_fd_t fd, int request, void* arg) { ci_netif* netif = ep->netif; ci_sock_cmn* s = ep->s; ci_tcp_state* ts = NULL; int rc = 0; int os_socket_exists = s->b.sb_aflags & CI_SB_AFLAG_OS_BACKED; if( s->b.state != CI_TCP_LISTEN ) ts = SOCK_TO_TCP(s); /* Keep the os socket in sync. If this is a "get" request then the * return will be based on our support, not the os's (except for EFAULT * handling which we get for free). * Exceptions: * - FIONBIO is applied just in time on handover if needed (listening * sockets always have a non-blocking OS socket) * - FIONREAD, TIOCOUTQ, SIOCOUTQNSD and SIOCATMARK are useless on OS * socket, let's avoid syscall. */ if( os_socket_exists && request != FIONREAD && request != SIOCATMARK && request != FIOASYNC && request != TIOCOUTQ && request != SIOCOUTQNSD && request != (int) FIONBIO ) { rc = oo_os_sock_ioctl(netif, s->b.bufid, request, arg, NULL); if( rc < 0 ) return rc; } /* ioctl defines are listed in `man ioctl_list` and the CI equivalent * CI defines are in include/ci/net/ioctls.h */ LOG_TV( ci_log("%s: request = %d, arg = %ld", __FUNCTION__, request, (long)arg)); switch( request ) { case FIONBIO: if( CI_IOCTL_ARG_OK(int, arg) ) { CI_CMN_IOCTL_FIONBIO(ep->s, arg); rc = 0; break; } goto fail_fault; case FIONREAD: /* synonym of SIOCINQ */ if( !CI_IOCTL_ARG_OK(int, arg) ) goto fail_fault; if( s->b.state == CI_TCP_LISTEN ) goto fail_inval; if( s->b.state == CI_TCP_SYN_SENT ) { CI_IOCTL_SETARG((int*)arg, 0); } else { /* In inline mode, return the total number of bytes in the receive queue. If SO_OOBINLINE isn't set then return the number of bytes up to the mark but without counting the mark */ int bytes_in_rxq = tcp_rcv_usr(ts); if (bytes_in_rxq && ! (ts->s.s_flags & CI_SOCK_FLAG_OOBINLINE)) { if (tcp_urg_data(ts) & CI_TCP_URG_PTR_VALID) { /*! \TODO: what if FIN has been received? */ unsigned int readnxt = tcp_rcv_nxt(ts) - bytes_in_rxq; if (SEQ_LT(readnxt, tcp_rcv_up(ts))) { bytes_in_rxq = tcp_rcv_up(ts) - readnxt; } else if (SEQ_EQ(readnxt, tcp_rcv_up(ts))) { bytes_in_rxq--; } } } CI_IOCTL_SETARG((int*)arg, bytes_in_rxq); } break; case TIOCOUTQ: /* synonym of SIOCOUTQ */ case SIOCOUTQNSD: { CI_BUILD_ASSERT(TIOCOUTQ == SIOCOUTQ); int outq_bytes = 0; if( !CI_IOCTL_ARG_OK(int, arg) ) goto fail_fault; if( s->b.state == CI_TCP_LISTEN ) goto fail_inval; if( s->b.state != CI_TCP_SYN_SENT ) { /* TIOCOUTQ counts all unacknowledged data, so includes retrans queue. */ if( request == TIOCOUTQ ) outq_bytes = SEQ_SUB(tcp_enq_nxt(ts), tcp_snd_una(ts)); else outq_bytes = SEQ_SUB(tcp_enq_nxt(ts), tcp_snd_nxt(ts)); } CI_IOCTL_SETARG((int*)arg, outq_bytes); } break; case SIOCATMARK: { if( !CI_IOCTL_ARG_OK(int, arg) ) goto fail_fault; /* return true, if we are at the out-of-band byte */ CI_IOCTL_SETARG((int*)arg, 0); if( s->b.state != CI_TCP_LISTEN ) { int readnxt; readnxt = SEQ_SUB(tcp_rcv_nxt(ts), tcp_rcv_usr(ts)); if( ~ts->s.b.state & CI_TCP_STATE_ACCEPT_DATA ) readnxt = SEQ_SUB(readnxt, 1); if( tcp_urg_data(ts) & CI_TCP_URG_PTR_VALID ) CI_IOCTL_SETARG((int*)arg, readnxt == tcp_rcv_up(ts)); LOG_URG(log(NTS_FMT "SIOCATMARK atmark=%d readnxt=%u rcv_up=%u%s", NTS_PRI_ARGS(ep->netif, ts), readnxt == tcp_rcv_up(ts), readnxt, tcp_rcv_up(SOCK_TO_TCP(ep->s)), (tcp_urg_data(ts)&CI_TCP_URG_PTR_VALID)?"":" (invalid)")); } break; } #ifndef __KERNEL__ case FIOASYNC: /* Need to apply this to [fd] so that our fasync file-op will be * invoked. */ rc = ci_sys_ioctl(fd, request, arg); break; case SIOCSPGRP: if( !CI_IOCTL_ARG_OK(int, arg) ) goto fail_fault; /* Need to apply this to [fd] to get signal delivery to work. However, * SIOCSPGRP is only supported on sockets, so we need to convert to * fcntl(). */ rc = ci_sys_fcntl(fd, F_SETOWN, CI_IOCTL_GETARG(int, arg)); if( rc == 0 ) { rc = ci_cmn_ioctl(netif, ep->s, request, arg, rc, os_socket_exists); } else { CI_SET_ERROR(rc, -rc); } break; #endif default: return ci_cmn_ioctl(netif, ep->s, request, arg, rc, os_socket_exists); }
/* Substitute signal handler by our variant. */ static int efab_signal_substitute(int sig, struct sigaction *new_act, struct mm_signal_data *tramp_data) { int rc; __sighandler_t handler; struct k_sigaction *k; int type; __user struct oo_sigaction *user_data; struct oo_sigaction *signal_data = &(tramp_data->signal_data[sig - 1]); ci_int32 old_type; ci_int32 seq; user_data = &(((struct oo_sigaction *) (CI_USER_PTR_GET(tramp_data->user_data)))[sig - 1]); if( !access_ok(VERIFY_WRITE, user_data, sizeof(struct oo_sigaction) ) ) return -EFAULT; do { old_type = signal_data->type; seq = (old_type & OO_SIGHANGLER_SEQ_MASK) + (1 << OO_SIGHANGLER_SEQ_SHIFT); } while( ci_cas32_fail(&signal_data->type, old_type, OO_SIGHANGLER_BUSY | seq) ); /* We are going to change signal handler: UL should wait until we've * finished */ rc = __put_user(signal_data->type, &user_data->type); if( rc != 0 ) { signal_data->type = old_type; return -EFAULT; } spin_lock_irq(¤t->sighand->siglock); k = ¤t->sighand->action[sig - 1]; if( new_act ) k->sa = *new_act; type = efab_signal_handler_type(sig, k->sa.sa_handler); handler = type <= OO_SIGHANGLER_DFL_MAX ? tramp_data->handlers[type] : NULL; BUILD_BUG_ON(SIG_DFL != NULL); /* We do not handle this signal: */ if( type != OO_SIGHANGLER_USER && handler == NULL ) { spin_unlock_irq(¤t->sighand->siglock); signal_data->type = old_type | OO_SIGHANGLER_IGN_BIT | seq; ci_verify(__put_user(signal_data->type, &user_data->type) == 0); return 0; } OO_DEBUG_SIGNAL(ci_log("%s: %d change sig=%d handler %p flags %lx " "restorer %p type %d", __func__, current->pid, sig, k->sa.sa_handler, k->sa.sa_flags, k->sa.sa_restorer, type)); signal_data->flags = k->sa.sa_flags; k->sa.sa_flags |= SA_SIGINFO; if( type == OO_SIGHANGLER_USER ) CI_USER_PTR_SET(signal_data->handler, k->sa.sa_handler); else { CI_USER_PTR_SET(signal_data->handler, handler); if( tramp_data->sarestorer ) { k->sa.sa_flags |= SA_RESTORER; k->sa.sa_restorer = tramp_data->sarestorer; } } k->sa.sa_handler = tramp_data->handler_postpone; spin_unlock_irq(¤t->sighand->siglock); OO_DEBUG_SIGNAL(ci_log("%s: %d set sig=%d handler %p flags %lx restorer %p", __func__, current->pid, sig, k->sa.sa_handler, k->sa.sa_flags, k->sa.sa_restorer)); /* Copy signal_data to UL; type BUSY */ rc = __copy_to_user(user_data, signal_data, sizeof(*signal_data)); signal_data->type = type | seq; if( rc != 0 ) return -EFAULT; /* Fill in the real type */ ci_verify(__put_user(signal_data->type, &user_data->type) == 0); return 0; }
int onload_zc_alloc_buffers(int fd, struct onload_zc_iovec* iovecs, int iovecs_len, enum onload_zc_buffer_type_flags flags) { int rc = 0, i; citp_lib_context_t lib_context; citp_fdinfo* fdi; citp_sock_fdi* epi; ci_netif* ni; ci_ip_pkt_fmt *pkt; unsigned max_len; Log_CALL(ci_log("%s(%d, %p, %d, %x)", __FUNCTION__, fd, iovecs, iovecs_len, flags)); citp_enter_lib(&lib_context); if( (fdi = citp_fdtable_lookup(fd)) != NULL ) { switch( citp_fdinfo_get_type(fdi) ) { case CITP_UDP_SOCKET: case CITP_TCP_SOCKET: epi = fdi_to_sock_fdi(fdi); ni = epi->sock.netif; ci_netif_lock(ni); for( i = 0; i < iovecs_len; ++i ) { max_len = CI_CFG_PKT_BUF_SIZE; pkt = ci_netif_pkt_tx_tcp_alloc(ni); if( pkt == NULL ) { while( --i >= 0 ) ci_netif_pkt_release(ni, (ci_ip_pkt_fmt*)iovecs[i].buf); rc = -ENOMEM; ci_netif_unlock(ni); goto out; } /* Make sure this is clear as it affects behaviour when freeing */ pkt->pf.udp.rx_flags = 0; iovecs[i].buf = (struct oo_zc_buf *)pkt; if( flags & ONLOAD_ZC_BUFFER_HDR_TCP ) { if( (citp_fdinfo_get_type(fdi) == CITP_TCP_SOCKET) && (epi->sock.s->b.state & CI_TCP_STATE_TCP_CONN) ) { ci_tcp_state* ts = SOCK_TO_TCP(epi->sock.s); oo_tx_pkt_layout_init(pkt); iovecs[i].iov_base = ((char *)oo_tx_ip_hdr(pkt)) + ts->outgoing_hdrs_len; max_len = tcp_eff_mss(ts); } else { /* Best guess. We can fix it up later. Magic 12 leaves * space for time stamp option (common case) */ oo_tx_pkt_layout_init(pkt); iovecs[i].iov_base = (uint8_t*) oo_tx_ip_data(pkt) + sizeof(ci_tcp_hdr) + 12; } } else if( flags & ONLOAD_ZC_BUFFER_HDR_UDP ) { oo_tx_pkt_layout_init(pkt); iovecs[i].iov_base = (uint8_t*) oo_tx_ip_data(pkt) + sizeof(ci_udp_hdr); } else iovecs[i].iov_base = PKT_START(pkt); iovecs[i].iov_len = CI_CFG_PKT_BUF_SIZE - ((char *)iovecs[i].iov_base - (char *)pkt); if( iovecs[i].iov_len > max_len ) iovecs[i].iov_len = max_len; } ni->state->n_async_pkts += iovecs_len; ci_netif_unlock(ni); break; #if CI_CFG_USERSPACE_EPOLL case CITP_EPOLL_FD: rc = -ENOTSOCK; break; #endif #if CI_CFG_USERSPACE_PIPE case CITP_PIPE_FD: rc = -ENOTSOCK; break; #endif case CITP_PASSTHROUGH_FD: rc = -ESOCKTNOSUPPORT; break; default: LOG_U(log("%s: unknown fdinfo type %d", __FUNCTION__, citp_fdinfo_get_type(fdi))); rc = -EINVAL; } citp_fdinfo_release_ref(fdi, 0); } else { /* Not onload socket */ rc = -ESOCKTNOSUPPORT; } out: citp_exit_lib(&lib_context, TRUE); Log_CALL_RESULT(rc); return rc; }
int ci_tcp_listen(citp_socket* ep, ci_fd_t fd, int backlog) { /* ** ?? error handling on possible fails not handled robustly... ** ?? Need to check port number is valid TODO */ /*! \todo If not bound then we have to be listening on all interfaces. * It's likely that we won't be coming through here as we have to * listen on the OS socket too! */ ci_tcp_state* ts; ci_tcp_socket_listen* tls; ci_netif* netif = ep->netif; ci_sock_cmn* s = ep->s; unsigned ul_backlog = backlog; int rc; oo_p sp; LOG_TC(log("%s "SK_FMT" listen backlog=%d", __FUNCTION__, SK_PRI_ARGS(ep), backlog)); CHECK_TEP(ep); if( NI_OPTS(netif).tcp_listen_handover ) return CI_SOCKET_HANDOVER; if( !NI_OPTS(netif).tcp_server_loopback) { /* We should handover if the socket is bound to alien address. */ if( s->s_flags & CI_SOCK_FLAG_BOUND_ALIEN ) return CI_SOCKET_HANDOVER; } if( ul_backlog < 0 ) ul_backlog = NI_OPTS(netif).max_ep_bufs; else if( ul_backlog < NI_OPTS(netif).acceptq_min_backlog ) ul_backlog = NI_OPTS(netif).acceptq_min_backlog; if( s->b.state == CI_TCP_LISTEN ) { tls = SOCK_TO_TCP_LISTEN(s); tls->acceptq_max = ul_backlog; ci_tcp_helper_listen_os_sock(fd, ul_backlog); return 0; } if( s->b.state != CI_TCP_CLOSED ) { CI_SET_ERROR(rc, EINVAL); return rc; } ts = SOCK_TO_TCP(s); /* Bug 3376: if socket used for a previous, failed, connect then the error * numbers will not be as expected. Only seen when not using listening * netifs (as moving the EP to the new netif resets them). */ ts->s.tx_errno = EPIPE; ts->s.rx_errno = ENOTCONN; /* fill in address/ports and all TCP state */ if( !(ts->s.s_flags & CI_SOCK_FLAG_BOUND) ) { ci_uint16 source_be16; /* They haven't previously done a bind, so we need to choose * a port. As we haven't been given a hint we let the OS choose. */ source_be16 = 0; rc = __ci_bind(ep->netif, ep->s, ts->s.pkt.ip.ip_saddr_be32, &source_be16); if (CI_LIKELY( rc==0 )) { TS_TCP(ts)->tcp_source_be16 = source_be16; ts->s.cp.lport_be16 = source_be16; LOG_TC(log(LNT_FMT "listen: our bind returned %s:%u", LNT_PRI_ARGS(ep->netif, ts), ip_addr_str(ts->s.pkt.ip.ip_saddr_be32), (unsigned) CI_BSWAP_BE16(TS_TCP(ts)->tcp_source_be16))); } else { LOG_U(ci_log("__ci_bind returned %d at %s:%d", CI_GET_ERROR(rc), __FILE__, __LINE__)); return rc; } } ci_sock_lock(netif, &ts->s.b); ci_tcp_set_slow_state(netif, ts, CI_TCP_LISTEN); tls = SOCK_TO_TCP_LISTEN(&ts->s); tcp_raddr_be32(tls) = 0u; tcp_rport_be16(tls) = 0u; ci_assert_equal(tls->s.tx_errno, EPIPE); ci_assert_equal(tls->s.rx_errno, ENOTCONN); /* setup listen timer - do it before the first return statement, * because __ci_tcp_listen_to_normal() will be called on error path. */ if( ~tls->s.s_flags & CI_SOCK_FLAG_BOUND_ALIEN ) { sp = TS_OFF(netif, tls); OO_P_ADD(sp, CI_MEMBER_OFFSET(ci_tcp_socket_listen, listenq_tid)); ci_ip_timer_init(netif, &tls->listenq_tid, sp, "lstq"); tls->listenq_tid.param1 = S_SP(tls); tls->listenq_tid.fn = CI_IP_TIMER_TCP_LISTEN; } rc = ci_tcp_listen_init(netif, tls); ci_sock_unlock(netif, &ts->s.b); if( rc != 0 ) { CI_SET_ERROR(rc, -rc); goto listen_fail; } tls->acceptq_max = ul_backlog; CITP_STATS_TCP_LISTEN(CI_ZERO(&tls->stats)); /* install all the filters needed for this connection * - tcp_laddr_be32(ts) = 0 for IPADDR_ANY * * TODO: handle BINDTODEVICE by setting phys_port paramter to correct * physical L5 port index * TODO: handle REUSEADDR by setting last paramter to TRUE */ if( ~s->s_flags & CI_SOCK_FLAG_BOUND_ALIEN ) { #ifdef ONLOAD_OFE if( netif->ofe != NULL ) { tls->s.ofe_code_start = ofe_socktbl_find( netif->ofe, OFE_SOCKTYPE_TCP_LISTEN, tcp_laddr_be32(tls), INADDR_ANY, tcp_lport_be16(ts), 0); tls->ofe_promote = ofe_socktbl_find( netif->ofe, OFE_SOCKTYPE_TCP_PASSIVE, tcp_laddr_be32(tls), INADDR_ANY, tcp_lport_be16(ts), 0); } #endif rc = ci_tcp_ep_set_filters(netif, S_SP(tls), tls->s.cp.so_bindtodevice, OO_SP_NULL); if( rc == -EFILTERSSOME ) { if( CITP_OPTS.no_fail ) rc = 0; else { ci_tcp_ep_clear_filters(netif, S_SP(tls), 0); rc = -ENOBUFS; } } ci_assert_nequal(rc, -EFILTERSSOME); VERB(ci_log("%s: set_filters returned %d", __FUNCTION__, rc)); if (rc < 0) { CI_SET_ERROR(rc, -rc); goto post_listen_fail; } } /* * Call of system listen() is required for listen any, local host * communications server and multi-homed server (to accept connections * to L5 assigned address(es), but incoming from other interfaces). */ #ifdef __ci_driver__ { rc = efab_tcp_helper_listen_os_sock( netif2tcp_helper_resource(netif), S_SP(tls), backlog); } #else rc = ci_tcp_helper_listen_os_sock(fd, backlog); #endif if ( rc < 0 ) { /* clear the filter we've just set */ ci_tcp_ep_clear_filters(netif, S_SP(tls), 0); goto post_listen_fail; } return 0; post_listen_fail: ci_tcp_listenq_drop_all(netif, tls); listen_fail: /* revert TCP state to a non-listening socket format */ __ci_tcp_listen_to_normal(netif, tls); /* Above function sets orphan flag but we are attached to an FD. */ ci_bit_clear(&tls->s.b.sb_aflags, CI_SB_AFLAG_ORPHAN_BIT); #ifdef __ci_driver__ return rc; #else return CI_SOCKET_ERROR; #endif }