int ci_tcp_shutdown(citp_socket* ep, int how, ci_fd_t fd) { ci_sock_cmn* s = ep->s; int rc; if( s->b.state == CI_TCP_LISTEN ) return ci_tcp_shutdown_listen(ep, how, fd); if( SOCK_TO_TCP(s)->snd_delegated ) { /* We do not know which seq number to use. Call * onload_delegated_send_cancel(). */ CI_SET_ERROR(rc, EBUSY); return rc; } if( ! ci_netif_trylock(ep->netif) ) { /* Can't get lock, so try to defer shutdown to the lock holder. */ unsigned flags = 0; switch( s->b.state ) { case CI_TCP_CLOSED: case CI_TCP_TIME_WAIT: CI_SET_ERROR(rc, ENOTCONN); return rc; } if( how == SHUT_RD || how == SHUT_RDWR ) flags |= CI_SOCK_AFLAG_NEED_SHUT_RD; if( how == SHUT_WR || how == SHUT_RDWR ) flags |= CI_SOCK_AFLAG_NEED_SHUT_WR; ci_atomic32_or(&s->s_aflags, flags); if( ! ci_netif_lock_or_defer_work(ep->netif, &s->b) ) return 0; ci_atomic32_and(&s->s_aflags, ~flags); } if( 0 ) { /* Poll to get up-to-date. This is slightly spurious but done to ensure * ordered response to all packets preceding this FIN (e.g. ANVL tcp_core * 9.18) * * DJR: I've disabled this because it can hurt performance for * high-connection-rate apps. May consider adding back (as option?) if * needed. */ ci_netif_poll(ep->netif); } rc = __ci_tcp_shutdown(ep->netif, SOCK_TO_TCP(s), how); if( rc < 0 ) CI_SET_ERROR(rc, -rc); ci_netif_unlock(ep->netif); return rc; }
/* check that we can handle this destination */ static int ci_tcp_connect_check_dest(citp_socket* ep, ci_ip_addr_t dst_be32, int dport_be16) { ci_ip_cached_hdrs* ipcache = &ep->s->pkt; ipcache->ip.ip_daddr_be32 = dst_be32; ipcache->dport_be16 = dport_be16; cicp_user_retrieve(ep->netif, ipcache, &ep->s->cp); if(CI_LIKELY( ipcache->status == retrrc_success || ipcache->status == retrrc_nomac || ipcache->status < 0 )) { /* Onloadable. */ if( ipcache->encap.type & CICP_LLAP_TYPE_XMIT_HASH_LAYER4 ) /* We don't yet have a local port number, so the result of that * lookup may be wrong. */ ci_ip_cache_invalidate(ipcache); if( ipcache->ip.ip_saddr_be32 == 0 ) { /* Control plane has selected a source address for us -- remember it. */ ipcache->ip.ip_saddr_be32 = ipcache->ip_saddr_be32; ep->s->cp.ip_laddr_be32 = ipcache->ip_saddr_be32; } return 0; } else if( ipcache->status == retrrc_localroute ) { ci_tcp_state* ts = SOCK_TO_TCP(ep->s); if( NI_OPTS(ep->netif).tcp_client_loopback == CITP_TCP_LOOPBACK_OFF) return CI_SOCKET_HANDOVER; ep->s->s_flags |= CI_SOCK_FLAG_BOUND_ALIEN; if( NI_OPTS(ep->netif).tcp_server_loopback != CITP_TCP_LOOPBACK_OFF ) ts->local_peer = ci_tcp_connect_find_local_peer(ep->netif, dst_be32, dport_be16); else ts->local_peer = OO_SP_NULL; if( OO_SP_NOT_NULL(ts->local_peer) || NI_OPTS(ep->netif).tcp_client_loopback != CITP_TCP_LOOPBACK_SAMESTACK ) { ipcache->flags |= CI_IP_CACHE_IS_LOCALROUTE; if( ipcache->ip.ip_saddr_be32 == 0 ) { ipcache->ip.ip_saddr_be32 = dst_be32; ep->s->cp.ip_laddr_be32 = dst_be32; } ipcache->ether_offset = 4; /* lo is non-VLAN */ ipcache->ip_saddr_be32 = dst_be32; ipcache->dport_be16 = dport_be16; return 0; } return CI_SOCKET_HANDOVER; } return CI_SOCKET_HANDOVER; }
/* Returns true if move of this endpoint is supported */ static int efab_file_move_supported(ci_netif *ni, ci_sock_cmn *s) { /* We do not copy TX timestamping queue yet. */ if( s->timestamping_flags != 0 ) return false; /* UDP: */ if( s->b.state == CI_TCP_STATE_UDP ) return efab_file_move_supported_udp(ni, SOCK_TO_UDP(s)); /* TCP or UDP only */ if( ! (s->b.state & CI_TCP_STATE_TCP ) ) return false; /* No listening sockets */ if( s->b.state == CI_TCP_LISTEN ) return false; return efab_file_move_supported_tcp(ni, SOCK_TO_TCP(s)); }
ssize_t linux_tcp_helper_fop_sendpage(struct file* filp, struct page* page, int offset, size_t size, loff_t* ppos, int flags) { ci_private_t* priv = filp->private_data; tcp_helper_resource_t* trs = efab_priv_to_thr(priv); ci_sock_cmn* s; OO_DEBUG_VERB(ci_log("%s: %d:%d offset=%d size=%d flags=%x", __FUNCTION__, NI_ID(&trs->netif), OO_SP_FMT(priv->sock_id), offset, (int) size, flags)); ci_assert(page); ci_assert_ge(offset, 0); ci_assert_gt(size, 0); ci_assert_le(offset + size, CI_PAGE_SIZE); #ifndef MSG_SENDPAGE_NOTLAST /* "flags" is really "more". Convert it. */ if( flags ) flags = MSG_MORE; /* [more] is sometimes true even for the last page. We get a little ** closer to the truth by spotting that we're not reading to the end of ** the page. - seen on 2.6.18, but not on 2.6.26 or later */ if( offset + size < CI_PAGE_SIZE && flags ) flags = 0; #endif s = SP_TO_SOCK(&trs->netif, priv->sock_id); if(CI_LIKELY( s->b.state & CI_TCP_STATE_TCP_CONN )) return sendpage_copy(&trs->netif,SOCK_TO_TCP(s),page,offset,size,flags); else /* Closed or listening. Return epipe. Do not send SIGPIPE, because ** Linux will do it for us. */ return -s->tx_errno; }
int onload_zc_alloc_buffers(int fd, struct onload_zc_iovec* iovecs, int iovecs_len, enum onload_zc_buffer_type_flags flags) { int rc = 0, i; citp_lib_context_t lib_context; citp_fdinfo* fdi; citp_sock_fdi* epi; ci_netif* ni; ci_ip_pkt_fmt *pkt; unsigned max_len; Log_CALL(ci_log("%s(%d, %p, %d, %x)", __FUNCTION__, fd, iovecs, iovecs_len, flags)); citp_enter_lib(&lib_context); if( (fdi = citp_fdtable_lookup(fd)) != NULL ) { switch( citp_fdinfo_get_type(fdi) ) { case CITP_UDP_SOCKET: case CITP_TCP_SOCKET: epi = fdi_to_sock_fdi(fdi); ni = epi->sock.netif; ci_netif_lock(ni); for( i = 0; i < iovecs_len; ++i ) { max_len = CI_CFG_PKT_BUF_SIZE; pkt = ci_netif_pkt_tx_tcp_alloc(ni); if( pkt == NULL ) { while( --i >= 0 ) ci_netif_pkt_release(ni, (ci_ip_pkt_fmt*)iovecs[i].buf); rc = -ENOMEM; ci_netif_unlock(ni); goto out; } /* Make sure this is clear as it affects behaviour when freeing */ pkt->pf.udp.rx_flags = 0; iovecs[i].buf = (struct oo_zc_buf *)pkt; if( flags & ONLOAD_ZC_BUFFER_HDR_TCP ) { if( (citp_fdinfo_get_type(fdi) == CITP_TCP_SOCKET) && (epi->sock.s->b.state & CI_TCP_STATE_TCP_CONN) ) { ci_tcp_state* ts = SOCK_TO_TCP(epi->sock.s); oo_tx_pkt_layout_init(pkt); iovecs[i].iov_base = ((char *)oo_tx_ip_hdr(pkt)) + ts->outgoing_hdrs_len; max_len = tcp_eff_mss(ts); } else { /* Best guess. We can fix it up later. Magic 12 leaves * space for time stamp option (common case) */ oo_tx_pkt_layout_init(pkt); iovecs[i].iov_base = (uint8_t*) oo_tx_ip_data(pkt) + sizeof(ci_tcp_hdr) + 12; } } else if( flags & ONLOAD_ZC_BUFFER_HDR_UDP ) { oo_tx_pkt_layout_init(pkt); iovecs[i].iov_base = (uint8_t*) oo_tx_ip_data(pkt) + sizeof(ci_udp_hdr); } else iovecs[i].iov_base = PKT_START(pkt); iovecs[i].iov_len = CI_CFG_PKT_BUF_SIZE - ((char *)iovecs[i].iov_base - (char *)pkt); if( iovecs[i].iov_len > max_len ) iovecs[i].iov_len = max_len; } ni->state->n_async_pkts += iovecs_len; ci_netif_unlock(ni); break; #if CI_CFG_USERSPACE_EPOLL case CITP_EPOLL_FD: rc = -ENOTSOCK; break; #endif #if CI_CFG_USERSPACE_PIPE case CITP_PIPE_FD: rc = -ENOTSOCK; break; #endif case CITP_PASSTHROUGH_FD: rc = -ESOCKTNOSUPPORT; break; default: LOG_U(log("%s: unknown fdinfo type %d", __FUNCTION__, citp_fdinfo_get_type(fdi))); rc = -EINVAL; } citp_fdinfo_release_ref(fdi, 0); } else { /* Not onload socket */ rc = -ESOCKTNOSUPPORT; } out: citp_exit_lib(&lib_context, TRUE); Log_CALL_RESULT(rc); return rc; }
/* NOTE: in the kernel version [fd] is unused and, if it's a ptr, [arg] will * be in user-space and may need to be fetched into kernel memory. */ static int ci_tcp_ioctl_lk(citp_socket* ep, ci_fd_t fd, int request, void* arg) { ci_netif* netif = ep->netif; ci_sock_cmn* s = ep->s; ci_tcp_state* ts = NULL; int rc = 0; int os_socket_exists = s->b.sb_aflags & CI_SB_AFLAG_OS_BACKED; if( s->b.state != CI_TCP_LISTEN ) ts = SOCK_TO_TCP(s); /* Keep the os socket in sync. If this is a "get" request then the * return will be based on our support, not the os's (except for EFAULT * handling which we get for free). * Exceptions: * - FIONBIO is applied just in time on handover if needed (listening * sockets always have a non-blocking OS socket) * - FIONREAD, TIOCOUTQ, SIOCOUTQNSD and SIOCATMARK are useless on OS * socket, let's avoid syscall. */ if( os_socket_exists && request != FIONREAD && request != SIOCATMARK && request != FIOASYNC && request != TIOCOUTQ && request != SIOCOUTQNSD && request != (int) FIONBIO ) { rc = oo_os_sock_ioctl(netif, s->b.bufid, request, arg, NULL); if( rc < 0 ) return rc; } /* ioctl defines are listed in `man ioctl_list` and the CI equivalent * CI defines are in include/ci/net/ioctls.h */ LOG_TV( ci_log("%s: request = %d, arg = %ld", __FUNCTION__, request, (long)arg)); switch( request ) { case FIONBIO: if( CI_IOCTL_ARG_OK(int, arg) ) { CI_CMN_IOCTL_FIONBIO(ep->s, arg); rc = 0; break; } goto fail_fault; case FIONREAD: /* synonym of SIOCINQ */ if( !CI_IOCTL_ARG_OK(int, arg) ) goto fail_fault; if( s->b.state == CI_TCP_LISTEN ) goto fail_inval; if( s->b.state == CI_TCP_SYN_SENT ) { CI_IOCTL_SETARG((int*)arg, 0); } else { /* In inline mode, return the total number of bytes in the receive queue. If SO_OOBINLINE isn't set then return the number of bytes up to the mark but without counting the mark */ int bytes_in_rxq = tcp_rcv_usr(ts); if (bytes_in_rxq && ! (ts->s.s_flags & CI_SOCK_FLAG_OOBINLINE)) { if (tcp_urg_data(ts) & CI_TCP_URG_PTR_VALID) { /*! \TODO: what if FIN has been received? */ unsigned int readnxt = tcp_rcv_nxt(ts) - bytes_in_rxq; if (SEQ_LT(readnxt, tcp_rcv_up(ts))) { bytes_in_rxq = tcp_rcv_up(ts) - readnxt; } else if (SEQ_EQ(readnxt, tcp_rcv_up(ts))) { bytes_in_rxq--; } } } CI_IOCTL_SETARG((int*)arg, bytes_in_rxq); } break; case TIOCOUTQ: /* synonym of SIOCOUTQ */ case SIOCOUTQNSD: { CI_BUILD_ASSERT(TIOCOUTQ == SIOCOUTQ); int outq_bytes = 0; if( !CI_IOCTL_ARG_OK(int, arg) ) goto fail_fault; if( s->b.state == CI_TCP_LISTEN ) goto fail_inval; if( s->b.state != CI_TCP_SYN_SENT ) { /* TIOCOUTQ counts all unacknowledged data, so includes retrans queue. */ if( request == TIOCOUTQ ) outq_bytes = SEQ_SUB(tcp_enq_nxt(ts), tcp_snd_una(ts)); else outq_bytes = SEQ_SUB(tcp_enq_nxt(ts), tcp_snd_nxt(ts)); } CI_IOCTL_SETARG((int*)arg, outq_bytes); } break; case SIOCATMARK: { if( !CI_IOCTL_ARG_OK(int, arg) ) goto fail_fault; /* return true, if we are at the out-of-band byte */ CI_IOCTL_SETARG((int*)arg, 0); if( s->b.state != CI_TCP_LISTEN ) { int readnxt; readnxt = SEQ_SUB(tcp_rcv_nxt(ts), tcp_rcv_usr(ts)); if( ~ts->s.b.state & CI_TCP_STATE_ACCEPT_DATA ) readnxt = SEQ_SUB(readnxt, 1); if( tcp_urg_data(ts) & CI_TCP_URG_PTR_VALID ) CI_IOCTL_SETARG((int*)arg, readnxt == tcp_rcv_up(ts)); LOG_URG(log(NTS_FMT "SIOCATMARK atmark=%d readnxt=%u rcv_up=%u%s", NTS_PRI_ARGS(ep->netif, ts), readnxt == tcp_rcv_up(ts), readnxt, tcp_rcv_up(SOCK_TO_TCP(ep->s)), (tcp_urg_data(ts)&CI_TCP_URG_PTR_VALID)?"":" (invalid)")); } break; } #ifndef __KERNEL__ case FIOASYNC: /* Need to apply this to [fd] so that our fasync file-op will be * invoked. */ rc = ci_sys_ioctl(fd, request, arg); break; case SIOCSPGRP: if( !CI_IOCTL_ARG_OK(int, arg) ) goto fail_fault; /* Need to apply this to [fd] to get signal delivery to work. However, * SIOCSPGRP is only supported on sockets, so we need to convert to * fcntl(). */ rc = ci_sys_fcntl(fd, F_SETOWN, CI_IOCTL_GETARG(int, arg)); if( rc == 0 ) { rc = ci_cmn_ioctl(netif, ep->s, request, arg, rc, os_socket_exists); } else { CI_SET_ERROR(rc, -rc); } break; #endif default: return ci_cmn_ioctl(netif, ep->s, request, arg, rc, os_socket_exists); }
/* Move priv file to the alien_ni stack. * Should be called with the locked priv stack and socket; * the function returns with this stack being unlocked. * If rc=0, it returns with alien_ni stack locked; * otherwise, both stacks are unlocked. * Socket is always unlocked on return. */ int efab_file_move_to_alien_stack(ci_private_t *priv, ci_netif *alien_ni) { tcp_helper_resource_t *old_thr = priv->thr; tcp_helper_resource_t *new_thr = netif2tcp_helper_resource(alien_ni); ci_sock_cmn *old_s = SP_TO_SOCK(&old_thr->netif, priv->sock_id); ci_sock_cmn *new_s; ci_sock_cmn *mid_s; tcp_helper_endpoint_t *old_ep, *new_ep; int rc, i; int pollwait_register = 0; #if CI_CFG_FD_CACHING oo_p sp; #endif OO_DEBUG_TCPH(ci_log("%s: move %d:%d to %d", __func__, old_thr->id, priv->sock_id, new_thr->id)); /* Poll the old stack - deliver all data to our socket */ ci_netif_poll(&old_thr->netif); /* Endpoints in epoll list should not be moved, because waitq is already * in the epoll internal structures (bug 41152). */ if( !list_empty(&priv->_filp->f_ep_links) ) { rc = -EBUSY; goto fail1; } if( !efab_file_move_supported(&old_thr->netif, old_s) ) { rc = -EINVAL; goto fail1; } /* Lock the second stack */ i = 0; while( ! ci_netif_trylock(alien_ni) ) { ci_netif_unlock(&old_thr->netif); if( i++ >= 1000 ) { rc = -EBUSY; goto fail1_ni_unlocked; } rc = ci_netif_lock(&old_thr->netif); if( rc != 0 ) goto fail1_ni_unlocked; } /* Allocate a new socket in the alien_ni stack */ rc = -ENOMEM; if( old_s->b.state == CI_TCP_STATE_UDP ) { ci_udp_state *new_us = ci_udp_get_state_buf(alien_ni); if( new_us == NULL ) goto fail2; new_s = &new_us->s; } else { ci_tcp_state *new_ts = ci_tcp_get_state_buf(alien_ni); if( new_ts == NULL ) goto fail2; new_s = &new_ts->s; } /* Allocate an intermediate "socket" outside of everything */ mid_s = ci_alloc(CI_MAX(sizeof(ci_tcp_state), sizeof(ci_udp_state))); if( mid_s == NULL ) goto fail3; OO_DEBUG_TCPH(ci_log("%s: move %d:%d to %d:%d", __func__, old_thr->id, priv->sock_id, new_thr->id, new_s->b.bufid)); /* Copy TCP/UDP state */ memcpy(mid_s, old_s, CI_MAX(sizeof(ci_tcp_state), sizeof(ci_udp_state))); /* do not copy old_s->b.bufid * and other fields in stack adress space */ mid_s->b.sb_aflags |= CI_SB_AFLAG_ORPHAN; mid_s->b.bufid = new_s->b.bufid; mid_s->b.post_poll_link = new_s->b.post_poll_link; mid_s->b.ready_link = new_s->b.ready_link; mid_s->reap_link = new_s->reap_link; if( old_s->b.state & CI_TCP_STATE_TCP ) { ci_tcp_state *new_ts = SOCK_TO_TCP(new_s); ci_tcp_state *mid_ts = SOCK_TO_TCP(mid_s); mid_ts->timeout_q_link = new_ts->timeout_q_link; mid_ts->tx_ready_link = new_ts->tx_ready_link; mid_ts->rto_tid = new_ts->rto_tid; mid_ts->delack_tid = new_ts->delack_tid; mid_ts->zwin_tid = new_ts->zwin_tid; mid_ts->kalive_tid = new_ts->kalive_tid; mid_ts->cork_tid = new_ts->cork_tid; ci_ip_queue_init(&mid_ts->recv1); ci_ip_queue_init(&mid_ts->recv2); ci_ip_queue_init(&mid_ts->send); ci_ip_queue_init(&mid_ts->retrans); mid_ts->send_prequeue = OO_PP_ID_NULL; new_ts->retrans_ptr = OO_PP_NULL; mid_ts->tmpl_head = OO_PP_NULL; oo_atomic_set(&mid_ts->send_prequeue_in, 0); *new_ts = *mid_ts; ci_pmtu_state_init(alien_ni, &new_ts->s, &new_ts->pmtus, CI_IP_TIMER_PMTU_DISCOVER); #if CI_CFG_FD_CACHING sp = TS_OFF(alien_ni, new_ts); OO_P_ADD(sp, CI_MEMBER_OFFSET(ci_tcp_state, epcache_link)); ci_ni_dllist_link_init(alien_ni, &new_ts->epcache_link, sp, "epch"); ci_ni_dllist_self_link(alien_ni, &new_ts->epcache_link); sp = TS_OFF(alien_ni, new_ts); OO_P_ADD(sp, CI_MEMBER_OFFSET(ci_tcp_state, epcache_fd_link)); ci_ni_dllist_link_init(alien_ni, &new_ts->epcache_fd_link, sp, "ecfd"); ci_ni_dllist_self_link(alien_ni, &new_ts->epcache_fd_link); #endif /* free temporary mid_ts storage */ CI_FREE_OBJ(mid_ts); } else { ci_udp_state *mid_us = SOCK_TO_UDP(mid_s); *SOCK_TO_UDP(new_s) = *mid_us; CI_FREE_OBJ(mid_us); } /* Move the filter */ old_ep = ci_trs_ep_get(old_thr, priv->sock_id); new_ep = ci_trs_ep_get(new_thr, new_s->b.bufid); rc = tcp_helper_endpoint_move_filters_pre(old_ep, new_ep); if( rc != 0 ) { rc = -EINVAL; goto fail3; } /* Allocate a new file for the new endpoint */ rc = onload_alloc_file(new_thr, new_s->b.bufid, priv->_filp->f_flags, priv->fd_type, &old_ep->alien_ref); if( rc != 0 ) goto fail4; ci_assert(old_ep->alien_ref); /* Copy F_SETOWN_EX, F_SETSIG to the new file */ #ifdef F_SETOWN_EX rcu_read_lock(); __f_setown(old_ep->alien_ref->_filp, priv->_filp->f_owner.pid, priv->_filp->f_owner.pid_type, 1); rcu_read_unlock(); #endif old_ep->alien_ref->_filp->f_owner.signum = priv->_filp->f_owner.signum; old_ep->alien_ref->_filp->f_flags |= priv->_filp->f_flags & O_NONBLOCK; /* Move os_socket from one ep to another */ if( tcp_helper_endpoint_set_aflags(new_ep, OO_THR_EP_AFLAG_ATTACHED) & OO_THR_EP_AFLAG_ATTACHED ) { fput(old_ep->alien_ref->_filp); rc = -EBUSY; goto fail2; /* state & filters are cleared by fput() */ } /********* Point of no return **********/ ci_wmb(); priv->fd_type = CI_PRIV_TYPE_ALIEN_EP; priv->_filp->f_op = &linux_tcp_helper_fops_alien; ci_wmb(); oo_file_moved(priv); /* Read all already-arrived packets after the filters move but before * copying of the receive queue. */ ci_netif_poll(&old_thr->netif); tcp_helper_endpoint_move_filters_post(old_ep, new_ep); ci_assert( efab_file_move_supported(&old_thr->netif, old_s)); /* There's a gap between un-registering the old ep, and registering the * the new. However, the notifications shouldn't be in use for sockets * that are in a state that can be moved, so this shouldn't be a problem. */ if( old_ep->os_sock_pt.whead ) { pollwait_register = 1; efab_tcp_helper_os_pollwait_unregister(old_ep); } ci_assert_equal(new_ep->os_socket, NULL); new_ep->os_socket = oo_file_ref_xchg(&old_ep->os_socket, NULL); ci_assert_equal(old_ep->os_socket, NULL); if( pollwait_register ) efab_tcp_helper_os_pollwait_register(new_ep); ci_bit_clear(&new_s->b.sb_aflags, CI_SB_AFLAG_ORPHAN_BIT); if( new_s->b.state == CI_TCP_ESTABLISHED ) CI_TCP_STATS_INC_CURR_ESTAB(alien_ni); /* Copy recv queue */ if( new_s->b.state & CI_TCP_STATE_TCP ) { ci_tcp_state *new_ts = SOCK_TO_TCP(new_s); ci_tcp_state *old_ts = SOCK_TO_TCP(old_s); int i; /* Stop timers */ ci_ip_timer_clear(&old_thr->netif, &old_ts->kalive_tid); ci_ip_timer_clear(&old_thr->netif, &old_ts->delack_tid); efab_ip_queue_copy(alien_ni, &new_ts->recv1, &old_thr->netif, &old_ts->recv1); efab_ip_queue_copy(alien_ni, &new_ts->recv2, &old_thr->netif, &old_ts->recv2); new_ts->recv1_extract = new_ts->recv1.head; /* Drop reorder buffer */ ci_ip_queue_init(&new_ts->rob); new_ts->dsack_block = OO_PP_INVALID; new_ts->dsack_start = new_ts->dsack_end = 0; for( i = 0; i <= CI_TCP_SACK_MAX_BLOCKS; i++ ) new_ts->last_sack[i] = OO_PP_NULL; } else { /* There should not be any recv q, but drop it to be sure */ ci_udp_recv_q_init(&SOCK_TO_UDP(new_s)->recv_q); } /* Old stack can be unlocked */ old_s->b.sb_flags |= CI_SB_FLAG_MOVED; ci_netif_unlock(&old_thr->netif); ci_assert( efab_file_move_supported(alien_ni, new_s) ); /* Move done: poll for any new data. */ ci_netif_poll(alien_ni); if( new_s->b.state & CI_TCP_STATE_TCP ) { ci_tcp_state *new_ts = SOCK_TO_TCP(new_s); /* Timers setup: delack, keepalive */ if( (new_ts->acks_pending & CI_TCP_ACKS_PENDING_MASK) > 0) ci_tcp_timeout_delack(alien_ni, new_ts); ci_tcp_kalive_reset(alien_ni, new_ts); } /* Old ep: we are done. */ ci_bit_set(&old_s->b.sb_aflags, CI_SB_AFLAG_MOVED_AWAY_BIT); old_s->b.moved_to_stack_id = alien_ni->state->stack_id; old_s->b.moved_to_sock_id = new_s->b.bufid; if( ! list_empty(&priv->_filp->f_ep_links) ) ci_bit_set(&old_s->b.sb_aflags, CI_SB_AFLAG_MOVED_AWAY_IN_EPOLL_BIT); ci_sock_unlock(&old_thr->netif, &old_s->b); ci_sock_unlock(alien_ni, &new_s->b); ci_assert(ci_netif_is_locked(alien_ni)); OO_DEBUG_TCPH(ci_log("%s: -> [%d:%d] %s", __func__, new_thr->id, new_s->b.bufid, ci_tcp_state_str(new_s->b.state))); return 0; fail4: /* We clear the filters from the new ep. * For now, we do not need to re-insert old filters because hw filters * are alredy here (in case of accepted socket) or not needed. * We have not removed old sw filters yet. */ tcp_helper_endpoint_move_filters_undo(old_ep, new_ep); fail3: if( new_s->b.state & CI_TCP_STATE_TCP ) ci_tcp_state_free(alien_ni, SOCK_TO_TCP(new_s)); else ci_udp_state_free(alien_ni, SOCK_TO_UDP(new_s)); fail2: ci_netif_unlock(alien_ni); fail1: ci_netif_unlock(&old_thr->netif); fail1_ni_unlocked: ci_sock_unlock(&old_thr->netif, &old_s->b); OO_DEBUG_TCPH(ci_log("%s: rc=%d", __func__, rc)); return rc; }
static int ci_tcp_info_get(ci_netif* netif, ci_sock_cmn* s, struct ci_tcp_info* info) { ci_iptime_t now = ci_ip_time_now(netif); memset(info, 0, sizeof(*info)); info->tcpi_state = ci_sock_states_linux_map[CI_TCP_STATE_NUM(s->b.state)]; /* info->tcpi_backoff = 0; */ info->tcpi_ato = ci_ip_time_ticks2ms(netif, netif->state->conf.tconst_delack) * 1000; info->tcpi_rcv_mss = 536; /* no way to get the actual value */ /* info->tcpi_sacked = 0; */ /* there is no way to get any of these */ /* info->tcpi_lost = 0; */ /* info->tcpi_fackets = 0; */ /* info->tcpi_reordering = 0; */ /* info->tcpi_last_ack_sent = 0; */ /* info->tcpi_last_ack_recv = 0; */ if( s->b.state != CI_TCP_LISTEN ) { ci_tcp_state* ts = SOCK_TO_TCP(s); info->tcpi_pmtu = ts->pmtus.pmtu; info->tcpi_ca_state = sock_congstate_linux_map[ts->congstate]; info->tcpi_retransmits = ts->retransmits; info->tcpi_probes = ts->ka_probes; /* info->tcpi_options = 0; */ if( ts->tcpflags & CI_TCPT_FLAG_TSO ) info->tcpi_options |= CI_TCPI_OPT_TIMESTAMPS; if( ts->tcpflags & CI_TCPT_FLAG_ECN ) info->tcpi_options |= CI_TCPI_OPT_ECN; if( ts->tcpflags & CI_TCPT_FLAG_SACK ) info->tcpi_options |= CI_TCPI_OPT_SACK; if( ts->tcpflags & CI_TCPT_FLAG_WSCL ) { info->tcpi_options |= CI_TCPI_OPT_WSCALE; info->tcpi_snd_wscale = ts->snd_wscl; info->tcpi_rcv_wscale = ts->rcv_wscl; } info->tcpi_rto = ci_ip_time_ticks2ms(netif, ts->rto) * 1000; info->tcpi_snd_mss = ts->eff_mss; info->tcpi_unacked = ts->acks_pending & CI_TCP_ACKS_PENDING_MASK; #if CI_CFG_TCP_SOCK_STATS info->tcpi_retrans = ts->stats_cumulative.count.tx_retrans_pkt; #endif #if CI_CFG_CONGESTION_WINDOW_VALIDATION info->tcpi_last_data_sent = ci_ip_time_ticks2ms(netif, now - ts->t_last_sent); #else info->tcpi_last_data_sent = 0; #endif info->tcpi_last_data_recv = ci_ip_time_ticks2ms(netif, now - ts->tspaws); info->tcpi_rtt = ci_ip_time_ticks2ms(netif, ts->sa) * 1000 / 8; info->tcpi_rttvar = ci_ip_time_ticks2ms(netif, ts->sv) * 1000 / 4; info->tcpi_rcv_ssthresh = ts->ssthresh; if( tcp_eff_mss(ts) != 0 ) { info->tcpi_snd_ssthresh = ts->ssthresh / tcp_eff_mss(ts); info->tcpi_snd_cwnd = ts->cwnd / tcp_eff_mss(ts); } else { /* non-initialised connection */ info->tcpi_snd_ssthresh = 0; info->tcpi_snd_cwnd = 0; } info->tcpi_advmss = ts->amss; } return 0; }
static int ci_tcp_setsockopt_lk(citp_socket* ep, ci_fd_t fd, int level, int optname, const void* optval, socklen_t optlen ) { ci_sock_cmn* s = ep->s; #if defined(__linux__) || \ defined(__sun__) && defined(TCP_KEEPALIVE_THRESHOLD) || \ defined(__sun__) && defined(TCP_KEEPALIVE_ABORT_THRESHOLD) ci_tcp_socket_cmn* c = &(SOCK_TO_WAITABLE_OBJ(s)->tcp.c); #endif ci_netif* netif = ep->netif; int zeroval = 0; int rc; /* ?? what to do about optval and optlen checking ** Kernel can raise EFAULT, here we are a little in the dark. ** Note: If the OS sock is sync'd then we get this checking for free. */ if (optlen == 0) { /* Match kernel behaviour: if length is 0, it treats the value as 0; and * some applications rely on this. */ optval = &zeroval; optlen = sizeof(zeroval); } /* If you're adding to this please remember to look in common_sockopts.c * and decide if the option is common to all protocols. */ if(level == SOL_SOCKET) { switch(optname) { case SO_KEEPALIVE: /* Over-ride the default common handler. * Enable sending of keep-alive messages */ if( (rc = opt_not_ok(optval, optlen, unsigned)) ) goto fail_inval; if( *(unsigned*) optval ) { unsigned prev_flags = s->s_flags; s->s_flags |= CI_SOCK_FLAG_KALIVE; /* Set KEEPALIVE timer only if we are not in ** CLOSE or LISTENING state. */ if( s->b.state != CI_TCP_CLOSED && s->b.state != CI_TCP_LISTEN && !(prev_flags & CI_SOCK_FLAG_KALIVE) ) { ci_tcp_state* ts = SOCK_TO_TCP(s); LOG_TV(log("%s: "NSS_FMT" run KEEPALIVE timer from setsockopt()", __FUNCTION__, NSS_PRI_ARGS(netif, s))); ci_assert(ts->ka_probes == 0); ci_tcp_kalive_restart(netif, ts, ci_tcp_kalive_idle_get(ts)); } } else { s->s_flags &=~ CI_SOCK_FLAG_KALIVE; if( s->b.state != CI_TCP_LISTEN ) { ci_tcp_state* ts = SOCK_TO_TCP(s); ci_tcp_kalive_check_and_clear(netif, ts); ts->ka_probes = 0; } } break; default: { /* Common socket level options */ return ci_set_sol_socket(netif, s, optname, optval, optlen); } } } else if( level == IPPROTO_IP ) {
/* [fd] is unused in the kernel version */ int ci_tcp_getsockopt(citp_socket* ep, ci_fd_t fd, int level, int optname, void *optval, socklen_t *optlen ) { ci_sock_cmn* s = ep->s; #if defined(__linux__) || \ defined(__sun__) && defined(TCP_KEEPALIVE_THRESHOLD) || \ defined(__sun__) && defined(TCP_KEEPALIVE_ABORT_THRESHOLD) ci_tcp_socket_cmn *c = &(SOCK_TO_WAITABLE_OBJ(s)->tcp.c); #endif ci_netif* netif = ep->netif; unsigned u = 0; /* NOTE: The setsockopt() call is reflected into the os socket to * keep the two in sync - it's assumed that we know everything * to allow us to give good answers here - and therefore we don't * bother the os with the get call */ /* ?? what to do about optval and optlen checking * Kernel can raise EFAULT, here we are a little in the dark. * - sockcall_intercept.c checks that optlen is non-NULL and if *optlen * is non-zero that optval is non-NULL, returning EFAULT if false */ if(level == SOL_SOCKET) { /* Common SOL_SOCKET handler */ return ci_get_sol_socket(netif, s, optname, optval, optlen); } else if (level == IPPROTO_IP) { /* IP level options valid for TCP */ return ci_get_sol_ip(ep, s, fd, optname, optval, optlen); #if CI_CFG_FAKE_IPV6 } else if (level == IPPROTO_IPV6 && s->domain == AF_INET6) { /* IP6 level options valid for TCP */ return ci_get_sol_ip6(ep, s, fd, optname, optval, optlen); #endif } else if (level == IPPROTO_TCP) { /* TCP level options valid for TCP */ switch(optname){ case TCP_NODELAY: /* gets status of TCP Nagle algorithm */ u = ((s->s_aflags & CI_SOCK_AFLAG_NODELAY) != 0); goto u_out; case TCP_MAXSEG: /* gets the MSS size for this connection */ if ((s->b.state & CI_TCP_STATE_TCP_CONN)) { u = tcp_eff_mss(SOCK_TO_TCP(s)); } else { u = 536; } goto u_out; # ifdef TCP_CORK case TCP_CORK: /* don't send partial framses, all partial frames sent ** when the option is cleared */ u = ((s->s_aflags & CI_SOCK_AFLAG_CORK) != 0); goto u_out; # endif case TCP_KEEPIDLE: { /* idle time for keepalives */ u = (unsigned) c->t_ka_time_in_secs; } goto u_out; case TCP_KEEPINTVL: { /* time between keepalives */ u = (unsigned) c->t_ka_intvl_in_secs; } goto u_out; case TCP_KEEPCNT: { /* number of keepalives before giving up */ u = c->ka_probe_th; } goto u_out; case TCP_INFO: /* struct tcp_info to be filled */ return ci_tcp_info_get(netif, s, (struct ci_tcp_info*) optval); case TCP_DEFER_ACCEPT: { u = 0; if( c->tcp_defer_accept != OO_TCP_DEFER_ACCEPT_OFF ) { u = ci_ip_time_ticks2ms(netif, NI_CONF(netif).tconst_rto_initial); u = ((u + 500) / 1000) << c->tcp_defer_accept; } goto u_out; } case TCP_QUICKACK: { u = 0; if( s->b.state & CI_TCP_STATE_TCP_CONN ) { ci_tcp_state* ts = SOCK_TO_TCP(s); u = ci_tcp_is_in_faststart(ts); } goto u_out; } default: LOG_TC( log(LPF "getsockopt: unimplemented or bad option: %i", optname)); RET_WITH_ERRNO(ENOPROTOOPT); } } else { SOCKOPT_RET_INVALID_LEVEL(s); } return 0; u_out: return ci_getsockopt_final(optval, optlen, level, &u, sizeof(u)); }
int ci_tcp_listen(citp_socket* ep, ci_fd_t fd, int backlog) { /* ** ?? error handling on possible fails not handled robustly... ** ?? Need to check port number is valid TODO */ /*! \todo If not bound then we have to be listening on all interfaces. * It's likely that we won't be coming through here as we have to * listen on the OS socket too! */ ci_tcp_state* ts; ci_tcp_socket_listen* tls; ci_netif* netif = ep->netif; ci_sock_cmn* s = ep->s; unsigned ul_backlog = backlog; int rc; oo_p sp; LOG_TC(log("%s "SK_FMT" listen backlog=%d", __FUNCTION__, SK_PRI_ARGS(ep), backlog)); CHECK_TEP(ep); if( NI_OPTS(netif).tcp_listen_handover ) return CI_SOCKET_HANDOVER; if( !NI_OPTS(netif).tcp_server_loopback) { /* We should handover if the socket is bound to alien address. */ if( s->s_flags & CI_SOCK_FLAG_BOUND_ALIEN ) return CI_SOCKET_HANDOVER; } if( ul_backlog < 0 ) ul_backlog = NI_OPTS(netif).max_ep_bufs; else if( ul_backlog < NI_OPTS(netif).acceptq_min_backlog ) ul_backlog = NI_OPTS(netif).acceptq_min_backlog; if( s->b.state == CI_TCP_LISTEN ) { tls = SOCK_TO_TCP_LISTEN(s); tls->acceptq_max = ul_backlog; ci_tcp_helper_listen_os_sock(fd, ul_backlog); return 0; } if( s->b.state != CI_TCP_CLOSED ) { CI_SET_ERROR(rc, EINVAL); return rc; } ts = SOCK_TO_TCP(s); /* Bug 3376: if socket used for a previous, failed, connect then the error * numbers will not be as expected. Only seen when not using listening * netifs (as moving the EP to the new netif resets them). */ ts->s.tx_errno = EPIPE; ts->s.rx_errno = ENOTCONN; /* fill in address/ports and all TCP state */ if( !(ts->s.s_flags & CI_SOCK_FLAG_BOUND) ) { ci_uint16 source_be16; /* They haven't previously done a bind, so we need to choose * a port. As we haven't been given a hint we let the OS choose. */ source_be16 = 0; rc = __ci_bind(ep->netif, ep->s, ts->s.pkt.ip.ip_saddr_be32, &source_be16); if (CI_LIKELY( rc==0 )) { TS_TCP(ts)->tcp_source_be16 = source_be16; ts->s.cp.lport_be16 = source_be16; LOG_TC(log(LNT_FMT "listen: our bind returned %s:%u", LNT_PRI_ARGS(ep->netif, ts), ip_addr_str(ts->s.pkt.ip.ip_saddr_be32), (unsigned) CI_BSWAP_BE16(TS_TCP(ts)->tcp_source_be16))); } else { LOG_U(ci_log("__ci_bind returned %d at %s:%d", CI_GET_ERROR(rc), __FILE__, __LINE__)); return rc; } } ci_sock_lock(netif, &ts->s.b); ci_tcp_set_slow_state(netif, ts, CI_TCP_LISTEN); tls = SOCK_TO_TCP_LISTEN(&ts->s); tcp_raddr_be32(tls) = 0u; tcp_rport_be16(tls) = 0u; ci_assert_equal(tls->s.tx_errno, EPIPE); ci_assert_equal(tls->s.rx_errno, ENOTCONN); /* setup listen timer - do it before the first return statement, * because __ci_tcp_listen_to_normal() will be called on error path. */ if( ~tls->s.s_flags & CI_SOCK_FLAG_BOUND_ALIEN ) { sp = TS_OFF(netif, tls); OO_P_ADD(sp, CI_MEMBER_OFFSET(ci_tcp_socket_listen, listenq_tid)); ci_ip_timer_init(netif, &tls->listenq_tid, sp, "lstq"); tls->listenq_tid.param1 = S_SP(tls); tls->listenq_tid.fn = CI_IP_TIMER_TCP_LISTEN; } rc = ci_tcp_listen_init(netif, tls); ci_sock_unlock(netif, &ts->s.b); if( rc != 0 ) { CI_SET_ERROR(rc, -rc); goto listen_fail; } tls->acceptq_max = ul_backlog; CITP_STATS_TCP_LISTEN(CI_ZERO(&tls->stats)); /* install all the filters needed for this connection * - tcp_laddr_be32(ts) = 0 for IPADDR_ANY * * TODO: handle BINDTODEVICE by setting phys_port paramter to correct * physical L5 port index * TODO: handle REUSEADDR by setting last paramter to TRUE */ if( ~s->s_flags & CI_SOCK_FLAG_BOUND_ALIEN ) { #ifdef ONLOAD_OFE if( netif->ofe != NULL ) { tls->s.ofe_code_start = ofe_socktbl_find( netif->ofe, OFE_SOCKTYPE_TCP_LISTEN, tcp_laddr_be32(tls), INADDR_ANY, tcp_lport_be16(ts), 0); tls->ofe_promote = ofe_socktbl_find( netif->ofe, OFE_SOCKTYPE_TCP_PASSIVE, tcp_laddr_be32(tls), INADDR_ANY, tcp_lport_be16(ts), 0); } #endif rc = ci_tcp_ep_set_filters(netif, S_SP(tls), tls->s.cp.so_bindtodevice, OO_SP_NULL); if( rc == -EFILTERSSOME ) { if( CITP_OPTS.no_fail ) rc = 0; else { ci_tcp_ep_clear_filters(netif, S_SP(tls), 0); rc = -ENOBUFS; } } ci_assert_nequal(rc, -EFILTERSSOME); VERB(ci_log("%s: set_filters returned %d", __FUNCTION__, rc)); if (rc < 0) { CI_SET_ERROR(rc, -rc); goto post_listen_fail; } } /* * Call of system listen() is required for listen any, local host * communications server and multi-homed server (to accept connections * to L5 assigned address(es), but incoming from other interfaces). */ #ifdef __ci_driver__ { rc = efab_tcp_helper_listen_os_sock( netif2tcp_helper_resource(netif), S_SP(tls), backlog); } #else rc = ci_tcp_helper_listen_os_sock(fd, backlog); #endif if ( rc < 0 ) { /* clear the filter we've just set */ ci_tcp_ep_clear_filters(netif, S_SP(tls), 0); goto post_listen_fail; } return 0; post_listen_fail: ci_tcp_listenq_drop_all(netif, tls); listen_fail: /* revert TCP state to a non-listening socket format */ __ci_tcp_listen_to_normal(netif, tls); /* Above function sets orphan flag but we are attached to an FD. */ ci_bit_clear(&tls->s.b.sb_aflags, CI_SB_AFLAG_ORPHAN_BIT); #ifdef __ci_driver__ return rc; #else return CI_SOCKET_ERROR; #endif }