static void ci_udp_pkt_to_zc_msg(ci_netif* ni, ci_ip_pkt_fmt* pkt, struct onload_zc_msg* zc_msg) { int i, bytes_left = pkt->pf.udp.pay_len; ci_ip_pkt_fmt* frag; ci_ip_pkt_fmt* handle_frag; handle_frag = frag = pkt; i = 0; ci_assert_nequal(zc_msg->iov, NULL); /* Ignore first frag if zero length and there is another frag, but * still pass the zero-length buffer as the onload_zc_handle so it * will get freed correctly */ if( oo_offbuf_left(&frag->buf) == 0 && OO_PP_NOT_NULL(frag->frag_next) ) frag = PKT_CHK_NNL(ni, frag->frag_next); do { zc_msg->iov[i].iov_len = CI_MIN(oo_offbuf_left(&frag->buf), bytes_left); zc_msg->iov[i].iov_base = oo_offbuf_ptr(&frag->buf); zc_msg->iov[i].buf = (onload_zc_handle)handle_frag; zc_msg->iov[i].iov_flags = 0; bytes_left -= zc_msg->iov[i].iov_len; ++i; if( OO_PP_IS_NULL(frag->frag_next) || (i == CI_UDP_ZC_IOVEC_MAX) || (bytes_left == 0) ) break; frag = PKT_CHK_NNL(ni, frag->frag_next); handle_frag = frag; } while( 1 ); zc_msg->msghdr.msg_iovlen = i; }
void ci_sock_cmn_timestamp_q_enqueue(ci_netif* ni, ci_sock_cmn* s, ci_ip_pkt_fmt* pkt) { ci_ip_pkt_queue* qu = &s->timestamp_q; oo_pkt_p prev_head = qu->head; /* This part is effectively ci_ip_queue_enqueue(ni, &s->timestamp_q, p); * but inlined to allow using tsq_next field */ pkt->tsq_next = OO_PP_NULL; if( ci_ip_queue_is_empty(qu) ) { ci_assert(OO_PP_IS_NULL(qu->head)); qu->head = OO_PKT_P(pkt); } else { ci_assert(OO_PP_NOT_NULL(qu->head)); /* This assumes the netif lock is held, so use ci_ip_queue_enqueue_nnl() if it's not */ PKT(ni, qu->tail)->tsq_next = OO_PKT_P(pkt); } qu->tail = OO_PKT_P(pkt); qu->num++; if( OO_PP_IS_NULL(prev_head) ) { ci_assert(OO_PP_IS_NULL(s->timestamp_q_extract)); s->timestamp_q_extract = qu->head; } else { ci_sock_cmn_timestamp_q_reap(ni, s); } /* Tells post-poll loop to put socket on the [reap_list]. */ s->b.sb_flags |= CI_SB_FLAG_RX_DELIVERED; }
static int oo_copy_pkt_to_iovec_no_adv(ci_netif* ni, const ci_ip_pkt_fmt* pkt, ci_iovec_ptr* piov, int bytes_to_copy) { /* Copy data from [pkt] to [piov], following [pkt->frag_next] as * necessary. Does not modify [pkt]. May or may not advance [piov]. * The packet must contain at least [bytes_to_copy] of data in the * [pkt->buf]. [piov] may contain an arbitrary amount of space. * * Returns number of bytes copied on success, or -EFAULT otherwise. */ int n, pkt_left, pkt_off = 0; int bytes_copied = 0; while( 1 ) { pkt_left = oo_offbuf_left(&pkt->buf) - pkt_off; n = CI_MIN(pkt_left, CI_IOVEC_LEN(&piov->io)); n = CI_MIN(n, bytes_to_copy); if(CI_UNLIKELY( do_copy(CI_IOVEC_BASE(&piov->io), oo_offbuf_ptr(&pkt->buf) + pkt_off, n) != 0 )) return -EFAULT; bytes_copied += n; pkt_off += n; if( n == bytes_to_copy ) return bytes_copied; bytes_to_copy -= n; if( n == pkt_left ) { /* Caller guarantees that packet contains at least [bytes_to_copy]. */ ci_assert(OO_PP_NOT_NULL(pkt->frag_next)); ci_iovec_ptr_advance(piov, n); pkt = PKT_CHK_NNL(ni, pkt->frag_next); pkt_off = 0; /* We're unlikely to hit end-of-pkt-buf and end-of-iovec at the same * time, and if we do, just go round the loop again. */ continue; } ci_assert_equal(n, CI_IOVEC_LEN(&piov->io)); if( piov->iovlen == 0 ) return bytes_copied; piov->io = *piov->iov++; --piov->iovlen; } }
static int efab_file_move_supported_tcp(ci_netif *ni, ci_tcp_state *ts) { #if CI_CFG_FD_CACHING /* Don't support moving cached sockets for now */ if( ci_tcp_is_cached(ts) || !ci_ni_dllist_is_self_linked(ni, &ts->epcache_link) ) return false; #endif /* TCP closed: supported */ if( ts->s.b.state == CI_TCP_CLOSED ) return true; /* everything except TCP connected is not supported */ if( !(ts->s.b.state & CI_TCP_STATE_TCP_CONN) ) return false; if( ts->local_peer != OO_SP_NULL ) return false; if( !(ts->tcpflags & CI_TCPT_FLAG_PASSIVE_OPENED) ) return false; /* send queue is not supported * NB: retrans_ptr is uninitialised when retrans was not used yet, * so do not check for !OO_PP_IS_NULL(ts->retrans_ptr) */ if( !ci_ip_queue_is_empty(&ts->send) || ts->send_prequeue != OO_PP_ID_NULL || oo_atomic_read(&ts->send_prequeue_in) != 0 || !ci_ip_queue_is_empty(&ts->retrans) || ci_ip_timer_pending(ni, &ts->rto_tid) || ci_ip_timer_pending(ni, &ts->zwin_tid) || #if CI_CFG_TAIL_DROP_PROBE ci_ip_timer_pending(ni, &ts->taildrop_tid) || #endif ci_ip_timer_pending(ni, &ts->cork_tid) ) return false; /* Sockets with allocated templates are not supported */ if( OO_PP_NOT_NULL(ts->tmpl_head) ) return false; return true; }
void ci_udp_all_fds_gone(ci_netif* netif, oo_sp sock_id, int do_free) { /* All process references to this socket have gone. So we should * shutdown() if necessary, and arrange for all resources to eventually * get cleaned up. * * This is called by the driver only. [sock_id] is trusted. */ ci_udp_state* us = SP_TO_UDP(netif, sock_id); ci_assert(ci_netif_is_locked(netif)); ci_assert(us->s.b.state == CI_TCP_STATE_UDP); LOG_UC(ci_log("ci_udp_all_fds_gone: "NTS_FMT, NTS_PRI_ARGS(netif, us))); if( UDP_GET_FLAG(us, CI_UDPF_FILTERED) ) { UDP_CLR_FLAG(us, CI_UDPF_FILTERED); ci_tcp_ep_clear_filters(netif, S_SP(us), 0); } ci_udp_recv_q_drop(netif, &us->recv_q); ci_ni_dllist_remove(netif, &us->s.reap_link); if( OO_PP_NOT_NULL(us->zc_kernel_datagram) ) { ci_netif_pkt_release_rx(netif, PKT_CHK(netif, us->zc_kernel_datagram)); us->zc_kernel_datagram = OO_PP_NULL; us->zc_kernel_datagram_count = 0; } /* Only free state if no outstanding tx packets: otherwise it'll get * freed by the tx completion event. */ if( do_free ) { if( us->tx_count == 0 ) ci_udp_state_free(netif, us); else CITP_STATS_NETIF_INC(netif, udp_free_with_tx_active); } }
static void efab_ip_queue_copy(ci_netif *ni_to, ci_ip_pkt_queue *q_to, ci_netif *ni_from, ci_ip_pkt_queue *q_from) { ci_ip_pkt_fmt *pkt_to, *pkt_from; oo_pkt_p pp; ci_ip_queue_init(q_to); if( q_from->num == 0 ) return; ci_assert( OO_PP_NOT_NULL(q_from->head) ); pp = q_from->head; do { pkt_from = PKT_CHK(ni_from, pp); pkt_to = ci_netif_pkt_alloc(ni_to); memcpy(&pkt_to->pay_len, &pkt_from->pay_len, CI_CFG_PKT_BUF_SIZE - CI_MEMBER_OFFSET(ci_ip_pkt_fmt, pay_len)); ci_ip_queue_enqueue(ni_to, q_to, pkt_to); if( pp == q_from->tail ) break; pp = pkt_from->next; } while(1); }
static int ci_udp_ioctl_locked(ci_netif* ni, ci_udp_state* us, ci_fd_t fd, int request, void* arg) { int rc; switch( request ) { case FIONREAD: /* synonym of SIOCINQ */ if( ! CI_IOCTL_ARG_OK(int, arg) ) return -EFAULT; rc = 1; if( rc ) { /* Return the size of the datagram at the head of the receive queue. * * Careful: extract side of receive queue is owned by sock lock, * which we don't have. However, freeing of bufs is owned by netif * lock, which we do have. So we're safe so long as we only read * [extract] once. */ oo_pkt_p extract = us->recv_q.extract; if( OO_PP_NOT_NULL(extract) ) { ci_ip_pkt_fmt* pkt = PKT_CHK(ni, extract); if( (pkt->rx_flags & CI_PKT_RX_FLAG_RECV_Q_CONSUMED) && OO_PP_NOT_NULL(pkt->udp_rx_next) ) pkt = PKT_CHK(ni, pkt->udp_rx_next); if( !(pkt->rx_flags & CI_PKT_RX_FLAG_RECV_Q_CONSUMED) ) { *(int*) arg = pkt->pf.udp.pay_len; return 0; } } } /* Nothing in userlevel receive queue: So take the value returned by * the O/S socket. */ if( !(us->s.os_sock_status & OO_OS_STATUS_RX) ) { *(int*)arg = 0; return 0; } goto sys_ioctl; case TIOCOUTQ: /* synonym of SIOCOUTQ */ if( ! CI_IOCTL_ARG_OK(int, arg) ) return -EFAULT; *(int*)arg = us->tx_count + oo_atomic_read(&us->tx_async_q_level); return 0; case SIOCGSTAMP: #if defined( __linux__) && defined(__KERNEL__) /* The following code assumes the width of the timespec and timeval fields */ # error "Need to consider 32-on-64 bit setting of timeval arg" #endif if( ! (us->udpflags & CI_UDPF_LAST_RECV_ON) ) return oo_os_sock_ioctl(ni, us->s.b.bufid, request, arg, NULL); return ci_udp_ioctl_siocgstamp(ni, us, arg, 1); case SIOCGSTAMPNS: if( ! (us->udpflags & CI_UDPF_LAST_RECV_ON) ) return oo_os_sock_ioctl(ni, us->s.b.bufid, request, arg, NULL); return ci_udp_ioctl_siocgstamp(ni, us, arg, 0); } return ci_udp_ioctl_slow(ni, us, fd, request, arg); sys_ioctl: return oo_os_sock_ioctl(ni, us->s.b.bufid, request, arg, NULL); }
static int ci_udp_recvmsg_socklocked_slowpath(ci_udp_iomsg_args* a, ci_msghdr* msg, ci_iovec_ptr *piov, int flags) { int rc = 0; ci_netif* ni = a->ni; ci_udp_state* us = a->us; if(CI_UNLIKELY( ni->state->rxq_low )) ci_netif_rxq_low_on_recv(ni, &us->s, 1 /* assume at least one pkt freed */); /* In the kernel recv() with flags is not called. * only read(). So flags may only contain MSG_DONTWAIT */ #ifdef __KERNEL__ ci_assert_equal(flags, 0); #endif #ifndef __KERNEL__ if( flags & MSG_ERRQUEUE_CHK ) { if( OO_PP_NOT_NULL(us->timestamp_q.extract) ) { ci_ip_pkt_fmt* pkt; struct timespec ts[3]; struct cmsg_state cmsg_state; ci_udp_hdr* udp; int paylen; /* TODO is this necessary? - mirroring ci_udp_recvmsg_get() */ ci_rmb(); pkt = PKT_CHK_NNL(ni, us->timestamp_q.extract); if( pkt->tx_hw_stamp.tv_sec == CI_PKT_TX_HW_STAMP_CONSUMED ) { if( OO_PP_IS_NULL(pkt->tsq_next) ) goto errqueue_empty; us->timestamp_q.extract = pkt->tsq_next; pkt = PKT_CHK_NNL(ni, us->timestamp_q.extract); ci_assert(pkt->tx_hw_stamp.tv_sec != CI_PKT_TX_HW_STAMP_CONSUMED); } udp = oo_ip_data(pkt); paylen = CI_BSWAP_BE16(oo_ip_hdr(pkt)->ip_tot_len_be16) - sizeof(ci_ip4_hdr) - sizeof(udp); msg->msg_flags = 0; cmsg_state.msg = msg; cmsg_state.cm = msg->msg_control; cmsg_state.cmsg_bytes_used = 0; ci_iovec_ptr_init_nz(piov, msg->msg_iov, msg->msg_iovlen); memset(ts, 0, sizeof(ts)); if( us->s.timestamping_flags & ONLOAD_SOF_TIMESTAMPING_RAW_HARDWARE ) { ts[2].tv_sec = pkt->tx_hw_stamp.tv_sec; ts[2].tv_nsec = pkt->tx_hw_stamp.tv_nsec; } if( (us->s.timestamping_flags & ONLOAD_SOF_TIMESTAMPING_SYS_HARDWARE) && (pkt->tx_hw_stamp.tv_nsec & CI_IP_PKT_HW_STAMP_FLAG_IN_SYNC) ) { ts[1].tv_sec = pkt->tx_hw_stamp.tv_sec; ts[1].tv_nsec = pkt->tx_hw_stamp.tv_nsec; } ci_put_cmsg(&cmsg_state, SOL_SOCKET, ONLOAD_SCM_TIMESTAMPING, sizeof(ts), &ts); oo_offbuf_set_start(&pkt->buf, udp + 1); oo_offbuf_set_len(&pkt->buf, paylen); rc = oo_copy_pkt_to_iovec_no_adv(ni, pkt, piov, paylen); /* Mark this packet/timestamp as consumed */ pkt->tx_hw_stamp.tv_sec = CI_PKT_TX_HW_STAMP_CONSUMED; ci_ip_cmsg_finish(&cmsg_state); msg->msg_flags |= MSG_ERRQUEUE_CHK; return rc; } errqueue_empty: /* ICMP is handled via OS, so get OS error */ rc = oo_os_sock_recvmsg(ni, SC_SP(&us->s), msg, flags); if( rc < 0 ) { ci_assert(-rc == errno); return -1; } else return rc; } #endif if( (rc = ci_get_so_error(&us->s)) != 0 ) { CI_SET_ERROR(rc, rc); return rc; } if( msg->msg_iovlen > 0 && msg->msg_iov == NULL ) { CI_SET_ERROR(rc, EFAULT); return rc; } #if MSG_OOB_CHK if( flags & MSG_OOB_CHK ) { CI_SET_ERROR(rc, EOPNOTSUPP); return rc; } #endif #if CI_CFG_POSIX_RECV if( ! udp_lport_be16(us)) { LOG_UV(log("%s: -1 (ENOTCONN)", __FUNCTION__)); CI_SET_ERROR(rc, ENOTCONN); return rc; } #endif if( msg->msg_iovlen == 0 ) { /* We have a difference in behaviour from the Linux stack here. When ** msg_iovlen is 0 Linux 2.4.21-15.EL does not set MSG_TRUNC when a ** datagram has non-zero length. We do. */ CI_IOVEC_LEN(&piov->io) = piov->iovlen = 0; return IOVLEN_WORKAROUND_RC_VALUE; } return 0; }
static int ci_zc_msg_to_udp_pkt(ci_netif* ni, struct onload_zc_msg* zc_msg, ci_ip_pkt_fmt* pkt) { int i, n_buffers = pkt->n_buffers, dropped_bytes = 0; ci_ip_pkt_fmt* frag; ci_ip_pkt_fmt* prev_frag = NULL; frag = pkt; i = 0; ci_assert_nequal(zc_msg->iov, NULL); /* Ignore first frag if zero length and there is another frag */ if( oo_offbuf_left(&frag->buf) == 0 && OO_PP_NOT_NULL(frag->frag_next) ) { frag = PKT_CHK_NNL(ni, frag->frag_next); --n_buffers; } CI_TEST(zc_msg->msghdr.msg_iovlen <= n_buffers); CI_TEST(zc_msg->msghdr.msg_iovlen > 0); do { CI_TEST(zc_msg->iov[i].buf == (onload_zc_handle)frag); CI_TEST(zc_msg->iov[i].iov_len != 0); if( i < zc_msg->msghdr.msg_iovlen ) { if( zc_msg->iov[i].iov_base != oo_offbuf_ptr(&frag->buf) ) { ci_assert_gt((char*)zc_msg->iov[i].iov_base, oo_offbuf_ptr(&frag->buf)); dropped_bytes += ((char*)zc_msg->iov[i].iov_base - oo_offbuf_ptr(&frag->buf) ); oo_offbuf_set_start(&frag->buf, (char*)zc_msg->iov[i].iov_base); } if( zc_msg->iov[i].iov_len != oo_offbuf_left(&frag->buf) ) { ci_assert_lt(zc_msg->iov[i].iov_len, oo_offbuf_left(&frag->buf)); dropped_bytes += (oo_offbuf_left(&frag->buf) - zc_msg->iov[i].iov_len); oo_offbuf_set_len(&frag->buf, zc_msg->iov[i].iov_len); } } else { /* All remaining fragments should be discarded. Should not get * here on first frag as msg_iovlen > 0 */ ci_assert(prev_frag != NULL); prev_frag->frag_next = OO_PP_NULL; /* remember frag so we can release it after counting dropped bytes */ prev_frag = frag; do { dropped_bytes += oo_offbuf_left(&frag->buf); if( ++i == n_buffers ) break; frag = PKT_CHK_NNL(ni, frag->frag_next); } while( 1 ); ci_netif_pkt_release(ni, prev_frag); pkt->n_buffers -= (n_buffers - zc_msg->msghdr.msg_iovlen); return dropped_bytes; } ci_assert_lt(oo_offbuf_offset(&frag->buf) + oo_offbuf_left(&frag->buf), CI_CFG_PKT_BUF_SIZE); if( ++i == n_buffers ) break; prev_frag = frag; frag = PKT_CHK_NNL(ni, frag->frag_next); } while( 1 ); return dropped_bytes; }