static int ci_udp_disconnect(citp_socket* ep, ci_udp_state* us, ci_fd_t os_sock) { int rc; if( (rc = ci_udp_sys_getsockname(os_sock, ep)) != 0 ) { LOG_E(log(FNS_FMT "ERROR: sys_getsockname failed (%d)", FNS_PRI_ARGS(ep->netif, ep->s), errno)); return rc; } ci_udp_set_raddr(us, 0, 0); /* TODO: We shouldn't really clear then set here; instead we should * insert wildcard filters before removing the full-match ones. ie. The * reverse of what we do in connect(). But probably not worth worrying * about in this case. */ ci_udp_clr_filters(ep); #ifdef ONLOAD_OFE if( ep->netif->ofe != NULL ) us->s.ofe_code_start = ofe_socktbl_find( ep->netif->ofe, OFE_SOCKTYPE_UDP, udp_laddr_be32(us), udp_raddr_be32(us), udp_lport_be16(us), udp_rport_be16(us)); #endif if( (rc = ci_udp_set_filters(ep, us)) != 0 ) /* Not too bad -- should still get packets via OS socket. */ LOG_U(log(FNS_FMT "ERROR: ci_udp_set_filters failed (%d)", FNS_PRI_ARGS(ep->netif, ep->s), errno)); us->s.cp.sock_cp_flags &= ~OO_SCP_CONNECTED; return 0; }
static int ci_udp_set_filters(citp_socket* ep, ci_udp_state* us) { int rc; ci_assert(ep); ci_assert(us); if( udp_lport_be16(us) == 0 ) return 0; rc = ci_tcp_ep_set_filters(ep->netif, S_SP(us), us->s.cp.so_bindtodevice, OO_SP_NULL); if( rc == -EFILTERSSOME ) { if( CITP_OPTS.no_fail ) rc = 0; else { ci_tcp_ep_clear_filters(ep->netif, S_SP(us), 0); rc = -ENOBUFS; } } if( rc < 0 ) { LOG_UC(log(FNS_FMT "ci_tcp_ep_set_filters failed (%d)", FNS_PRI_ARGS(ep->netif, ep->s), -rc)); CI_SET_ERROR(rc, -rc); return rc; } UDP_SET_FLAG(us, CI_UDPF_FILTERED); return 0; }
void ci_udp_set_laddr(citp_socket* ep, unsigned laddr_be32, int lport_be16) { ci_udp_state* us = SOCK_TO_UDP(ep->s); udp_laddr_be32(us) = laddr_be32; udp_lport_be16(us) = (ci_uint16) lport_be16; if( CI_IP_IS_MULTICAST(laddr_be32) ) us->s.cp.ip_laddr_be32 = 0; else us->s.cp.ip_laddr_be32 = laddr_be32; us->s.cp.lport_be16 = lport_be16; }
static char * ci_udp_addr_str( ci_udp_state* us ) { static char buf[128]; ci_assert(us); sprintf( buf, "L[%s:%d] R[%s:%d]", ip_addr_str( udp_laddr_be32(us)), CI_BSWAP_BE16(udp_lport_be16(us)), ip_addr_str( udp_raddr_be32(us)), CI_BSWAP_BE16(udp_rport_be16(us)) ); return buf; }
/* initialise all the fields that we can in the UDP state structure. ** There are no IP options, no destination addresses, no ports */ static void ci_udp_state_init(ci_netif* netif, ci_udp_state* us) { ci_sock_cmn_init(netif, &us->s, 1); /* IP_MULTICAST_LOOP is 1 by default, so we should not send multicast * unless specially permitted */ if( ! NI_OPTS(netif).force_send_multicast ) us->s.cp.sock_cp_flags |= OO_SCP_NO_MULTICAST; /* Poison. */ CI_DEBUG(memset(&us->s + 1, 0xf0, (char*) (us + 1) - (char*) (&us->s + 1))); /*! \todo This should be part of sock_cmn reinit, but the comment to that * function suggests that it's possibly not a good plan to move it there */ #if CI_CFG_TIMESTAMPING ci_udp_recv_q_init(&us->timestamp_q); #endif /*! \todo These two should really be handled in ci_sock_cmn_init() */ /* Make sure we don't hit any state assertions. Can use * UDP_STATE_FROM_SOCKET_EPINFO() after this. */ us->s.b.state = CI_TCP_STATE_UDP; us->s.so.sndbuf = NI_OPTS(netif).udp_sndbuf_def; us->s.so.rcvbuf = NI_OPTS(netif).udp_rcvbuf_def; /* Init the ip-caches (packet header templates). */ ci_udp_hdrs_init(&us->s.pkt); ci_ip_cache_init(&us->ephemeral_pkt); ci_udp_hdrs_init(&us->ephemeral_pkt); udp_lport_be16(us) = 0; udp_rport_be16(us) = 0; #if CI_CFG_ZC_RECV_FILTER us->recv_q_filter = 0; us->recv_q_filter_arg = 0; #endif ci_udp_recv_q_init(&us->recv_q); us->zc_kernel_datagram = OO_PP_NULL; us->zc_kernel_datagram_count = 0; us->tx_async_q = CI_ILL_END; oo_atomic_set(&us->tx_async_q_level, 0); us->tx_count = 0; us->udpflags = CI_UDPF_MCAST_LOOP; us->ip_pktinfo_cache.intf_i = -1; us->stamp = 0; memset(&us->stats, 0, sizeof(us->stats)); }
/* Conclude the EP's binding. This function is abstracted from the * main bind code to allow implicit binds that occur when sendto() is * called on an OS socket. [lport] and CI_SIN(addr)->sin_port do not * have to be the same value. */ static int ci_udp_bind_conclude(citp_socket* ep, const struct sockaddr* addr, ci_uint16 lport ) { ci_udp_state* us; ci_uint32 addr_be32; int rc; CHECK_UEP(ep); ci_assert(addr != NULL); if( ci_udp_should_handover(ep, addr, lport) ) goto handover; addr_be32 = ci_get_ip4_addr(ep->s->domain, addr); ci_udp_set_laddr(ep, addr_be32, lport); us = SOCK_TO_UDP(ep->s); if( addr_be32 != 0 ) us->s.cp.sock_cp_flags |= OO_SCP_LADDR_BOUND; /* reset any rx/tx that have taken place already */ UDP_CLR_FLAG(us, CI_UDPF_EF_SEND); #ifdef ONLOAD_OFE if( ep->netif->ofe != NULL ) us->s.ofe_code_start = ofe_socktbl_find( ep->netif->ofe, OFE_SOCKTYPE_UDP, udp_laddr_be32(us), udp_raddr_be32(us), udp_lport_be16(us), udp_rport_be16(us)); #endif /* OS source addrs have already been handed-over, so this must be one of * our src addresses. */ rc = ci_udp_set_filters( ep, us); ci_assert( !UDP_GET_FLAG(us, CI_UDPF_EF_BIND) ); /*! \todo FIXME isn't the port the thing to be testing here? */ if( udp_laddr_be32(us) != INADDR_ANY_BE32 ) UDP_SET_FLAG(us, CI_UDPF_EF_BIND); CI_UDPSTATE_SHOW_EP( ep ); if( rc == CI_SOCKET_ERROR && CITP_OPTS.no_fail) { CITP_STATS_NETIF(++ep->netif->state->stats.udp_bind_no_filter); goto handover; } return rc; handover: LOG_UV(log("%s: "SK_FMT" HANDOVER", __FUNCTION__, SK_PRI_ARGS(ep))); return CI_SOCKET_HANDOVER; }
void ci_udp_state_dump(ci_netif* ni, ci_udp_state* us, const char* pf, oo_dump_log_fn_t logger, void* log_arg) { ci_udp_socket_stats uss = us->stats; unsigned rx_added = us->recv_q.pkts_added; unsigned rx_os = uss.n_rx_os + uss.n_rx_os_slow; unsigned rx_total = rx_added + uss.n_rx_mem_drop + uss.n_rx_overflow + rx_os; unsigned n_tx_onload = uss.n_tx_onload_uc + uss.n_tx_onload_c; unsigned tx_total = n_tx_onload + uss.n_tx_os; ci_ip_cached_hdrs* ipcache; (void) rx_total; /* unused on 32-bit builds in kernel */ (void) tx_total; #if CI_CFG_TIMESTAMPING if( us->s.timestamping_flags & ONLOAD_SOF_TIMESTAMPING_TX_HARDWARE ) ci_udp_recvq_dump(ni, &us->timestamp_q, pf, " TX timestamping queue:", logger, log_arg); #endif /* General. */ logger(log_arg, "%s udpflags: "CI_UDP_STATE_FLAGS_FMT, pf, CI_UDP_STATE_FLAGS_PRI_ARG(us)); /* Receive path. */ ci_udp_recvq_dump(ni, &us->recv_q, pf, " rcv:", logger, log_arg); logger(log_arg, "%s rcv: oflow_drop=%u(%u%%) mem_drop=%u eagain=%u pktinfo=%u " "q_max_pkts=%u", pf, uss.n_rx_overflow, percent(uss.n_rx_overflow, rx_total), uss.n_rx_mem_drop, uss.n_rx_eagain, uss.n_rx_pktinfo, uss.max_recvq_pkts); logger(log_arg, "%s rcv: os=%u(%u%%) os_slow=%u os_error=%u", pf, rx_os, percent(rx_os, rx_total), uss.n_rx_os_slow, uss.n_rx_os_error); /* Send path. */ logger(log_arg, "%s snd: q=%u+%u ul=%u os=%u(%u%%)", pf, us->tx_count, oo_atomic_read(&us->tx_async_q_level), n_tx_onload, uss.n_tx_os, percent(uss.n_tx_os, tx_total)); logger(log_arg, "%s snd: LOCK cp=%u(%u%%) pkt=%u(%u%%) snd=%u(%u%%) poll=%u(%u%%) " "defer=%u(%u%%)", pf, uss.n_tx_lock_cp, percent(uss.n_tx_lock_cp, n_tx_onload), uss.n_tx_lock_pkt, percent(uss.n_tx_lock_pkt, n_tx_onload), uss.n_tx_lock_snd, percent(uss.n_tx_lock_snd, n_tx_onload), uss.n_tx_lock_poll, percent(uss.n_tx_lock_poll, n_tx_onload), uss.n_tx_lock_defer, percent(uss.n_tx_lock_defer, n_tx_onload)); logger(log_arg, "%s snd: MCAST if=%d src="OOF_IP4" ttl=%d", pf, us->s.cp.ip_multicast_if, OOFA_IP4(us->s.cp.ip_multicast_if_laddr_be32), (int) us->s.cp.ip_mcast_ttl); /* State relating to unconnected sends. */ ipcache = &us->ephemeral_pkt; logger(log_arg, "%s snd: TO n=%u match=%u(%u%%) " "lookup=%u+%u(%u%%) "OOF_IPCACHE_STATE, pf, uss.n_tx_onload_uc, uss.n_tx_cp_match, percent(uss.n_tx_cp_match, uss.n_tx_onload_uc), uss.n_tx_cp_uc_lookup, uss.n_tx_cp_a_lookup, percent(uss.n_tx_cp_uc_lookup + uss.n_tx_cp_a_lookup, uss.n_tx_onload_uc), OOFA_IPCACHE_STATE(ni, ipcache)); logger(log_arg, "%s snd: TO "OOF_IPCACHE_DETAIL, pf, OOFA_IPCACHE_DETAIL(ipcache)); logger(log_arg, "%s snd: TO "OOF_IP4PORT" => "OOF_IP4PORT, pf, OOFA_IP4PORT(ipcache->ip_saddr.ip4, udp_lport_be16(us)), OOFA_IP4PORT(ipcache->ip.ip_daddr_be32, ipcache->dport_be16)); /* State relating to connected sends. */ ipcache = &us->s.pkt; logger(log_arg, "%s snd: CON n=%d lookup=%d "OOF_IPCACHE_STATE, pf, uss.n_tx_onload_c, uss.n_tx_cp_c_lookup, OOFA_IPCACHE_STATE(ni,ipcache)); logger(log_arg, "%s snd: CON "OOF_IPCACHE_DETAIL, pf, OOFA_IPCACHE_DETAIL(ipcache)); logger(log_arg, "%s snd: eagain=%d spin=%d block=%d", pf, uss.n_tx_eagain, uss.n_tx_spin, uss.n_tx_block); logger(log_arg, "%s snd: poll_avoids_full=%d fragments=%d confirm=%d", pf, uss.n_tx_poll_avoids_full, uss.n_tx_fragments, uss.n_tx_msg_confirm); logger(log_arg, "%s snd: os_slow=%d os_late=%d unconnect_late=%d nomac=%u(%u%%)", pf, uss.n_tx_os_slow, uss.n_tx_os_late, uss.n_tx_unconnect_late, uss.n_tx_cp_no_mac, percent(uss.n_tx_cp_no_mac, tx_total)); }
/* Complete a UDP U/L connect. The sys connect() call must have been made * (and succeeded) before calling this function. So if anything goes wrong * in here, then it can be consider an internal error or failing of onload. */ int ci_udp_connect_conclude(citp_socket* ep, ci_fd_t fd, const struct sockaddr* serv_addr, socklen_t addrlen, ci_fd_t os_sock) { const struct sockaddr_in* serv_sin = (const struct sockaddr_in*) serv_addr; ci_uint32 dst_be32; ci_udp_state* us = SOCK_TO_UDP(ep->s); int onloadable; int rc = 0; CHECK_UEP(ep); UDP_CLR_FLAG(us, CI_UDPF_EF_SEND); us->s.rx_errno = 0; us->s.tx_errno = 0; if( IS_DISCONNECTING(serv_sin) ) { rc = ci_udp_disconnect(ep, us, os_sock); goto out; } #if CI_CFG_FAKE_IPV6 if( us->s.domain == PF_INET6 && !ci_tcp_ipv6_is_ipv4(serv_addr) ) { LOG_UC(log(FNT_FMT "HANDOVER not IPv4", FNT_PRI_ARGS(ep->netif, us))); goto handover; } #endif dst_be32 = ci_get_ip4_addr(serv_sin->sin_family, serv_addr); if( (rc = ci_udp_sys_getsockname(os_sock, ep)) != 0 ) { LOG_E(log(FNT_FMT "ERROR: (%s:%d) sys_getsockname failed (%d)", FNT_PRI_ARGS(ep->netif, us), ip_addr_str(dst_be32), CI_BSWAP_BE16(serv_sin->sin_port), errno)); goto out; } us->s.cp.sock_cp_flags |= OO_SCP_CONNECTED; ci_udp_set_raddr(us, dst_be32, serv_sin->sin_port); cicp_user_retrieve(ep->netif, &us->s.pkt, &us->s.cp); switch( us->s.pkt.status ) { case retrrc_success: case retrrc_nomac: onloadable = 1; break; default: onloadable = 0; if( NI_OPTS(ep->netif).udp_connect_handover ) { LOG_UC(log(FNT_FMT "HANDOVER %s:%d", FNT_PRI_ARGS(ep->netif, us), ip_addr_str(dst_be32), CI_BSWAP_BE16(serv_sin->sin_port))); goto handover; } break; } if( dst_be32 == INADDR_ANY_BE32 || serv_sin->sin_port == 0 ) { LOG_UC(log(FNT_FMT "%s:%d - route via OS socket", FNT_PRI_ARGS(ep->netif, us), ip_addr_str(dst_be32), CI_BSWAP_BE16(serv_sin->sin_port))); ci_udp_clr_filters(ep); return 0; } if( CI_IP_IS_LOOPBACK(dst_be32) ) { /* After connecting via loopback it is not possible to connect anywhere * else. */ LOG_UC(log(FNT_FMT "HANDOVER %s:%d", FNT_PRI_ARGS(ep->netif, us), ip_addr_str(dst_be32), CI_BSWAP_BE16(serv_sin->sin_port))); goto handover; } if( onloadable ) { #ifdef ONLOAD_OFE if( ep->netif->ofe != NULL ) us->s.ofe_code_start = ofe_socktbl_find( ep->netif->ofe, OFE_SOCKTYPE_UDP, udp_laddr_be32(us), udp_raddr_be32(us), udp_lport_be16(us), udp_rport_be16(us)); #endif if( (rc = ci_udp_set_filters(ep, us)) != 0 ) { /* Failed to set filters. Most likely we've run out of h/w filters. * Handover to O/S to avoid breaking the app. * * TODO: Actually we probably won't break the app if we don't * handover, as packets will still get delivered via the kernel * stack. Might be worth having a runtime option to choose whether * or not to handover in such cases. */ LOG_U(log(FNT_FMT "ERROR: (%s:%d) ci_udp_set_filters failed (%d)", FNT_PRI_ARGS(ep->netif, us), ip_addr_str(dst_be32), CI_BSWAP_BE16(serv_sin->sin_port), rc)); CITP_STATS_NETIF(++ep->netif->state->stats.udp_connect_no_filter); goto out; } } else { ci_udp_clr_filters(ep); } LOG_UC(log(LPF "connect: "SF_FMT" %sCONNECTED L:%s:%u R:%s:%u (err:%d)", SF_PRI_ARGS(ep,fd), udp_raddr_be32(us) ? "" : "DIS", ip_addr_str(udp_laddr_be32(us)), (unsigned) CI_BSWAP_BE16(udp_lport_be16(us)), ip_addr_str(udp_raddr_be32(us)), (unsigned) CI_BSWAP_BE16(udp_rport_be16(us)), errno)); return 0; out: if( rc < 0 && CITP_OPTS.no_fail ) goto handover; return rc; handover: ci_udp_clr_filters(ep); return CI_SOCKET_HANDOVER; }
static int ci_udp_recvmsg_socklocked_slowpath(ci_udp_iomsg_args* a, ci_msghdr* msg, ci_iovec_ptr *piov, int flags) { int rc = 0; ci_netif* ni = a->ni; ci_udp_state* us = a->us; if(CI_UNLIKELY( ni->state->rxq_low )) ci_netif_rxq_low_on_recv(ni, &us->s, 1 /* assume at least one pkt freed */); /* In the kernel recv() with flags is not called. * only read(). So flags may only contain MSG_DONTWAIT */ #ifdef __KERNEL__ ci_assert_equal(flags, 0); #endif #ifndef __KERNEL__ if( flags & MSG_ERRQUEUE_CHK ) { if( OO_PP_NOT_NULL(us->timestamp_q.extract) ) { ci_ip_pkt_fmt* pkt; struct timespec ts[3]; struct cmsg_state cmsg_state; ci_udp_hdr* udp; int paylen; /* TODO is this necessary? - mirroring ci_udp_recvmsg_get() */ ci_rmb(); pkt = PKT_CHK_NNL(ni, us->timestamp_q.extract); if( pkt->tx_hw_stamp.tv_sec == CI_PKT_TX_HW_STAMP_CONSUMED ) { if( OO_PP_IS_NULL(pkt->tsq_next) ) goto errqueue_empty; us->timestamp_q.extract = pkt->tsq_next; pkt = PKT_CHK_NNL(ni, us->timestamp_q.extract); ci_assert(pkt->tx_hw_stamp.tv_sec != CI_PKT_TX_HW_STAMP_CONSUMED); } udp = oo_ip_data(pkt); paylen = CI_BSWAP_BE16(oo_ip_hdr(pkt)->ip_tot_len_be16) - sizeof(ci_ip4_hdr) - sizeof(udp); msg->msg_flags = 0; cmsg_state.msg = msg; cmsg_state.cm = msg->msg_control; cmsg_state.cmsg_bytes_used = 0; ci_iovec_ptr_init_nz(piov, msg->msg_iov, msg->msg_iovlen); memset(ts, 0, sizeof(ts)); if( us->s.timestamping_flags & ONLOAD_SOF_TIMESTAMPING_RAW_HARDWARE ) { ts[2].tv_sec = pkt->tx_hw_stamp.tv_sec; ts[2].tv_nsec = pkt->tx_hw_stamp.tv_nsec; } if( (us->s.timestamping_flags & ONLOAD_SOF_TIMESTAMPING_SYS_HARDWARE) && (pkt->tx_hw_stamp.tv_nsec & CI_IP_PKT_HW_STAMP_FLAG_IN_SYNC) ) { ts[1].tv_sec = pkt->tx_hw_stamp.tv_sec; ts[1].tv_nsec = pkt->tx_hw_stamp.tv_nsec; } ci_put_cmsg(&cmsg_state, SOL_SOCKET, ONLOAD_SCM_TIMESTAMPING, sizeof(ts), &ts); oo_offbuf_set_start(&pkt->buf, udp + 1); oo_offbuf_set_len(&pkt->buf, paylen); rc = oo_copy_pkt_to_iovec_no_adv(ni, pkt, piov, paylen); /* Mark this packet/timestamp as consumed */ pkt->tx_hw_stamp.tv_sec = CI_PKT_TX_HW_STAMP_CONSUMED; ci_ip_cmsg_finish(&cmsg_state); msg->msg_flags |= MSG_ERRQUEUE_CHK; return rc; } errqueue_empty: /* ICMP is handled via OS, so get OS error */ rc = oo_os_sock_recvmsg(ni, SC_SP(&us->s), msg, flags); if( rc < 0 ) { ci_assert(-rc == errno); return -1; } else return rc; } #endif if( (rc = ci_get_so_error(&us->s)) != 0 ) { CI_SET_ERROR(rc, rc); return rc; } if( msg->msg_iovlen > 0 && msg->msg_iov == NULL ) { CI_SET_ERROR(rc, EFAULT); return rc; } #if MSG_OOB_CHK if( flags & MSG_OOB_CHK ) { CI_SET_ERROR(rc, EOPNOTSUPP); return rc; } #endif #if CI_CFG_POSIX_RECV if( ! udp_lport_be16(us)) { LOG_UV(log("%s: -1 (ENOTCONN)", __FUNCTION__)); CI_SET_ERROR(rc, ENOTCONN); return rc; } #endif if( msg->msg_iovlen == 0 ) { /* We have a difference in behaviour from the Linux stack here. When ** msg_iovlen is 0 Linux 2.4.21-15.EL does not set MSG_TRUNC when a ** datagram has non-zero length. We do. */ CI_IOVEC_LEN(&piov->io) = piov->iovlen = 0; return IOVLEN_WORKAROUND_RC_VALUE; } return 0; }