/* Encapsulation of sys_getsockname for UDP EPs */ static int ci_udp_sys_getsockname( ci_fd_t sock, citp_socket* ep ) { socklen_t salen; int rc; union ci_sockaddr_u sa_u; ci_assert(ep); #if CI_CFG_FAKE_IPV6 ci_assert(ep->s->domain == AF_INET || ep->s->domain == AF_INET6); #else ci_assert(ep->s->domain == AF_INET); #endif salen = sizeof(sa_u); rc = ci_sys_getsockname( sock, &sa_u.sa, &salen ); if( rc ) return rc; if( sa_u.sa.sa_family != ep->s->domain || salen < sizeof(struct sockaddr_in) #if CI_CFG_FAKE_IPV6 || (ep->s->domain == AF_INET6 && salen < sizeof(struct sockaddr_in6) ) #endif ) { LOG_UV(log("%s: OS sock domain %d != expected domain %d or " "sys_getsockname struct small (%d exp %d)", __FUNCTION__, sa_u.sa.sa_family, ep->s->domain, salen, (int)(ep->s->domain == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6)))); return -1; } #if CI_CFG_FAKE_IPV6 if( ep->s->domain == AF_INET ) { ci_udp_set_laddr( ep, ci_get_ip4_addr(sa_u.sa.sa_family, &sa_u.sa), sa_u.sin.sin_port ); } else { ci_udp_set_laddr( ep, ci_get_ip4_addr(sa_u.sa.sa_family, &sa_u.sa), sa_u.sin6.sin6_port ); } #else ci_udp_set_laddr( ep, ci_get_ip4_addr(sa_u.sa.sa_family, &sa_u.sa), sa_u.sin.sin_port ); #endif return 0; }
/* Set a reuseport bind on a socket. */ int ci_udp_reuseport_bind(citp_socket* ep, ci_fd_t fd, const struct sockaddr* sa, socklen_t sa_len) { int rc; ci_uint32 laddr_be32 = ci_get_ip4_addr(ep->s->domain, sa); int lport_be16 = ((struct sockaddr_in*)sa)->sin_port; ci_assert_nequal(ep->s->s_flags & CI_SOCK_FLAG_REUSEPORT, 0); /* We cannot support binding to port 0 as the kernel would assign * the socket a port number. We must move the socket before binding * the OS socket and we don't have a port number to look up * clusters. */ if( lport_be16 == 0 ) { LOG_UC(ci_log("%s: Binding to port 0 with reuseport set not supported", __FUNCTION__)); RET_WITH_ERRNO(ENOSYS); } if( (rc = ci_tcp_ep_reuseport_bind(fd, CITP_OPTS.cluster_name, CITP_OPTS.cluster_size, CITP_OPTS.cluster_restart_opt, laddr_be32, lport_be16)) != 0 ) { errno = -rc; return -1; } return rc; }
/* Conclude the EP's binding. This function is abstracted from the * main bind code to allow implicit binds that occur when sendto() is * called on an OS socket. [lport] and CI_SIN(addr)->sin_port do not * have to be the same value. */ static int ci_udp_bind_conclude(citp_socket* ep, const struct sockaddr* addr, ci_uint16 lport ) { ci_udp_state* us; ci_uint32 addr_be32; int rc; CHECK_UEP(ep); ci_assert(addr != NULL); if( ci_udp_should_handover(ep, addr, lport) ) goto handover; addr_be32 = ci_get_ip4_addr(ep->s->domain, addr); ci_udp_set_laddr(ep, addr_be32, lport); us = SOCK_TO_UDP(ep->s); if( addr_be32 != 0 ) us->s.cp.sock_cp_flags |= OO_SCP_LADDR_BOUND; /* reset any rx/tx that have taken place already */ UDP_CLR_FLAG(us, CI_UDPF_EF_SEND); #ifdef ONLOAD_OFE if( ep->netif->ofe != NULL ) us->s.ofe_code_start = ofe_socktbl_find( ep->netif->ofe, OFE_SOCKTYPE_UDP, udp_laddr_be32(us), udp_raddr_be32(us), udp_lport_be16(us), udp_rport_be16(us)); #endif /* OS source addrs have already been handed-over, so this must be one of * our src addresses. */ rc = ci_udp_set_filters( ep, us); ci_assert( !UDP_GET_FLAG(us, CI_UDPF_EF_BIND) ); /*! \todo FIXME isn't the port the thing to be testing here? */ if( udp_laddr_be32(us) != INADDR_ANY_BE32 ) UDP_SET_FLAG(us, CI_UDPF_EF_BIND); CI_UDPSTATE_SHOW_EP( ep ); if( rc == CI_SOCKET_ERROR && CITP_OPTS.no_fail) { CITP_STATS_NETIF(++ep->netif->state->stats.udp_bind_no_filter); goto handover; } return rc; handover: LOG_UV(log("%s: "SK_FMT" HANDOVER", __FUNCTION__, SK_PRI_ARGS(ep))); return CI_SOCKET_HANDOVER; }
int ci_udp_should_handover(citp_socket* ep, const struct sockaddr* addr, ci_uint16 lport) { ci_uint32 addr_be32; #if CI_CFG_FAKE_IPV6 if( ep->s->domain == AF_INET6 && ! ci_tcp_ipv6_is_ipv4(addr) ) goto handover; #endif if( (CI_BSWAP_BE16(lport) >= NI_OPTS(ep->netif).udp_port_handover_min && CI_BSWAP_BE16(lport) <= NI_OPTS(ep->netif).udp_port_handover_max) || (CI_BSWAP_BE16(lport) >= NI_OPTS(ep->netif).udp_port_handover2_min && CI_BSWAP_BE16(lport) <= NI_OPTS(ep->netif).udp_port_handover2_max) || (CI_BSWAP_BE16(lport) >= NI_OPTS(ep->netif).udp_port_handover3_min && CI_BSWAP_BE16(lport) <= NI_OPTS(ep->netif).udp_port_handover3_max) ) { LOG_UC(log(FNS_FMT "HANDOVER (%d <= %d <= %d)", FNS_PRI_ARGS(ep->netif, ep->s), NI_OPTS(ep->netif).udp_port_handover_min, CI_BSWAP_BE16(lport), NI_OPTS(ep->netif).udp_port_handover_max)); goto handover; } addr_be32 = ci_get_ip4_addr(ep->s->domain, addr); if( addr_be32 != CI_BSWAPC_BE32(INADDR_ANY) && ! cicp_user_addr_is_local_efab(CICP_HANDLE(ep->netif), &addr_be32) && ! CI_IP_IS_MULTICAST(addr_be32) ) { /* Either the bind/getsockname indicated that we need to let the OS * take this or the local address is not one of ours - so we can safely * hand-over as bind to a non-ANY addr cannot be revoked. * The filters (if any) have already been removed, so we just get out. */ goto handover; } return 0; handover: return 1; }
/* Complete a UDP U/L connect. The sys connect() call must have been made * (and succeeded) before calling this function. So if anything goes wrong * in here, then it can be consider an internal error or failing of onload. */ int ci_udp_connect_conclude(citp_socket* ep, ci_fd_t fd, const struct sockaddr* serv_addr, socklen_t addrlen, ci_fd_t os_sock) { const struct sockaddr_in* serv_sin = (const struct sockaddr_in*) serv_addr; ci_uint32 dst_be32; ci_udp_state* us = SOCK_TO_UDP(ep->s); int onloadable; int rc = 0; CHECK_UEP(ep); UDP_CLR_FLAG(us, CI_UDPF_EF_SEND); us->s.rx_errno = 0; us->s.tx_errno = 0; if( IS_DISCONNECTING(serv_sin) ) { rc = ci_udp_disconnect(ep, us, os_sock); goto out; } #if CI_CFG_FAKE_IPV6 if( us->s.domain == PF_INET6 && !ci_tcp_ipv6_is_ipv4(serv_addr) ) { LOG_UC(log(FNT_FMT "HANDOVER not IPv4", FNT_PRI_ARGS(ep->netif, us))); goto handover; } #endif dst_be32 = ci_get_ip4_addr(serv_sin->sin_family, serv_addr); if( (rc = ci_udp_sys_getsockname(os_sock, ep)) != 0 ) { LOG_E(log(FNT_FMT "ERROR: (%s:%d) sys_getsockname failed (%d)", FNT_PRI_ARGS(ep->netif, us), ip_addr_str(dst_be32), CI_BSWAP_BE16(serv_sin->sin_port), errno)); goto out; } us->s.cp.sock_cp_flags |= OO_SCP_CONNECTED; ci_udp_set_raddr(us, dst_be32, serv_sin->sin_port); cicp_user_retrieve(ep->netif, &us->s.pkt, &us->s.cp); switch( us->s.pkt.status ) { case retrrc_success: case retrrc_nomac: onloadable = 1; break; default: onloadable = 0; if( NI_OPTS(ep->netif).udp_connect_handover ) { LOG_UC(log(FNT_FMT "HANDOVER %s:%d", FNT_PRI_ARGS(ep->netif, us), ip_addr_str(dst_be32), CI_BSWAP_BE16(serv_sin->sin_port))); goto handover; } break; } if( dst_be32 == INADDR_ANY_BE32 || serv_sin->sin_port == 0 ) { LOG_UC(log(FNT_FMT "%s:%d - route via OS socket", FNT_PRI_ARGS(ep->netif, us), ip_addr_str(dst_be32), CI_BSWAP_BE16(serv_sin->sin_port))); ci_udp_clr_filters(ep); return 0; } if( CI_IP_IS_LOOPBACK(dst_be32) ) { /* After connecting via loopback it is not possible to connect anywhere * else. */ LOG_UC(log(FNT_FMT "HANDOVER %s:%d", FNT_PRI_ARGS(ep->netif, us), ip_addr_str(dst_be32), CI_BSWAP_BE16(serv_sin->sin_port))); goto handover; } if( onloadable ) { #ifdef ONLOAD_OFE if( ep->netif->ofe != NULL ) us->s.ofe_code_start = ofe_socktbl_find( ep->netif->ofe, OFE_SOCKTYPE_UDP, udp_laddr_be32(us), udp_raddr_be32(us), udp_lport_be16(us), udp_rport_be16(us)); #endif if( (rc = ci_udp_set_filters(ep, us)) != 0 ) { /* Failed to set filters. Most likely we've run out of h/w filters. * Handover to O/S to avoid breaking the app. * * TODO: Actually we probably won't break the app if we don't * handover, as packets will still get delivered via the kernel * stack. Might be worth having a runtime option to choose whether * or not to handover in such cases. */ LOG_U(log(FNT_FMT "ERROR: (%s:%d) ci_udp_set_filters failed (%d)", FNT_PRI_ARGS(ep->netif, us), ip_addr_str(dst_be32), CI_BSWAP_BE16(serv_sin->sin_port), rc)); CITP_STATS_NETIF(++ep->netif->state->stats.udp_connect_no_filter); goto out; } } else { ci_udp_clr_filters(ep); } LOG_UC(log(LPF "connect: "SF_FMT" %sCONNECTED L:%s:%u R:%s:%u (err:%d)", SF_PRI_ARGS(ep,fd), udp_raddr_be32(us) ? "" : "DIS", ip_addr_str(udp_laddr_be32(us)), (unsigned) CI_BSWAP_BE16(udp_lport_be16(us)), ip_addr_str(udp_raddr_be32(us)), (unsigned) CI_BSWAP_BE16(udp_rport_be16(us)), errno)); return 0; out: if( rc < 0 && CITP_OPTS.no_fail ) goto handover; return rc; handover: ci_udp_clr_filters(ep); return CI_SOCKET_HANDOVER; }
/* In this bind handler we just check that the address to which * are binding is either "any" or one of ours. * In the Linux kernel version [fd] is unused. */ int ci_tcp_bind(citp_socket* ep, const struct sockaddr* my_addr, socklen_t addrlen, ci_fd_t fd ) { struct sockaddr_in* my_addr_in; ci_uint16 new_port; ci_uint32 addr_be32; ci_sock_cmn* s = ep->s; ci_tcp_state* c = &SOCK_TO_WAITABLE_OBJ(s)->tcp; int rc; CHECK_TEP(ep); my_addr_in = (struct sockaddr_in*) my_addr; /* Check if state of the socket is OK for bind operation. */ /* \todo Earlier (TS_TCP( epi->tcpep.state )->tcp_source_be16) is used. * What is better? */ if (my_addr == NULL) RET_WITH_ERRNO( EINVAL ); if (s->b.state != CI_TCP_CLOSED) RET_WITH_ERRNO( EINVAL ); if (c->tcpflags & CI_TCPT_FLAG_WAS_ESTAB) RET_WITH_ERRNO( EINVAL ); if( my_addr->sa_family != s->domain ) RET_WITH_ERRNO( s->domain == PF_INET ? EAFNOSUPPORT : EINVAL ); /* Bug 4884: Windows regularly uses addrlen > sizeof(struct sockaddr_in) * Linux is also relaxed about overlength data areas. */ if (s->domain == PF_INET && addrlen < sizeof(struct sockaddr_in)) RET_WITH_ERRNO( EINVAL ); #if CI_CFG_FAKE_IPV6 if (s->domain == PF_INET6 && addrlen < SIN6_LEN_RFC2133) RET_WITH_ERRNO( EINVAL ); if( s->domain == PF_INET6 && !ci_tcp_ipv6_is_ipv4(my_addr) ) return CI_SOCKET_HANDOVER; #endif addr_be32 = ci_get_ip4_addr(s->domain, my_addr); /* Using the port number provided, see if we can do this bind */ new_port = my_addr_in->sin_port; if( CITP_OPTS.tcp_reuseports != 0 && new_port != 0 ) { struct ci_port_list *force_reuseport; CI_DLLIST_FOR_EACH2(struct ci_port_list, force_reuseport, link, (ci_dllist*)(ci_uintptr_t)CITP_OPTS.tcp_reuseports) { if( force_reuseport->port == new_port ) { int one = 1; ci_fd_t os_sock = ci_get_os_sock_fd(ep, fd); ci_assert(CI_IS_VALID_SOCKET(os_sock)); rc = ci_sys_setsockopt(os_sock, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one)); ci_rel_os_sock_fd(os_sock); if( rc != 0 && errno == ENOPROTOOPT ) ep->s->s_flags |= CI_SOCK_FLAG_REUSEPORT_LEGACY; ep->s->s_flags |= CI_SOCK_FLAG_REUSEPORT; LOG_TC(log("%s "SF_FMT", applied legacy SO_REUSEPORT flag for port %u", __FUNCTION__, SF_PRI_ARGS(ep, fd), new_port)); } } } if( !(ep->s->s_flags & CI_SOCK_FLAG_REUSEPORT_LEGACY) ) CI_LOGLEVEL_TRY_RET(LOG_TV, __ci_bind(ep->netif, ep->s, addr_be32, &new_port)); ep->s->s_flags |= CI_SOCK_FLAG_BOUND; sock_lport_be16(s) = new_port; sock_laddr_be32(s) = addr_be32; if( CI_IP_IS_MULTICAST(addr_be32) ) s->cp.ip_laddr_be32 = 0; else s->cp.ip_laddr_be32 = addr_be32; s->cp.lport_be16 = new_port; sock_rport_be16(s) = sock_raddr_be32(s) = 0; LOG_TC(log(LPF "bind to %s:%u n_p:%u lp:%u", ip_addr_str(addr_be32), (unsigned) CI_BSWAP_BE16(my_addr_in->sin_port), CI_BSWAP_BE16(new_port), CI_BSWAP_BE16(sock_lport_be16(s)))); return 0; }
/* Returns: * 0 on success * * CI_SOCKET_ERROR (and errno set) * this is a normal error that is returned to the * the application * * CI_SOCKET_HANDOVER we tell the upper layers to handover, no need * to set errno since it isn't a real error */ int ci_tcp_connect(citp_socket* ep, const struct sockaddr* serv_addr, socklen_t addrlen, ci_fd_t fd, int *p_moved) { /* Address family is validated earlier. */ struct sockaddr_in* inaddr = (struct sockaddr_in*) serv_addr; ci_sock_cmn* s = ep->s; ci_tcp_state* ts = &SOCK_TO_WAITABLE_OBJ(s)->tcp; int rc = 0, crc; ci_uint32 dst_be32; if( NI_OPTS(ep->netif).tcp_connect_handover ) return CI_SOCKET_HANDOVER; /* Make sure we're up-to-date. */ ci_netif_lock(ep->netif); CHECK_TEP(ep); ci_netif_poll(ep->netif); /* * 1. Check if state of the socket is OK for connect operation. */ start_again: if( (rc = ci_tcp_connect_handle_so_error(s)) != 0) { CI_SET_ERROR(rc, rc); goto unlock_out; } if( s->b.state != CI_TCP_CLOSED ) { /* see if progress can be made on this socket before ** determining status (e.g. non-blocking connect and connect poll)*/ if( s->b.state & CI_TCP_STATE_SYNCHRONISED ) { if( ts->tcpflags & CI_TCPT_FLAG_NONBLOCK_CONNECT ) { ts->tcpflags &= ~CI_TCPT_FLAG_NONBLOCK_CONNECT; rc = 0; goto unlock_out; } if( serv_addr->sa_family == AF_UNSPEC ) LOG_E(ci_log("Onload does not support TCP disconnect via " "connect(addr->sa_family==AF_UNSPEC)")); CI_SET_ERROR(rc, EISCONN); } else if( s->b.state == CI_TCP_LISTEN ) { #if CI_CFG_POSIX_CONNECT_AFTER_LISTEN CI_SET_ERROR(rc, EOPNOTSUPP); #else if( ci_tcp_validate_sa(s->domain, serv_addr, addrlen) ) { /* Request should be forwarded to OS */ rc = CI_SOCKET_HANDOVER; goto unlock_out; } if( serv_addr->sa_family == AF_UNSPEC ) { /* Linux does listen shutdown on disconnect (AF_UNSPEC) */ ci_netif_unlock(ep->netif); rc = ci_tcp_shutdown(ep, SHUT_RD, fd); goto out; } else { /* Linux has curious error reporting in this case */ CI_SET_ERROR(rc, EISCONN); } #endif } else { /* Socket is in SYN-SENT state. Let's block for receiving SYN-ACK */ ci_assert_equal(s->b.state, CI_TCP_SYN_SENT); if( s->b.sb_aflags & (CI_SB_AFLAG_O_NONBLOCK | CI_SB_AFLAG_O_NDELAY) ) CI_SET_ERROR(rc, EALREADY); else goto syn_sent; } goto unlock_out; } /* Check if we've ever been connected. */ if( ts->tcpflags & CI_TCPT_FLAG_WAS_ESTAB ) { CI_SET_ERROR(rc, EISCONN); goto unlock_out; } /* * 2. Check address parameter, if it's inappropriate for handover * decision or handover should be done, try to to call OS and * do handover on success. */ if ( /* Af first, check that address family and length is OK. */ ci_tcp_validate_sa(s->domain, serv_addr, addrlen) /* rfc793 p54 if the foreign socket is unspecified return */ /* "error: foreign socket unspecified" (EINVAL), but keep it to OS */ || (dst_be32 = ci_get_ip4_addr(inaddr->sin_family, serv_addr)) == 0 /* Zero destination port is tricky as well, keep it to OS */ || inaddr->sin_port == 0 ) { rc = CI_SOCKET_HANDOVER; goto unlock_out; } /* is this a socket that we can handle? */ rc = ci_tcp_connect_check_dest(ep, dst_be32, inaddr->sin_port); if( rc ) goto unlock_out; if( (ts->s.pkt.flags & CI_IP_CACHE_IS_LOCALROUTE) && OO_SP_IS_NULL(ts->local_peer) ) { /* Try to connect to another stack; handover if can't */ struct oo_op_loopback_connect op; op.dst_port = inaddr->sin_port; op.dst_addr = dst_be32; /* this operation unlocks netif */ rc = oo_resource_op(fd, OO_IOC_TCP_LOOPBACK_CONNECT, &op); if( rc < 0) return CI_SOCKET_HANDOVER; if( op.out_moved ) *p_moved = 1; if( op.out_rc == -EINPROGRESS ) RET_WITH_ERRNO( EINPROGRESS ); else if( op.out_rc == -EAGAIN ) return -EAGAIN; else if( op.out_rc != 0 ) return CI_SOCKET_HANDOVER; return 0; } /* filters can't handle alien source address */ if( (s->s_flags & CI_SOCK_FLAG_BOUND_ALIEN) && ! (ts->s.pkt.flags & CI_IP_CACHE_IS_LOCALROUTE) ) { rc = CI_SOCKET_HANDOVER; goto unlock_out; } crc = ci_tcp_connect_ul_start(ep->netif, ts, dst_be32, inaddr->sin_port, &rc); if( crc != CI_CONNECT_UL_OK ) { switch( crc ) { case CI_CONNECT_UL_FAIL: goto unlock_out; case CI_CONNECT_UL_LOCK_DROPPED: goto out; case CI_CONNECT_UL_START_AGAIN: goto start_again; } } CI_TCP_STATS_INC_ACTIVE_OPENS( ep->netif ); syn_sent: rc = ci_tcp_connect_ul_syn_sent(ep->netif, ts); unlock_out: ci_netif_unlock(ep->netif); out: return rc; }