/* Common handler for IOCTL calls. * NOTE: in the kernel version if [arg] is a pointer then it will point * into user space. Use the CI_IOCTL_* macros in internal.h please. */ int ci_cmn_ioctl(ci_netif* netif, ci_sock_cmn* s, int request, void* arg, int os_rc, int os_socket_exists) { ci_assert(netif); ci_assert(s); /* ioctl defines are listed in `man ioctl_list` and the CI equivalent * CI defines are in include/ci/net/ioctls.h */ LOG_SV( ci_log("request = %u/%#x, arg = %lu/%#lx", request, request, (long) arg, (long) arg)); switch( request ) { case SIOCGPGRP: /* get the process ID/group that is receiving signals for this fd */ if( !CI_IOCTL_ARG_OK(int, arg) ) goto fail_fault; CI_IOCTL_SETARG( ((int*)arg), s->b.sigown); break; case SIOCSPGRP: /* set the process ID/group that is receiving signals for this fd */ if( !CI_IOCTL_ARG_OK(int, arg) ) goto fail_fault; s->b.sigown = CI_IOCTL_GETARG(int,arg); if( s->b.sigown && (s->b.sb_aflags & CI_SB_AFLAG_O_ASYNC) ) ci_bit_set(&s->b.wake_request, CI_SB_FLAG_WAKE_RX_B); break; case SIOCGSTAMP: case SIOCGSTAMPNS: RET_WITH_ERRNO(ENOENT); default: if (!os_socket_exists) RET_WITH_ERRNO(ENOTTY); /* Assumes that errno is unchanged from the OS call, or that [os_rc] == 0 */ return os_rc; } /* Successful conclusion */ return 0; fail_fault: LOG_SC( ci_log("%s: "NS_FMT" req %d/%#x arg %ld/%#lx unhandled (EINVAL)", __FUNCTION__, NS_PRI_ARGS(netif, s), request, request, (long)arg, (long)arg)); RET_WITH_ERRNO(EFAULT); }
/* Set a reuseport bind on a socket. */ int ci_udp_reuseport_bind(citp_socket* ep, ci_fd_t fd, const struct sockaddr* sa, socklen_t sa_len) { int rc; ci_uint32 laddr_be32 = ci_get_ip4_addr(ep->s->domain, sa); int lport_be16 = ((struct sockaddr_in*)sa)->sin_port; ci_assert_nequal(ep->s->s_flags & CI_SOCK_FLAG_REUSEPORT, 0); /* We cannot support binding to port 0 as the kernel would assign * the socket a port number. We must move the socket before binding * the OS socket and we don't have a port number to look up * clusters. */ if( lport_be16 == 0 ) { LOG_UC(ci_log("%s: Binding to port 0 with reuseport set not supported", __FUNCTION__)); RET_WITH_ERRNO(ENOSYS); } if( (rc = ci_tcp_ep_reuseport_bind(fd, CITP_OPTS.cluster_name, CITP_OPTS.cluster_size, CITP_OPTS.cluster_restart_opt, laddr_be32, lport_be16)) != 0 ) { errno = -rc; return -1; } return rc; }
/*! * Tests for valid sockaddr & sockaddr length & AF_INET or AF_INET6. */ static int ci_tcp_validate_sa( sa_family_t domain, const struct sockaddr* sa, socklen_t sa_len ) { /* * Linux deviates from documented behaviour here; * On Linux we return EINVAL if sa and sa_len are NULL and 0 respectively, * and we return EFAULT if sa is NULL and sa_len != 0.... */ if( !sa ) { LOG_U(ci_log(LPF "invalid sockaddr : sa = %lx, sa_len = %d", (long) sa, sa_len)); if( sa_len == 0 ) RET_WITH_ERRNO( EINVAL ); else RET_WITH_ERRNO( EFAULT ); } if( sa_len < sizeof(struct sockaddr_in) #if CI_CFG_FAKE_IPV6 || (domain == AF_INET6 && sa_len < SIN6_LEN_RFC2133) #endif ) { LOG_U( ci_log(LPF "struct too short to be sockaddr_in(6)" )); RET_WITH_ERRNO( EINVAL ); } /* It should be sa->sa_family, but MS wdm does not understand it, * so let's use CI_SIN(sa)->sin_family. */ if (CI_SIN(sa)->sin_family != domain && CI_SIN(sa)->sin_family != AF_UNSPEC) { LOG_U(ci_log(LPF "address family %d does not match " "with socket domain %d", CI_SIN(sa)->sin_family, domain)); RET_WITH_ERRNO(EAFNOSUPPORT); } #if CI_CFG_FAKE_IPV6 if (sa->sa_family == AF_INET6 && !ci_tcp_ipv6_is_ipv4(sa)) { LOG_TC(ci_log(LPF "Pure IPv6 address is not supported")); RET_WITH_ERRNO(EAFNOSUPPORT); } #endif return 0; }
/*! \todo we can simplify this a lot by letting the kernel have it! */ int ci_udp_getpeername(citp_socket*ep, struct sockaddr* name, socklen_t* namelen) { ci_udp_state* us; CHECK_UEP(ep); us = SOCK_TO_UDP(ep->s); /* * At first, it's necessary to check whether socket is connected or * not, since we can return ENOTCONN even if name and/or namelen are * not valid. */ if( udp_raddr_be32(us) == 0 ) { RET_WITH_ERRNO(ENOTCONN); } else if( name == NULL || namelen == NULL ) { RET_WITH_ERRNO(EFAULT); } else { ci_addr_to_user(name, namelen, ep->s->domain, udp_rport_be16(us), udp_raddr_be32(us)); return 0; } }
/* [fd] is unused in the kernel version */ int ci_tcp_getsockopt(citp_socket* ep, ci_fd_t fd, int level, int optname, void *optval, socklen_t *optlen ) { ci_sock_cmn* s = ep->s; #if defined(__linux__) || \ defined(__sun__) && defined(TCP_KEEPALIVE_THRESHOLD) || \ defined(__sun__) && defined(TCP_KEEPALIVE_ABORT_THRESHOLD) ci_tcp_socket_cmn *c = &(SOCK_TO_WAITABLE_OBJ(s)->tcp.c); #endif ci_netif* netif = ep->netif; unsigned u = 0; /* NOTE: The setsockopt() call is reflected into the os socket to * keep the two in sync - it's assumed that we know everything * to allow us to give good answers here - and therefore we don't * bother the os with the get call */ /* ?? what to do about optval and optlen checking * Kernel can raise EFAULT, here we are a little in the dark. * - sockcall_intercept.c checks that optlen is non-NULL and if *optlen * is non-zero that optval is non-NULL, returning EFAULT if false */ if(level == SOL_SOCKET) { /* Common SOL_SOCKET handler */ return ci_get_sol_socket(netif, s, optname, optval, optlen); } else if (level == IPPROTO_IP) { /* IP level options valid for TCP */ return ci_get_sol_ip(ep, s, fd, optname, optval, optlen); #if CI_CFG_FAKE_IPV6 } else if (level == IPPROTO_IPV6 && s->domain == AF_INET6) { /* IP6 level options valid for TCP */ return ci_get_sol_ip6(ep, s, fd, optname, optval, optlen); #endif } else if (level == IPPROTO_TCP) { /* TCP level options valid for TCP */ switch(optname){ case TCP_NODELAY: /* gets status of TCP Nagle algorithm */ u = ((s->s_aflags & CI_SOCK_AFLAG_NODELAY) != 0); goto u_out; case TCP_MAXSEG: /* gets the MSS size for this connection */ if ((s->b.state & CI_TCP_STATE_TCP_CONN)) { u = tcp_eff_mss(SOCK_TO_TCP(s)); } else { u = 536; } goto u_out; # ifdef TCP_CORK case TCP_CORK: /* don't send partial framses, all partial frames sent ** when the option is cleared */ u = ((s->s_aflags & CI_SOCK_AFLAG_CORK) != 0); goto u_out; # endif case TCP_KEEPIDLE: { /* idle time for keepalives */ u = (unsigned) c->t_ka_time_in_secs; } goto u_out; case TCP_KEEPINTVL: { /* time between keepalives */ u = (unsigned) c->t_ka_intvl_in_secs; } goto u_out; case TCP_KEEPCNT: { /* number of keepalives before giving up */ u = c->ka_probe_th; } goto u_out; case TCP_INFO: /* struct tcp_info to be filled */ return ci_tcp_info_get(netif, s, (struct ci_tcp_info*) optval); case TCP_DEFER_ACCEPT: { u = 0; if( c->tcp_defer_accept != OO_TCP_DEFER_ACCEPT_OFF ) { u = ci_ip_time_ticks2ms(netif, NI_CONF(netif).tconst_rto_initial); u = ((u + 500) / 1000) << c->tcp_defer_accept; } goto u_out; } case TCP_QUICKACK: { u = 0; if( s->b.state & CI_TCP_STATE_TCP_CONN ) { ci_tcp_state* ts = SOCK_TO_TCP(s); u = ci_tcp_is_in_faststart(ts); } goto u_out; } default: LOG_TC( log(LPF "getsockopt: unimplemented or bad option: %i", optname)); RET_WITH_ERRNO(ENOPROTOOPT); } } else { SOCKOPT_RET_INVALID_LEVEL(s); } return 0; u_out: return ci_getsockopt_final(optval, optlen, level, &u, sizeof(u)); }
/* In this bind handler we just check that the address to which * are binding is either "any" or one of ours. * In the Linux kernel version [fd] is unused. */ int ci_tcp_bind(citp_socket* ep, const struct sockaddr* my_addr, socklen_t addrlen, ci_fd_t fd ) { struct sockaddr_in* my_addr_in; ci_uint16 new_port; ci_uint32 addr_be32; ci_sock_cmn* s = ep->s; ci_tcp_state* c = &SOCK_TO_WAITABLE_OBJ(s)->tcp; int rc; CHECK_TEP(ep); my_addr_in = (struct sockaddr_in*) my_addr; /* Check if state of the socket is OK for bind operation. */ /* \todo Earlier (TS_TCP( epi->tcpep.state )->tcp_source_be16) is used. * What is better? */ if (my_addr == NULL) RET_WITH_ERRNO( EINVAL ); if (s->b.state != CI_TCP_CLOSED) RET_WITH_ERRNO( EINVAL ); if (c->tcpflags & CI_TCPT_FLAG_WAS_ESTAB) RET_WITH_ERRNO( EINVAL ); if( my_addr->sa_family != s->domain ) RET_WITH_ERRNO( s->domain == PF_INET ? EAFNOSUPPORT : EINVAL ); /* Bug 4884: Windows regularly uses addrlen > sizeof(struct sockaddr_in) * Linux is also relaxed about overlength data areas. */ if (s->domain == PF_INET && addrlen < sizeof(struct sockaddr_in)) RET_WITH_ERRNO( EINVAL ); #if CI_CFG_FAKE_IPV6 if (s->domain == PF_INET6 && addrlen < SIN6_LEN_RFC2133) RET_WITH_ERRNO( EINVAL ); if( s->domain == PF_INET6 && !ci_tcp_ipv6_is_ipv4(my_addr) ) return CI_SOCKET_HANDOVER; #endif addr_be32 = ci_get_ip4_addr(s->domain, my_addr); /* Using the port number provided, see if we can do this bind */ new_port = my_addr_in->sin_port; if( CITP_OPTS.tcp_reuseports != 0 && new_port != 0 ) { struct ci_port_list *force_reuseport; CI_DLLIST_FOR_EACH2(struct ci_port_list, force_reuseport, link, (ci_dllist*)(ci_uintptr_t)CITP_OPTS.tcp_reuseports) { if( force_reuseport->port == new_port ) { int one = 1; ci_fd_t os_sock = ci_get_os_sock_fd(ep, fd); ci_assert(CI_IS_VALID_SOCKET(os_sock)); rc = ci_sys_setsockopt(os_sock, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one)); ci_rel_os_sock_fd(os_sock); if( rc != 0 && errno == ENOPROTOOPT ) ep->s->s_flags |= CI_SOCK_FLAG_REUSEPORT_LEGACY; ep->s->s_flags |= CI_SOCK_FLAG_REUSEPORT; LOG_TC(log("%s "SF_FMT", applied legacy SO_REUSEPORT flag for port %u", __FUNCTION__, SF_PRI_ARGS(ep, fd), new_port)); } } } if( !(ep->s->s_flags & CI_SOCK_FLAG_REUSEPORT_LEGACY) ) CI_LOGLEVEL_TRY_RET(LOG_TV, __ci_bind(ep->netif, ep->s, addr_be32, &new_port)); ep->s->s_flags |= CI_SOCK_FLAG_BOUND; sock_lport_be16(s) = new_port; sock_laddr_be32(s) = addr_be32; if( CI_IP_IS_MULTICAST(addr_be32) ) s->cp.ip_laddr_be32 = 0; else s->cp.ip_laddr_be32 = addr_be32; s->cp.lport_be16 = new_port; sock_rport_be16(s) = sock_raddr_be32(s) = 0; LOG_TC(log(LPF "bind to %s:%u n_p:%u lp:%u", ip_addr_str(addr_be32), (unsigned) CI_BSWAP_BE16(my_addr_in->sin_port), CI_BSWAP_BE16(new_port), CI_BSWAP_BE16(sock_lport_be16(s)))); return 0; }
/* Returns: * 0 on success * * CI_SOCKET_ERROR (and errno set) * this is a normal error that is returned to the * the application * * CI_SOCKET_HANDOVER we tell the upper layers to handover, no need * to set errno since it isn't a real error */ int ci_tcp_connect(citp_socket* ep, const struct sockaddr* serv_addr, socklen_t addrlen, ci_fd_t fd, int *p_moved) { /* Address family is validated earlier. */ struct sockaddr_in* inaddr = (struct sockaddr_in*) serv_addr; ci_sock_cmn* s = ep->s; ci_tcp_state* ts = &SOCK_TO_WAITABLE_OBJ(s)->tcp; int rc = 0, crc; ci_uint32 dst_be32; if( NI_OPTS(ep->netif).tcp_connect_handover ) return CI_SOCKET_HANDOVER; /* Make sure we're up-to-date. */ ci_netif_lock(ep->netif); CHECK_TEP(ep); ci_netif_poll(ep->netif); /* * 1. Check if state of the socket is OK for connect operation. */ start_again: if( (rc = ci_tcp_connect_handle_so_error(s)) != 0) { CI_SET_ERROR(rc, rc); goto unlock_out; } if( s->b.state != CI_TCP_CLOSED ) { /* see if progress can be made on this socket before ** determining status (e.g. non-blocking connect and connect poll)*/ if( s->b.state & CI_TCP_STATE_SYNCHRONISED ) { if( ts->tcpflags & CI_TCPT_FLAG_NONBLOCK_CONNECT ) { ts->tcpflags &= ~CI_TCPT_FLAG_NONBLOCK_CONNECT; rc = 0; goto unlock_out; } if( serv_addr->sa_family == AF_UNSPEC ) LOG_E(ci_log("Onload does not support TCP disconnect via " "connect(addr->sa_family==AF_UNSPEC)")); CI_SET_ERROR(rc, EISCONN); } else if( s->b.state == CI_TCP_LISTEN ) { #if CI_CFG_POSIX_CONNECT_AFTER_LISTEN CI_SET_ERROR(rc, EOPNOTSUPP); #else if( ci_tcp_validate_sa(s->domain, serv_addr, addrlen) ) { /* Request should be forwarded to OS */ rc = CI_SOCKET_HANDOVER; goto unlock_out; } if( serv_addr->sa_family == AF_UNSPEC ) { /* Linux does listen shutdown on disconnect (AF_UNSPEC) */ ci_netif_unlock(ep->netif); rc = ci_tcp_shutdown(ep, SHUT_RD, fd); goto out; } else { /* Linux has curious error reporting in this case */ CI_SET_ERROR(rc, EISCONN); } #endif } else { /* Socket is in SYN-SENT state. Let's block for receiving SYN-ACK */ ci_assert_equal(s->b.state, CI_TCP_SYN_SENT); if( s->b.sb_aflags & (CI_SB_AFLAG_O_NONBLOCK | CI_SB_AFLAG_O_NDELAY) ) CI_SET_ERROR(rc, EALREADY); else goto syn_sent; } goto unlock_out; } /* Check if we've ever been connected. */ if( ts->tcpflags & CI_TCPT_FLAG_WAS_ESTAB ) { CI_SET_ERROR(rc, EISCONN); goto unlock_out; } /* * 2. Check address parameter, if it's inappropriate for handover * decision or handover should be done, try to to call OS and * do handover on success. */ if ( /* Af first, check that address family and length is OK. */ ci_tcp_validate_sa(s->domain, serv_addr, addrlen) /* rfc793 p54 if the foreign socket is unspecified return */ /* "error: foreign socket unspecified" (EINVAL), but keep it to OS */ || (dst_be32 = ci_get_ip4_addr(inaddr->sin_family, serv_addr)) == 0 /* Zero destination port is tricky as well, keep it to OS */ || inaddr->sin_port == 0 ) { rc = CI_SOCKET_HANDOVER; goto unlock_out; } /* is this a socket that we can handle? */ rc = ci_tcp_connect_check_dest(ep, dst_be32, inaddr->sin_port); if( rc ) goto unlock_out; if( (ts->s.pkt.flags & CI_IP_CACHE_IS_LOCALROUTE) && OO_SP_IS_NULL(ts->local_peer) ) { /* Try to connect to another stack; handover if can't */ struct oo_op_loopback_connect op; op.dst_port = inaddr->sin_port; op.dst_addr = dst_be32; /* this operation unlocks netif */ rc = oo_resource_op(fd, OO_IOC_TCP_LOOPBACK_CONNECT, &op); if( rc < 0) return CI_SOCKET_HANDOVER; if( op.out_moved ) *p_moved = 1; if( op.out_rc == -EINPROGRESS ) RET_WITH_ERRNO( EINPROGRESS ); else if( op.out_rc == -EAGAIN ) return -EAGAIN; else if( op.out_rc != 0 ) return CI_SOCKET_HANDOVER; return 0; } /* filters can't handle alien source address */ if( (s->s_flags & CI_SOCK_FLAG_BOUND_ALIEN) && ! (ts->s.pkt.flags & CI_IP_CACHE_IS_LOCALROUTE) ) { rc = CI_SOCKET_HANDOVER; goto unlock_out; } crc = ci_tcp_connect_ul_start(ep->netif, ts, dst_be32, inaddr->sin_port, &rc); if( crc != CI_CONNECT_UL_OK ) { switch( crc ) { case CI_CONNECT_UL_FAIL: goto unlock_out; case CI_CONNECT_UL_LOCK_DROPPED: goto out; case CI_CONNECT_UL_START_AGAIN: goto start_again; } } CI_TCP_STATS_INC_ACTIVE_OPENS( ep->netif ); syn_sent: rc = ci_tcp_connect_ul_syn_sent(ep->netif, ts); unlock_out: ci_netif_unlock(ep->netif); out: return rc; }
static int ci_tcp_connect_ul_syn_sent(ci_netif *ni, ci_tcp_state *ts) { int rc = 0; if( ts->s.b.state == CI_TCP_SYN_SENT ) { ci_netif_poll(ni); if( OO_SP_NOT_NULL(ts->local_peer) ) { /* No reason to sleep. Obviously, listener have dropped our syn * because of some reason. Go away! */ ci_tcp_drop(ni, ts, EBUSY); RET_WITH_ERRNO(EBUSY); } CI_TCP_SLEEP_WHILE(ni, ts, CI_SB_FLAG_WAKE_RX, ts->s.so.sndtimeo_msec, ts->s.b.state == CI_TCP_SYN_SENT, &rc); } if( rc == -EAGAIN ) { LOG_TC(log( LNT_FMT "timeout on sleep: %d", LNT_PRI_ARGS(ni, ts), -rc)); if( ! (ts->tcpflags & CI_TCPT_FLAG_NONBLOCK_CONNECT) ) { ts->tcpflags |= CI_TCPT_FLAG_NONBLOCK_CONNECT; CI_SET_ERROR(rc, EINPROGRESS); } else CI_SET_ERROR(rc, EALREADY); return rc; } else if( rc == -EINTR ) { LOG_TC(log(LNT_FMT "connect() was interrupted by a signal", LNT_PRI_ARGS(ni, ts))); ts->tcpflags |= CI_TCPT_FLAG_NONBLOCK_CONNECT; CI_SET_ERROR(rc, EINTR); return rc; } /*! \TODO propagate the correct error code: CONNREFUSED, NOROUTE, etc. */ if( ts->s.b.state == CI_TCP_CLOSED ) { /* Bug 3558: * Set OS socket state to allow/disallow next bind(). * It is Linux hack. */ #ifdef __ci_driver__ CI_TRY(efab_tcp_helper_set_tcp_close_os_sock(netif2tcp_helper_resource(ni), S_SP(ts))); #else CI_TRY(ci_tcp_helper_set_tcp_close_os_sock(ni, S_SP(ts))); #endif /* We should re-bind socket on the next use if the port was determined by * OS. */ if( ! (ts->s.s_flags & CI_SOCK_FLAG_PORT_BOUND) ) ts->s.s_flags |= CI_SOCK_FLAG_CONNECT_MUST_BIND; /* - if SO_ERROR is set, handle it and return this value; * - else if rx_errno is set, return it; * - else (TCP_RX_ERRNO==0, socket is CI_SHUT_RD) return ECONNABORTED */ if( (rc = ci_tcp_connect_handle_so_error(&ts->s)) == 0) rc = TCP_RX_ERRNO(ts) ? TCP_RX_ERRNO(ts) : ECONNABORTED; CI_SET_ERROR(rc, rc); if( ! (ts->s.s_flags & CI_SOCK_FLAG_ADDR_BOUND) ) { ts->s.pkt.ip.ip_saddr_be32 = 0; ts->s.cp.ip_laddr_be32 = 0; } return rc; } return 0; }