static int ci_tcp_setsockopt_lk(citp_socket* ep, ci_fd_t fd, int level, int optname, const void* optval, socklen_t optlen ) { ci_sock_cmn* s = ep->s; #if defined(__linux__) || \ defined(__sun__) && defined(TCP_KEEPALIVE_THRESHOLD) || \ defined(__sun__) && defined(TCP_KEEPALIVE_ABORT_THRESHOLD) ci_tcp_socket_cmn* c = &(SOCK_TO_WAITABLE_OBJ(s)->tcp.c); #endif ci_netif* netif = ep->netif; int zeroval = 0; int rc; /* ?? what to do about optval and optlen checking ** Kernel can raise EFAULT, here we are a little in the dark. ** Note: If the OS sock is sync'd then we get this checking for free. */ if (optlen == 0) { /* Match kernel behaviour: if length is 0, it treats the value as 0; and * some applications rely on this. */ optval = &zeroval; optlen = sizeof(zeroval); } /* If you're adding to this please remember to look in common_sockopts.c * and decide if the option is common to all protocols. */ if(level == SOL_SOCKET) { switch(optname) { case SO_KEEPALIVE: /* Over-ride the default common handler. * Enable sending of keep-alive messages */ if( (rc = opt_not_ok(optval, optlen, unsigned)) ) goto fail_inval; if( *(unsigned*) optval ) { unsigned prev_flags = s->s_flags; s->s_flags |= CI_SOCK_FLAG_KALIVE; /* Set KEEPALIVE timer only if we are not in ** CLOSE or LISTENING state. */ if( s->b.state != CI_TCP_CLOSED && s->b.state != CI_TCP_LISTEN && !(prev_flags & CI_SOCK_FLAG_KALIVE) ) { ci_tcp_state* ts = SOCK_TO_TCP(s); LOG_TV(log("%s: "NSS_FMT" run KEEPALIVE timer from setsockopt()", __FUNCTION__, NSS_PRI_ARGS(netif, s))); ci_assert(ts->ka_probes == 0); ci_tcp_kalive_restart(netif, ts, ci_tcp_kalive_idle_get(ts)); } } else { s->s_flags &=~ CI_SOCK_FLAG_KALIVE; if( s->b.state != CI_TCP_LISTEN ) { ci_tcp_state* ts = SOCK_TO_TCP(s); ci_tcp_kalive_check_and_clear(netif, ts); ts->ka_probes = 0; } } break; default: { /* Common socket level options */ return ci_set_sol_socket(netif, s, optname, optval, optlen); } } } else if( level == IPPROTO_IP ) {
/* In this bind handler we just check that the address to which * are binding is either "any" or one of ours. * In the Linux kernel version [fd] is unused. */ int ci_tcp_bind(citp_socket* ep, const struct sockaddr* my_addr, socklen_t addrlen, ci_fd_t fd ) { struct sockaddr_in* my_addr_in; ci_uint16 new_port; ci_uint32 addr_be32; ci_sock_cmn* s = ep->s; ci_tcp_state* c = &SOCK_TO_WAITABLE_OBJ(s)->tcp; int rc; CHECK_TEP(ep); my_addr_in = (struct sockaddr_in*) my_addr; /* Check if state of the socket is OK for bind operation. */ /* \todo Earlier (TS_TCP( epi->tcpep.state )->tcp_source_be16) is used. * What is better? */ if (my_addr == NULL) RET_WITH_ERRNO( EINVAL ); if (s->b.state != CI_TCP_CLOSED) RET_WITH_ERRNO( EINVAL ); if (c->tcpflags & CI_TCPT_FLAG_WAS_ESTAB) RET_WITH_ERRNO( EINVAL ); if( my_addr->sa_family != s->domain ) RET_WITH_ERRNO( s->domain == PF_INET ? EAFNOSUPPORT : EINVAL ); /* Bug 4884: Windows regularly uses addrlen > sizeof(struct sockaddr_in) * Linux is also relaxed about overlength data areas. */ if (s->domain == PF_INET && addrlen < sizeof(struct sockaddr_in)) RET_WITH_ERRNO( EINVAL ); #if CI_CFG_FAKE_IPV6 if (s->domain == PF_INET6 && addrlen < SIN6_LEN_RFC2133) RET_WITH_ERRNO( EINVAL ); if( s->domain == PF_INET6 && !ci_tcp_ipv6_is_ipv4(my_addr) ) return CI_SOCKET_HANDOVER; #endif addr_be32 = ci_get_ip4_addr(s->domain, my_addr); /* Using the port number provided, see if we can do this bind */ new_port = my_addr_in->sin_port; if( CITP_OPTS.tcp_reuseports != 0 && new_port != 0 ) { struct ci_port_list *force_reuseport; CI_DLLIST_FOR_EACH2(struct ci_port_list, force_reuseport, link, (ci_dllist*)(ci_uintptr_t)CITP_OPTS.tcp_reuseports) { if( force_reuseport->port == new_port ) { int one = 1; ci_fd_t os_sock = ci_get_os_sock_fd(ep, fd); ci_assert(CI_IS_VALID_SOCKET(os_sock)); rc = ci_sys_setsockopt(os_sock, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one)); ci_rel_os_sock_fd(os_sock); if( rc != 0 && errno == ENOPROTOOPT ) ep->s->s_flags |= CI_SOCK_FLAG_REUSEPORT_LEGACY; ep->s->s_flags |= CI_SOCK_FLAG_REUSEPORT; LOG_TC(log("%s "SF_FMT", applied legacy SO_REUSEPORT flag for port %u", __FUNCTION__, SF_PRI_ARGS(ep, fd), new_port)); } } } if( !(ep->s->s_flags & CI_SOCK_FLAG_REUSEPORT_LEGACY) ) CI_LOGLEVEL_TRY_RET(LOG_TV, __ci_bind(ep->netif, ep->s, addr_be32, &new_port)); ep->s->s_flags |= CI_SOCK_FLAG_BOUND; sock_lport_be16(s) = new_port; sock_laddr_be32(s) = addr_be32; if( CI_IP_IS_MULTICAST(addr_be32) ) s->cp.ip_laddr_be32 = 0; else s->cp.ip_laddr_be32 = addr_be32; s->cp.lport_be16 = new_port; sock_rport_be16(s) = sock_raddr_be32(s) = 0; LOG_TC(log(LPF "bind to %s:%u n_p:%u lp:%u", ip_addr_str(addr_be32), (unsigned) CI_BSWAP_BE16(my_addr_in->sin_port), CI_BSWAP_BE16(new_port), CI_BSWAP_BE16(sock_lport_be16(s)))); return 0; }
/* [fd] is unused in the kernel version */ int ci_tcp_getsockopt(citp_socket* ep, ci_fd_t fd, int level, int optname, void *optval, socklen_t *optlen ) { ci_sock_cmn* s = ep->s; #if defined(__linux__) || \ defined(__sun__) && defined(TCP_KEEPALIVE_THRESHOLD) || \ defined(__sun__) && defined(TCP_KEEPALIVE_ABORT_THRESHOLD) ci_tcp_socket_cmn *c = &(SOCK_TO_WAITABLE_OBJ(s)->tcp.c); #endif ci_netif* netif = ep->netif; unsigned u = 0; /* NOTE: The setsockopt() call is reflected into the os socket to * keep the two in sync - it's assumed that we know everything * to allow us to give good answers here - and therefore we don't * bother the os with the get call */ /* ?? what to do about optval and optlen checking * Kernel can raise EFAULT, here we are a little in the dark. * - sockcall_intercept.c checks that optlen is non-NULL and if *optlen * is non-zero that optval is non-NULL, returning EFAULT if false */ if(level == SOL_SOCKET) { /* Common SOL_SOCKET handler */ return ci_get_sol_socket(netif, s, optname, optval, optlen); } else if (level == IPPROTO_IP) { /* IP level options valid for TCP */ return ci_get_sol_ip(ep, s, fd, optname, optval, optlen); #if CI_CFG_FAKE_IPV6 } else if (level == IPPROTO_IPV6 && s->domain == AF_INET6) { /* IP6 level options valid for TCP */ return ci_get_sol_ip6(ep, s, fd, optname, optval, optlen); #endif } else if (level == IPPROTO_TCP) { /* TCP level options valid for TCP */ switch(optname){ case TCP_NODELAY: /* gets status of TCP Nagle algorithm */ u = ((s->s_aflags & CI_SOCK_AFLAG_NODELAY) != 0); goto u_out; case TCP_MAXSEG: /* gets the MSS size for this connection */ if ((s->b.state & CI_TCP_STATE_TCP_CONN)) { u = tcp_eff_mss(SOCK_TO_TCP(s)); } else { u = 536; } goto u_out; # ifdef TCP_CORK case TCP_CORK: /* don't send partial framses, all partial frames sent ** when the option is cleared */ u = ((s->s_aflags & CI_SOCK_AFLAG_CORK) != 0); goto u_out; # endif case TCP_KEEPIDLE: { /* idle time for keepalives */ u = (unsigned) c->t_ka_time_in_secs; } goto u_out; case TCP_KEEPINTVL: { /* time between keepalives */ u = (unsigned) c->t_ka_intvl_in_secs; } goto u_out; case TCP_KEEPCNT: { /* number of keepalives before giving up */ u = c->ka_probe_th; } goto u_out; case TCP_INFO: /* struct tcp_info to be filled */ return ci_tcp_info_get(netif, s, (struct ci_tcp_info*) optval); case TCP_DEFER_ACCEPT: { u = 0; if( c->tcp_defer_accept != OO_TCP_DEFER_ACCEPT_OFF ) { u = ci_ip_time_ticks2ms(netif, NI_CONF(netif).tconst_rto_initial); u = ((u + 500) / 1000) << c->tcp_defer_accept; } goto u_out; } case TCP_QUICKACK: { u = 0; if( s->b.state & CI_TCP_STATE_TCP_CONN ) { ci_tcp_state* ts = SOCK_TO_TCP(s); u = ci_tcp_is_in_faststart(ts); } goto u_out; } default: LOG_TC( log(LPF "getsockopt: unimplemented or bad option: %i", optname)); RET_WITH_ERRNO(ENOPROTOOPT); } } else { SOCKOPT_RET_INVALID_LEVEL(s); } return 0; u_out: return ci_getsockopt_final(optval, optlen, level, &u, sizeof(u)); }
/* Returns: * 0 on success * * CI_SOCKET_ERROR (and errno set) * this is a normal error that is returned to the * the application * * CI_SOCKET_HANDOVER we tell the upper layers to handover, no need * to set errno since it isn't a real error */ int ci_tcp_connect(citp_socket* ep, const struct sockaddr* serv_addr, socklen_t addrlen, ci_fd_t fd, int *p_moved) { /* Address family is validated earlier. */ struct sockaddr_in* inaddr = (struct sockaddr_in*) serv_addr; ci_sock_cmn* s = ep->s; ci_tcp_state* ts = &SOCK_TO_WAITABLE_OBJ(s)->tcp; int rc = 0, crc; ci_uint32 dst_be32; if( NI_OPTS(ep->netif).tcp_connect_handover ) return CI_SOCKET_HANDOVER; /* Make sure we're up-to-date. */ ci_netif_lock(ep->netif); CHECK_TEP(ep); ci_netif_poll(ep->netif); /* * 1. Check if state of the socket is OK for connect operation. */ start_again: if( (rc = ci_tcp_connect_handle_so_error(s)) != 0) { CI_SET_ERROR(rc, rc); goto unlock_out; } if( s->b.state != CI_TCP_CLOSED ) { /* see if progress can be made on this socket before ** determining status (e.g. non-blocking connect and connect poll)*/ if( s->b.state & CI_TCP_STATE_SYNCHRONISED ) { if( ts->tcpflags & CI_TCPT_FLAG_NONBLOCK_CONNECT ) { ts->tcpflags &= ~CI_TCPT_FLAG_NONBLOCK_CONNECT; rc = 0; goto unlock_out; } if( serv_addr->sa_family == AF_UNSPEC ) LOG_E(ci_log("Onload does not support TCP disconnect via " "connect(addr->sa_family==AF_UNSPEC)")); CI_SET_ERROR(rc, EISCONN); } else if( s->b.state == CI_TCP_LISTEN ) { #if CI_CFG_POSIX_CONNECT_AFTER_LISTEN CI_SET_ERROR(rc, EOPNOTSUPP); #else if( ci_tcp_validate_sa(s->domain, serv_addr, addrlen) ) { /* Request should be forwarded to OS */ rc = CI_SOCKET_HANDOVER; goto unlock_out; } if( serv_addr->sa_family == AF_UNSPEC ) { /* Linux does listen shutdown on disconnect (AF_UNSPEC) */ ci_netif_unlock(ep->netif); rc = ci_tcp_shutdown(ep, SHUT_RD, fd); goto out; } else { /* Linux has curious error reporting in this case */ CI_SET_ERROR(rc, EISCONN); } #endif } else { /* Socket is in SYN-SENT state. Let's block for receiving SYN-ACK */ ci_assert_equal(s->b.state, CI_TCP_SYN_SENT); if( s->b.sb_aflags & (CI_SB_AFLAG_O_NONBLOCK | CI_SB_AFLAG_O_NDELAY) ) CI_SET_ERROR(rc, EALREADY); else goto syn_sent; } goto unlock_out; } /* Check if we've ever been connected. */ if( ts->tcpflags & CI_TCPT_FLAG_WAS_ESTAB ) { CI_SET_ERROR(rc, EISCONN); goto unlock_out; } /* * 2. Check address parameter, if it's inappropriate for handover * decision or handover should be done, try to to call OS and * do handover on success. */ if ( /* Af first, check that address family and length is OK. */ ci_tcp_validate_sa(s->domain, serv_addr, addrlen) /* rfc793 p54 if the foreign socket is unspecified return */ /* "error: foreign socket unspecified" (EINVAL), but keep it to OS */ || (dst_be32 = ci_get_ip4_addr(inaddr->sin_family, serv_addr)) == 0 /* Zero destination port is tricky as well, keep it to OS */ || inaddr->sin_port == 0 ) { rc = CI_SOCKET_HANDOVER; goto unlock_out; } /* is this a socket that we can handle? */ rc = ci_tcp_connect_check_dest(ep, dst_be32, inaddr->sin_port); if( rc ) goto unlock_out; if( (ts->s.pkt.flags & CI_IP_CACHE_IS_LOCALROUTE) && OO_SP_IS_NULL(ts->local_peer) ) { /* Try to connect to another stack; handover if can't */ struct oo_op_loopback_connect op; op.dst_port = inaddr->sin_port; op.dst_addr = dst_be32; /* this operation unlocks netif */ rc = oo_resource_op(fd, OO_IOC_TCP_LOOPBACK_CONNECT, &op); if( rc < 0) return CI_SOCKET_HANDOVER; if( op.out_moved ) *p_moved = 1; if( op.out_rc == -EINPROGRESS ) RET_WITH_ERRNO( EINPROGRESS ); else if( op.out_rc == -EAGAIN ) return -EAGAIN; else if( op.out_rc != 0 ) return CI_SOCKET_HANDOVER; return 0; } /* filters can't handle alien source address */ if( (s->s_flags & CI_SOCK_FLAG_BOUND_ALIEN) && ! (ts->s.pkt.flags & CI_IP_CACHE_IS_LOCALROUTE) ) { rc = CI_SOCKET_HANDOVER; goto unlock_out; } crc = ci_tcp_connect_ul_start(ep->netif, ts, dst_be32, inaddr->sin_port, &rc); if( crc != CI_CONNECT_UL_OK ) { switch( crc ) { case CI_CONNECT_UL_FAIL: goto unlock_out; case CI_CONNECT_UL_LOCK_DROPPED: goto out; case CI_CONNECT_UL_START_AGAIN: goto start_again; } } CI_TCP_STATS_INC_ACTIVE_OPENS( ep->netif ); syn_sent: rc = ci_tcp_connect_ul_syn_sent(ep->netif, ts); unlock_out: ci_netif_unlock(ep->netif); out: return rc; }