ci_inline ci_tcp_state* get_ts_from_cache(ci_netif *netif, ci_tcp_state_synrecv* tsr, ci_tcp_socket_listen* tls) { ci_tcp_state *ts = NULL; #if CI_CFG_FD_CACHING if( ci_ni_dllist_not_empty(netif, &tls->epcache.cache) ) { /* Take the entry from the cache */ ci_ni_dllist_link *link = ci_ni_dllist_pop(netif, &tls->epcache.cache); ts = CI_CONTAINER (ci_tcp_state, epcache_link, link); ci_assert (ts); ci_ni_dllist_self_link(netif, &ts->epcache_link); LOG_EP(ci_log("Taking cached fd %d off cached list, (onto acceptq)", ts->cached_on_fd)); if( tcp_laddr_be32(ts) == tsr->l_addr ) { ci_tcp_state_init(netif, ts, 1); /* Shouldn't have touched these bits of state */ ci_assert(!(ts->s.b.sb_aflags & CI_SB_AFLAG_ORPHAN)); ci_assert(ci_tcp_is_cached(ts)); CITP_STATS_NETIF(++netif->state->stats.sockcache_hit); CITP_STATS_TCP_LISTEN(++tls->stats.n_sockcache_hit); } else { /* Oh dear -- the tcp-state we cached was using a different local IP * address. This means we've accepted a connection from a different * interface as we did for the thing we've cached. Which means we * can't share the hardware filter after all. For now, just bung it * back on the list. */ LOG_EP(ci_log("changed interface of cached EP, re-queueing")); ci_ni_dllist_push_tail(netif, &tls->epcache.cache, &ts->epcache_link); ts = NULL; CITP_STATS_NETIF(++netif->state->stats.sockcache_miss_intmismatch); } } #endif return ts; }
/* ** promote a synrecv structure to an established socket ** ** Assumes that the caller will handle a fail if we can't allocate a new ** tcp_state structure due to memory pressure or the like */ int ci_tcp_listenq_try_promote(ci_netif* netif, ci_tcp_socket_listen* tls, ci_tcp_state_synrecv* tsr, ci_ip_cached_hdrs* ipcache, ci_tcp_state** ts_out) { int rc = 0; ci_assert(netif); ci_assert(tls); ci_assert(tls->s.b.state == CI_TCP_LISTEN); ci_assert(tsr); if( (int) ci_tcp_acceptq_n(tls) < tls->acceptq_max ) { ci_tcp_state* ts; /* grab a tcp_state structure that will go onto the accept queue. We take * from the cache of EPs if any are available */ ts = get_ts_from_cache (netif, tsr, tls); if( !ts ) { /* None on cache; try allocating a new ts */ ts = ci_tcp_get_state_buf(netif); #if CI_CFG_FD_CACHING if( ts == NULL ) { /* We've reaped. Did this result in any being cached */ ts = get_ts_from_cache(netif, tsr, tls); if (ts == NULL ) { /* No -- try again to allocate. */ ts = ci_tcp_get_state_buf(netif); } else { CITP_STATS_NETIF(++netif->state->stats.sockcache_hit_reap); } } #endif if( ts == NULL ) { LOG_TV(ci_log("%s: [%d] out of socket buffers", __FUNCTION__, NI_ID(netif))); CITP_STATS_TCP_LISTEN(++tls->stats.n_acceptq_no_sock); CI_SET_SO_ERROR(&tls->s, ENOMEM); citp_waitable_wake(netif, &tls->s.b, CI_SB_FLAG_WAKE_RX); return -ENOMEM; } ci_assert(ci_tcp_is_cached(ts) || (ts->s.b.sb_aflags & CI_SB_AFLAG_ORPHAN)); } #ifdef ONLOAD_OFE ts->s.ofe_code_start = tls->ofe_promote; #endif if( ! ci_tcp_is_cached(ts) ) { /* Need to initialise address information for use when setting filters */ ci_tcp_set_addr_on_promote(netif, ts, tsr, tls); /* "borrow" filter from listening socket. For loopback socket, we * do not need filters, but we have to take a reference of the OS * socket. */ rc = ci_tcp_ep_set_filters(netif, S_SP(ts), ts->s.cp.so_bindtodevice, S_SP(tls)); if( rc < 0 ) { LOG_U(ci_log("%s: Unable to set filters %d", __FUNCTION__, rc)); /* Either put this back on the list (at the head) or free it */ ci_tcp_state_free(netif, ts); return rc; } } #if CI_CFG_FD_CACHING else { /* Now set the s/w filter. We leave the hw filter in place for cached * EPS. This will probably not have the correct raddr and rport, but as * it's sharing the listening socket's filter that's not a problem. It * will be updated if this is still around when the listener is closed. */ rc = ci_netif_filter_insert(netif, S_SP(ts), tsr->l_addr, sock_lport_be16(&tls->s), tsr->r_addr, tsr->r_port, tcp_protocol(ts)); if (rc < 0) { /* Bung it back on the cache list */ LOG_EP(ci_log("Unable to create s/w filter!")); ci_ni_dllist_push(netif, &tls->epcache.cache, &ts->epcache_link); return rc; } /* Need to initialise address information. We do this after trying to * insert the sw filter, so we can push the tcp state back onto the * cache queue with as few changes as possible if we fail to add the * sw filter. */ ci_tcp_set_addr_on_promote(netif, ts, tsr, tls); LOG_EP(ci_log("Cached fd %d from cached to connected", ts->cached_on_fd)); ci_ni_dllist_push(netif, &tls->epcache_connected, &ts->epcache_link); } #endif ci_assert(IS_VALID_SOCK_P(netif, S_SP(ts))); ci_assert(ts->s.b.state == CI_TCP_CLOSED); ts->s.domain = tls->s.domain; cicp_ip_cache_update_from(netif, &ts->s.pkt, ipcache); ci_pmtu_state_init(netif, &ts->s, &ts->pmtus, CI_IP_TIMER_PMTU_DISCOVER); ci_pmtu_set(netif, &ts->pmtus, CI_MIN(ts->s.pkt.mtu, tsr->tcpopts.smss + sizeof(ci_tcp_hdr) + sizeof(ci_ip4_hdr))); /* If we've got SYN via local route, we can handle it */ ci_assert_equiv(ts->s.pkt.status == retrrc_localroute, OO_SP_NOT_NULL(tsr->local_peer)); if( ts->s.pkt.status == retrrc_localroute ) ts->s.pkt.flags |= CI_IP_CACHE_IS_LOCALROUTE; ts->amss = tsr->amss; /* options and flags */ ts->tcpflags = 0; ts->tcpflags |= tsr->tcpopts.flags; ts->tcpflags |= CI_TCPT_FLAG_PASSIVE_OPENED; ts->outgoing_hdrs_len = sizeof(ci_ip4_hdr) + sizeof(ci_tcp_hdr); if( ts->tcpflags & CI_TCPT_FLAG_WSCL ) { ts->snd_wscl = tsr->tcpopts.wscl_shft; ts->rcv_wscl = tsr->rcv_wscl; } else { ts->snd_wscl = ts->rcv_wscl = 0u; } CI_IP_SOCK_STATS_VAL_TXWSCL( ts, ts->snd_wscl); CI_IP_SOCK_STATS_VAL_RXWSCL( ts, ts->rcv_wscl); /* Send and receive sequence numbers */ tcp_snd_una(ts) = tcp_snd_nxt(ts) = tcp_enq_nxt(ts) = tcp_snd_up(ts) = tsr->snd_isn + 1; ci_tcp_set_snd_max(ts, tsr->rcv_nxt, tcp_snd_una(ts), 0); ci_tcp_rx_set_isn(ts, tsr->rcv_nxt); tcp_rcv_up(ts) = SEQ_SUB(tcp_rcv_nxt(ts), 1); if( ts->tcpflags & CI_TCPT_FLAG_TSO ) { ts->incoming_tcp_hdr_len += 12; ts->outgoing_hdrs_len += 12; ts->tspaws = ci_tcp_time_now(netif); ts->tsrecent = tsr->tspeer; ts->tslastack = tsr->rcv_nxt; } else { /* Must be after initialising snd_una. */ ci_tcp_clear_rtt_timing(ts); ts->timed_ts = tsr->timest; } /* SACK has nothing to be done. */ /* ?? ECN */ ci_tcp_set_hdr_len(ts, (ts->outgoing_hdrs_len - sizeof(ci_ip4_hdr))); ts->smss = tsr->tcpopts.smss; ts->c.user_mss = tls->c.user_mss; if (ts->c.user_mss && ts->c.user_mss < ts->smss) ts->smss = ts->c.user_mss; #if CI_CFG_LIMIT_SMSS ts->smss = ci_tcp_limit_mss(ts->smss, netif, __FUNCTION__); #endif ci_assert(ts->smss>0); ci_tcp_set_eff_mss(netif, ts); ci_tcp_set_initialcwnd(netif, ts); /* Copy socket options & related fields that should be inherited. * Note: Windows does not inherit rcvbuf until the call to accept * completes. The assumption here is that all options can be * inherited at the same time (most won't have an effect until there * is a socket available for use by the app.). */ ci_tcp_inherit_accept_options(netif, tls, ts, "SYN RECV (LISTENQ PROMOTE)"); /* NB. Must have already set peer (which we have). */ ci_tcp_set_established_state(netif, ts); CITP_STATS_NETIF(++netif->state->stats.synrecv2established); ci_assert(ts->ka_probes == 0); ci_tcp_kalive_restart(netif, ts, ci_tcp_kalive_idle_get(ts)); ci_tcp_set_flags(ts, CI_TCP_FLAG_ACK); /* Remove the synrecv structure from the listen queue, and free the ** buffer. */ if( tsr->tcpopts.flags & CI_TCPT_FLAG_SYNCOOKIE ) ci_free(tsr); else { ci_tcp_listenq_remove(netif, tls, tsr); ci_tcp_synrecv_free(netif, tsr); } ci_bit_set(&ts->s.b.sb_aflags, CI_SB_AFLAG_TCP_IN_ACCEPTQ_BIT); ci_tcp_acceptq_put(netif, tls, &ts->s.b); LOG_TC(log(LNT_FMT "new ts=%d SYN-RECV->ESTABLISHED flags=0x%x", LNT_PRI_ARGS(netif, tls), S_FMT(ts), ts->tcpflags); log(LNTS_FMT RCV_WND_FMT " snd=%08x-%08x-%08x enq=%08x", LNTS_PRI_ARGS(netif, ts), RCV_WND_ARGS(ts), tcp_snd_una(ts), tcp_snd_nxt(ts), ts->snd_max, tcp_enq_nxt(ts))); citp_waitable_wake(netif, &tls->s.b, CI_SB_FLAG_WAKE_RX); *ts_out = ts; return 0; }
int ci_tcp_listen(citp_socket* ep, ci_fd_t fd, int backlog) { /* ** ?? error handling on possible fails not handled robustly... ** ?? Need to check port number is valid TODO */ /*! \todo If not bound then we have to be listening on all interfaces. * It's likely that we won't be coming through here as we have to * listen on the OS socket too! */ ci_tcp_state* ts; ci_tcp_socket_listen* tls; ci_netif* netif = ep->netif; ci_sock_cmn* s = ep->s; unsigned ul_backlog = backlog; int rc; oo_p sp; LOG_TC(log("%s "SK_FMT" listen backlog=%d", __FUNCTION__, SK_PRI_ARGS(ep), backlog)); CHECK_TEP(ep); if( NI_OPTS(netif).tcp_listen_handover ) return CI_SOCKET_HANDOVER; if( !NI_OPTS(netif).tcp_server_loopback) { /* We should handover if the socket is bound to alien address. */ if( s->s_flags & CI_SOCK_FLAG_BOUND_ALIEN ) return CI_SOCKET_HANDOVER; } if( ul_backlog < 0 ) ul_backlog = NI_OPTS(netif).max_ep_bufs; else if( ul_backlog < NI_OPTS(netif).acceptq_min_backlog ) ul_backlog = NI_OPTS(netif).acceptq_min_backlog; if( s->b.state == CI_TCP_LISTEN ) { tls = SOCK_TO_TCP_LISTEN(s); tls->acceptq_max = ul_backlog; ci_tcp_helper_listen_os_sock(fd, ul_backlog); return 0; } if( s->b.state != CI_TCP_CLOSED ) { CI_SET_ERROR(rc, EINVAL); return rc; } ts = SOCK_TO_TCP(s); /* Bug 3376: if socket used for a previous, failed, connect then the error * numbers will not be as expected. Only seen when not using listening * netifs (as moving the EP to the new netif resets them). */ ts->s.tx_errno = EPIPE; ts->s.rx_errno = ENOTCONN; /* fill in address/ports and all TCP state */ if( !(ts->s.s_flags & CI_SOCK_FLAG_BOUND) ) { ci_uint16 source_be16; /* They haven't previously done a bind, so we need to choose * a port. As we haven't been given a hint we let the OS choose. */ source_be16 = 0; rc = __ci_bind(ep->netif, ep->s, ts->s.pkt.ip.ip_saddr_be32, &source_be16); if (CI_LIKELY( rc==0 )) { TS_TCP(ts)->tcp_source_be16 = source_be16; ts->s.cp.lport_be16 = source_be16; LOG_TC(log(LNT_FMT "listen: our bind returned %s:%u", LNT_PRI_ARGS(ep->netif, ts), ip_addr_str(ts->s.pkt.ip.ip_saddr_be32), (unsigned) CI_BSWAP_BE16(TS_TCP(ts)->tcp_source_be16))); } else { LOG_U(ci_log("__ci_bind returned %d at %s:%d", CI_GET_ERROR(rc), __FILE__, __LINE__)); return rc; } } ci_sock_lock(netif, &ts->s.b); ci_tcp_set_slow_state(netif, ts, CI_TCP_LISTEN); tls = SOCK_TO_TCP_LISTEN(&ts->s); tcp_raddr_be32(tls) = 0u; tcp_rport_be16(tls) = 0u; ci_assert_equal(tls->s.tx_errno, EPIPE); ci_assert_equal(tls->s.rx_errno, ENOTCONN); /* setup listen timer - do it before the first return statement, * because __ci_tcp_listen_to_normal() will be called on error path. */ if( ~tls->s.s_flags & CI_SOCK_FLAG_BOUND_ALIEN ) { sp = TS_OFF(netif, tls); OO_P_ADD(sp, CI_MEMBER_OFFSET(ci_tcp_socket_listen, listenq_tid)); ci_ip_timer_init(netif, &tls->listenq_tid, sp, "lstq"); tls->listenq_tid.param1 = S_SP(tls); tls->listenq_tid.fn = CI_IP_TIMER_TCP_LISTEN; } rc = ci_tcp_listen_init(netif, tls); ci_sock_unlock(netif, &ts->s.b); if( rc != 0 ) { CI_SET_ERROR(rc, -rc); goto listen_fail; } tls->acceptq_max = ul_backlog; CITP_STATS_TCP_LISTEN(CI_ZERO(&tls->stats)); /* install all the filters needed for this connection * - tcp_laddr_be32(ts) = 0 for IPADDR_ANY * * TODO: handle BINDTODEVICE by setting phys_port paramter to correct * physical L5 port index * TODO: handle REUSEADDR by setting last paramter to TRUE */ if( ~s->s_flags & CI_SOCK_FLAG_BOUND_ALIEN ) { #ifdef ONLOAD_OFE if( netif->ofe != NULL ) { tls->s.ofe_code_start = ofe_socktbl_find( netif->ofe, OFE_SOCKTYPE_TCP_LISTEN, tcp_laddr_be32(tls), INADDR_ANY, tcp_lport_be16(ts), 0); tls->ofe_promote = ofe_socktbl_find( netif->ofe, OFE_SOCKTYPE_TCP_PASSIVE, tcp_laddr_be32(tls), INADDR_ANY, tcp_lport_be16(ts), 0); } #endif rc = ci_tcp_ep_set_filters(netif, S_SP(tls), tls->s.cp.so_bindtodevice, OO_SP_NULL); if( rc == -EFILTERSSOME ) { if( CITP_OPTS.no_fail ) rc = 0; else { ci_tcp_ep_clear_filters(netif, S_SP(tls), 0); rc = -ENOBUFS; } } ci_assert_nequal(rc, -EFILTERSSOME); VERB(ci_log("%s: set_filters returned %d", __FUNCTION__, rc)); if (rc < 0) { CI_SET_ERROR(rc, -rc); goto post_listen_fail; } } /* * Call of system listen() is required for listen any, local host * communications server and multi-homed server (to accept connections * to L5 assigned address(es), but incoming from other interfaces). */ #ifdef __ci_driver__ { rc = efab_tcp_helper_listen_os_sock( netif2tcp_helper_resource(netif), S_SP(tls), backlog); } #else rc = ci_tcp_helper_listen_os_sock(fd, backlog); #endif if ( rc < 0 ) { /* clear the filter we've just set */ ci_tcp_ep_clear_filters(netif, S_SP(tls), 0); goto post_listen_fail; } return 0; post_listen_fail: ci_tcp_listenq_drop_all(netif, tls); listen_fail: /* revert TCP state to a non-listening socket format */ __ci_tcp_listen_to_normal(netif, tls); /* Above function sets orphan flag but we are attached to an FD. */ ci_bit_clear(&tls->s.b.sb_aflags, CI_SB_AFLAG_ORPHAN_BIT); #ifdef __ci_driver__ return rc; #else return CI_SOCKET_ERROR; #endif }