/* ** promote a synrecv structure to an established socket ** ** Assumes that the caller will handle a fail if we can't allocate a new ** tcp_state structure due to memory pressure or the like */ int ci_tcp_listenq_try_promote(ci_netif* netif, ci_tcp_socket_listen* tls, ci_tcp_state_synrecv* tsr, ci_ip_cached_hdrs* ipcache, ci_tcp_state** ts_out) { int rc = 0; ci_assert(netif); ci_assert(tls); ci_assert(tls->s.b.state == CI_TCP_LISTEN); ci_assert(tsr); if( (int) ci_tcp_acceptq_n(tls) < tls->acceptq_max ) { ci_tcp_state* ts; /* grab a tcp_state structure that will go onto the accept queue. We take * from the cache of EPs if any are available */ ts = get_ts_from_cache (netif, tsr, tls); if( !ts ) { /* None on cache; try allocating a new ts */ ts = ci_tcp_get_state_buf(netif); #if CI_CFG_FD_CACHING if( ts == NULL ) { /* We've reaped. Did this result in any being cached */ ts = get_ts_from_cache(netif, tsr, tls); if (ts == NULL ) { /* No -- try again to allocate. */ ts = ci_tcp_get_state_buf(netif); } else { CITP_STATS_NETIF(++netif->state->stats.sockcache_hit_reap); } } #endif if( ts == NULL ) { LOG_TV(ci_log("%s: [%d] out of socket buffers", __FUNCTION__, NI_ID(netif))); CITP_STATS_TCP_LISTEN(++tls->stats.n_acceptq_no_sock); CI_SET_SO_ERROR(&tls->s, ENOMEM); citp_waitable_wake(netif, &tls->s.b, CI_SB_FLAG_WAKE_RX); return -ENOMEM; } ci_assert(ci_tcp_is_cached(ts) || (ts->s.b.sb_aflags & CI_SB_AFLAG_ORPHAN)); } #ifdef ONLOAD_OFE ts->s.ofe_code_start = tls->ofe_promote; #endif if( ! ci_tcp_is_cached(ts) ) { /* Need to initialise address information for use when setting filters */ ci_tcp_set_addr_on_promote(netif, ts, tsr, tls); /* "borrow" filter from listening socket. For loopback socket, we * do not need filters, but we have to take a reference of the OS * socket. */ rc = ci_tcp_ep_set_filters(netif, S_SP(ts), ts->s.cp.so_bindtodevice, S_SP(tls)); if( rc < 0 ) { LOG_U(ci_log("%s: Unable to set filters %d", __FUNCTION__, rc)); /* Either put this back on the list (at the head) or free it */ ci_tcp_state_free(netif, ts); return rc; } } #if CI_CFG_FD_CACHING else { /* Now set the s/w filter. We leave the hw filter in place for cached * EPS. This will probably not have the correct raddr and rport, but as * it's sharing the listening socket's filter that's not a problem. It * will be updated if this is still around when the listener is closed. */ rc = ci_netif_filter_insert(netif, S_SP(ts), tsr->l_addr, sock_lport_be16(&tls->s), tsr->r_addr, tsr->r_port, tcp_protocol(ts)); if (rc < 0) { /* Bung it back on the cache list */ LOG_EP(ci_log("Unable to create s/w filter!")); ci_ni_dllist_push(netif, &tls->epcache.cache, &ts->epcache_link); return rc; } /* Need to initialise address information. We do this after trying to * insert the sw filter, so we can push the tcp state back onto the * cache queue with as few changes as possible if we fail to add the * sw filter. */ ci_tcp_set_addr_on_promote(netif, ts, tsr, tls); LOG_EP(ci_log("Cached fd %d from cached to connected", ts->cached_on_fd)); ci_ni_dllist_push(netif, &tls->epcache_connected, &ts->epcache_link); } #endif ci_assert(IS_VALID_SOCK_P(netif, S_SP(ts))); ci_assert(ts->s.b.state == CI_TCP_CLOSED); ts->s.domain = tls->s.domain; cicp_ip_cache_update_from(netif, &ts->s.pkt, ipcache); ci_pmtu_state_init(netif, &ts->s, &ts->pmtus, CI_IP_TIMER_PMTU_DISCOVER); ci_pmtu_set(netif, &ts->pmtus, CI_MIN(ts->s.pkt.mtu, tsr->tcpopts.smss + sizeof(ci_tcp_hdr) + sizeof(ci_ip4_hdr))); /* If we've got SYN via local route, we can handle it */ ci_assert_equiv(ts->s.pkt.status == retrrc_localroute, OO_SP_NOT_NULL(tsr->local_peer)); if( ts->s.pkt.status == retrrc_localroute ) ts->s.pkt.flags |= CI_IP_CACHE_IS_LOCALROUTE; ts->amss = tsr->amss; /* options and flags */ ts->tcpflags = 0; ts->tcpflags |= tsr->tcpopts.flags; ts->tcpflags |= CI_TCPT_FLAG_PASSIVE_OPENED; ts->outgoing_hdrs_len = sizeof(ci_ip4_hdr) + sizeof(ci_tcp_hdr); if( ts->tcpflags & CI_TCPT_FLAG_WSCL ) { ts->snd_wscl = tsr->tcpopts.wscl_shft; ts->rcv_wscl = tsr->rcv_wscl; } else { ts->snd_wscl = ts->rcv_wscl = 0u; } CI_IP_SOCK_STATS_VAL_TXWSCL( ts, ts->snd_wscl); CI_IP_SOCK_STATS_VAL_RXWSCL( ts, ts->rcv_wscl); /* Send and receive sequence numbers */ tcp_snd_una(ts) = tcp_snd_nxt(ts) = tcp_enq_nxt(ts) = tcp_snd_up(ts) = tsr->snd_isn + 1; ci_tcp_set_snd_max(ts, tsr->rcv_nxt, tcp_snd_una(ts), 0); ci_tcp_rx_set_isn(ts, tsr->rcv_nxt); tcp_rcv_up(ts) = SEQ_SUB(tcp_rcv_nxt(ts), 1); if( ts->tcpflags & CI_TCPT_FLAG_TSO ) { ts->incoming_tcp_hdr_len += 12; ts->outgoing_hdrs_len += 12; ts->tspaws = ci_tcp_time_now(netif); ts->tsrecent = tsr->tspeer; ts->tslastack = tsr->rcv_nxt; } else { /* Must be after initialising snd_una. */ ci_tcp_clear_rtt_timing(ts); ts->timed_ts = tsr->timest; } /* SACK has nothing to be done. */ /* ?? ECN */ ci_tcp_set_hdr_len(ts, (ts->outgoing_hdrs_len - sizeof(ci_ip4_hdr))); ts->smss = tsr->tcpopts.smss; ts->c.user_mss = tls->c.user_mss; if (ts->c.user_mss && ts->c.user_mss < ts->smss) ts->smss = ts->c.user_mss; #if CI_CFG_LIMIT_SMSS ts->smss = ci_tcp_limit_mss(ts->smss, netif, __FUNCTION__); #endif ci_assert(ts->smss>0); ci_tcp_set_eff_mss(netif, ts); ci_tcp_set_initialcwnd(netif, ts); /* Copy socket options & related fields that should be inherited. * Note: Windows does not inherit rcvbuf until the call to accept * completes. The assumption here is that all options can be * inherited at the same time (most won't have an effect until there * is a socket available for use by the app.). */ ci_tcp_inherit_accept_options(netif, tls, ts, "SYN RECV (LISTENQ PROMOTE)"); /* NB. Must have already set peer (which we have). */ ci_tcp_set_established_state(netif, ts); CITP_STATS_NETIF(++netif->state->stats.synrecv2established); ci_assert(ts->ka_probes == 0); ci_tcp_kalive_restart(netif, ts, ci_tcp_kalive_idle_get(ts)); ci_tcp_set_flags(ts, CI_TCP_FLAG_ACK); /* Remove the synrecv structure from the listen queue, and free the ** buffer. */ if( tsr->tcpopts.flags & CI_TCPT_FLAG_SYNCOOKIE ) ci_free(tsr); else { ci_tcp_listenq_remove(netif, tls, tsr); ci_tcp_synrecv_free(netif, tsr); } ci_bit_set(&ts->s.b.sb_aflags, CI_SB_AFLAG_TCP_IN_ACCEPTQ_BIT); ci_tcp_acceptq_put(netif, tls, &ts->s.b); LOG_TC(log(LNT_FMT "new ts=%d SYN-RECV->ESTABLISHED flags=0x%x", LNT_PRI_ARGS(netif, tls), S_FMT(ts), ts->tcpflags); log(LNTS_FMT RCV_WND_FMT " snd=%08x-%08x-%08x enq=%08x", LNTS_PRI_ARGS(netif, ts), RCV_WND_ARGS(ts), tcp_snd_una(ts), tcp_snd_nxt(ts), ts->snd_max, tcp_enq_nxt(ts))); citp_waitable_wake(netif, &tls->s.b, CI_SB_FLAG_WAKE_RX); *ts_out = ts; return 0; }
static int ci_tcp_connect_ul_start(ci_netif *ni, ci_tcp_state* ts, ci_uint32 dst_be32, unsigned dport_be16, int* fail_rc) { ci_ip_pkt_fmt* pkt; int rc = 0; ci_assert(ts->s.pkt.mtu); /* Now that we know the outgoing route, set the MTU related values. * Note, even these values are speculative since the real MTU * could change between now and passing the packet to the lower layers */ ts->amss = ts->s.pkt.mtu - sizeof(ci_tcp_hdr) - sizeof(ci_ip4_hdr); #if CI_CFG_LIMIT_AMSS ts->amss = ci_tcp_limit_mss(ts->amss, ni, __FUNCTION__); #endif /* Default smss until discovered by MSS option in SYN - RFC1122 4.2.2.6 */ ts->smss = CI_CFG_TCP_DEFAULT_MSS; /* set pmtu, eff_mss, snd_buf and adjust windows */ ci_pmtu_set(ni, &ts->pmtus, ts->s.pkt.mtu); ci_tcp_set_eff_mss(ni, ts); ci_tcp_set_initialcwnd(ni, ts); /* Send buffer adjusted by ci_tcp_set_eff_mss(), but we want it to stay * zero until the connection is established. */ ts->so_sndbuf_pkts = 0; /* * 3. State and address are OK. It's address routed through our NIC. * Do connect(). */ ci_assert_nequal(ts->s.pkt.ip.ip_saddr_be32, INADDR_ANY); if( ts->s.s_flags & CI_SOCK_FLAG_CONNECT_MUST_BIND ) { ci_sock_cmn* s = &ts->s; ci_uint16 source_be16 = 0; if( s->s_flags & CI_SOCK_FLAG_ADDR_BOUND ) rc = __ci_bind(ni, &ts->s, ts->s.pkt.ip.ip_saddr_be32, &source_be16); else rc = __ci_bind(ni, &ts->s, INADDR_ANY, &source_be16); if(CI_LIKELY( rc == 0 )) { TS_TCP(ts)->tcp_source_be16 = source_be16; ts->s.cp.lport_be16 = source_be16; LOG_TC(log(LNT_FMT "connect: our bind returned %s:%u", LNT_PRI_ARGS(ni, ts), ip_addr_str(INADDR_ANY), (unsigned) CI_BSWAP_BE16(TS_TCP(ts)->tcp_source_be16))); } else { LOG_U(ci_log("__ci_bind returned %d at %s:%d", CI_GET_ERROR(rc), __FILE__, __LINE__)); *fail_rc = rc; return CI_CONNECT_UL_FAIL; } if(CI_UNLIKELY( ts->s.pkt.ip.ip_saddr_be32 == 0 )) { CI_SET_ERROR(*fail_rc, EINVAL); return CI_CONNECT_UL_FAIL; } } ci_tcp_set_peer(ts, dst_be32, dport_be16); /* Make sure we can get a buffer before we change state. */ pkt = ci_netif_pkt_tx_tcp_alloc(ni); if( CI_UNLIKELY(! pkt) ) { /* NB. We've already done a poll above. */ rc = ci_netif_pkt_wait(ni, &ts->s, CI_SLEEP_NETIF_LOCKED|CI_SLEEP_NETIF_RQ); if( ci_netif_pkt_wait_was_interrupted(rc) ) { CI_SET_ERROR(*fail_rc, -rc); return CI_CONNECT_UL_LOCK_DROPPED; } /* OK, there are (probably) packets available - go try again. Note we * jump back to the top of the function because someone may have * connected this socket in the mean-time, so we need to check the * state once more. */ return CI_CONNECT_UL_START_AGAIN; } #ifdef ONLOAD_OFE if( ni->ofe != NULL ) ts->s.ofe_code_start = ofe_socktbl_find( ni->ofe, OFE_SOCKTYPE_TCP_ACTIVE, tcp_laddr_be32(ts), tcp_raddr_be32(ts), tcp_lport_be16(ts), tcp_rport_be16(ts)); #endif rc = ci_tcp_ep_set_filters(ni, S_SP(ts), ts->s.cp.so_bindtodevice, OO_SP_NULL); if( rc < 0 ) { /* Perhaps we've run out of filters? See if we can push a socket out * of timewait and steal its filter. */ ci_assert_nequal(rc, -EFILTERSSOME); if( rc != -EBUSY || ! ci_netif_timewait_try_to_free_filter(ni) || (rc = ci_tcp_ep_set_filters(ni, S_SP(ts), ts->s.cp.so_bindtodevice, OO_SP_NULL)) < 0 ) { ci_assert_nequal(rc, -EFILTERSSOME); /* Either a different error, or our efforts to free a filter did not * work. */ if( ! (ts->s.s_flags & CI_SOCK_FLAG_ADDR_BOUND) ) { ts->s.pkt.ip.ip_saddr_be32 = 0; ts->s.cp.ip_laddr_be32 = 0; } ci_netif_pkt_release(ni, pkt); CI_SET_ERROR(*fail_rc, -rc); return CI_CONNECT_UL_FAIL; } } LOG_TC(log(LNT_FMT "CONNECT %s:%u->%s:%u", LNT_PRI_ARGS(ni, ts), ip_addr_str(ts->s.pkt.ip.ip_saddr_be32), (unsigned) CI_BSWAP_BE16(TS_TCP(ts)->tcp_source_be16), ip_addr_str(ts->s.pkt.ip.ip_daddr_be32), (unsigned) CI_BSWAP_BE16(TS_TCP(ts)->tcp_dest_be16))); /* We are going to send the SYN - set states appropriately */ tcp_snd_una(ts) = tcp_snd_nxt(ts) = tcp_enq_nxt(ts) = tcp_snd_up(ts) = ci_tcp_initial_seqno(ni); ts->snd_max = tcp_snd_nxt(ts) + 1; /* Must be after initialising snd_una. */ ci_tcp_clear_rtt_timing(ts); ci_tcp_set_flags(ts, CI_TCP_FLAG_SYN); ts->tcpflags &=~ CI_TCPT_FLAG_OPT_MASK; ts->tcpflags |= NI_OPTS(ni).syn_opts; if( (ts->tcpflags & CI_TCPT_FLAG_WSCL) ) { ts->rcv_wscl = ci_tcp_wscl_by_buff(ni, ci_tcp_rcvbuf_established(ni, &ts->s)); CI_IP_SOCK_STATS_VAL_RXWSCL(ts, ts->rcv_wscl); } else { ts->rcv_wscl = 0; CI_IP_SOCK_STATS_VAL_RXWSCL(ts, 0); } ci_tcp_set_rcvbuf(ni, ts); ci_tcp_init_rcv_wnd(ts, "CONNECT"); /* outgoing_hdrs_len is initialised to include timestamp option. */ if( ! (ts->tcpflags & CI_TCPT_FLAG_TSO) ) ts->outgoing_hdrs_len = sizeof(ci_ip4_hdr)+sizeof(ci_tcp_hdr); if( ci_tcp_can_stripe(ni, ts->s.pkt.ip.ip_saddr_be32, ts->s.pkt.ip.ip_daddr_be32) ) ts->tcpflags |= CI_TCPT_FLAG_STRIPE; ci_tcp_set_slow_state(ni, ts, CI_TCP_SYN_SENT); /* If the app trys to send data on a socket in SYN_SENT state ** then the data is queued for send until the SYN gets ACKed. ** (rfc793 p56) ** ** Receive calls on the socket should block until data arrives ** (rfc793 p58) ** ** Clearing tx_errno and rx_errno acheive this. The transmit window ** is set to 1 byte which ensures that only the SYN packet gets ** sent until the ACK is received with more window. */ ci_assert(ts->snd_max == tcp_snd_nxt(ts) + 1); ts->s.rx_errno = 0; ts->s.tx_errno = 0; ci_tcp_enqueue_no_data(ts, ni, pkt); ci_tcp_set_flags(ts, CI_TCP_FLAG_ACK); if( ts->s.b.sb_aflags & (CI_SB_AFLAG_O_NONBLOCK | CI_SB_AFLAG_O_NDELAY) ) { ts->tcpflags |= CI_TCPT_FLAG_NONBLOCK_CONNECT; LOG_TC(log( LNT_FMT "Non-blocking connect - return EINPROGRESS", LNT_PRI_ARGS(ni, ts))); CI_SET_ERROR(*fail_rc, EINPROGRESS); return CI_CONNECT_UL_FAIL; } return CI_CONNECT_UL_OK; }