ci_sock_cmn* __ci_netif_filter_lookup(ci_netif* netif, unsigned laddr, unsigned lport, unsigned raddr, unsigned rport, unsigned protocol) { int rc; /* try full lookup */ rc = ci_netif_filter_lookup(netif, laddr, lport, raddr, rport, protocol); LOG_NV(log(LPF "FULL LOOKUP %s:%u->%s:%u rc=%d", ip_addr_str(laddr), (unsigned) CI_BSWAP_BE16(lport), ip_addr_str(raddr), (unsigned) CI_BSWAP_BE16(rport), rc)); if(CI_LIKELY( rc >= 0 )) return ID_TO_SOCK(netif, netif->filter_table->table[rc].id); /* try wildcard lookup */ raddr = rport = 0; rc = ci_netif_filter_lookup(netif, laddr, lport, raddr, rport, protocol); LOG_NV(log(LPF "WILD LOOKUP %s:%u->%s:%u rc=%d", ip_addr_str(laddr), (unsigned) CI_BSWAP_BE16(lport), ip_addr_str(raddr), (unsigned) CI_BSWAP_BE16(rport), rc)); if(CI_LIKELY( rc >= 0 )) return ID_TO_SOCK(netif, netif->filter_table->table[rc].id); return 0; }
void ci_netif_filter_for_each_match(ci_netif* ni, unsigned laddr, unsigned lport, unsigned raddr, unsigned rport, unsigned protocol, int intf_i, int vlan, int (*callback)(ci_sock_cmn*, void*), void* callback_arg, ci_uint32* hash_out) { ci_netif_filter_table* tbl; unsigned hash1, hash2 = 0; unsigned first; tbl = ni->filter_table; if( hash_out != NULL ) *hash_out = tcp_hash3(tbl, laddr, lport, raddr, rport, protocol); hash1 = tcp_hash1(tbl, laddr, lport, raddr, rport, protocol); first = hash1; LOG_NV(log("%s: %s %s:%u->%s:%u hash=%u:%u at=%u", __FUNCTION__, CI_IP_PROTOCOL_STR(protocol), ip_addr_str(laddr), (unsigned) CI_BSWAP_BE16(lport), ip_addr_str(raddr), (unsigned) CI_BSWAP_BE16(rport), first, tcp_hash2(tbl, laddr, lport, raddr, rport, protocol), hash1)); while( 1 ) { int id = tbl->table[hash1].id; if(CI_LIKELY( id >= 0 )) { ci_sock_cmn* s = ID_TO_SOCK(ni, id); if( ((laddr - tbl->table[hash1].laddr) | (lport - sock_lport_be16(s) ) | (raddr - sock_raddr_be32(s) ) | (rport - sock_rport_be16(s) ) | (protocol - sock_protocol(s) )) == 0 ) if(CI_LIKELY( (s->rx_bind2dev_ifindex == CI_IFID_BAD || ci_sock_intf_check(ni, s, intf_i, vlan)) )) if( callback(s, callback_arg) != 0 ) return; } else if( id == EMPTY ) break; /* We defer calculating hash2 until it's needed, just to make the fast ** case that little bit faster. */ if( hash1 == first ) hash2 = tcp_hash2(tbl, laddr, lport, raddr, rport, protocol); hash1 = (hash1 + hash2) & tbl->table_size_mask; if( hash1 == first ) { LOG_NV(ci_log(FN_FMT "ITERATE FULL %s:%u->%s:%u hash=%u:%u", FN_PRI_ARGS(ni), ip_addr_str(laddr), lport, ip_addr_str(raddr), rport, hash1, hash2)); break; } } }
static int citp_udp_sendmmsg(citp_fdinfo* fdinfo, struct mmsghdr* mmsg, unsigned vlen, int flags) { citp_sock_fdi* epi = fdi_to_sock_fdi(fdinfo); ci_udp_iomsg_args a; int i, rc; Log_V(log(LPF "sendmmsg(%d, msg, %u, %#x)", fdinfo->fd, vlen, (unsigned) flags)); if( vlen == 0 ) return 0; a.ep = &epi->sock; a.fd = fdinfo->fd; a.ni = epi->sock.netif; a.us = SOCK_TO_UDP(epi->sock.s); i = 0; do { rc = ci_udp_sendmsg(&a, &mmsg[i].msg_hdr, flags); if(CI_LIKELY( rc >= 0 ) ) mmsg[i].msg_len = rc; ++i; } while( rc >= 0 && i < vlen ); return (rc>=0) ? i : rc; }
/* check that we can handle this destination */ static int ci_tcp_connect_check_dest(citp_socket* ep, ci_ip_addr_t dst_be32, int dport_be16) { ci_ip_cached_hdrs* ipcache = &ep->s->pkt; ipcache->ip.ip_daddr_be32 = dst_be32; ipcache->dport_be16 = dport_be16; cicp_user_retrieve(ep->netif, ipcache, &ep->s->cp); if(CI_LIKELY( ipcache->status == retrrc_success || ipcache->status == retrrc_nomac || ipcache->status < 0 )) { /* Onloadable. */ if( ipcache->encap.type & CICP_LLAP_TYPE_XMIT_HASH_LAYER4 ) /* We don't yet have a local port number, so the result of that * lookup may be wrong. */ ci_ip_cache_invalidate(ipcache); if( ipcache->ip.ip_saddr_be32 == 0 ) { /* Control plane has selected a source address for us -- remember it. */ ipcache->ip.ip_saddr_be32 = ipcache->ip_saddr_be32; ep->s->cp.ip_laddr_be32 = ipcache->ip_saddr_be32; } return 0; } else if( ipcache->status == retrrc_localroute ) { ci_tcp_state* ts = SOCK_TO_TCP(ep->s); if( NI_OPTS(ep->netif).tcp_client_loopback == CITP_TCP_LOOPBACK_OFF) return CI_SOCKET_HANDOVER; ep->s->s_flags |= CI_SOCK_FLAG_BOUND_ALIEN; if( NI_OPTS(ep->netif).tcp_server_loopback != CITP_TCP_LOOPBACK_OFF ) ts->local_peer = ci_tcp_connect_find_local_peer(ep->netif, dst_be32, dport_be16); else ts->local_peer = OO_SP_NULL; if( OO_SP_NOT_NULL(ts->local_peer) || NI_OPTS(ep->netif).tcp_client_loopback != CITP_TCP_LOOPBACK_SAMESTACK ) { ipcache->flags |= CI_IP_CACHE_IS_LOCALROUTE; if( ipcache->ip.ip_saddr_be32 == 0 ) { ipcache->ip.ip_saddr_be32 = dst_be32; ep->s->cp.ip_laddr_be32 = dst_be32; } ipcache->ether_offset = 4; /* lo is non-VLAN */ ipcache->ip_saddr_be32 = dst_be32; ipcache->dport_be16 = dport_be16; return 0; } return CI_SOCKET_HANDOVER; } return CI_SOCKET_HANDOVER; }
int ci_netif_filter_lookup(ci_netif* netif, unsigned laddr, unsigned lport, unsigned raddr, unsigned rport, unsigned protocol) { unsigned hash1, hash2 = 0; ci_netif_filter_table* tbl; unsigned first; ci_assert(netif); ci_assert(ci_netif_is_locked(netif)); ci_assert(netif->filter_table); tbl = netif->filter_table; hash1 = tcp_hash1(tbl, laddr, lport, raddr, rport, protocol); first = hash1; LOG_NV(log("tbl_lookup: %s %s:%u->%s:%u hash=%u:%u at=%u", CI_IP_PROTOCOL_STR(protocol), ip_addr_str(laddr), (unsigned) CI_BSWAP_BE16(lport), ip_addr_str(raddr), (unsigned) CI_BSWAP_BE16(rport), first, tcp_hash2(tbl, laddr, lport, raddr, rport, protocol), hash1)); while( 1 ) { int id = tbl->table[hash1].id; if( CI_LIKELY(id >= 0) ) { ci_sock_cmn* s = ID_TO_SOCK(netif, id); if( ((laddr - tbl->table[hash1].laddr) | (lport - sock_lport_be16(s) ) | (raddr - sock_raddr_be32(s) ) | (rport - sock_rport_be16(s) ) | (protocol - sock_protocol(s) )) == 0 ) return hash1; } if( id == EMPTY ) break; /* We defer calculating hash2 until it's needed, just to make the fast * case that little bit faster. */ if( hash1 == first ) hash2 = tcp_hash2(tbl, laddr, lport, raddr, rport, protocol); hash1 = (hash1 + hash2) & tbl->table_size_mask; if( hash1 == first ) { LOG_E(ci_log(FN_FMT "ERROR: LOOP %s:%u->%s:%u hash=%u:%u", FN_PRI_ARGS(netif), ip_addr_str(laddr), lport, ip_addr_str(raddr), rport, hash1, hash2)); return -ELOOP; } } return -ENOENT; }
ssize_t linux_tcp_helper_fop_sendpage(struct file* filp, struct page* page, int offset, size_t size, loff_t* ppos, int flags) { ci_private_t* priv = filp->private_data; tcp_helper_resource_t* trs = efab_priv_to_thr(priv); ci_sock_cmn* s; OO_DEBUG_VERB(ci_log("%s: %d:%d offset=%d size=%d flags=%x", __FUNCTION__, NI_ID(&trs->netif), OO_SP_FMT(priv->sock_id), offset, (int) size, flags)); ci_assert(page); ci_assert_ge(offset, 0); ci_assert_gt(size, 0); ci_assert_le(offset + size, CI_PAGE_SIZE); #ifndef MSG_SENDPAGE_NOTLAST /* "flags" is really "more". Convert it. */ if( flags ) flags = MSG_MORE; /* [more] is sometimes true even for the last page. We get a little ** closer to the truth by spotting that we're not reading to the end of ** the page. - seen on 2.6.18, but not on 2.6.26 or later */ if( offset + size < CI_PAGE_SIZE && flags ) flags = 0; #endif s = SP_TO_SOCK(&trs->netif, priv->sock_id); if(CI_LIKELY( s->b.state & CI_TCP_STATE_TCP_CONN )) return sendpage_copy(&trs->netif,SOCK_TO_TCP(s),page,offset,size,flags); else /* Closed or listening. Return epipe. Do not send SIGPIPE, because ** Linux will do it for us. */ return -s->tx_errno; }
void ci_netif_filter_dump(ci_netif* ni) { int id; unsigned i; ci_netif_filter_table* tbl; ci_assert(ni); tbl = ni->filter_table; log("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"); #if CI_CFG_STATS_NETIF log(FN_FMT "size=%d n_entries=%i n_slots=%i max=%i mean=%i", FN_PRI_ARGS(ni), tbl->table_size_mask + 1, ni->state->stats.table_n_entries, ni->state->stats.table_n_slots, ni->state->stats.table_max_hops, ni->state->stats.table_mean_hops); #endif for( i = 0; i <= tbl->table_size_mask; ++i ) { id = tbl->table[i].id; if( CI_LIKELY(id >= 0) ) { ci_sock_cmn* s = ID_TO_SOCK(ni, id); unsigned laddr = tbl->table[i].laddr; int lport = sock_lport_be16(s); unsigned raddr = sock_raddr_be32(s); int rport = sock_rport_be16(s); int protocol = sock_protocol(s); unsigned hash1 = tcp_hash1(tbl, laddr, lport, raddr, rport, protocol); unsigned hash2 = tcp_hash2(tbl, laddr, lport, raddr, rport, protocol); log("%010d id=%-10d rt_ct=%d %s "CI_IP_PRINTF_FORMAT":%d " CI_IP_PRINTF_FORMAT":%d %010d:%010d", i, id, tbl->table[i].route_count, CI_IP_PROTOCOL_STR(protocol), CI_IP_PRINTF_ARGS(&laddr), CI_BSWAP_BE16(lport), CI_IP_PRINTF_ARGS(&raddr), CI_BSWAP_BE16(rport), hash1, hash2); } } log("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"); }
static int citp_udp_send(citp_fdinfo* fdinfo, const struct msghdr * msg, int flags) { citp_sock_fdi *epi = fdi_to_sock_fdi(fdinfo); ci_udp_iomsg_args a; int rc; ci_assert(msg != NULL); a.ep = &epi->sock; a.fd = fdinfo->fd; a.ni = epi->sock.netif; a.us = SOCK_TO_UDP(epi->sock.s); /* NB. msg_name[len] validated in ci_udp_sendmsg(). */ if(CI_LIKELY( msg->msg_iov != NULL || msg->msg_iovlen == 0 )) { rc = ci_udp_sendmsg( &a, msg, flags); } else { rc = -1; errno = EFAULT; } return rc; }
static int ci_udp_recvmsg_get(ci_netif* ni, ci_udp_state* us, ci_iovec_ptr* piov, ci_msghdr* msg, int flags) { ci_ip_pkt_fmt* pkt; int rc; /* NB. [msg] can be NULL for async recv. */ if( ci_udp_recv_q_not_readable(ni, us) ) goto recv_q_is_empty; ci_rmb(); pkt = PKT_CHK_NNL(ni, us->recv_q.extract); if( pkt->pf.udp.rx_flags & CI_IP_PKT_FMT_PREFIX_UDP_RX_CONSUMED ) { /* We know that the receive queue is not empty and if a filter is * involved that there are some that have passed the filter, so if * this pkt is already consumed, the next one must be OK to * receive (and already have been filtered) */ us->recv_q.extract = pkt->next; pkt = PKT_CHK_NNL(ni, us->recv_q.extract); ci_assert( !(pkt->pf.udp.rx_flags & CI_IP_PKT_FMT_PREFIX_UDP_RX_CONSUMED) ); #if CI_CFG_ZC_RECV_FILTER if( us->recv_q_filter ) /* Filter should have run on this packet and marked it */ ci_assert( (pkt->pf.udp.rx_flags & (CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_PASSED | CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_DROPPED)) ); else /* Bump this along as we don't have a filter installed, but want * to keep the filter pointer in a sane place */ us->recv_q.filter = us->recv_q.extract; #endif } #if CI_CFG_ZC_RECV_FILTER /* Skip any that the filter has dropped. This must terminate before * hitting the tail because we know the queue is readable. */ while( pkt->pf.udp.rx_flags & CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_DROPPED ) { us->recv_q.extract = pkt->next; pkt = PKT_CHK_NNL(ni, us->recv_q.extract); } #endif #if defined(__linux__) && !defined(__KERNEL__) if( msg != NULL && msg->msg_controllen != 0 ) { if( CI_UNLIKELY(us->s.cmsg_flags != 0 ) ) ci_ip_cmsg_recv(ni, us, pkt, msg, 0); else msg->msg_controllen = 0; } #endif us->stamp = pkt->pf.udp.rx_stamp; rc = oo_copy_pkt_to_iovec_no_adv(ni, pkt, piov, pkt->pf.udp.pay_len); if(CI_LIKELY( rc >= 0 )) { #if HAVE_MSG_FLAGS if(CI_UNLIKELY( rc < pkt->pf.udp.pay_len && msg != NULL )) msg->msg_flags |= LOCAL_MSG_TRUNC; #endif ci_udp_recvmsg_fill_msghdr(ni, msg, pkt, &us->s); if( ! (flags & MSG_PEEK) ) { us->recv_q.bytes_delivered += pkt->pf.udp.pay_len; us->recv_q.pkts_delivered += 1; pkt->pf.udp.rx_flags |= CI_IP_PKT_FMT_PREFIX_UDP_RX_CONSUMED; #if CI_CFG_ZC_RECV_FILTER if( !us->recv_q_filter ) { /* Pretend this packet passed the filter, to keep state consistent */ ++us->recv_q.pkts_filter_passed; us->recv_q.bytes_filter_passed += pkt->pf.udp.pay_len; pkt->pf.udp.rx_flags |= CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_PASSED; } #endif } us->udpflags |= CI_UDPF_LAST_RECV_ON; } return rc; recv_q_is_empty: return -EAGAIN; }
static int ci_tcp_connect_ul_start(ci_netif *ni, ci_tcp_state* ts, ci_uint32 dst_be32, unsigned dport_be16, int* fail_rc) { ci_ip_pkt_fmt* pkt; int rc = 0; ci_assert(ts->s.pkt.mtu); /* Now that we know the outgoing route, set the MTU related values. * Note, even these values are speculative since the real MTU * could change between now and passing the packet to the lower layers */ ts->amss = ts->s.pkt.mtu - sizeof(ci_tcp_hdr) - sizeof(ci_ip4_hdr); #if CI_CFG_LIMIT_AMSS ts->amss = ci_tcp_limit_mss(ts->amss, ni, __FUNCTION__); #endif /* Default smss until discovered by MSS option in SYN - RFC1122 4.2.2.6 */ ts->smss = CI_CFG_TCP_DEFAULT_MSS; /* set pmtu, eff_mss, snd_buf and adjust windows */ ci_pmtu_set(ni, &ts->pmtus, ts->s.pkt.mtu); ci_tcp_set_eff_mss(ni, ts); ci_tcp_set_initialcwnd(ni, ts); /* Send buffer adjusted by ci_tcp_set_eff_mss(), but we want it to stay * zero until the connection is established. */ ts->so_sndbuf_pkts = 0; /* * 3. State and address are OK. It's address routed through our NIC. * Do connect(). */ ci_assert_nequal(ts->s.pkt.ip.ip_saddr_be32, INADDR_ANY); if( ts->s.s_flags & CI_SOCK_FLAG_CONNECT_MUST_BIND ) { ci_sock_cmn* s = &ts->s; ci_uint16 source_be16 = 0; if( s->s_flags & CI_SOCK_FLAG_ADDR_BOUND ) rc = __ci_bind(ni, &ts->s, ts->s.pkt.ip.ip_saddr_be32, &source_be16); else rc = __ci_bind(ni, &ts->s, INADDR_ANY, &source_be16); if(CI_LIKELY( rc == 0 )) { TS_TCP(ts)->tcp_source_be16 = source_be16; ts->s.cp.lport_be16 = source_be16; LOG_TC(log(LNT_FMT "connect: our bind returned %s:%u", LNT_PRI_ARGS(ni, ts), ip_addr_str(INADDR_ANY), (unsigned) CI_BSWAP_BE16(TS_TCP(ts)->tcp_source_be16))); } else { LOG_U(ci_log("__ci_bind returned %d at %s:%d", CI_GET_ERROR(rc), __FILE__, __LINE__)); *fail_rc = rc; return CI_CONNECT_UL_FAIL; } if(CI_UNLIKELY( ts->s.pkt.ip.ip_saddr_be32 == 0 )) { CI_SET_ERROR(*fail_rc, EINVAL); return CI_CONNECT_UL_FAIL; } } ci_tcp_set_peer(ts, dst_be32, dport_be16); /* Make sure we can get a buffer before we change state. */ pkt = ci_netif_pkt_tx_tcp_alloc(ni); if( CI_UNLIKELY(! pkt) ) { /* NB. We've already done a poll above. */ rc = ci_netif_pkt_wait(ni, &ts->s, CI_SLEEP_NETIF_LOCKED|CI_SLEEP_NETIF_RQ); if( ci_netif_pkt_wait_was_interrupted(rc) ) { CI_SET_ERROR(*fail_rc, -rc); return CI_CONNECT_UL_LOCK_DROPPED; } /* OK, there are (probably) packets available - go try again. Note we * jump back to the top of the function because someone may have * connected this socket in the mean-time, so we need to check the * state once more. */ return CI_CONNECT_UL_START_AGAIN; } #ifdef ONLOAD_OFE if( ni->ofe != NULL ) ts->s.ofe_code_start = ofe_socktbl_find( ni->ofe, OFE_SOCKTYPE_TCP_ACTIVE, tcp_laddr_be32(ts), tcp_raddr_be32(ts), tcp_lport_be16(ts), tcp_rport_be16(ts)); #endif rc = ci_tcp_ep_set_filters(ni, S_SP(ts), ts->s.cp.so_bindtodevice, OO_SP_NULL); if( rc < 0 ) { /* Perhaps we've run out of filters? See if we can push a socket out * of timewait and steal its filter. */ ci_assert_nequal(rc, -EFILTERSSOME); if( rc != -EBUSY || ! ci_netif_timewait_try_to_free_filter(ni) || (rc = ci_tcp_ep_set_filters(ni, S_SP(ts), ts->s.cp.so_bindtodevice, OO_SP_NULL)) < 0 ) { ci_assert_nequal(rc, -EFILTERSSOME); /* Either a different error, or our efforts to free a filter did not * work. */ if( ! (ts->s.s_flags & CI_SOCK_FLAG_ADDR_BOUND) ) { ts->s.pkt.ip.ip_saddr_be32 = 0; ts->s.cp.ip_laddr_be32 = 0; } ci_netif_pkt_release(ni, pkt); CI_SET_ERROR(*fail_rc, -rc); return CI_CONNECT_UL_FAIL; } } LOG_TC(log(LNT_FMT "CONNECT %s:%u->%s:%u", LNT_PRI_ARGS(ni, ts), ip_addr_str(ts->s.pkt.ip.ip_saddr_be32), (unsigned) CI_BSWAP_BE16(TS_TCP(ts)->tcp_source_be16), ip_addr_str(ts->s.pkt.ip.ip_daddr_be32), (unsigned) CI_BSWAP_BE16(TS_TCP(ts)->tcp_dest_be16))); /* We are going to send the SYN - set states appropriately */ tcp_snd_una(ts) = tcp_snd_nxt(ts) = tcp_enq_nxt(ts) = tcp_snd_up(ts) = ci_tcp_initial_seqno(ni); ts->snd_max = tcp_snd_nxt(ts) + 1; /* Must be after initialising snd_una. */ ci_tcp_clear_rtt_timing(ts); ci_tcp_set_flags(ts, CI_TCP_FLAG_SYN); ts->tcpflags &=~ CI_TCPT_FLAG_OPT_MASK; ts->tcpflags |= NI_OPTS(ni).syn_opts; if( (ts->tcpflags & CI_TCPT_FLAG_WSCL) ) { ts->rcv_wscl = ci_tcp_wscl_by_buff(ni, ci_tcp_rcvbuf_established(ni, &ts->s)); CI_IP_SOCK_STATS_VAL_RXWSCL(ts, ts->rcv_wscl); } else { ts->rcv_wscl = 0; CI_IP_SOCK_STATS_VAL_RXWSCL(ts, 0); } ci_tcp_set_rcvbuf(ni, ts); ci_tcp_init_rcv_wnd(ts, "CONNECT"); /* outgoing_hdrs_len is initialised to include timestamp option. */ if( ! (ts->tcpflags & CI_TCPT_FLAG_TSO) ) ts->outgoing_hdrs_len = sizeof(ci_ip4_hdr)+sizeof(ci_tcp_hdr); if( ci_tcp_can_stripe(ni, ts->s.pkt.ip.ip_saddr_be32, ts->s.pkt.ip.ip_daddr_be32) ) ts->tcpflags |= CI_TCPT_FLAG_STRIPE; ci_tcp_set_slow_state(ni, ts, CI_TCP_SYN_SENT); /* If the app trys to send data on a socket in SYN_SENT state ** then the data is queued for send until the SYN gets ACKed. ** (rfc793 p56) ** ** Receive calls on the socket should block until data arrives ** (rfc793 p58) ** ** Clearing tx_errno and rx_errno acheive this. The transmit window ** is set to 1 byte which ensures that only the SYN packet gets ** sent until the ACK is received with more window. */ ci_assert(ts->snd_max == tcp_snd_nxt(ts) + 1); ts->s.rx_errno = 0; ts->s.tx_errno = 0; ci_tcp_enqueue_no_data(ts, ni, pkt); ci_tcp_set_flags(ts, CI_TCP_FLAG_ACK); if( ts->s.b.sb_aflags & (CI_SB_AFLAG_O_NONBLOCK | CI_SB_AFLAG_O_NDELAY) ) { ts->tcpflags |= CI_TCPT_FLAG_NONBLOCK_CONNECT; LOG_TC(log( LNT_FMT "Non-blocking connect - return EINPROGRESS", LNT_PRI_ARGS(ni, ts))); CI_SET_ERROR(*fail_rc, EINPROGRESS); return CI_CONNECT_UL_FAIL; } return CI_CONNECT_UL_OK; }
int ci_tcp_listen(citp_socket* ep, ci_fd_t fd, int backlog) { /* ** ?? error handling on possible fails not handled robustly... ** ?? Need to check port number is valid TODO */ /*! \todo If not bound then we have to be listening on all interfaces. * It's likely that we won't be coming through here as we have to * listen on the OS socket too! */ ci_tcp_state* ts; ci_tcp_socket_listen* tls; ci_netif* netif = ep->netif; ci_sock_cmn* s = ep->s; unsigned ul_backlog = backlog; int rc; oo_p sp; LOG_TC(log("%s "SK_FMT" listen backlog=%d", __FUNCTION__, SK_PRI_ARGS(ep), backlog)); CHECK_TEP(ep); if( NI_OPTS(netif).tcp_listen_handover ) return CI_SOCKET_HANDOVER; if( !NI_OPTS(netif).tcp_server_loopback) { /* We should handover if the socket is bound to alien address. */ if( s->s_flags & CI_SOCK_FLAG_BOUND_ALIEN ) return CI_SOCKET_HANDOVER; } if( ul_backlog < 0 ) ul_backlog = NI_OPTS(netif).max_ep_bufs; else if( ul_backlog < NI_OPTS(netif).acceptq_min_backlog ) ul_backlog = NI_OPTS(netif).acceptq_min_backlog; if( s->b.state == CI_TCP_LISTEN ) { tls = SOCK_TO_TCP_LISTEN(s); tls->acceptq_max = ul_backlog; ci_tcp_helper_listen_os_sock(fd, ul_backlog); return 0; } if( s->b.state != CI_TCP_CLOSED ) { CI_SET_ERROR(rc, EINVAL); return rc; } ts = SOCK_TO_TCP(s); /* Bug 3376: if socket used for a previous, failed, connect then the error * numbers will not be as expected. Only seen when not using listening * netifs (as moving the EP to the new netif resets them). */ ts->s.tx_errno = EPIPE; ts->s.rx_errno = ENOTCONN; /* fill in address/ports and all TCP state */ if( !(ts->s.s_flags & CI_SOCK_FLAG_BOUND) ) { ci_uint16 source_be16; /* They haven't previously done a bind, so we need to choose * a port. As we haven't been given a hint we let the OS choose. */ source_be16 = 0; rc = __ci_bind(ep->netif, ep->s, ts->s.pkt.ip.ip_saddr_be32, &source_be16); if (CI_LIKELY( rc==0 )) { TS_TCP(ts)->tcp_source_be16 = source_be16; ts->s.cp.lport_be16 = source_be16; LOG_TC(log(LNT_FMT "listen: our bind returned %s:%u", LNT_PRI_ARGS(ep->netif, ts), ip_addr_str(ts->s.pkt.ip.ip_saddr_be32), (unsigned) CI_BSWAP_BE16(TS_TCP(ts)->tcp_source_be16))); } else { LOG_U(ci_log("__ci_bind returned %d at %s:%d", CI_GET_ERROR(rc), __FILE__, __LINE__)); return rc; } } ci_sock_lock(netif, &ts->s.b); ci_tcp_set_slow_state(netif, ts, CI_TCP_LISTEN); tls = SOCK_TO_TCP_LISTEN(&ts->s); tcp_raddr_be32(tls) = 0u; tcp_rport_be16(tls) = 0u; ci_assert_equal(tls->s.tx_errno, EPIPE); ci_assert_equal(tls->s.rx_errno, ENOTCONN); /* setup listen timer - do it before the first return statement, * because __ci_tcp_listen_to_normal() will be called on error path. */ if( ~tls->s.s_flags & CI_SOCK_FLAG_BOUND_ALIEN ) { sp = TS_OFF(netif, tls); OO_P_ADD(sp, CI_MEMBER_OFFSET(ci_tcp_socket_listen, listenq_tid)); ci_ip_timer_init(netif, &tls->listenq_tid, sp, "lstq"); tls->listenq_tid.param1 = S_SP(tls); tls->listenq_tid.fn = CI_IP_TIMER_TCP_LISTEN; } rc = ci_tcp_listen_init(netif, tls); ci_sock_unlock(netif, &ts->s.b); if( rc != 0 ) { CI_SET_ERROR(rc, -rc); goto listen_fail; } tls->acceptq_max = ul_backlog; CITP_STATS_TCP_LISTEN(CI_ZERO(&tls->stats)); /* install all the filters needed for this connection * - tcp_laddr_be32(ts) = 0 for IPADDR_ANY * * TODO: handle BINDTODEVICE by setting phys_port paramter to correct * physical L5 port index * TODO: handle REUSEADDR by setting last paramter to TRUE */ if( ~s->s_flags & CI_SOCK_FLAG_BOUND_ALIEN ) { #ifdef ONLOAD_OFE if( netif->ofe != NULL ) { tls->s.ofe_code_start = ofe_socktbl_find( netif->ofe, OFE_SOCKTYPE_TCP_LISTEN, tcp_laddr_be32(tls), INADDR_ANY, tcp_lport_be16(ts), 0); tls->ofe_promote = ofe_socktbl_find( netif->ofe, OFE_SOCKTYPE_TCP_PASSIVE, tcp_laddr_be32(tls), INADDR_ANY, tcp_lport_be16(ts), 0); } #endif rc = ci_tcp_ep_set_filters(netif, S_SP(tls), tls->s.cp.so_bindtodevice, OO_SP_NULL); if( rc == -EFILTERSSOME ) { if( CITP_OPTS.no_fail ) rc = 0; else { ci_tcp_ep_clear_filters(netif, S_SP(tls), 0); rc = -ENOBUFS; } } ci_assert_nequal(rc, -EFILTERSSOME); VERB(ci_log("%s: set_filters returned %d", __FUNCTION__, rc)); if (rc < 0) { CI_SET_ERROR(rc, -rc); goto post_listen_fail; } } /* * Call of system listen() is required for listen any, local host * communications server and multi-homed server (to accept connections * to L5 assigned address(es), but incoming from other interfaces). */ #ifdef __ci_driver__ { rc = efab_tcp_helper_listen_os_sock( netif2tcp_helper_resource(netif), S_SP(tls), backlog); } #else rc = ci_tcp_helper_listen_os_sock(fd, backlog); #endif if ( rc < 0 ) { /* clear the filter we've just set */ ci_tcp_ep_clear_filters(netif, S_SP(tls), 0); goto post_listen_fail; } return 0; post_listen_fail: ci_tcp_listenq_drop_all(netif, tls); listen_fail: /* revert TCP state to a non-listening socket format */ __ci_tcp_listen_to_normal(netif, tls); /* Above function sets orphan flag but we are attached to an FD. */ ci_bit_clear(&tls->s.b.sb_aflags, CI_SB_AFLAG_ORPHAN_BIT); #ifdef __ci_driver__ return rc; #else return CI_SOCKET_ERROR; #endif }