ci_sock_cmn* __ci_netif_filter_lookup(ci_netif* netif, unsigned laddr, unsigned lport, unsigned raddr, unsigned rport, unsigned protocol) { int rc; /* try full lookup */ rc = ci_netif_filter_lookup(netif, laddr, lport, raddr, rport, protocol); LOG_NV(log(LPF "FULL LOOKUP %s:%u->%s:%u rc=%d", ip_addr_str(laddr), (unsigned) CI_BSWAP_BE16(lport), ip_addr_str(raddr), (unsigned) CI_BSWAP_BE16(rport), rc)); if(CI_LIKELY( rc >= 0 )) return ID_TO_SOCK(netif, netif->filter_table->table[rc].id); /* try wildcard lookup */ raddr = rport = 0; rc = ci_netif_filter_lookup(netif, laddr, lport, raddr, rport, protocol); LOG_NV(log(LPF "WILD LOOKUP %s:%u->%s:%u rc=%d", ip_addr_str(laddr), (unsigned) CI_BSWAP_BE16(lport), ip_addr_str(raddr), (unsigned) CI_BSWAP_BE16(rport), rc)); if(CI_LIKELY( rc >= 0 )) return ID_TO_SOCK(netif, netif->filter_table->table[rc].id); return 0; }
void ci_netif_filter_for_each_match(ci_netif* ni, unsigned laddr, unsigned lport, unsigned raddr, unsigned rport, unsigned protocol, int intf_i, int vlan, int (*callback)(ci_sock_cmn*, void*), void* callback_arg, ci_uint32* hash_out) { ci_netif_filter_table* tbl; unsigned hash1, hash2 = 0; unsigned first; tbl = ni->filter_table; if( hash_out != NULL ) *hash_out = tcp_hash3(tbl, laddr, lport, raddr, rport, protocol); hash1 = tcp_hash1(tbl, laddr, lport, raddr, rport, protocol); first = hash1; LOG_NV(log("%s: %s %s:%u->%s:%u hash=%u:%u at=%u", __FUNCTION__, CI_IP_PROTOCOL_STR(protocol), ip_addr_str(laddr), (unsigned) CI_BSWAP_BE16(lport), ip_addr_str(raddr), (unsigned) CI_BSWAP_BE16(rport), first, tcp_hash2(tbl, laddr, lport, raddr, rport, protocol), hash1)); while( 1 ) { int id = tbl->table[hash1].id; if(CI_LIKELY( id >= 0 )) { ci_sock_cmn* s = ID_TO_SOCK(ni, id); if( ((laddr - tbl->table[hash1].laddr) | (lport - sock_lport_be16(s) ) | (raddr - sock_raddr_be32(s) ) | (rport - sock_rport_be16(s) ) | (protocol - sock_protocol(s) )) == 0 ) if(CI_LIKELY( (s->rx_bind2dev_ifindex == CI_IFID_BAD || ci_sock_intf_check(ni, s, intf_i, vlan)) )) if( callback(s, callback_arg) != 0 ) return; } else if( id == EMPTY ) break; /* We defer calculating hash2 until it's needed, just to make the fast ** case that little bit faster. */ if( hash1 == first ) hash2 = tcp_hash2(tbl, laddr, lport, raddr, rport, protocol); hash1 = (hash1 + hash2) & tbl->table_size_mask; if( hash1 == first ) { LOG_NV(ci_log(FN_FMT "ITERATE FULL %s:%u->%s:%u hash=%u:%u", FN_PRI_ARGS(ni), ip_addr_str(laddr), lport, ip_addr_str(raddr), rport, hash1, hash2)); break; } } }
void ef_driver_save_fd(void) { int rc = 0; ef_driver_handle fd; enum oo_device_type dev_type; for( dev_type = 0; dev_type < OO_MAX_DEV; dev_type++ ) { if( ! fd_is_saved[dev_type] ) { rc = ef_onload_driver_open(&fd, dev_type, 1); if( rc == 0 ) { saved_fd[dev_type] = fd; fd_is_saved[dev_type] = 1; LOG_NV(ci_log("%s: Saved fd %d %s for cloning", __func__, (int)fd, oo_device_name[dev_type])); if( oo_st_rdev[dev_type] <= 0 ) { struct stat st; fstat(fd, &st); oo_st_rdev[dev_type] = st.st_rdev; } } else { ci_log("%s: failed to open %s - rc=%d", __func__, oo_device_name[dev_type], rc); } } } }
unsigned long oo_get_st_rdev(enum oo_device_type dev_type) { if( oo_st_rdev[dev_type] == 0 ) { struct stat st; if( stat(oo_device_name[dev_type], &st) == 0 ) oo_st_rdev[dev_type] = st.st_rdev; else { LOG_NV(ci_log("%s: ERROR: stats(%s) failed errno=%d", __func__, oo_device_name[dev_type], errno)); oo_st_rdev[dev_type] = -1; } } return oo_st_rdev[dev_type]; }
int ci_netif_filter_lookup(ci_netif* netif, unsigned laddr, unsigned lport, unsigned raddr, unsigned rport, unsigned protocol) { unsigned hash1, hash2 = 0; ci_netif_filter_table* tbl; unsigned first; ci_assert(netif); ci_assert(ci_netif_is_locked(netif)); ci_assert(netif->filter_table); tbl = netif->filter_table; hash1 = tcp_hash1(tbl, laddr, lport, raddr, rport, protocol); first = hash1; LOG_NV(log("tbl_lookup: %s %s:%u->%s:%u hash=%u:%u at=%u", CI_IP_PROTOCOL_STR(protocol), ip_addr_str(laddr), (unsigned) CI_BSWAP_BE16(lport), ip_addr_str(raddr), (unsigned) CI_BSWAP_BE16(rport), first, tcp_hash2(tbl, laddr, lport, raddr, rport, protocol), hash1)); while( 1 ) { int id = tbl->table[hash1].id; if( CI_LIKELY(id >= 0) ) { ci_sock_cmn* s = ID_TO_SOCK(netif, id); if( ((laddr - tbl->table[hash1].laddr) | (lport - sock_lport_be16(s) ) | (raddr - sock_raddr_be32(s) ) | (rport - sock_rport_be16(s) ) | (protocol - sock_protocol(s) )) == 0 ) return hash1; } if( id == EMPTY ) break; /* We defer calculating hash2 until it's needed, just to make the fast * case that little bit faster. */ if( hash1 == first ) hash2 = tcp_hash2(tbl, laddr, lport, raddr, rport, protocol); hash1 = (hash1 + hash2) & tbl->table_size_mask; if( hash1 == first ) { LOG_E(ci_log(FN_FMT "ERROR: LOOP %s:%u->%s:%u hash=%u:%u", FN_PRI_ARGS(netif), ip_addr_str(laddr), lport, ip_addr_str(raddr), rport, hash1, hash2)); return -ELOOP; } } return -ENOENT; }
int ef_onload_handle_move_and_do_cloexec(ef_driver_handle* pfd, int do_cloexec) { int fd; if( do_cloexec ) fd = oo_fcntl_dupfd_cloexec(*pfd, CITP_OPTS.fd_base); else fd = ci_sys_fcntl(*pfd, F_DUPFD, CITP_OPTS.fd_base); /* If we've successfully done the dup then we've also set CLOEXEC if * needed on the new fd, so we're done. */ if( fd >= 0 ) { ci_tcp_helper_close_no_trampoline(*pfd); *pfd = fd; return 0; } else { LOG_NV(ci_log("%s: Failed to move fd from %d, rc %d", __func__, *pfd, fd)); } return fd; }
int ef_onload_driver_open(ef_driver_handle* pfd, enum oo_device_type dev_type, int do_cloexec) { int rc; int flags = 0; int saved_errno = errno; #ifdef O_CLOEXEC if( do_cloexec ) flags = O_CLOEXEC; #endif ci_assert(pfd); rc = oo_open(pfd, dev_type, flags); if( rc != 0 && errno != EMFILE && fd_is_saved[dev_type] >= 0 ) { ci_clone_fd_t op; op.do_cloexec = do_cloexec; LOG_NV(ci_log("%s: open failed, but cloning from saved fd", __func__)); rc = ci_sys_ioctl((ci_fd_t) saved_fd[dev_type], clone_ioctl[dev_type], &op); if( rc < 0 ) return rc; errno = saved_errno; *pfd = op.fd; } if( rc != 0 ) return rc; /* Our internal driver handles are not visible to the application. It may * make assumptions about the fd space available to it, and try to dup2/3 * onto one of our driver fds. To try and minimise this we allow the user * to specify a minimum value for us to use, to try and keep out of their * way. * * We have to be able to cope with them coming along and trying to dup onto * one of these fds anyway, as they may not have set the option up. As such * we treat failure to shift the fd as acceptable, and just retain the old * one. */ if( *pfd < CITP_OPTS.fd_base ) if( ef_onload_handle_move_and_do_cloexec(pfd, do_cloexec) == 0 ) return 0; if( do_cloexec ) { #if defined(O_CLOEXEC) static int o_cloexec_fails = -1; if( o_cloexec_fails < 0 ) { int arg; rc = ci_sys_fcntl(*(int *)pfd, F_GETFD, &arg); if( rc == 0 && (arg & FD_CLOEXEC) ) o_cloexec_fails = 0; else o_cloexec_fails = 1; } #else static const int o_cloexec_fails = 1; #endif if( o_cloexec_fails ) CI_DEBUG_TRY(ci_sys_fcntl(*(int *)pfd, F_SETFD, FD_CLOEXEC)); } return 0; }
/* This function must be called with netif lock not held and it always * returns with the netif lock not held. */ int efab_tcp_helper_reuseport_bind(ci_private_t *priv, void *arg) { oo_tcp_reuseport_bind_t* trb = arg; ci_netif* ni = &priv->thr->netif; tcp_helper_cluster_t* thc; tcp_helper_resource_t* thr = NULL; citp_waitable* waitable; ci_sock_cmn* sock = SP_TO_SOCK(ni, priv->sock_id); struct oof_manager* fm = efab_tcp_driver.filter_manager; struct oof_socket* oofilter; struct oof_socket dummy_oofilter; int protocol = thc_get_sock_protocol(sock); char name[CI_CFG_CLUSTER_NAME_LEN + 1]; int rc, rc1; int flags = 0; tcp_helper_cluster_t* named_thc,* ported_thc; int alloced = 0; /* No clustering on sockets bound to alien addresses */ if( sock->s_flags & CI_SOCK_FLAG_BOUND_ALIEN ) return 0; if( NI_OPTS(ni).cluster_ignore == 1 ) { LOG_NV(ci_log("%s: Ignored attempt to use clusters due to " "EF_CLUSTER_IGNORE option.", __FUNCTION__)); return 0; } if( trb->port_be16 == 0 ) { ci_log("%s: Reuseport on port=0 is not supported", __FUNCTION__); return -EINVAL; } if( trb->cluster_size < 2 ) { ci_log("%s: Cluster sizes < 2 are not supported", __FUNCTION__); return -EINVAL; } if( sock->s_flags & (CI_SOCK_FLAG_TPROXY | CI_SOCK_FLAG_MAC_FILTER) ) { ci_log("%s: Scalable filter sockets cannot be clustered", __FUNCTION__); return -EINVAL; } oofilter = &ci_trs_ep_get(priv->thr, priv->sock_id)->oofilter; if( oofilter->sf_local_port != NULL ) { ci_log("%s: Socket that already have filter cannot be clustered", __FUNCTION__); return -EINVAL; } if( priv->thr->thc ) { /* Reserve proto:port[:ip] until bind (or close)*/ rc = oof_socket_add(fm, oofilter, OOF_SOCKET_ADD_FLAG_CLUSTERED | OOF_SOCKET_ADD_FLAG_DUMMY, protocol, trb->addr_be32, trb->port_be16, 0, 0, &ported_thc); if( rc > 0 ) rc = 0; if( rc == 0 ) sock->s_flags |= CI_SOCK_FLAG_FILTER; return rc; } mutex_lock(&thc_init_mutex); /* We are going to be iterating over clusters, make sure they don't * change. */ mutex_lock(&thc_mutex); /* Lookup a suitable cluster to use */ /* We try to add dummy filter to oof to reserve proto:port[:ip] tuple, * if there is already a cluster at the tuple we will get reference to it, */ oof_socket_ctor(&dummy_oofilter); rc = oof_socket_add(fm, &dummy_oofilter, OOF_SOCKET_ADD_FLAG_CLUSTERED | OOF_SOCKET_ADD_FLAG_DUMMY | OOF_SOCKET_ADD_FLAG_NO_STACK, protocol, trb->addr_be32, trb->port_be16, 0, 0, &ported_thc); if( rc < 0 ) /* non-clustered socket on the tuple */ goto alloc_fail0; if( ! gen_cluster_name(trb->cluster_name, name) ) { /* user requested a cluster by name. But we need to make sure * that the oof_local_port that the user is interested in is not * being used by another cluster. We search for cluster by name * and use results of prior protp:port[:ip] search oof_local_port * to then do some sanity checking. */ rc1 = thc_search_by_name(name, protocol, trb->port_be16, ci_geteuid(), &named_thc); if( rc1 < 0 ) { rc = rc1; goto alloc_fail; } if( rc1 == 0 ) { if( rc == 1 ) { /* search by oof_local_port found a cluster which search by * name didn't find. */ LOG_E(ci_log("Error: Cluster with requested name %s already " "bound to %s", name, ported_thc->thc_name)); rc = -EEXIST; goto alloc_fail; } else { /* Neither searches found a cluster. So allocate one below. */ } } else { if( rc == 1 ) { /* Both searches found clusters. Fine if they are the same or * else error. */ if( named_thc != ported_thc ) { LOG_E(ci_log("Error: Cluster %s does not handle socket %s:%d. " "Cluster %s does", name, FMT_PROTOCOL(protocol), trb->port_be16, named_thc->thc_name)); rc = -EEXIST; goto alloc_fail; } } /* Search by name found a cluster no conflict with search by tuple * (the ported cluster is either none or the same as named)*/ thc = named_thc; goto cont; } } else { /* No cluster name requested. We have already looked for a cluster handling * the tuple. If none found, then try to use an existing * cluster this process created. If none found, then allocate one. */ /* If rc == 0, then no cluster found - try to allocate one. * If rc == 1, we found cluster - make sure that euids match and continue. */ if( rc == 1 ) { thc = ported_thc; if( thc->thc_euid != ci_geteuid() ) { rc = -EADDRINUSE; goto alloc_fail; } goto cont; } rc = thc_search_by_name(name, protocol, trb->port_be16, ci_geteuid(), &thc); if( rc < 0 ) goto alloc_fail; if( rc == 1 ) goto cont; } /* When an interface is in tproxy mode, all clustered listening socket * are assumed to be part of tproxy passive side. This requires * rss context to use altered rss hashing based solely on src ip:port. */ flags = tcp_helper_cluster_thc_flags(&NI_OPTS(ni)); if( (rc = thc_alloc(name, protocol, trb->port_be16, ci_geteuid(), trb->cluster_size, flags, &thc)) != 0 ) goto alloc_fail; alloced = 1; cont: tcp_helper_cluster_ref(thc); /* At this point we have our cluster with one additional reference */ /* Find a suitable stack within the cluster to use */ rc = thc_get_thr(thc, &dummy_oofilter, &thr); if( rc != 0 ) rc = thc_alloc_thr(thc, trb->cluster_restart_opt, &ni->opts, ni->flags, &thr); /* If get or alloc succeeded thr holds reference to the cluster, * so the cluster cannot go away. We'll drop our reference and also * will not be accessing state within the cluster anymore so we can * drop the lock. */ mutex_unlock(&thc_mutex); if( alloced && rc == 0 && (flags & THC_FLAG_TPROXY) != 0 ) { /* Tproxy filter is allocated as late as here, * the reason is that this needs to be preceded by stack allocation * (firmware needs initialized vi) */ rc = thc_install_tproxy(thc, NI_OPTS(ni).scalable_filter_ifindex); if( rc != 0 ) efab_thr_release(thr); } tcp_helper_cluster_release(thc, NULL); if( rc != 0 ) { oof_socket_del(fm, &dummy_oofilter); goto alloc_fail_unlocked; } /* We have thr and we hold single reference to it. */ /* Move the socket into the new stack */ if( (rc = ci_netif_lock(ni)) != 0 ) goto drop_and_done; waitable = SP_TO_WAITABLE(ni, priv->sock_id); rc = ci_sock_lock(ni, waitable); if( rc != 0 ) { ci_netif_unlock(ni); goto drop_and_done; } /* thr referencing scheme comes from efab_file_move_to_alien_stack_rsop */ efab_thr_ref(thr); rc = efab_file_move_to_alien_stack(priv, &thr->netif, 0); if( rc != 0 ) efab_thr_release(thr); else { /* beside us, socket now holds its own reference to thr */ oofilter = &ci_trs_ep_get(thr, sock->b.moved_to_sock_id)->oofilter; oof_socket_replace(fm, &dummy_oofilter, oofilter); SP_TO_SOCK(&thr->netif, sock->b.moved_to_sock_id)->s_flags |= CI_SOCK_FLAG_FILTER; ci_netif_unlock(&thr->netif); } drop_and_done: if( rc != 0 ) oof_socket_del(fm, &dummy_oofilter); /* Drop the reference we got from thc_get_thr or thc_alloc_thr(). * If things went wrong both stack and cluster might disappear. */ efab_thr_release(thr); oof_socket_dtor(&dummy_oofilter); mutex_unlock(&thc_init_mutex); return rc; alloc_fail: oof_socket_del(fm, &dummy_oofilter); alloc_fail0: mutex_unlock(&thc_mutex); alloc_fail_unlocked: oof_socket_dtor(&dummy_oofilter); mutex_unlock(&thc_init_mutex); return rc; }