int efab_file_move_to_alien_stack_rsop(ci_private_t *stack_priv, void *arg) { ci_fixed_descriptor_t sock_fd = *(ci_fixed_descriptor_t *)arg; struct file *sock_file = fget(sock_fd); ci_private_t *sock_priv; tcp_helper_resource_t *old_thr; tcp_helper_resource_t *new_thr; citp_waitable *w; int rc; if( sock_file == NULL ) return -EINVAL; if( !FILE_IS_ENDPOINT_SOCK(sock_file) || stack_priv->fd_type != CI_PRIV_TYPE_NETIF ) { fput(sock_file); return -EINVAL; } sock_priv = sock_file->private_data; ci_assert(sock_priv->fd_type == CI_PRIV_TYPE_TCP_EP || sock_priv->fd_type == CI_PRIV_TYPE_UDP_EP); old_thr = sock_priv->thr; new_thr = stack_priv->thr; ci_assert(old_thr); ci_assert(new_thr); if( old_thr == new_thr ) { fput(sock_file); return 0; } if( tcp_helper_cluster_from_cluster(old_thr) != 0 ) { LOG_S(ci_log("%s: move_fd() not permitted on clustered stacks", __func__)); fput(sock_file); return -EINVAL; } w = SP_TO_WAITABLE(&old_thr->netif, sock_priv->sock_id); rc = ci_sock_lock(&old_thr->netif, w); if( rc != 0 ) { fput(sock_file); return rc; } rc = ci_netif_lock(&old_thr->netif); if( rc != 0 ) { ci_sock_unlock(&old_thr->netif, w); fput(sock_file); return rc; } efab_thr_ref(new_thr); rc = efab_file_move_to_alien_stack(sock_priv, &stack_priv->thr->netif); fput(sock_file); if( rc != 0 ) efab_thr_release(new_thr); else ci_netif_unlock(&new_thr->netif); return rc; }
/* Locking policy: * Enterance: priv->thr->netif is assumed to be locked. * Exit: all stacks (the client stack and the listener's stack) are * unlocked. */ int efab_tcp_loopback_connect(ci_private_t *priv, void *arg) { struct oo_op_loopback_connect *carg = arg; ci_netif *alien_ni = NULL; oo_sp tls_id; ci_assert(ci_netif_is_locked(&priv->thr->netif)); carg->out_moved = 0; if( !CI_PRIV_TYPE_IS_ENDPOINT(priv->fd_type) ) return -EINVAL; if( NI_OPTS(&priv->thr->netif).tcp_client_loopback != CITP_TCP_LOOPBACK_TO_CONNSTACK && NI_OPTS(&priv->thr->netif).tcp_client_loopback != CITP_TCP_LOOPBACK_TO_LISTSTACK && NI_OPTS(&priv->thr->netif).tcp_client_loopback != CITP_TCP_LOOPBACK_TO_NEWSTACK) { ci_netif_unlock(&priv->thr->netif); return -EINVAL; } while( iterate_netifs_unlocked(&alien_ni) == 0 ) { if( !efab_thr_can_access_stack(netif2tcp_helper_resource(alien_ni), EFAB_THR_TABLE_LOOKUP_CHECK_USER) ) continue; /* no permission to look in here */ if( NI_OPTS(alien_ni).tcp_server_loopback == CITP_TCP_LOOPBACK_OFF ) continue; /* server does not accept loopback connections */ if( NI_OPTS(&priv->thr->netif).tcp_client_loopback != CITP_TCP_LOOPBACK_TO_LISTSTACK && NI_OPTS(alien_ni).tcp_server_loopback != CITP_TCP_LOOPBACK_ALLOW_ALIEN_IN_ACCEPTQ ) continue; /* options of the stacks to not match */ if( NI_OPTS(&priv->thr->netif).tcp_client_loopback != CITP_TCP_LOOPBACK_TO_LISTSTACK && !efab_thr_user_can_access_stack(alien_ni->uid, alien_ni->euid, &priv->thr->netif) ) continue; /* server can't accept our socket */ tls_id = ci_tcp_connect_find_local_peer(alien_ni, carg->dst_addr, carg->dst_port); if( OO_SP_NOT_NULL(tls_id) ) { int rc; /* We are going to exit in this or other way: get ref and * drop kref of alien_ni */ efab_thr_ref(netif2tcp_helper_resource(alien_ni)); iterate_netifs_unlocked_dropref(alien_ni); switch( NI_OPTS(&priv->thr->netif).tcp_client_loopback ) { case CITP_TCP_LOOPBACK_TO_CONNSTACK: /* connect_lo_toconn unlocks priv->thr->netif */ carg->out_rc = ci_tcp_connect_lo_toconn(&priv->thr->netif, priv->sock_id, carg->dst_addr, alien_ni, tls_id); efab_thr_release(netif2tcp_helper_resource(alien_ni)); return 0; case CITP_TCP_LOOPBACK_TO_LISTSTACK: /* Nobody should be using this socket, so trylock should succeed. * Overwise we hand over the socket and do not accelerate this * loopback connection. */ rc = ci_sock_trylock(&priv->thr->netif, SP_TO_WAITABLE(&priv->thr->netif, priv->sock_id)); if( rc == 0 ) { ci_netif_unlock(&priv->thr->netif); efab_thr_release(netif2tcp_helper_resource(alien_ni)); return -ECONNREFUSED; } /* move_to_alien changes locks - see comments near it */ rc = efab_file_move_to_alien_stack(priv, alien_ni); if( rc != 0 ) { /* error - everything is already unlocked */ efab_thr_release(netif2tcp_helper_resource(alien_ni)); /* if we return error, UL will hand the socket over. */ return rc; } /* now alien_ni is locked */ /* Connect again, using new endpoint */ carg->out_rc = ci_tcp_connect_lo_samestack( alien_ni, SP_TO_TCP(alien_ni, SP_TO_WAITABLE(&priv->thr->netif, priv->sock_id) ->moved_to_sock_id), tls_id); ci_netif_unlock(alien_ni); carg->out_moved = 1; return 0; case CITP_TCP_LOOPBACK_TO_NEWSTACK: { tcp_helper_resource_t *new_thr; ci_resource_onload_alloc_t alloc; /* create new stack * todo: no hardware interfaces are necessary */ strcpy(alloc.in_version, ONLOAD_VERSION); strcpy(alloc.in_uk_intf_ver, oo_uk_intf_ver); alloc.in_name[0] = '\0'; alloc.in_flags = 0; rc = tcp_helper_alloc_kernel(&alloc, &NI_OPTS(&priv->thr->netif), 0, &new_thr); if( rc != 0 ) { ci_netif_unlock(&priv->thr->netif); efab_thr_release(netif2tcp_helper_resource(alien_ni)); return -ECONNREFUSED; } rc = ci_sock_trylock(&priv->thr->netif, SP_TO_WAITABLE(&priv->thr->netif, priv->sock_id)); if( rc == 0 ) { ci_netif_unlock(&priv->thr->netif); efab_thr_release(netif2tcp_helper_resource(alien_ni)); efab_thr_release(new_thr); return -ECONNREFUSED; } /* move connecting socket to the new stack */ rc = efab_file_move_to_alien_stack(priv, &new_thr->netif); if( rc != 0 ) { /* error - everything is already unlocked */ efab_thr_release(netif2tcp_helper_resource(alien_ni)); efab_thr_release(new_thr); return -ECONNREFUSED; } /* now new_thr->netif is locked */ carg->out_moved = 1; carg->out_rc = -ECONNREFUSED; /* now connect via CITP_TCP_LOOPBACK_TO_CONNSTACK */ /* connect_lo_toconn unlocks new_thr->netif */ carg->out_rc = ci_tcp_connect_lo_toconn( &new_thr->netif, SP_TO_WAITABLE(&priv->thr->netif, priv->sock_id)->moved_to_sock_id, carg->dst_addr, alien_ni, tls_id); efab_thr_release(netif2tcp_helper_resource(alien_ni)); return 0; } } } else if( tls_id == OO_SP_INVALID ) break; } ci_netif_unlock(&priv->thr->netif); return -ENOENT; }
/* This function must be called with netif lock not held and it always * returns with the netif lock not held. */ int efab_tcp_helper_reuseport_bind(ci_private_t *priv, void *arg) { oo_tcp_reuseport_bind_t* trb = arg; ci_netif* ni = &priv->thr->netif; tcp_helper_cluster_t* thc; tcp_helper_resource_t* thr = NULL; citp_waitable* waitable; ci_sock_cmn* sock = SP_TO_SOCK(ni, priv->sock_id); struct oof_manager* fm = efab_tcp_driver.filter_manager; struct oof_socket* oofilter; struct oof_socket dummy_oofilter; int protocol = thc_get_sock_protocol(sock); char name[CI_CFG_CLUSTER_NAME_LEN + 1]; int rc, rc1; int flags = 0; tcp_helper_cluster_t* named_thc,* ported_thc; int alloced = 0; /* No clustering on sockets bound to alien addresses */ if( sock->s_flags & CI_SOCK_FLAG_BOUND_ALIEN ) return 0; if( NI_OPTS(ni).cluster_ignore == 1 ) { LOG_NV(ci_log("%s: Ignored attempt to use clusters due to " "EF_CLUSTER_IGNORE option.", __FUNCTION__)); return 0; } if( trb->port_be16 == 0 ) { ci_log("%s: Reuseport on port=0 is not supported", __FUNCTION__); return -EINVAL; } if( trb->cluster_size < 2 ) { ci_log("%s: Cluster sizes < 2 are not supported", __FUNCTION__); return -EINVAL; } if( sock->s_flags & (CI_SOCK_FLAG_TPROXY | CI_SOCK_FLAG_MAC_FILTER) ) { ci_log("%s: Scalable filter sockets cannot be clustered", __FUNCTION__); return -EINVAL; } oofilter = &ci_trs_ep_get(priv->thr, priv->sock_id)->oofilter; if( oofilter->sf_local_port != NULL ) { ci_log("%s: Socket that already have filter cannot be clustered", __FUNCTION__); return -EINVAL; } if( priv->thr->thc ) { /* Reserve proto:port[:ip] until bind (or close)*/ rc = oof_socket_add(fm, oofilter, OOF_SOCKET_ADD_FLAG_CLUSTERED | OOF_SOCKET_ADD_FLAG_DUMMY, protocol, trb->addr_be32, trb->port_be16, 0, 0, &ported_thc); if( rc > 0 ) rc = 0; if( rc == 0 ) sock->s_flags |= CI_SOCK_FLAG_FILTER; return rc; } mutex_lock(&thc_init_mutex); /* We are going to be iterating over clusters, make sure they don't * change. */ mutex_lock(&thc_mutex); /* Lookup a suitable cluster to use */ /* We try to add dummy filter to oof to reserve proto:port[:ip] tuple, * if there is already a cluster at the tuple we will get reference to it, */ oof_socket_ctor(&dummy_oofilter); rc = oof_socket_add(fm, &dummy_oofilter, OOF_SOCKET_ADD_FLAG_CLUSTERED | OOF_SOCKET_ADD_FLAG_DUMMY | OOF_SOCKET_ADD_FLAG_NO_STACK, protocol, trb->addr_be32, trb->port_be16, 0, 0, &ported_thc); if( rc < 0 ) /* non-clustered socket on the tuple */ goto alloc_fail0; if( ! gen_cluster_name(trb->cluster_name, name) ) { /* user requested a cluster by name. But we need to make sure * that the oof_local_port that the user is interested in is not * being used by another cluster. We search for cluster by name * and use results of prior protp:port[:ip] search oof_local_port * to then do some sanity checking. */ rc1 = thc_search_by_name(name, protocol, trb->port_be16, ci_geteuid(), &named_thc); if( rc1 < 0 ) { rc = rc1; goto alloc_fail; } if( rc1 == 0 ) { if( rc == 1 ) { /* search by oof_local_port found a cluster which search by * name didn't find. */ LOG_E(ci_log("Error: Cluster with requested name %s already " "bound to %s", name, ported_thc->thc_name)); rc = -EEXIST; goto alloc_fail; } else { /* Neither searches found a cluster. So allocate one below. */ } } else { if( rc == 1 ) { /* Both searches found clusters. Fine if they are the same or * else error. */ if( named_thc != ported_thc ) { LOG_E(ci_log("Error: Cluster %s does not handle socket %s:%d. " "Cluster %s does", name, FMT_PROTOCOL(protocol), trb->port_be16, named_thc->thc_name)); rc = -EEXIST; goto alloc_fail; } } /* Search by name found a cluster no conflict with search by tuple * (the ported cluster is either none or the same as named)*/ thc = named_thc; goto cont; } } else { /* No cluster name requested. We have already looked for a cluster handling * the tuple. If none found, then try to use an existing * cluster this process created. If none found, then allocate one. */ /* If rc == 0, then no cluster found - try to allocate one. * If rc == 1, we found cluster - make sure that euids match and continue. */ if( rc == 1 ) { thc = ported_thc; if( thc->thc_euid != ci_geteuid() ) { rc = -EADDRINUSE; goto alloc_fail; } goto cont; } rc = thc_search_by_name(name, protocol, trb->port_be16, ci_geteuid(), &thc); if( rc < 0 ) goto alloc_fail; if( rc == 1 ) goto cont; } /* When an interface is in tproxy mode, all clustered listening socket * are assumed to be part of tproxy passive side. This requires * rss context to use altered rss hashing based solely on src ip:port. */ flags = tcp_helper_cluster_thc_flags(&NI_OPTS(ni)); if( (rc = thc_alloc(name, protocol, trb->port_be16, ci_geteuid(), trb->cluster_size, flags, &thc)) != 0 ) goto alloc_fail; alloced = 1; cont: tcp_helper_cluster_ref(thc); /* At this point we have our cluster with one additional reference */ /* Find a suitable stack within the cluster to use */ rc = thc_get_thr(thc, &dummy_oofilter, &thr); if( rc != 0 ) rc = thc_alloc_thr(thc, trb->cluster_restart_opt, &ni->opts, ni->flags, &thr); /* If get or alloc succeeded thr holds reference to the cluster, * so the cluster cannot go away. We'll drop our reference and also * will not be accessing state within the cluster anymore so we can * drop the lock. */ mutex_unlock(&thc_mutex); if( alloced && rc == 0 && (flags & THC_FLAG_TPROXY) != 0 ) { /* Tproxy filter is allocated as late as here, * the reason is that this needs to be preceded by stack allocation * (firmware needs initialized vi) */ rc = thc_install_tproxy(thc, NI_OPTS(ni).scalable_filter_ifindex); if( rc != 0 ) efab_thr_release(thr); } tcp_helper_cluster_release(thc, NULL); if( rc != 0 ) { oof_socket_del(fm, &dummy_oofilter); goto alloc_fail_unlocked; } /* We have thr and we hold single reference to it. */ /* Move the socket into the new stack */ if( (rc = ci_netif_lock(ni)) != 0 ) goto drop_and_done; waitable = SP_TO_WAITABLE(ni, priv->sock_id); rc = ci_sock_lock(ni, waitable); if( rc != 0 ) { ci_netif_unlock(ni); goto drop_and_done; } /* thr referencing scheme comes from efab_file_move_to_alien_stack_rsop */ efab_thr_ref(thr); rc = efab_file_move_to_alien_stack(priv, &thr->netif, 0); if( rc != 0 ) efab_thr_release(thr); else { /* beside us, socket now holds its own reference to thr */ oofilter = &ci_trs_ep_get(thr, sock->b.moved_to_sock_id)->oofilter; oof_socket_replace(fm, &dummy_oofilter, oofilter); SP_TO_SOCK(&thr->netif, sock->b.moved_to_sock_id)->s_flags |= CI_SOCK_FLAG_FILTER; ci_netif_unlock(&thr->netif); } drop_and_done: if( rc != 0 ) oof_socket_del(fm, &dummy_oofilter); /* Drop the reference we got from thc_get_thr or thc_alloc_thr(). * If things went wrong both stack and cluster might disappear. */ efab_thr_release(thr); oof_socket_dtor(&dummy_oofilter); mutex_unlock(&thc_init_mutex); return rc; alloc_fail: oof_socket_del(fm, &dummy_oofilter); alloc_fail0: mutex_unlock(&thc_mutex); alloc_fail_unlocked: oof_socket_dtor(&dummy_oofilter); mutex_unlock(&thc_init_mutex); return rc; }