ci_inline int sock_sleep__on_wakeup(ci_waiter_t* waiter, void* opaque_trs, void* opaque_op, int rc, ci_waitable_timeout_t timeout) { tcp_helper_resource_t* trs = (tcp_helper_resource_t*) opaque_trs; oo_tcp_sock_sleep_t* op = (oo_tcp_sock_sleep_t*) opaque_op; tcp_helper_endpoint_t* ep = ci_trs_ep_get(trs, op->sock_id); if( rc == -ETIMEDOUT ) rc = -EAGAIN; ci_waiter_post(waiter, &ep->waitq); if( rc == 0 && (op->lock_flags & CI_SLEEP_NETIF_RQ) ) if( ! (trs->netif.state->lock.lock & CI_EPLOCK_UNLOCKED) ) { rc = efab_eplock_lock_wait(&trs->netif CI_BLOCKING_CTX_ARG(CI_WAITER_BCTX(waiter)), 0); rc = CI_WAITER_CONVERT_REENTRANT(rc); } if( rc == 0 && (op->lock_flags & CI_SLEEP_SOCK_RQ) ) { citp_waitable* w = SP_TO_WAITABLE(&trs->netif, ep->id); if( w->lock.wl_val & OO_WAITABLE_LK_LOCKED ) { rc = efab_tcp_helper_sock_lock_slow(trs, op->sock_id CI_BLOCKING_CTX_ARG( CI_WAITER_BCTX(waiter))); rc = CI_WAITER_CONVERT_REENTRANT(rc); } } if( op->timeout_ms ) { op->timeout_ms = jiffies_to_msecs(timeout); if( op->timeout_ms == 0 ) rc = -EAGAIN; } return rc; }
static int efab_tcp_drop_from_acceptq(ci_private_t *priv, void *arg) { struct oo_op_tcp_drop_from_acceptq *carg = arg; tcp_helper_resource_t *thr; tcp_helper_endpoint_t *ep; citp_waitable *w; ci_tcp_state *ts; int rc = -EINVAL; /* find stack */ rc = efab_thr_table_lookup(NULL, carg->stack_id, EFAB_THR_TABLE_LOOKUP_CHECK_USER | EFAB_THR_TABLE_LOOKUP_NO_UL, &thr); if( rc < 0 ) return rc; ci_assert( thr->k_ref_count & TCP_HELPER_K_RC_NO_USERLAND ); /* find endpoint and drop OS socket */ ep = ci_trs_get_valid_ep(thr, carg->sock_id); if( ep == NULL ) goto fail1; w = SP_TO_WAITABLE(&thr->netif, carg->sock_id); if( !(w->state & CI_TCP_STATE_TCP) || w->state == CI_TCP_LISTEN ) goto fail2; ts = SP_TO_TCP(&thr->netif, carg->sock_id); ci_assert(ep->os_port_keeper); ci_assert_equal(ep->os_socket, NULL); LOG_TV(ci_log("%s: send reset to non-accepted connection", __FUNCTION__)); /* copy from ci_tcp_listen_shutdown_queues() */ ci_assert(ts->s.b.sb_aflags & CI_SB_AFLAG_TCP_IN_ACCEPTQ); rc = ci_netif_lock(&thr->netif); if( rc != 0 ) { ci_assert_equal(rc, -EINTR); rc = -ERESTARTSYS; goto fail2; } ci_bit_clear(&ts->s.b.sb_aflags, CI_SB_AFLAG_TCP_IN_ACCEPTQ_BIT); /* We have no way to close this connection from the other side: * there was no RST from peer. */ ci_assert_nequal(ts->s.b.state, CI_TCP_CLOSED); ci_assert_nequal(ts->s.b.state, CI_TCP_TIME_WAIT); ci_tcp_send_rst(&thr->netif, ts); ci_tcp_drop(&thr->netif, ts, ECONNRESET); ci_assert_equal(ep->os_port_keeper, NULL); ci_netif_unlock(&thr->netif); efab_tcp_helper_k_ref_count_dec(thr, 1); return 0; fail1: efab_thr_release(thr); fail2: ci_log("%s: inconsistent ep %d:%d", __func__, carg->stack_id, carg->sock_id); return rc; }
static citp_fdinfo_p citp_fdtable_probe_restore(int fd, ci_ep_info_t * info, int print_banner) { citp_protocol_impl* proto = 0; citp_fdinfo* fdi = 0; ci_netif* ni; int rc; int c_sock_fdi = 1; /* Must be holding the FD table writer lock */ CITP_FDTABLE_ASSERT_LOCKED(1); ci_assert_nequal(info->resource_id, CI_ID_POOL_ID_NONE); /* Will need to review this function if the following assert fires */ switch( info->fd_type ) { case CI_PRIV_TYPE_TCP_EP: proto = &citp_tcp_protocol_impl; break; case CI_PRIV_TYPE_UDP_EP: proto = &citp_udp_protocol_impl; break; case CI_PRIV_TYPE_PASSTHROUGH_EP: proto = &citp_passthrough_protocol_impl; c_sock_fdi = 0; break; case CI_PRIV_TYPE_ALIEN_EP: proto = NULL; c_sock_fdi = 0; break; #if CI_CFG_USERSPACE_PIPE case CI_PRIV_TYPE_PIPE_READER: proto = &citp_pipe_read_protocol_impl; c_sock_fdi = 0; break; case CI_PRIV_TYPE_PIPE_WRITER: proto = &citp_pipe_write_protocol_impl; c_sock_fdi = 0; break; #endif default: ci_assert(0); } /* Attempt to find the user-level netif for this endpoint */ ni = citp_find_ul_netif(info->resource_id, 1); if( ! ni ) { ef_driver_handle netif_fd; /* Not found, rebuild/restore the netif for this endpoint */ rc = citp_netif_recreate_probed(fd, &netif_fd, &ni); if ( rc < 0 ) { Log_E(log("%s: citp_netif_recreate_probed failed! (%d)", __FUNCTION__, rc)); goto fail; } if( print_banner ) { ci_log("Importing "ONLOAD_PRODUCT" "ONLOAD_VERSION" "ONLOAD_COPYRIGHT " [%s]", ni->state->pretty_name); } } else citp_netif_add_ref(ni); /* There is a race condition where the fd can have been created, but it has * not yet been initialised, as we can't put a busy marker in the right place * in the fdtable until we know what the fd is. In this case we don't want * to probe this new info, so return the closed fd. */ if( SP_TO_WAITABLE(ni, info->sock_id)->sb_aflags & CI_SB_AFLAG_NOT_READY ) { citp_fdtable_busy_clear(fd, fdip_unknown, 1); fdi = &citp_the_closed_fd; citp_fdinfo_ref(fdi); return fdi_to_fdip(fdi); } if (c_sock_fdi) { citp_sock_fdi* sock_fdi; sock_fdi = CI_ALLOC_OBJ(citp_sock_fdi); if( ! sock_fdi ) { Log_E(log("%s: out of memory (sock_fdi)", __FUNCTION__)); goto fail; } fdi = &sock_fdi->fdinfo; sock_fdi->sock.s = SP_TO_SOCK_CMN(ni, info->sock_id); sock_fdi->sock.netif = ni; } else if( info->fd_type == CI_PRIV_TYPE_PASSTHROUGH_EP ) { citp_waitable* w = SP_TO_WAITABLE(ni, info->sock_id); citp_alien_fdi* alien_fdi; if( ~w->sb_aflags & CI_SB_AFLAG_MOVED_AWAY_IN_EPOLL && fdtable_fd_move(fd, OO_IOC_FILE_MOVED) == 0 ) { citp_netif_release_ref(ni, 1); return fdip_passthru; } alien_fdi = CI_ALLOC_OBJ(citp_alien_fdi); if( ! alien_fdi ) { Log_E(log("%s: out of memory (alien_fdi)", __FUNCTION__)); goto fail; } fdi = &alien_fdi->fdinfo; alien_fdi->netif = ni; alien_fdi->ep = SP_TO_WAITABLE(ni, info->sock_id); citp_passthrough_init(alien_fdi); } else if( info->fd_type == CI_PRIV_TYPE_ALIEN_EP ) { citp_waitable* w = SP_TO_WAITABLE(ni, info->sock_id); citp_sock_fdi* sock_fdi; ci_netif* alien_ni; sock_fdi = CI_ALLOC_OBJ(citp_sock_fdi); if( ! sock_fdi ) { Log_E(log("%s: out of memory (alien sock_fdi)", __FUNCTION__)); goto fail; } fdi = &sock_fdi->fdinfo; rc = citp_netif_by_id(w->moved_to_stack_id, &alien_ni, 1); if( rc != 0 ) { goto fail; } sock_fdi->sock.s = SP_TO_SOCK_CMN(alien_ni, w->moved_to_sock_id); sock_fdi->sock.netif = alien_ni; citp_netif_release_ref(ni, 1); /* Replace the file under this fd if possible */ if( ~w->sb_aflags & CI_SB_AFLAG_MOVED_AWAY_IN_EPOLL ) fdtable_fd_move(fd, OO_IOC_FILE_MOVED); if( sock_fdi->sock.s->b.state & CI_TCP_STATE_TCP ) proto = &citp_tcp_protocol_impl; else if( sock_fdi->sock.s->b.state == CI_TCP_STATE_UDP ) proto = &citp_udp_protocol_impl; else { CI_TEST(0); } } #if CI_CFG_USERSPACE_PIPE else { citp_pipe_fdi* pipe_fdi; pipe_fdi = CI_ALLOC_OBJ(citp_pipe_fdi); if( ! pipe_fdi ) { Log_E(log("%s: out of memory (pipe_fdi)", __FUNCTION__)); goto fail; } fdi = &pipe_fdi->fdinfo; pipe_fdi->pipe = SP_TO_PIPE(ni, info->sock_id); pipe_fdi->ni = ni; } #endif citp_fdinfo_init(fdi, proto); /* We're returning a reference to the caller. */ citp_fdinfo_ref(fdi); citp_fdtable_insert(fdi, fd, 1); return fdi_to_fdip(fdi); fail: if( ni ) citp_netif_release_ref(ni, 1); return fdip_unknown; }
static int oo_epoll2_ctl(struct oo_epoll_private *priv, int op_kepfd, int op_op, int op_fd, struct epoll_event *op_event) { tcp_helper_resource_t *fd_thr; struct file *file; int rc; ci_uint32 fd_sock_id; citp_waitable *fd_w; /* We are interested in ADD only */ if( op_op != EPOLL_CTL_ADD ) return efab_linux_sys_epoll_ctl(op_kepfd, op_op, op_fd, op_event); /* system poll() and friends use fget_light(), which is cheap. * But they do not export fget_light to us, so we have to use fget(). */ file = fget(op_fd); if(unlikely( file == NULL )) return -EBADF; /* Check for the dead circle. * We should check that we are not adding ourself. */ if(unlikely( file->private_data == priv )) { fput(file); return -EINVAL; } /* Is op->fd ours and if yes, which netif it has? */ /* Fixme: epoll fd - do we want to accelerate something? */ if( file->f_op != &linux_tcp_helper_fops_udp && file->f_op != &linux_tcp_helper_fops_tcp ) { int rc; #ifdef OO_EPOLL_NEED_NEST_PROTECTION struct oo_epoll_busy_task t; t.task = current; spin_lock(&priv->lock); list_add(&t.link, &priv->p.p2.busy_tasks); spin_unlock(&priv->lock); #endif #if CI_CFG_USERSPACE_PIPE if( ( file->f_op == &linux_tcp_helper_fops_pipe_reader || file->f_op == &linux_tcp_helper_fops_pipe_writer ) ) priv->p.p2.do_spin = 1; #endif fput(file); rc = efab_linux_sys_epoll_ctl(op_kepfd, op_op, op_fd, op_event); #ifdef OO_EPOLL_NEED_NEST_PROTECTION spin_lock(&priv->lock); list_del(&t.link); spin_unlock(&priv->lock); #endif return rc; } /* Onload socket here! */ fd_thr = ((ci_private_t *)file->private_data)->thr; fd_sock_id = ((ci_private_t *)file->private_data)->sock_id; priv->p.p2.do_spin = 1; if(unlikely( ! oo_epoll_add_stack(priv, fd_thr) )) { static int printed; if( !printed ) ci_log("Can't add stack %d to epoll set: consider " "increasing epoll_max_stacks module option", fd_thr->id); /* fall through to sys_epoll_ctl() without interrupt */ } /* Let kernel add fd to the epoll set, but ask endpoint to avoid enabling * interrupts. * And we keep file ref while using fd_w to avoid nasty things. */ fd_w = SP_TO_WAITABLE(&fd_thr->netif, fd_sock_id); ci_bit_set(&fd_w->sb_aflags, CI_SB_AFLAG_AVOID_INTERRUPTS_BIT); rc = efab_linux_sys_epoll_ctl(op_kepfd, op_op, op_fd, op_event); ci_bit_clear(&fd_w->sb_aflags, CI_SB_AFLAG_AVOID_INTERRUPTS_BIT); fput(file); return rc; }
/* Locking policy: * Enterance: priv->thr->netif is assumed to be locked. * Exit: all stacks (the client stack and the listener's stack) are * unlocked. */ int efab_tcp_loopback_connect(ci_private_t *priv, void *arg) { struct oo_op_loopback_connect *carg = arg; ci_netif *alien_ni = NULL; oo_sp tls_id; ci_assert(ci_netif_is_locked(&priv->thr->netif)); carg->out_moved = 0; if( !CI_PRIV_TYPE_IS_ENDPOINT(priv->fd_type) ) return -EINVAL; if( NI_OPTS(&priv->thr->netif).tcp_client_loopback != CITP_TCP_LOOPBACK_TO_CONNSTACK && NI_OPTS(&priv->thr->netif).tcp_client_loopback != CITP_TCP_LOOPBACK_TO_LISTSTACK && NI_OPTS(&priv->thr->netif).tcp_client_loopback != CITP_TCP_LOOPBACK_TO_NEWSTACK) { ci_netif_unlock(&priv->thr->netif); return -EINVAL; } while( iterate_netifs_unlocked(&alien_ni) == 0 ) { if( !efab_thr_can_access_stack(netif2tcp_helper_resource(alien_ni), EFAB_THR_TABLE_LOOKUP_CHECK_USER) ) continue; /* no permission to look in here */ if( NI_OPTS(alien_ni).tcp_server_loopback == CITP_TCP_LOOPBACK_OFF ) continue; /* server does not accept loopback connections */ if( NI_OPTS(&priv->thr->netif).tcp_client_loopback != CITP_TCP_LOOPBACK_TO_LISTSTACK && NI_OPTS(alien_ni).tcp_server_loopback != CITP_TCP_LOOPBACK_ALLOW_ALIEN_IN_ACCEPTQ ) continue; /* options of the stacks to not match */ if( NI_OPTS(&priv->thr->netif).tcp_client_loopback != CITP_TCP_LOOPBACK_TO_LISTSTACK && !efab_thr_user_can_access_stack(alien_ni->uid, alien_ni->euid, &priv->thr->netif) ) continue; /* server can't accept our socket */ tls_id = ci_tcp_connect_find_local_peer(alien_ni, carg->dst_addr, carg->dst_port); if( OO_SP_NOT_NULL(tls_id) ) { int rc; /* We are going to exit in this or other way: get ref and * drop kref of alien_ni */ efab_thr_ref(netif2tcp_helper_resource(alien_ni)); iterate_netifs_unlocked_dropref(alien_ni); switch( NI_OPTS(&priv->thr->netif).tcp_client_loopback ) { case CITP_TCP_LOOPBACK_TO_CONNSTACK: /* connect_lo_toconn unlocks priv->thr->netif */ carg->out_rc = ci_tcp_connect_lo_toconn(&priv->thr->netif, priv->sock_id, carg->dst_addr, alien_ni, tls_id); efab_thr_release(netif2tcp_helper_resource(alien_ni)); return 0; case CITP_TCP_LOOPBACK_TO_LISTSTACK: /* Nobody should be using this socket, so trylock should succeed. * Overwise we hand over the socket and do not accelerate this * loopback connection. */ rc = ci_sock_trylock(&priv->thr->netif, SP_TO_WAITABLE(&priv->thr->netif, priv->sock_id)); if( rc == 0 ) { ci_netif_unlock(&priv->thr->netif); efab_thr_release(netif2tcp_helper_resource(alien_ni)); return -ECONNREFUSED; } /* move_to_alien changes locks - see comments near it */ rc = efab_file_move_to_alien_stack(priv, alien_ni); if( rc != 0 ) { /* error - everything is already unlocked */ efab_thr_release(netif2tcp_helper_resource(alien_ni)); /* if we return error, UL will hand the socket over. */ return rc; } /* now alien_ni is locked */ /* Connect again, using new endpoint */ carg->out_rc = ci_tcp_connect_lo_samestack( alien_ni, SP_TO_TCP(alien_ni, SP_TO_WAITABLE(&priv->thr->netif, priv->sock_id) ->moved_to_sock_id), tls_id); ci_netif_unlock(alien_ni); carg->out_moved = 1; return 0; case CITP_TCP_LOOPBACK_TO_NEWSTACK: { tcp_helper_resource_t *new_thr; ci_resource_onload_alloc_t alloc; /* create new stack * todo: no hardware interfaces are necessary */ strcpy(alloc.in_version, ONLOAD_VERSION); strcpy(alloc.in_uk_intf_ver, oo_uk_intf_ver); alloc.in_name[0] = '\0'; alloc.in_flags = 0; rc = tcp_helper_alloc_kernel(&alloc, &NI_OPTS(&priv->thr->netif), 0, &new_thr); if( rc != 0 ) { ci_netif_unlock(&priv->thr->netif); efab_thr_release(netif2tcp_helper_resource(alien_ni)); return -ECONNREFUSED; } rc = ci_sock_trylock(&priv->thr->netif, SP_TO_WAITABLE(&priv->thr->netif, priv->sock_id)); if( rc == 0 ) { ci_netif_unlock(&priv->thr->netif); efab_thr_release(netif2tcp_helper_resource(alien_ni)); efab_thr_release(new_thr); return -ECONNREFUSED; } /* move connecting socket to the new stack */ rc = efab_file_move_to_alien_stack(priv, &new_thr->netif); if( rc != 0 ) { /* error - everything is already unlocked */ efab_thr_release(netif2tcp_helper_resource(alien_ni)); efab_thr_release(new_thr); return -ECONNREFUSED; } /* now new_thr->netif is locked */ carg->out_moved = 1; carg->out_rc = -ECONNREFUSED; /* now connect via CITP_TCP_LOOPBACK_TO_CONNSTACK */ /* connect_lo_toconn unlocks new_thr->netif */ carg->out_rc = ci_tcp_connect_lo_toconn( &new_thr->netif, SP_TO_WAITABLE(&priv->thr->netif, priv->sock_id)->moved_to_sock_id, carg->dst_addr, alien_ni, tls_id); efab_thr_release(netif2tcp_helper_resource(alien_ni)); return 0; } } } else if( tls_id == OO_SP_INVALID ) break; } ci_netif_unlock(&priv->thr->netif); return -ENOENT; }
int efab_file_move_to_alien_stack_rsop(ci_private_t *stack_priv, void *arg) { ci_fixed_descriptor_t sock_fd = *(ci_fixed_descriptor_t *)arg; struct file *sock_file = fget(sock_fd); ci_private_t *sock_priv; tcp_helper_resource_t *old_thr; tcp_helper_resource_t *new_thr; citp_waitable *w; int rc; if( sock_file == NULL ) return -EINVAL; if( !FILE_IS_ENDPOINT_SOCK(sock_file) || stack_priv->fd_type != CI_PRIV_TYPE_NETIF ) { fput(sock_file); return -EINVAL; } sock_priv = sock_file->private_data; ci_assert(sock_priv->fd_type == CI_PRIV_TYPE_TCP_EP || sock_priv->fd_type == CI_PRIV_TYPE_UDP_EP); old_thr = sock_priv->thr; new_thr = stack_priv->thr; ci_assert(old_thr); ci_assert(new_thr); if( old_thr == new_thr ) { fput(sock_file); return 0; } if( tcp_helper_cluster_from_cluster(old_thr) != 0 ) { LOG_S(ci_log("%s: move_fd() not permitted on clustered stacks", __func__)); fput(sock_file); return -EINVAL; } w = SP_TO_WAITABLE(&old_thr->netif, sock_priv->sock_id); rc = ci_sock_lock(&old_thr->netif, w); if( rc != 0 ) { fput(sock_file); return rc; } rc = ci_netif_lock(&old_thr->netif); if( rc != 0 ) { ci_sock_unlock(&old_thr->netif, w); fput(sock_file); return rc; } efab_thr_ref(new_thr); rc = efab_file_move_to_alien_stack(sock_priv, &stack_priv->thr->netif); fput(sock_file); if( rc != 0 ) efab_thr_release(new_thr); else ci_netif_unlock(&new_thr->netif); return rc; }
/* This function must be called with netif lock not held and it always * returns with the netif lock not held. */ int efab_tcp_helper_reuseport_bind(ci_private_t *priv, void *arg) { oo_tcp_reuseport_bind_t* trb = arg; ci_netif* ni = &priv->thr->netif; tcp_helper_cluster_t* thc; tcp_helper_resource_t* thr = NULL; citp_waitable* waitable; ci_sock_cmn* sock = SP_TO_SOCK(ni, priv->sock_id); struct oof_manager* fm = efab_tcp_driver.filter_manager; struct oof_socket* oofilter; struct oof_socket dummy_oofilter; int protocol = thc_get_sock_protocol(sock); char name[CI_CFG_CLUSTER_NAME_LEN + 1]; int rc, rc1; int flags = 0; tcp_helper_cluster_t* named_thc,* ported_thc; int alloced = 0; /* No clustering on sockets bound to alien addresses */ if( sock->s_flags & CI_SOCK_FLAG_BOUND_ALIEN ) return 0; if( NI_OPTS(ni).cluster_ignore == 1 ) { LOG_NV(ci_log("%s: Ignored attempt to use clusters due to " "EF_CLUSTER_IGNORE option.", __FUNCTION__)); return 0; } if( trb->port_be16 == 0 ) { ci_log("%s: Reuseport on port=0 is not supported", __FUNCTION__); return -EINVAL; } if( trb->cluster_size < 2 ) { ci_log("%s: Cluster sizes < 2 are not supported", __FUNCTION__); return -EINVAL; } if( sock->s_flags & (CI_SOCK_FLAG_TPROXY | CI_SOCK_FLAG_MAC_FILTER) ) { ci_log("%s: Scalable filter sockets cannot be clustered", __FUNCTION__); return -EINVAL; } oofilter = &ci_trs_ep_get(priv->thr, priv->sock_id)->oofilter; if( oofilter->sf_local_port != NULL ) { ci_log("%s: Socket that already have filter cannot be clustered", __FUNCTION__); return -EINVAL; } if( priv->thr->thc ) { /* Reserve proto:port[:ip] until bind (or close)*/ rc = oof_socket_add(fm, oofilter, OOF_SOCKET_ADD_FLAG_CLUSTERED | OOF_SOCKET_ADD_FLAG_DUMMY, protocol, trb->addr_be32, trb->port_be16, 0, 0, &ported_thc); if( rc > 0 ) rc = 0; if( rc == 0 ) sock->s_flags |= CI_SOCK_FLAG_FILTER; return rc; } mutex_lock(&thc_init_mutex); /* We are going to be iterating over clusters, make sure they don't * change. */ mutex_lock(&thc_mutex); /* Lookup a suitable cluster to use */ /* We try to add dummy filter to oof to reserve proto:port[:ip] tuple, * if there is already a cluster at the tuple we will get reference to it, */ oof_socket_ctor(&dummy_oofilter); rc = oof_socket_add(fm, &dummy_oofilter, OOF_SOCKET_ADD_FLAG_CLUSTERED | OOF_SOCKET_ADD_FLAG_DUMMY | OOF_SOCKET_ADD_FLAG_NO_STACK, protocol, trb->addr_be32, trb->port_be16, 0, 0, &ported_thc); if( rc < 0 ) /* non-clustered socket on the tuple */ goto alloc_fail0; if( ! gen_cluster_name(trb->cluster_name, name) ) { /* user requested a cluster by name. But we need to make sure * that the oof_local_port that the user is interested in is not * being used by another cluster. We search for cluster by name * and use results of prior protp:port[:ip] search oof_local_port * to then do some sanity checking. */ rc1 = thc_search_by_name(name, protocol, trb->port_be16, ci_geteuid(), &named_thc); if( rc1 < 0 ) { rc = rc1; goto alloc_fail; } if( rc1 == 0 ) { if( rc == 1 ) { /* search by oof_local_port found a cluster which search by * name didn't find. */ LOG_E(ci_log("Error: Cluster with requested name %s already " "bound to %s", name, ported_thc->thc_name)); rc = -EEXIST; goto alloc_fail; } else { /* Neither searches found a cluster. So allocate one below. */ } } else { if( rc == 1 ) { /* Both searches found clusters. Fine if they are the same or * else error. */ if( named_thc != ported_thc ) { LOG_E(ci_log("Error: Cluster %s does not handle socket %s:%d. " "Cluster %s does", name, FMT_PROTOCOL(protocol), trb->port_be16, named_thc->thc_name)); rc = -EEXIST; goto alloc_fail; } } /* Search by name found a cluster no conflict with search by tuple * (the ported cluster is either none or the same as named)*/ thc = named_thc; goto cont; } } else { /* No cluster name requested. We have already looked for a cluster handling * the tuple. If none found, then try to use an existing * cluster this process created. If none found, then allocate one. */ /* If rc == 0, then no cluster found - try to allocate one. * If rc == 1, we found cluster - make sure that euids match and continue. */ if( rc == 1 ) { thc = ported_thc; if( thc->thc_euid != ci_geteuid() ) { rc = -EADDRINUSE; goto alloc_fail; } goto cont; } rc = thc_search_by_name(name, protocol, trb->port_be16, ci_geteuid(), &thc); if( rc < 0 ) goto alloc_fail; if( rc == 1 ) goto cont; } /* When an interface is in tproxy mode, all clustered listening socket * are assumed to be part of tproxy passive side. This requires * rss context to use altered rss hashing based solely on src ip:port. */ flags = tcp_helper_cluster_thc_flags(&NI_OPTS(ni)); if( (rc = thc_alloc(name, protocol, trb->port_be16, ci_geteuid(), trb->cluster_size, flags, &thc)) != 0 ) goto alloc_fail; alloced = 1; cont: tcp_helper_cluster_ref(thc); /* At this point we have our cluster with one additional reference */ /* Find a suitable stack within the cluster to use */ rc = thc_get_thr(thc, &dummy_oofilter, &thr); if( rc != 0 ) rc = thc_alloc_thr(thc, trb->cluster_restart_opt, &ni->opts, ni->flags, &thr); /* If get or alloc succeeded thr holds reference to the cluster, * so the cluster cannot go away. We'll drop our reference and also * will not be accessing state within the cluster anymore so we can * drop the lock. */ mutex_unlock(&thc_mutex); if( alloced && rc == 0 && (flags & THC_FLAG_TPROXY) != 0 ) { /* Tproxy filter is allocated as late as here, * the reason is that this needs to be preceded by stack allocation * (firmware needs initialized vi) */ rc = thc_install_tproxy(thc, NI_OPTS(ni).scalable_filter_ifindex); if( rc != 0 ) efab_thr_release(thr); } tcp_helper_cluster_release(thc, NULL); if( rc != 0 ) { oof_socket_del(fm, &dummy_oofilter); goto alloc_fail_unlocked; } /* We have thr and we hold single reference to it. */ /* Move the socket into the new stack */ if( (rc = ci_netif_lock(ni)) != 0 ) goto drop_and_done; waitable = SP_TO_WAITABLE(ni, priv->sock_id); rc = ci_sock_lock(ni, waitable); if( rc != 0 ) { ci_netif_unlock(ni); goto drop_and_done; } /* thr referencing scheme comes from efab_file_move_to_alien_stack_rsop */ efab_thr_ref(thr); rc = efab_file_move_to_alien_stack(priv, &thr->netif, 0); if( rc != 0 ) efab_thr_release(thr); else { /* beside us, socket now holds its own reference to thr */ oofilter = &ci_trs_ep_get(thr, sock->b.moved_to_sock_id)->oofilter; oof_socket_replace(fm, &dummy_oofilter, oofilter); SP_TO_SOCK(&thr->netif, sock->b.moved_to_sock_id)->s_flags |= CI_SOCK_FLAG_FILTER; ci_netif_unlock(&thr->netif); } drop_and_done: if( rc != 0 ) oof_socket_del(fm, &dummy_oofilter); /* Drop the reference we got from thc_get_thr or thc_alloc_thr(). * If things went wrong both stack and cluster might disappear. */ efab_thr_release(thr); oof_socket_dtor(&dummy_oofilter); mutex_unlock(&thc_init_mutex); return rc; alloc_fail: oof_socket_del(fm, &dummy_oofilter); alloc_fail0: mutex_unlock(&thc_mutex); alloc_fail_unlocked: oof_socket_dtor(&dummy_oofilter); mutex_unlock(&thc_init_mutex); return rc; }