static int efab_tcp_drop_from_acceptq(ci_private_t *priv, void *arg) { struct oo_op_tcp_drop_from_acceptq *carg = arg; tcp_helper_resource_t *thr; tcp_helper_endpoint_t *ep; citp_waitable *w; ci_tcp_state *ts; int rc = -EINVAL; /* find stack */ rc = efab_thr_table_lookup(NULL, carg->stack_id, EFAB_THR_TABLE_LOOKUP_CHECK_USER | EFAB_THR_TABLE_LOOKUP_NO_UL, &thr); if( rc < 0 ) return rc; ci_assert( thr->k_ref_count & TCP_HELPER_K_RC_NO_USERLAND ); /* find endpoint and drop OS socket */ ep = ci_trs_get_valid_ep(thr, carg->sock_id); if( ep == NULL ) goto fail1; w = SP_TO_WAITABLE(&thr->netif, carg->sock_id); if( !(w->state & CI_TCP_STATE_TCP) || w->state == CI_TCP_LISTEN ) goto fail2; ts = SP_TO_TCP(&thr->netif, carg->sock_id); ci_assert(ep->os_port_keeper); ci_assert_equal(ep->os_socket, NULL); LOG_TV(ci_log("%s: send reset to non-accepted connection", __FUNCTION__)); /* copy from ci_tcp_listen_shutdown_queues() */ ci_assert(ts->s.b.sb_aflags & CI_SB_AFLAG_TCP_IN_ACCEPTQ); rc = ci_netif_lock(&thr->netif); if( rc != 0 ) { ci_assert_equal(rc, -EINTR); rc = -ERESTARTSYS; goto fail2; } ci_bit_clear(&ts->s.b.sb_aflags, CI_SB_AFLAG_TCP_IN_ACCEPTQ_BIT); /* We have no way to close this connection from the other side: * there was no RST from peer. */ ci_assert_nequal(ts->s.b.state, CI_TCP_CLOSED); ci_assert_nequal(ts->s.b.state, CI_TCP_TIME_WAIT); ci_tcp_send_rst(&thr->netif, ts); ci_tcp_drop(&thr->netif, ts, ECONNRESET); ci_assert_equal(ep->os_port_keeper, NULL); ci_netif_unlock(&thr->netif); efab_tcp_helper_k_ref_count_dec(thr, 1); return 0; fail1: efab_thr_release(thr); fail2: ci_log("%s: inconsistent ep %d:%d", __func__, carg->stack_id, carg->sock_id); return rc; }
static int tcp_helper_alloc_rsop(ci_private_t *priv, void *arg) { /* Using lock to serialize multiple processes trying to create * stacks with same name. */ static DEFINE_MUTEX(ctor_mutex); ci_resource_onload_alloc_t *alloc = arg; tcp_helper_resource_t* trs; int rc; mutex_lock(&ctor_mutex); rc = tcp_helper_alloc_ul(alloc, -1, &trs); if( rc == 0 ) { rc = oo_priv_set_stack(priv, trs); if( rc == 0 ) { priv->fd_type = CI_PRIV_TYPE_NETIF; priv->sock_id = OO_SP_NULL; } else efab_thr_release(trs); } mutex_unlock(&ctor_mutex); return rc; }
int tcp_helper_cluster_alloc_thr(const char* cname, int cluster_size, int cluster_restart, int ni_flags, const ci_netif_config_opts* ni_opts, tcp_helper_resource_t** thr_out) { tcp_helper_cluster_t* thc = NULL; tcp_helper_resource_t* thr = NULL; int alloced = 0; int rc = -ENOENT; int thc_flags = tcp_helper_cluster_thc_flags(ni_opts); char name[CI_CFG_CLUSTER_NAME_LEN + 1]; mutex_lock(&thc_init_mutex); mutex_lock(&thc_mutex); gen_cluster_name(cname, name); rc = thc_search_by_name(name, 0, 0, ci_geteuid(), &thc); if( rc < 0 ) goto fail; if( rc == 1 ) rc = 0; else rc = -ENOENT; if( rc == -ENOENT ) { rc = thc_alloc(name, 0, 0, ci_geteuid(), cluster_size, thc_flags, &thc); if( rc < 0 ) goto fail; alloced = 1; } if( rc == 0 ) /* Find a suitable stack within the cluster to use */ rc = thc_alloc_thr(thc, cluster_restart, ni_opts, ni_flags, &thr); fail: mutex_unlock(&thc_mutex); mutex_unlock(&thc_init_mutex); if( rc == 0 && alloced && thc_flags & THC_FLAG_TPROXY ) { rc = thc_install_tproxy(thc, ni_opts->scalable_filter_ifindex); if( rc != 0 ) { efab_thr_release(thr); /* this should have freed the thc without other references */ alloced = 0; } } if( rc != 0 && alloced ) tcp_helper_cluster_release(thc, NULL); if( rc == 0 ) *thr_out = thr; return rc; }
int oo_create_stack_fd(tcp_helper_resource_t *thr) { int fd; efab_thr_ref(thr); fd = onload_alloc_file(thr, OO_SP_NULL, O_CLOEXEC, CI_PRIV_TYPE_NETIF); if( fd < 0 ) { efab_thr_release(thr); OO_DEBUG_ERR(ci_log("%s: onload_alloc_file failed (%d)", __FUNCTION__, fd)); return fd; } return fd; }
static int oo_priv_lookup_and_attach_stack(ci_private_t* priv, const char* name, unsigned id) { tcp_helper_resource_t* trs; int rc; if( (rc = efab_thr_table_lookup(name, id, EFAB_THR_TABLE_LOOKUP_CHECK_USER, &trs)) == 0 ) { if( (rc = oo_priv_set_stack(priv, trs)) == 0 ) { priv->fd_type = CI_PRIV_TYPE_NETIF; priv->sock_id = OO_SP_NULL; } else efab_thr_release(trs); } return rc; }
/* Locking policy: * Enterance: priv->thr->netif is assumed to be locked. * Exit: all stacks (the client stack and the listener's stack) are * unlocked. */ int efab_tcp_loopback_connect(ci_private_t *priv, void *arg) { struct oo_op_loopback_connect *carg = arg; ci_netif *alien_ni = NULL; oo_sp tls_id; ci_assert(ci_netif_is_locked(&priv->thr->netif)); carg->out_moved = 0; if( !CI_PRIV_TYPE_IS_ENDPOINT(priv->fd_type) ) return -EINVAL; if( NI_OPTS(&priv->thr->netif).tcp_client_loopback != CITP_TCP_LOOPBACK_TO_CONNSTACK && NI_OPTS(&priv->thr->netif).tcp_client_loopback != CITP_TCP_LOOPBACK_TO_LISTSTACK && NI_OPTS(&priv->thr->netif).tcp_client_loopback != CITP_TCP_LOOPBACK_TO_NEWSTACK) { ci_netif_unlock(&priv->thr->netif); return -EINVAL; } while( iterate_netifs_unlocked(&alien_ni) == 0 ) { if( !efab_thr_can_access_stack(netif2tcp_helper_resource(alien_ni), EFAB_THR_TABLE_LOOKUP_CHECK_USER) ) continue; /* no permission to look in here */ if( NI_OPTS(alien_ni).tcp_server_loopback == CITP_TCP_LOOPBACK_OFF ) continue; /* server does not accept loopback connections */ if( NI_OPTS(&priv->thr->netif).tcp_client_loopback != CITP_TCP_LOOPBACK_TO_LISTSTACK && NI_OPTS(alien_ni).tcp_server_loopback != CITP_TCP_LOOPBACK_ALLOW_ALIEN_IN_ACCEPTQ ) continue; /* options of the stacks to not match */ if( NI_OPTS(&priv->thr->netif).tcp_client_loopback != CITP_TCP_LOOPBACK_TO_LISTSTACK && !efab_thr_user_can_access_stack(alien_ni->uid, alien_ni->euid, &priv->thr->netif) ) continue; /* server can't accept our socket */ tls_id = ci_tcp_connect_find_local_peer(alien_ni, carg->dst_addr, carg->dst_port); if( OO_SP_NOT_NULL(tls_id) ) { int rc; /* We are going to exit in this or other way: get ref and * drop kref of alien_ni */ efab_thr_ref(netif2tcp_helper_resource(alien_ni)); iterate_netifs_unlocked_dropref(alien_ni); switch( NI_OPTS(&priv->thr->netif).tcp_client_loopback ) { case CITP_TCP_LOOPBACK_TO_CONNSTACK: /* connect_lo_toconn unlocks priv->thr->netif */ carg->out_rc = ci_tcp_connect_lo_toconn(&priv->thr->netif, priv->sock_id, carg->dst_addr, alien_ni, tls_id); efab_thr_release(netif2tcp_helper_resource(alien_ni)); return 0; case CITP_TCP_LOOPBACK_TO_LISTSTACK: /* Nobody should be using this socket, so trylock should succeed. * Overwise we hand over the socket and do not accelerate this * loopback connection. */ rc = ci_sock_trylock(&priv->thr->netif, SP_TO_WAITABLE(&priv->thr->netif, priv->sock_id)); if( rc == 0 ) { ci_netif_unlock(&priv->thr->netif); efab_thr_release(netif2tcp_helper_resource(alien_ni)); return -ECONNREFUSED; } /* move_to_alien changes locks - see comments near it */ rc = efab_file_move_to_alien_stack(priv, alien_ni); if( rc != 0 ) { /* error - everything is already unlocked */ efab_thr_release(netif2tcp_helper_resource(alien_ni)); /* if we return error, UL will hand the socket over. */ return rc; } /* now alien_ni is locked */ /* Connect again, using new endpoint */ carg->out_rc = ci_tcp_connect_lo_samestack( alien_ni, SP_TO_TCP(alien_ni, SP_TO_WAITABLE(&priv->thr->netif, priv->sock_id) ->moved_to_sock_id), tls_id); ci_netif_unlock(alien_ni); carg->out_moved = 1; return 0; case CITP_TCP_LOOPBACK_TO_NEWSTACK: { tcp_helper_resource_t *new_thr; ci_resource_onload_alloc_t alloc; /* create new stack * todo: no hardware interfaces are necessary */ strcpy(alloc.in_version, ONLOAD_VERSION); strcpy(alloc.in_uk_intf_ver, oo_uk_intf_ver); alloc.in_name[0] = '\0'; alloc.in_flags = 0; rc = tcp_helper_alloc_kernel(&alloc, &NI_OPTS(&priv->thr->netif), 0, &new_thr); if( rc != 0 ) { ci_netif_unlock(&priv->thr->netif); efab_thr_release(netif2tcp_helper_resource(alien_ni)); return -ECONNREFUSED; } rc = ci_sock_trylock(&priv->thr->netif, SP_TO_WAITABLE(&priv->thr->netif, priv->sock_id)); if( rc == 0 ) { ci_netif_unlock(&priv->thr->netif); efab_thr_release(netif2tcp_helper_resource(alien_ni)); efab_thr_release(new_thr); return -ECONNREFUSED; } /* move connecting socket to the new stack */ rc = efab_file_move_to_alien_stack(priv, &new_thr->netif); if( rc != 0 ) { /* error - everything is already unlocked */ efab_thr_release(netif2tcp_helper_resource(alien_ni)); efab_thr_release(new_thr); return -ECONNREFUSED; } /* now new_thr->netif is locked */ carg->out_moved = 1; carg->out_rc = -ECONNREFUSED; /* now connect via CITP_TCP_LOOPBACK_TO_CONNSTACK */ /* connect_lo_toconn unlocks new_thr->netif */ carg->out_rc = ci_tcp_connect_lo_toconn( &new_thr->netif, SP_TO_WAITABLE(&priv->thr->netif, priv->sock_id)->moved_to_sock_id, carg->dst_addr, alien_ni, tls_id); efab_thr_release(netif2tcp_helper_resource(alien_ni)); return 0; } } } else if( tls_id == OO_SP_INVALID ) break; } ci_netif_unlock(&priv->thr->netif); return -ENOENT; }
int efab_file_move_to_alien_stack_rsop(ci_private_t *stack_priv, void *arg) { ci_fixed_descriptor_t sock_fd = *(ci_fixed_descriptor_t *)arg; struct file *sock_file = fget(sock_fd); ci_private_t *sock_priv; tcp_helper_resource_t *old_thr; tcp_helper_resource_t *new_thr; citp_waitable *w; int rc; if( sock_file == NULL ) return -EINVAL; if( !FILE_IS_ENDPOINT_SOCK(sock_file) || stack_priv->fd_type != CI_PRIV_TYPE_NETIF ) { fput(sock_file); return -EINVAL; } sock_priv = sock_file->private_data; ci_assert(sock_priv->fd_type == CI_PRIV_TYPE_TCP_EP || sock_priv->fd_type == CI_PRIV_TYPE_UDP_EP); old_thr = sock_priv->thr; new_thr = stack_priv->thr; ci_assert(old_thr); ci_assert(new_thr); if( old_thr == new_thr ) { fput(sock_file); return 0; } if( tcp_helper_cluster_from_cluster(old_thr) != 0 ) { LOG_S(ci_log("%s: move_fd() not permitted on clustered stacks", __func__)); fput(sock_file); return -EINVAL; } w = SP_TO_WAITABLE(&old_thr->netif, sock_priv->sock_id); rc = ci_sock_lock(&old_thr->netif, w); if( rc != 0 ) { fput(sock_file); return rc; } rc = ci_netif_lock(&old_thr->netif); if( rc != 0 ) { ci_sock_unlock(&old_thr->netif, w); fput(sock_file); return rc; } efab_thr_ref(new_thr); rc = efab_file_move_to_alien_stack(sock_priv, &stack_priv->thr->netif); fput(sock_file); if( rc != 0 ) efab_thr_release(new_thr); else ci_netif_unlock(&new_thr->netif); return rc; }
static int efab_tcp_helper_get_info(ci_private_t *unused, void *arg) { ci_netif_info_t *info = arg; int index, rc=0; tcp_helper_resource_t* thr = NULL; ci_netif* ni = NULL; int flags = EFAB_THR_TABLE_LOOKUP_CHECK_USER | EFAB_THR_TABLE_LOOKUP_NO_WARN; #if CI_CFG_EFAB_EPLOCK_RECORD_CONTENTIONS int j; eplock_resource_t* eplock_rs; #endif info->ni_exists = 0; info->ni_no_perms_exists = 0; if( info->ni_orphan ) { flags |= EFAB_THR_TABLE_LOOKUP_NO_UL; info->ni_orphan = 0; } rc = efab_thr_table_lookup(NULL, info->ni_index, flags, &thr); if( rc == 0 ) { info->ni_exists = 1; info->ni_orphan = (thr->k_ref_count & TCP_HELPER_K_RC_NO_USERLAND); ni = &thr->netif; info->mmap_bytes = thr->mem_mmap_bytes; info->k_ref_count = thr->k_ref_count; info->rs_ref_count = oo_atomic_read(&thr->ref_count); memcpy(info->ni_name, ni->state->name, sizeof(ni->state->name)); } else if( rc == -EACCES ) { info->ni_no_perms_id = info->ni_index; if( efab_thr_get_inaccessible_stack_info(info->ni_index, &info->ni_no_perms_uid, &info->ni_no_perms_euid, &info->ni_no_perms_share_with, info->ni_no_perms_name) == 0 ) info->ni_no_perms_exists = 1; } /* sub-ops that do not need the netif to exist */ if( info->ni_subop == CI_DBG_NETIF_INFO_GET_NEXT_NETIF ) { tcp_helper_resource_t* next_thr; info->u.ni_next_ni.index = -1; for( index = info->ni_index + 1; index < 10000 /* FIXME: magic! */; ++index ) { rc = efab_thr_table_lookup(NULL, index, flags, &next_thr); if( rc == 0 ) { if( next_thr->k_ref_count & TCP_HELPER_K_RC_NO_USERLAND ) efab_tcp_helper_k_ref_count_dec(next_thr, 1); else efab_thr_release(next_thr); info->u.ni_next_ni.index = index; break; } if( rc == -EACCES ) { info->u.ni_next_ni.index = index; break; } } rc = 0; } else if( info->ni_subop == CI_DBG_NETIF_INFO_NOOP ) { rc = 0; } if (!info->ni_exists) return 0; /* sub-ops that need the netif to exist */ switch (info->ni_subop) { case CI_DBG_NETIF_INFO_GET_ENDPOINT_STATE: index = info->u.ni_endpoint.index; info->u.ni_endpoint.max = thr->netif.ep_tbl_n; if ((index < 0) || (index >= (int)thr->netif.ep_tbl_n)) { info->u.ni_endpoint.state = CI_TCP_STATE_FREE; } else { citp_waitable_obj* wo = ID_TO_WAITABLE_OBJ(ni, index); info->u.ni_endpoint.state = wo->waitable.state; if( wo->waitable.state == CI_TCP_STATE_UDP ) { ci_udp_state* us = &wo->udp; info->u.ni_endpoint.udpstate = us->udpflags; info->u.ni_endpoint.rx_pkt_ul = us->recv_q.pkts_delivered; info->u.ni_endpoint.rx_pkt_kn = us->stats.n_rx_os; } else if( wo->waitable.state & CI_TCP_STATE_TCP_CONN ) { ci_tcp_state* ts = &wo->tcp; info->u.ni_endpoint.tx_pkts_max = ts->so_sndbuf_pkts; info->u.ni_endpoint.tx_pkts_num = ts->send.num; } if( wo->waitable.state & CI_TCP_STATE_SOCKET ) { ci_sock_cmn* s = &wo->sock; info->u.ni_endpoint.protocol = (int) sock_protocol(s); info->u.ni_endpoint.laddr = sock_laddr_be32(s); info->u.ni_endpoint.lport = (int) sock_lport_be16(s); info->u.ni_endpoint.raddr = sock_raddr_be32(s); info->u.ni_endpoint.rport = (int) sock_rport_be16(s); } } break; case CI_DBG_NETIF_INFO_GET_NEXT_NETIF: /* If the current netif is found, we need to succeed */ break; case CI_DBG_NETIF_INFO_NOOP: /* Always succeeds, rc already set */ break; default: rc = -EINVAL; break; } if( thr ) { /* Lookup needs a matching efab_thr_release() in case of ordinary * stack but just a ref_count_dec in case of orphan */ if( thr->k_ref_count & TCP_HELPER_K_RC_NO_USERLAND ) efab_tcp_helper_k_ref_count_dec(thr, 1); else efab_thr_release(thr); } return rc; }
static int efab_tcp_helper_move_state(ci_private_t* priv, void *arg) { oo_tcp_move_state_t *op = arg; tcp_helper_endpoint_t *new_ep; tcp_helper_resource_t * new_trs = NULL; ci_netif* ni, *new_ni; ci_tcp_state * ts, *new_ts; tcp_helper_endpoint_t* ep; int rc = efab_ioctl_get_ep(priv, op->ep_id, &ep); if (rc != 0) return rc; OO_DEBUG_TCPH(ci_log("%s: (trs=%p (%u), priv=%p, ep_id=%u, new_trs_id=%u, " "new_ep_id=%u", __FUNCTION__, priv->thr, priv->thr->id, priv, OO_SP_FMT(op->ep_id), op->new_trs_id, OO_SP_FMT(op->new_ep_id))); do { /* check that the existing id is valid */ ni = &priv->thr->netif; ts = SP_TO_TCP(ni, ep->id); /* TODO: check this endpoint belongs to the tcp helper resource of priv and not * somewhere else */ /* this function does not change fd_type or fd ops, so it is not able * to cope with changing the socket type. We think this only makes sense * for TCP, so assert we are taking a TCP endpoint. */ ci_assert_equal(ts->s.pkt.ip.ip_protocol, IPPROTO_TCP); ci_assert_equal(priv->fd_type, CI_PRIV_TYPE_TCP_EP); /* get pointer to resource from handle - increments ref count */ rc = efab_thr_table_lookup(NULL, op->new_trs_id, EFAB_THR_TABLE_LOOKUP_CHECK_USER, &new_trs); if (rc < 0) { OO_DEBUG_ERR( ci_log("%s: invalid new resource handle", __FUNCTION__) ); break; } ci_assert(new_trs != NULL); /* check valid endpoint in new netif */ new_ni = &new_trs->netif; new_ep = ci_netif_get_valid_ep(new_ni, op->new_ep_id); new_ts = SP_TO_TCP(new_ni, new_ep->id); /* check the two endpoint states look valid */ if( (ts->s.pkt.ip.ip_protocol != new_ts->s.pkt.ip.ip_protocol) || (ts->s.b.state != CI_TCP_CLOSED) || (ep->oofilter.sf_local_port != NULL) ) { efab_thr_release(new_trs); rc = -EINVAL; OO_DEBUG_ERR(ci_log("%s: invalid endpoint states", __FUNCTION__)); break; } /* should be fine to complete */ ci_assert(new_trs); { tcp_helper_resource_t *old_trs; again: old_trs = priv->thr; if (ci_cas_uintptr_fail((ci_uintptr_t *)&priv->thr, (ci_uintptr_t)old_trs, (ci_uintptr_t)new_trs)) goto again; efab_thr_release(old_trs); } /* move file to hold details of new resource, new endpoint */ ci_assert(OO_SP_EQ(priv->sock_id, op->ep_id)); priv->sock_id = new_ep->id; OO_DEBUG_TCPH(ci_log("%s: set epid %u", __FUNCTION__, OO_SP_FMT(priv->sock_id))); /* copy across any necessary state */ ci_assert_equal(new_ep->os_socket, NULL); new_ep->os_socket = ep->os_socket; ep->os_socket = NULL; /* set ORPHAN flag in current as not attached to an FD */ ci_bit_set(&ts->s.b.sb_aflags, CI_SB_AFLAG_ORPHAN_BIT); /* remove ORPHAN flag in new TCP state */ ci_atomic32_and(&new_ts->s.b.sb_aflags, ~(CI_SB_AFLAG_ORPHAN | CI_SB_AFLAG_TCP_IN_ACCEPTQ)); return 0; } while (0); return rc; }
static int onload_alloc_file(tcp_helper_resource_t *thr, oo_sp ep_id, int flags, int fd_type) { struct qstr name = { .name = "" }; #ifdef EFX_HAVE_STRUCT_PATH struct path path; #define my_dentry path.dentry #else struct dentry *dentry; #define my_dentry dentry #endif struct file *file; int fd; struct inode *inode; ci_private_t *priv; struct file_operations *fops; fops = oo_fops_by_type(fd_type); if( fops == NULL ) return -EINVAL; ci_assert_equal(fops->owner, THIS_MODULE); inode = new_inode(onload_mnt->mnt_sb); if( inode == NULL ) return -ENOMEM; #ifdef EFX_FSTYPE_HAS_MOUNT inode->i_ino = get_next_ino(); #endif if( fd_type == CI_PRIV_TYPE_NETIF ) inode->i_mode = S_IRWXUGO; if( fd_type == CI_PRIV_TYPE_TCP_EP || fd_type == CI_PRIV_TYPE_UDP_EP ) inode->i_mode = #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,21) /* in 2.6.18 this flag makes us "socket" and sendmsg crashes; * see sock_from_file() */ S_IFSOCK | #endif S_IRWXUGO; else inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); priv = &container_of(inode, struct onload_inode, vfs_inode)->priv; priv->thr = thr; priv->sock_id = ep_id; priv->fd_type = fd_type; fd = get_unused_fd(); if( fd < 0 ) { iput(inode); return fd; } /*ci_log("[%d]%s(%d:%d) return %d priv=%p", current->pid, __func__, thr->id, ep_id, fd, priv);*/ #ifdef EFX_FSTYPE_HAS_MOUNT #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,37) path.dentry = d_alloc(onload_mnt->mnt_sb->s_root, &name); if( path.dentry != NULL ) path.dentry->d_op = &onloadfs_dentry_operations; #else path.dentry = d_alloc_pseudo(onload_mnt->mnt_sb, &name); #endif #else /* EFX_FSTYPE_HAS_MOUNT */ #ifdef EFX_HAVE_D_DNAME my_dentry = d_alloc(onload_mnt->mnt_sb->s_root, &name); #else { char str[32]; name.len = onloadfs_name(&container_of(inode, struct onload_inode, vfs_inode)->priv, str, sizeof(str)); name.name = str; name.hash = inode->i_ino; my_dentry = d_alloc(onload_mnt->mnt_sb->s_root, &name); } #endif #endif /* EFX_FSTYPE_HAS_MOUNT */ if( my_dentry == NULL ) { put_unused_fd(fd); iput(inode); return -ENOMEM; } #if !defined(EFX_FSTYPE_HAS_MOUNT) || defined(EFX_OLD_MOUNT_PSEUDO) my_dentry->d_op = &onloadfs_dentry_operations; #if !defined(EFX_HAVE_STRUCT_PATH) && defined(EFX_HAVE_D_DNAME) my_dentry->d_flags &= ~DCACHE_UNHASHED; #endif #endif d_instantiate(my_dentry, inode); #ifndef EFX_HAVE_D_DNAME d_rehash(my_dentry); #endif inode->i_fop = fops; #ifdef EFX_HAVE_STRUCT_PATH path.mnt = mntget(onload_mnt); file = alloc_file(&path, FMODE_READ | FMODE_WRITE, fops); #else file = alloc_file(onload_mnt, dentry, FMODE_READ | FMODE_WRITE, fops); #endif if( file == NULL) { #ifdef EFX_HAVE_STRUCT_PATH path_put(&path); #else dput(dentry); iput(inode); #endif put_unused_fd(fd); return -ENFILE; } priv->_filp = file; file->f_flags = O_RDWR | (flags & O_NONBLOCK); file->f_pos = 0; file->private_data = priv; if( flags & O_CLOEXEC ) { struct files_struct *files = current->files; struct fdtable *fdt; spin_lock(&files->file_lock); fdt = files_fdtable(files); rcu_assign_pointer(fdt->fd[fd], file); efx_set_close_on_exec(fd, fdt); spin_unlock(&files->file_lock); } else fd_install(fd, file); try_module_get(THIS_MODULE); ci_assert_equal(file->f_op, fops); return fd; } void onload_priv_free(ci_private_t *priv) { if( priv->_filp->f_vfsmnt != onload_mnt) ci_free(priv); /* inode will free the priv automatically */ } int oo_create_fd(tcp_helper_endpoint_t* ep, int flags, int fd_type) { int fd; tcp_helper_resource_t *trs = ep->thr; citp_waitable_obj *wo = SP_TO_WAITABLE_OBJ(&trs->netif, ep->id); efab_thr_ref(trs); fd = onload_alloc_file(trs, ep->id, flags, fd_type); if( fd < 0 ) { efab_thr_release(trs); OO_DEBUG_ERR(ci_log("%s: onload_alloc_file failed (%d)", __FUNCTION__, fd)); return fd; } ci_atomic32_and(&wo-> waitable.sb_aflags, ~(CI_SB_AFLAG_ORPHAN | CI_SB_AFLAG_TCP_IN_ACCEPTQ)); return fd; }
/* This function must be called with netif lock not held and it always * returns with the netif lock not held. */ int efab_tcp_helper_reuseport_bind(ci_private_t *priv, void *arg) { oo_tcp_reuseport_bind_t* trb = arg; ci_netif* ni = &priv->thr->netif; tcp_helper_cluster_t* thc; tcp_helper_resource_t* thr = NULL; citp_waitable* waitable; ci_sock_cmn* sock = SP_TO_SOCK(ni, priv->sock_id); struct oof_manager* fm = efab_tcp_driver.filter_manager; struct oof_socket* oofilter; struct oof_socket dummy_oofilter; int protocol = thc_get_sock_protocol(sock); char name[CI_CFG_CLUSTER_NAME_LEN + 1]; int rc, rc1; int flags = 0; tcp_helper_cluster_t* named_thc,* ported_thc; int alloced = 0; /* No clustering on sockets bound to alien addresses */ if( sock->s_flags & CI_SOCK_FLAG_BOUND_ALIEN ) return 0; if( NI_OPTS(ni).cluster_ignore == 1 ) { LOG_NV(ci_log("%s: Ignored attempt to use clusters due to " "EF_CLUSTER_IGNORE option.", __FUNCTION__)); return 0; } if( trb->port_be16 == 0 ) { ci_log("%s: Reuseport on port=0 is not supported", __FUNCTION__); return -EINVAL; } if( trb->cluster_size < 2 ) { ci_log("%s: Cluster sizes < 2 are not supported", __FUNCTION__); return -EINVAL; } if( sock->s_flags & (CI_SOCK_FLAG_TPROXY | CI_SOCK_FLAG_MAC_FILTER) ) { ci_log("%s: Scalable filter sockets cannot be clustered", __FUNCTION__); return -EINVAL; } oofilter = &ci_trs_ep_get(priv->thr, priv->sock_id)->oofilter; if( oofilter->sf_local_port != NULL ) { ci_log("%s: Socket that already have filter cannot be clustered", __FUNCTION__); return -EINVAL; } if( priv->thr->thc ) { /* Reserve proto:port[:ip] until bind (or close)*/ rc = oof_socket_add(fm, oofilter, OOF_SOCKET_ADD_FLAG_CLUSTERED | OOF_SOCKET_ADD_FLAG_DUMMY, protocol, trb->addr_be32, trb->port_be16, 0, 0, &ported_thc); if( rc > 0 ) rc = 0; if( rc == 0 ) sock->s_flags |= CI_SOCK_FLAG_FILTER; return rc; } mutex_lock(&thc_init_mutex); /* We are going to be iterating over clusters, make sure they don't * change. */ mutex_lock(&thc_mutex); /* Lookup a suitable cluster to use */ /* We try to add dummy filter to oof to reserve proto:port[:ip] tuple, * if there is already a cluster at the tuple we will get reference to it, */ oof_socket_ctor(&dummy_oofilter); rc = oof_socket_add(fm, &dummy_oofilter, OOF_SOCKET_ADD_FLAG_CLUSTERED | OOF_SOCKET_ADD_FLAG_DUMMY | OOF_SOCKET_ADD_FLAG_NO_STACK, protocol, trb->addr_be32, trb->port_be16, 0, 0, &ported_thc); if( rc < 0 ) /* non-clustered socket on the tuple */ goto alloc_fail0; if( ! gen_cluster_name(trb->cluster_name, name) ) { /* user requested a cluster by name. But we need to make sure * that the oof_local_port that the user is interested in is not * being used by another cluster. We search for cluster by name * and use results of prior protp:port[:ip] search oof_local_port * to then do some sanity checking. */ rc1 = thc_search_by_name(name, protocol, trb->port_be16, ci_geteuid(), &named_thc); if( rc1 < 0 ) { rc = rc1; goto alloc_fail; } if( rc1 == 0 ) { if( rc == 1 ) { /* search by oof_local_port found a cluster which search by * name didn't find. */ LOG_E(ci_log("Error: Cluster with requested name %s already " "bound to %s", name, ported_thc->thc_name)); rc = -EEXIST; goto alloc_fail; } else { /* Neither searches found a cluster. So allocate one below. */ } } else { if( rc == 1 ) { /* Both searches found clusters. Fine if they are the same or * else error. */ if( named_thc != ported_thc ) { LOG_E(ci_log("Error: Cluster %s does not handle socket %s:%d. " "Cluster %s does", name, FMT_PROTOCOL(protocol), trb->port_be16, named_thc->thc_name)); rc = -EEXIST; goto alloc_fail; } } /* Search by name found a cluster no conflict with search by tuple * (the ported cluster is either none or the same as named)*/ thc = named_thc; goto cont; } } else { /* No cluster name requested. We have already looked for a cluster handling * the tuple. If none found, then try to use an existing * cluster this process created. If none found, then allocate one. */ /* If rc == 0, then no cluster found - try to allocate one. * If rc == 1, we found cluster - make sure that euids match and continue. */ if( rc == 1 ) { thc = ported_thc; if( thc->thc_euid != ci_geteuid() ) { rc = -EADDRINUSE; goto alloc_fail; } goto cont; } rc = thc_search_by_name(name, protocol, trb->port_be16, ci_geteuid(), &thc); if( rc < 0 ) goto alloc_fail; if( rc == 1 ) goto cont; } /* When an interface is in tproxy mode, all clustered listening socket * are assumed to be part of tproxy passive side. This requires * rss context to use altered rss hashing based solely on src ip:port. */ flags = tcp_helper_cluster_thc_flags(&NI_OPTS(ni)); if( (rc = thc_alloc(name, protocol, trb->port_be16, ci_geteuid(), trb->cluster_size, flags, &thc)) != 0 ) goto alloc_fail; alloced = 1; cont: tcp_helper_cluster_ref(thc); /* At this point we have our cluster with one additional reference */ /* Find a suitable stack within the cluster to use */ rc = thc_get_thr(thc, &dummy_oofilter, &thr); if( rc != 0 ) rc = thc_alloc_thr(thc, trb->cluster_restart_opt, &ni->opts, ni->flags, &thr); /* If get or alloc succeeded thr holds reference to the cluster, * so the cluster cannot go away. We'll drop our reference and also * will not be accessing state within the cluster anymore so we can * drop the lock. */ mutex_unlock(&thc_mutex); if( alloced && rc == 0 && (flags & THC_FLAG_TPROXY) != 0 ) { /* Tproxy filter is allocated as late as here, * the reason is that this needs to be preceded by stack allocation * (firmware needs initialized vi) */ rc = thc_install_tproxy(thc, NI_OPTS(ni).scalable_filter_ifindex); if( rc != 0 ) efab_thr_release(thr); } tcp_helper_cluster_release(thc, NULL); if( rc != 0 ) { oof_socket_del(fm, &dummy_oofilter); goto alloc_fail_unlocked; } /* We have thr and we hold single reference to it. */ /* Move the socket into the new stack */ if( (rc = ci_netif_lock(ni)) != 0 ) goto drop_and_done; waitable = SP_TO_WAITABLE(ni, priv->sock_id); rc = ci_sock_lock(ni, waitable); if( rc != 0 ) { ci_netif_unlock(ni); goto drop_and_done; } /* thr referencing scheme comes from efab_file_move_to_alien_stack_rsop */ efab_thr_ref(thr); rc = efab_file_move_to_alien_stack(priv, &thr->netif, 0); if( rc != 0 ) efab_thr_release(thr); else { /* beside us, socket now holds its own reference to thr */ oofilter = &ci_trs_ep_get(thr, sock->b.moved_to_sock_id)->oofilter; oof_socket_replace(fm, &dummy_oofilter, oofilter); SP_TO_SOCK(&thr->netif, sock->b.moved_to_sock_id)->s_flags |= CI_SOCK_FLAG_FILTER; ci_netif_unlock(&thr->netif); } drop_and_done: if( rc != 0 ) oof_socket_del(fm, &dummy_oofilter); /* Drop the reference we got from thc_get_thr or thc_alloc_thr(). * If things went wrong both stack and cluster might disappear. */ efab_thr_release(thr); oof_socket_dtor(&dummy_oofilter); mutex_unlock(&thc_init_mutex); return rc; alloc_fail: oof_socket_del(fm, &dummy_oofilter); alloc_fail0: mutex_unlock(&thc_mutex); alloc_fail_unlocked: oof_socket_dtor(&dummy_oofilter); mutex_unlock(&thc_init_mutex); return rc; }