void ci_netif_filter_remove(ci_netif* netif, oo_sp sock_p, unsigned laddr, unsigned lport, unsigned raddr, unsigned rport, unsigned protocol) { ci_netif_filter_table_entry* entry; unsigned hash1, hash2, tbl_i; ci_netif_filter_table* tbl; int hops = 0; unsigned first; ci_assert(ci_netif_is_locked(netif) #ifdef __KERNEL__ /* release_ep_tbl might be called without the stack lock. * Do not complain about this. */ || (netif2tcp_helper_resource(netif)->k_ref_count & TCP_HELPER_K_RC_DEAD) #endif ); tbl = netif->filter_table; hash1 = tcp_hash1(tbl, laddr, lport, raddr, rport, protocol); hash2 = tcp_hash2(tbl, laddr, lport, raddr, rport, protocol); first = hash1; LOG_TC(ci_log("%s: [%d:%d] REMOVE %s %s:%u->%s:%u hash=%u:%u", __FUNCTION__, NI_ID(netif), OO_SP_FMT(sock_p), CI_IP_PROTOCOL_STR(protocol), ip_addr_str(laddr), (unsigned) CI_BSWAP_BE16(lport), ip_addr_str(raddr), (unsigned) CI_BSWAP_BE16(rport), hash1, hash2)); tbl_i = hash1; while( 1 ) { entry = &tbl->table[tbl_i]; if( entry->id == OO_SP_TO_INT(sock_p) ) { if( laddr == entry->laddr ) break; } else if( entry->id == EMPTY ) { /* We allow multiple removes of the same filter -- helps avoid some * complexity in the filter module. */ return; } tbl_i = (tbl_i + hash2) & tbl->table_size_mask; ++hops; if( tbl_i == first ) { LOG_E(ci_log(FN_FMT "ERROR: LOOP [%d] %s %s:%u->%s:%u", FN_PRI_ARGS(netif), OO_SP_FMT(sock_p), CI_IP_PROTOCOL_STR(protocol), ip_addr_str(laddr), (unsigned) CI_BSWAP_BE16(lport), ip_addr_str(raddr), (unsigned) CI_BSWAP_BE16(rport))); return; } } __ci_netif_filter_remove(netif, hash1, hash2, hops, tbl_i); }
void oof_cb_sw_filter_remove(struct oof_socket* skf, unsigned laddr, int lport, unsigned raddr, int rport, int protocol, int stack_locked) { ci_netif* ni = skf_to_ni(skf); struct tcp_helper_resource_s *trs = netif2tcp_helper_resource(ni); if( skf->sf_flags & OOF_SOCKET_SW_FILTER_WAS_REMOVED ) return; /* We MAY call this function with incorrect stack_locked flag * if OOF_SOCKET_SW_FILTER_WAS_REMOVED flag is set. */ ci_assert(!stack_locked || ci_netif_is_locked(ni)); /* We are holding a spinlock, so claim to be in driverlink context here */ if( stack_locked || efab_tcp_helper_netif_try_lock(trs, 1) ) { ci_netif_filter_remove(ni, OO_SP_FROM_INT(ni, skf_to_ep(skf)->id), laddr, lport, raddr, rport, protocol); if( ! stack_locked ) efab_tcp_helper_netif_unlock(trs, 1); } else oof_cb_sw_filter_postpone(skf, laddr, lport, raddr, rport, protocol, OOF_CB_SW_FILTER_OP_REMOVE); }
static void ci_drop_orphan(ci_netif * ni) { ci_irqlock_state_t lock_flags; tcp_helper_resource_t* trs; int dec_needed; /* Called when connection closes AFTER the file descriptor closes * - in kernel mode, if user mode has gone away, we call * efab_tcp_helper_k_ref_count_dec() to decrement count * of such connections so we can free the stack when * they've all gone away. */ if( ni->flags & CI_NETIF_FLAGS_DROP_SOCK_REFS ) { trs = netif2tcp_helper_resource(ni); dec_needed = 0; ci_irqlock_lock(&trs->lock, &lock_flags); if( trs->n_ep_closing_refs > 0 ) { --trs->n_ep_closing_refs; dec_needed = 1; } ci_irqlock_unlock(&trs->lock, &lock_flags); if( dec_needed ) efab_tcp_helper_k_ref_count_dec(trs, 0); } }
void citp_waitable_wake_not_in_poll(ci_netif* ni, citp_waitable* sb, unsigned what) { ci_assert(what); ci_assert((what & ~(CI_SB_FLAG_WAKE_RX|CI_SB_FLAG_WAKE_TX)) == 0u); ci_assert(!ni->state->in_poll); ci_wmb(); if( what & CI_SB_FLAG_WAKE_RX ) ++sb->sleep_seq.rw.rx; if( what & CI_SB_FLAG_WAKE_TX ) ++sb->sleep_seq.rw.tx; ci_mb(); #ifdef __KERNEL__ /* Normally we put an object on a ready list in ci_netif_put_on_post_poll, * but in this case we don't go via there, so have to explicitly queue on * the ready list here. */ ci_ni_dllist_remove(ni, &sb->ready_link); ci_ni_dllist_put(ni, &ni->state->ready_lists[sb->ready_list_id], &sb->ready_link); if( what & sb->wake_request ) { sb->sb_flags |= what; citp_waitable_wakeup(ni, sb); } /* Wake the ready list too, if that's requested it. */ if( ni->state->ready_list_flags[sb->ready_list_id] & CI_NI_READY_LIST_FLAG_WAKE ) efab_tcp_helper_ready_list_wakeup(netif2tcp_helper_resource(ni), sb->ready_list_id); #else if( what & sb->wake_request ) { sb->sb_flags |= what; ci_netif_put_on_post_poll(ni, sb); ef_eplock_holder_set_flag(&ni->state->lock, CI_EPLOCK_NETIF_NEED_WAKE); } else { /* Normally we put an object on a ready list in ci_netif_put_on_post_poll, * but in this case we don't go via there, so have to explicitly queue on * the ready list here. */ ci_ni_dllist_remove(ni, &sb->ready_link); ci_ni_dllist_put(ni, &ni->state->ready_lists[sb->ready_list_id], &sb->ready_link); if( ni->state->ready_list_flags[sb->ready_list_id] & CI_NI_READY_LIST_FLAG_WAKE ) ef_eplock_holder_set_flag(&ni->state->lock, CI_EPLOCK_NETIF_NEED_WAKE); } #endif }
/*! Perform system bind on the OS backing socket. * \param ep Endpoint context * \param fd Callers FD * \param ip_addr_be32 Local address to which to bind * \param port_be16 [in] requested port [out] assigned port * \return 0 - success & [port_be16] updated * CI_SOCKET_HANDOVER, Pass to OS, OS bound ok, (no error) * CI_SOCKET_ERROR & errno set */ ci_inline int __ci_bind(ci_netif *ni, ci_sock_cmn *s, ci_uint32 ip_addr_be32, ci_uint16* port_be16 ) { int rc; ci_uint16 user_port; /* Port number specified by user, not by OS. * See bug 4015 for details */ union ci_sockaddr_u sa_u; ci_assert(s->domain == AF_INET || s->domain == AF_INET6); ci_assert( port_be16 ); user_port = *port_be16; #if CI_CFG_FAKE_IPV6 ci_assert(s->domain == AF_INET || s->domain == AF_INET6); if( s->domain == AF_INET ) ci_make_sockaddr(&sa_u.sin, s->domain, user_port, ip_addr_be32); else ci_make_sockaddr6(&sa_u.sin6, s->domain, user_port, ip_addr_be32); #else ci_assert(s->domain == AF_INET); ci_make_sockaddr(&sa_u.sin, s->domain, user_port, ip_addr_be32); #endif #ifdef __ci_driver__ rc = efab_tcp_helper_bind_os_sock(netif2tcp_helper_resource(ni), SC_SP(s), &sa_u.sa, sizeof(sa_u), port_be16); #else rc = ci_tcp_helper_bind_os_sock(ni, SC_SP(s), &sa_u.sa, sizeof(sa_u), port_be16); #endif /* bug1781: only do this if the earlier bind succeeded. * check if we can handle this socket */ if( rc != 0 ) return rc; if( user_port != 0 ) s->s_flags |= CI_SOCK_FLAG_PORT_BOUND; if( ip_addr_be32 != INADDR_ANY ) s->s_flags |= CI_SOCK_FLAG_ADDR_BOUND; s->s_flags &= ~CI_SOCK_FLAG_CONNECT_MUST_BIND; #ifndef __ci_driver__ /* We do not call bind() to alien address from in-kernel code */ if( ip_addr_be32 != INADDR_ANY && !cicp_user_addr_is_local_efab(CICP_HANDLE(ni), &ip_addr_be32) ) s->s_flags |= CI_SOCK_FLAG_BOUND_ALIEN; #endif return rc; }
int efab_eplock_unlock_and_wake(ci_netif *ni, int in_dl_context) { int l = ni->state->lock.lock; tcp_helper_resource_t *rs = netif2tcp_helper_resource(ni); /* Allocate more packets if necessary. */ if( !in_dl_context && OO_STACK_NEEDS_MORE_PACKETS(ni) ) efab_tcp_helper_more_bufs(rs); /* We use in_dl_context from now on, and we should remove * CI_NETIF_FLAG_IN_DL_CONTEXT under the stack lock. */ if( in_dl_context ) ni->flags &= ~CI_NETIF_FLAG_IN_DL_CONTEXT; again: #ifndef NDEBUG if( (~l & CI_EPLOCK_LOCKED) || (l & CI_EPLOCK_UNLOCKED) ) { OO_DEBUG_ERR(ci_log("efab_eplock_unlock_and_wake: corrupt" " (value is %x)", (unsigned) l)); OO_DEBUG_ERR(dump_stack()); return -EIO; } #endif if( l & CI_EPLOCK_CALLBACK_FLAGS ) { /* Invoke the callback while we've still got the lock. The callback ** is responsible for either ** - dropping the lock using ef_eplock_try_unlock(), and returning ** the lock value prior to unlocking, OR ** - keeping the eplock locked and returning CI_EPLOCK_LOCKED */ l = efab_tcp_helper_netif_lock_callback(&ni->eplock_helper, l, in_dl_context); } else if( ci_cas32_fail(&ni->state->lock.lock, l, CI_EPLOCK_UNLOCKED) ) { /* Someone (probably) set a flag when we tried to unlock, so we'd ** better handle the flag(s). */ l = ni->state->lock.lock; goto again; } if( l & CI_EPLOCK_FL_NEED_WAKE ) { CITP_STATS_NETIF_INC(ni, lock_wakes); wake_up_interruptible(&ni->eplock_helper.wq); } return 0; }
static void oof_cb_sw_filter_postpone(struct oof_socket* skf, unsigned laddr, int lport, unsigned raddr, int rport, int protocol, int op_op) { ci_netif* ni = skf_to_ni(skf); struct tcp_helper_resource_s *trs = netif2tcp_helper_resource(ni); struct oof_cb_sw_filter_op* op = CI_ALLOC_OBJ(struct oof_cb_sw_filter_op); if( op == NULL ) { /* Linux complains about failed allocations */ return; } op->sock_id = OO_SP_FROM_INT(ni, skf_to_ep(skf)->id); op->laddr = laddr; op->raddr = raddr; op->lport = lport; op->rport = rport; op->protocol = protocol; op->op = op_op; op->next = NULL; spin_lock_bh(&ni->swf_update_lock); if( ni->swf_update_last == NULL ) ni->swf_update_first = op; else ni->swf_update_last->next = op; ni->swf_update_last = op; spin_unlock_bh(&ni->swf_update_lock); /* We are holding a spinlock, so claim to be in driverlink context here */ if( efab_tcp_helper_netif_lock_or_set_flags(trs, OO_TRUSTED_LOCK_SWF_UPDATE, CI_EPLOCK_NETIF_SWF_UPDATE, 1) ) { ef_eplock_holder_set_flag(&ni->state->lock, CI_EPLOCK_NETIF_SWF_UPDATE); efab_tcp_helper_netif_unlock(trs, 1); } }
/* Fixme: most callers of oof_cb_sw_filter_insert do not check rc. */ int oof_cb_sw_filter_insert(struct oof_socket* skf, unsigned laddr, int lport, unsigned raddr, int rport, int protocol, int stack_locked) { ci_netif* ni = skf_to_ni(skf); struct tcp_helper_resource_s *trs = netif2tcp_helper_resource(ni); int rc = 0; ci_assert(!stack_locked || ci_netif_is_locked(ni)); /* We are holding a spinlock, so claim to be in driverlink context here */ if( stack_locked || efab_tcp_helper_netif_try_lock(trs, 1) ) { rc = ci_netif_filter_insert(ni, OO_SP_FROM_INT(ni, skf_to_ep(skf)->id), laddr, lport, raddr, rport, protocol); if( ! stack_locked ) efab_tcp_helper_netif_unlock(trs, 1); } else oof_cb_sw_filter_postpone(skf, laddr, lport, raddr, rport, protocol, OOF_CB_SW_FILTER_OP_ADD); return rc; }
/* Locking policy: * Enterance: priv->thr->netif is assumed to be locked. * Exit: all stacks (the client stack and the listener's stack) are * unlocked. */ int efab_tcp_loopback_connect(ci_private_t *priv, void *arg) { struct oo_op_loopback_connect *carg = arg; ci_netif *alien_ni = NULL; oo_sp tls_id; ci_assert(ci_netif_is_locked(&priv->thr->netif)); carg->out_moved = 0; if( !CI_PRIV_TYPE_IS_ENDPOINT(priv->fd_type) ) return -EINVAL; if( NI_OPTS(&priv->thr->netif).tcp_client_loopback != CITP_TCP_LOOPBACK_TO_CONNSTACK && NI_OPTS(&priv->thr->netif).tcp_client_loopback != CITP_TCP_LOOPBACK_TO_LISTSTACK && NI_OPTS(&priv->thr->netif).tcp_client_loopback != CITP_TCP_LOOPBACK_TO_NEWSTACK) { ci_netif_unlock(&priv->thr->netif); return -EINVAL; } while( iterate_netifs_unlocked(&alien_ni) == 0 ) { if( !efab_thr_can_access_stack(netif2tcp_helper_resource(alien_ni), EFAB_THR_TABLE_LOOKUP_CHECK_USER) ) continue; /* no permission to look in here */ if( NI_OPTS(alien_ni).tcp_server_loopback == CITP_TCP_LOOPBACK_OFF ) continue; /* server does not accept loopback connections */ if( NI_OPTS(&priv->thr->netif).tcp_client_loopback != CITP_TCP_LOOPBACK_TO_LISTSTACK && NI_OPTS(alien_ni).tcp_server_loopback != CITP_TCP_LOOPBACK_ALLOW_ALIEN_IN_ACCEPTQ ) continue; /* options of the stacks to not match */ if( NI_OPTS(&priv->thr->netif).tcp_client_loopback != CITP_TCP_LOOPBACK_TO_LISTSTACK && !efab_thr_user_can_access_stack(alien_ni->uid, alien_ni->euid, &priv->thr->netif) ) continue; /* server can't accept our socket */ tls_id = ci_tcp_connect_find_local_peer(alien_ni, carg->dst_addr, carg->dst_port); if( OO_SP_NOT_NULL(tls_id) ) { int rc; /* We are going to exit in this or other way: get ref and * drop kref of alien_ni */ efab_thr_ref(netif2tcp_helper_resource(alien_ni)); iterate_netifs_unlocked_dropref(alien_ni); switch( NI_OPTS(&priv->thr->netif).tcp_client_loopback ) { case CITP_TCP_LOOPBACK_TO_CONNSTACK: /* connect_lo_toconn unlocks priv->thr->netif */ carg->out_rc = ci_tcp_connect_lo_toconn(&priv->thr->netif, priv->sock_id, carg->dst_addr, alien_ni, tls_id); efab_thr_release(netif2tcp_helper_resource(alien_ni)); return 0; case CITP_TCP_LOOPBACK_TO_LISTSTACK: /* Nobody should be using this socket, so trylock should succeed. * Overwise we hand over the socket and do not accelerate this * loopback connection. */ rc = ci_sock_trylock(&priv->thr->netif, SP_TO_WAITABLE(&priv->thr->netif, priv->sock_id)); if( rc == 0 ) { ci_netif_unlock(&priv->thr->netif); efab_thr_release(netif2tcp_helper_resource(alien_ni)); return -ECONNREFUSED; } /* move_to_alien changes locks - see comments near it */ rc = efab_file_move_to_alien_stack(priv, alien_ni); if( rc != 0 ) { /* error - everything is already unlocked */ efab_thr_release(netif2tcp_helper_resource(alien_ni)); /* if we return error, UL will hand the socket over. */ return rc; } /* now alien_ni is locked */ /* Connect again, using new endpoint */ carg->out_rc = ci_tcp_connect_lo_samestack( alien_ni, SP_TO_TCP(alien_ni, SP_TO_WAITABLE(&priv->thr->netif, priv->sock_id) ->moved_to_sock_id), tls_id); ci_netif_unlock(alien_ni); carg->out_moved = 1; return 0; case CITP_TCP_LOOPBACK_TO_NEWSTACK: { tcp_helper_resource_t *new_thr; ci_resource_onload_alloc_t alloc; /* create new stack * todo: no hardware interfaces are necessary */ strcpy(alloc.in_version, ONLOAD_VERSION); strcpy(alloc.in_uk_intf_ver, oo_uk_intf_ver); alloc.in_name[0] = '\0'; alloc.in_flags = 0; rc = tcp_helper_alloc_kernel(&alloc, &NI_OPTS(&priv->thr->netif), 0, &new_thr); if( rc != 0 ) { ci_netif_unlock(&priv->thr->netif); efab_thr_release(netif2tcp_helper_resource(alien_ni)); return -ECONNREFUSED; } rc = ci_sock_trylock(&priv->thr->netif, SP_TO_WAITABLE(&priv->thr->netif, priv->sock_id)); if( rc == 0 ) { ci_netif_unlock(&priv->thr->netif); efab_thr_release(netif2tcp_helper_resource(alien_ni)); efab_thr_release(new_thr); return -ECONNREFUSED; } /* move connecting socket to the new stack */ rc = efab_file_move_to_alien_stack(priv, &new_thr->netif); if( rc != 0 ) { /* error - everything is already unlocked */ efab_thr_release(netif2tcp_helper_resource(alien_ni)); efab_thr_release(new_thr); return -ECONNREFUSED; } /* now new_thr->netif is locked */ carg->out_moved = 1; carg->out_rc = -ECONNREFUSED; /* now connect via CITP_TCP_LOOPBACK_TO_CONNSTACK */ /* connect_lo_toconn unlocks new_thr->netif */ carg->out_rc = ci_tcp_connect_lo_toconn( &new_thr->netif, SP_TO_WAITABLE(&priv->thr->netif, priv->sock_id)->moved_to_sock_id, carg->dst_addr, alien_ni, tls_id); efab_thr_release(netif2tcp_helper_resource(alien_ni)); return 0; } } } else if( tls_id == OO_SP_INVALID ) break; } ci_netif_unlock(&priv->thr->netif); return -ENOENT; }
/* Move priv file to the alien_ni stack. * Should be called with the locked priv stack and socket; * the function returns with this stack being unlocked. * If rc=0, it returns with alien_ni stack locked; * otherwise, both stacks are unlocked. * Socket is always unlocked on return. */ int efab_file_move_to_alien_stack(ci_private_t *priv, ci_netif *alien_ni) { tcp_helper_resource_t *old_thr = priv->thr; tcp_helper_resource_t *new_thr = netif2tcp_helper_resource(alien_ni); ci_sock_cmn *old_s = SP_TO_SOCK(&old_thr->netif, priv->sock_id); ci_sock_cmn *new_s; ci_sock_cmn *mid_s; tcp_helper_endpoint_t *old_ep, *new_ep; int rc, i; int pollwait_register = 0; #if CI_CFG_FD_CACHING oo_p sp; #endif OO_DEBUG_TCPH(ci_log("%s: move %d:%d to %d", __func__, old_thr->id, priv->sock_id, new_thr->id)); /* Poll the old stack - deliver all data to our socket */ ci_netif_poll(&old_thr->netif); /* Endpoints in epoll list should not be moved, because waitq is already * in the epoll internal structures (bug 41152). */ if( !list_empty(&priv->_filp->f_ep_links) ) { rc = -EBUSY; goto fail1; } if( !efab_file_move_supported(&old_thr->netif, old_s) ) { rc = -EINVAL; goto fail1; } /* Lock the second stack */ i = 0; while( ! ci_netif_trylock(alien_ni) ) { ci_netif_unlock(&old_thr->netif); if( i++ >= 1000 ) { rc = -EBUSY; goto fail1_ni_unlocked; } rc = ci_netif_lock(&old_thr->netif); if( rc != 0 ) goto fail1_ni_unlocked; } /* Allocate a new socket in the alien_ni stack */ rc = -ENOMEM; if( old_s->b.state == CI_TCP_STATE_UDP ) { ci_udp_state *new_us = ci_udp_get_state_buf(alien_ni); if( new_us == NULL ) goto fail2; new_s = &new_us->s; } else { ci_tcp_state *new_ts = ci_tcp_get_state_buf(alien_ni); if( new_ts == NULL ) goto fail2; new_s = &new_ts->s; } /* Allocate an intermediate "socket" outside of everything */ mid_s = ci_alloc(CI_MAX(sizeof(ci_tcp_state), sizeof(ci_udp_state))); if( mid_s == NULL ) goto fail3; OO_DEBUG_TCPH(ci_log("%s: move %d:%d to %d:%d", __func__, old_thr->id, priv->sock_id, new_thr->id, new_s->b.bufid)); /* Copy TCP/UDP state */ memcpy(mid_s, old_s, CI_MAX(sizeof(ci_tcp_state), sizeof(ci_udp_state))); /* do not copy old_s->b.bufid * and other fields in stack adress space */ mid_s->b.sb_aflags |= CI_SB_AFLAG_ORPHAN; mid_s->b.bufid = new_s->b.bufid; mid_s->b.post_poll_link = new_s->b.post_poll_link; mid_s->b.ready_link = new_s->b.ready_link; mid_s->reap_link = new_s->reap_link; if( old_s->b.state & CI_TCP_STATE_TCP ) { ci_tcp_state *new_ts = SOCK_TO_TCP(new_s); ci_tcp_state *mid_ts = SOCK_TO_TCP(mid_s); mid_ts->timeout_q_link = new_ts->timeout_q_link; mid_ts->tx_ready_link = new_ts->tx_ready_link; mid_ts->rto_tid = new_ts->rto_tid; mid_ts->delack_tid = new_ts->delack_tid; mid_ts->zwin_tid = new_ts->zwin_tid; mid_ts->kalive_tid = new_ts->kalive_tid; mid_ts->cork_tid = new_ts->cork_tid; ci_ip_queue_init(&mid_ts->recv1); ci_ip_queue_init(&mid_ts->recv2); ci_ip_queue_init(&mid_ts->send); ci_ip_queue_init(&mid_ts->retrans); mid_ts->send_prequeue = OO_PP_ID_NULL; new_ts->retrans_ptr = OO_PP_NULL; mid_ts->tmpl_head = OO_PP_NULL; oo_atomic_set(&mid_ts->send_prequeue_in, 0); *new_ts = *mid_ts; ci_pmtu_state_init(alien_ni, &new_ts->s, &new_ts->pmtus, CI_IP_TIMER_PMTU_DISCOVER); #if CI_CFG_FD_CACHING sp = TS_OFF(alien_ni, new_ts); OO_P_ADD(sp, CI_MEMBER_OFFSET(ci_tcp_state, epcache_link)); ci_ni_dllist_link_init(alien_ni, &new_ts->epcache_link, sp, "epch"); ci_ni_dllist_self_link(alien_ni, &new_ts->epcache_link); sp = TS_OFF(alien_ni, new_ts); OO_P_ADD(sp, CI_MEMBER_OFFSET(ci_tcp_state, epcache_fd_link)); ci_ni_dllist_link_init(alien_ni, &new_ts->epcache_fd_link, sp, "ecfd"); ci_ni_dllist_self_link(alien_ni, &new_ts->epcache_fd_link); #endif /* free temporary mid_ts storage */ CI_FREE_OBJ(mid_ts); } else { ci_udp_state *mid_us = SOCK_TO_UDP(mid_s); *SOCK_TO_UDP(new_s) = *mid_us; CI_FREE_OBJ(mid_us); } /* Move the filter */ old_ep = ci_trs_ep_get(old_thr, priv->sock_id); new_ep = ci_trs_ep_get(new_thr, new_s->b.bufid); rc = tcp_helper_endpoint_move_filters_pre(old_ep, new_ep); if( rc != 0 ) { rc = -EINVAL; goto fail3; } /* Allocate a new file for the new endpoint */ rc = onload_alloc_file(new_thr, new_s->b.bufid, priv->_filp->f_flags, priv->fd_type, &old_ep->alien_ref); if( rc != 0 ) goto fail4; ci_assert(old_ep->alien_ref); /* Copy F_SETOWN_EX, F_SETSIG to the new file */ #ifdef F_SETOWN_EX rcu_read_lock(); __f_setown(old_ep->alien_ref->_filp, priv->_filp->f_owner.pid, priv->_filp->f_owner.pid_type, 1); rcu_read_unlock(); #endif old_ep->alien_ref->_filp->f_owner.signum = priv->_filp->f_owner.signum; old_ep->alien_ref->_filp->f_flags |= priv->_filp->f_flags & O_NONBLOCK; /* Move os_socket from one ep to another */ if( tcp_helper_endpoint_set_aflags(new_ep, OO_THR_EP_AFLAG_ATTACHED) & OO_THR_EP_AFLAG_ATTACHED ) { fput(old_ep->alien_ref->_filp); rc = -EBUSY; goto fail2; /* state & filters are cleared by fput() */ } /********* Point of no return **********/ ci_wmb(); priv->fd_type = CI_PRIV_TYPE_ALIEN_EP; priv->_filp->f_op = &linux_tcp_helper_fops_alien; ci_wmb(); oo_file_moved(priv); /* Read all already-arrived packets after the filters move but before * copying of the receive queue. */ ci_netif_poll(&old_thr->netif); tcp_helper_endpoint_move_filters_post(old_ep, new_ep); ci_assert( efab_file_move_supported(&old_thr->netif, old_s)); /* There's a gap between un-registering the old ep, and registering the * the new. However, the notifications shouldn't be in use for sockets * that are in a state that can be moved, so this shouldn't be a problem. */ if( old_ep->os_sock_pt.whead ) { pollwait_register = 1; efab_tcp_helper_os_pollwait_unregister(old_ep); } ci_assert_equal(new_ep->os_socket, NULL); new_ep->os_socket = oo_file_ref_xchg(&old_ep->os_socket, NULL); ci_assert_equal(old_ep->os_socket, NULL); if( pollwait_register ) efab_tcp_helper_os_pollwait_register(new_ep); ci_bit_clear(&new_s->b.sb_aflags, CI_SB_AFLAG_ORPHAN_BIT); if( new_s->b.state == CI_TCP_ESTABLISHED ) CI_TCP_STATS_INC_CURR_ESTAB(alien_ni); /* Copy recv queue */ if( new_s->b.state & CI_TCP_STATE_TCP ) { ci_tcp_state *new_ts = SOCK_TO_TCP(new_s); ci_tcp_state *old_ts = SOCK_TO_TCP(old_s); int i; /* Stop timers */ ci_ip_timer_clear(&old_thr->netif, &old_ts->kalive_tid); ci_ip_timer_clear(&old_thr->netif, &old_ts->delack_tid); efab_ip_queue_copy(alien_ni, &new_ts->recv1, &old_thr->netif, &old_ts->recv1); efab_ip_queue_copy(alien_ni, &new_ts->recv2, &old_thr->netif, &old_ts->recv2); new_ts->recv1_extract = new_ts->recv1.head; /* Drop reorder buffer */ ci_ip_queue_init(&new_ts->rob); new_ts->dsack_block = OO_PP_INVALID; new_ts->dsack_start = new_ts->dsack_end = 0; for( i = 0; i <= CI_TCP_SACK_MAX_BLOCKS; i++ ) new_ts->last_sack[i] = OO_PP_NULL; } else { /* There should not be any recv q, but drop it to be sure */ ci_udp_recv_q_init(&SOCK_TO_UDP(new_s)->recv_q); } /* Old stack can be unlocked */ old_s->b.sb_flags |= CI_SB_FLAG_MOVED; ci_netif_unlock(&old_thr->netif); ci_assert( efab_file_move_supported(alien_ni, new_s) ); /* Move done: poll for any new data. */ ci_netif_poll(alien_ni); if( new_s->b.state & CI_TCP_STATE_TCP ) { ci_tcp_state *new_ts = SOCK_TO_TCP(new_s); /* Timers setup: delack, keepalive */ if( (new_ts->acks_pending & CI_TCP_ACKS_PENDING_MASK) > 0) ci_tcp_timeout_delack(alien_ni, new_ts); ci_tcp_kalive_reset(alien_ni, new_ts); } /* Old ep: we are done. */ ci_bit_set(&old_s->b.sb_aflags, CI_SB_AFLAG_MOVED_AWAY_BIT); old_s->b.moved_to_stack_id = alien_ni->state->stack_id; old_s->b.moved_to_sock_id = new_s->b.bufid; if( ! list_empty(&priv->_filp->f_ep_links) ) ci_bit_set(&old_s->b.sb_aflags, CI_SB_AFLAG_MOVED_AWAY_IN_EPOLL_BIT); ci_sock_unlock(&old_thr->netif, &old_s->b); ci_sock_unlock(alien_ni, &new_s->b); ci_assert(ci_netif_is_locked(alien_ni)); OO_DEBUG_TCPH(ci_log("%s: -> [%d:%d] %s", __func__, new_thr->id, new_s->b.bufid, ci_tcp_state_str(new_s->b.state))); return 0; fail4: /* We clear the filters from the new ep. * For now, we do not need to re-insert old filters because hw filters * are alredy here (in case of accepted socket) or not needed. * We have not removed old sw filters yet. */ tcp_helper_endpoint_move_filters_undo(old_ep, new_ep); fail3: if( new_s->b.state & CI_TCP_STATE_TCP ) ci_tcp_state_free(alien_ni, SOCK_TO_TCP(new_s)); else ci_udp_state_free(alien_ni, SOCK_TO_UDP(new_s)); fail2: ci_netif_unlock(alien_ni); fail1: ci_netif_unlock(&old_thr->netif); fail1_ni_unlocked: ci_sock_unlock(&old_thr->netif, &old_s->b); OO_DEBUG_TCPH(ci_log("%s: rc=%d", __func__, rc)); return rc; }
/* c_ni is assumed to be locked on enterance and is always unlocked on * exit. */ int ci_tcp_connect_lo_toconn(ci_netif *c_ni, oo_sp c_id, ci_uint32 dst, ci_netif *l_ni, oo_sp l_id) { ci_tcp_state *ts; ci_tcp_socket_listen *tls, *alien_tls; citp_waitable_obj *wo; citp_waitable *w; int rc; ci_assert(ci_netif_is_locked(c_ni)); ci_assert(OO_SP_NOT_NULL(c_id)); ci_assert(OO_SP_NOT_NULL(l_id)); LOG_TC(log("%s: connect %d:%d to %d:%d", __FUNCTION__, c_ni->state->stack_id, OO_SP_TO_INT(c_id), l_ni->state->stack_id, OO_SP_TO_INT(l_id))); alien_tls = SP_TO_TCP_LISTEN(l_ni, l_id); if( (int)ci_tcp_acceptq_n(alien_tls) >= alien_tls->acceptq_max ) { ci_netif_unlock(c_ni); return -EBUSY; } /* In c_ni, create shadow listening socket tls (copy l_id) */ ts = ci_tcp_get_state_buf(c_ni); if( ts == NULL ) { ci_netif_unlock(c_ni); LOG_E(ci_log("%s: [%d] out of socket buffers", __FUNCTION__, NI_ID(c_ni))); return -ENOMEM; } /* init common tcp fields */ ts->s.so = alien_tls->s.so; ts->s.cp.ip_ttl = alien_tls->s.cp.ip_ttl; S_TCP_HDR(&ts->s)->tcp_source_be16 = S_TCP_HDR(&alien_tls->s)->tcp_source_be16; ts->s.domain = alien_tls->s.domain; ts->c = alien_tls->c; ts->c.tcp_defer_accept = OO_TCP_DEFER_ACCEPT_OFF; /* make sure nobody will ever connect to our "shadow" socket * except us */ ci_bit_set(&ts->s.b.sb_aflags, CI_SB_AFLAG_ORPHAN_BIT); ci_tcp_set_slow_state(c_ni, ts, CI_TCP_LISTEN); tls = SOCK_TO_TCP_LISTEN(&ts->s); /* no timer: */ tls->s.s_flags = alien_tls->s.s_flags | CI_SOCK_FLAG_BOUND_ALIEN; tls->acceptq_max = 1; rc = ci_tcp_listen_init(c_ni, tls); if( rc != 0 ) { citp_waitable_obj_free(c_ni, &tls->s.b); return rc; } /* Connect c_id to tls */ ts = SP_TO_TCP(c_ni, c_id); rc = ci_tcp_connect_lo_samestack(c_ni, ts, tls->s.b.bufid); /* Accept as from tls */ if( !ci_tcp_acceptq_not_empty(tls) ) { /* it is possible, for example, if ci_tcp_listenq_try_promote() failed * because there are no endpoints */ ci_tcp_listenq_drop_all(c_ni, tls); citp_waitable_obj_free(c_ni, &tls->s.b); ci_netif_unlock(c_ni); return -EBUSY; } w = ci_tcp_acceptq_get(c_ni, tls); ci_assert(w); LOG_TV(ci_log("%s: %d:%d to %d:%d shadow %d:%d accepted %d:%d", __FUNCTION__, c_ni->state->stack_id, OO_SP_TO_INT(c_id), l_ni->state->stack_id, OO_SP_TO_INT(l_id), c_ni->state->stack_id, tls->s.b.bufid, c_ni->state->stack_id, w->bufid)); ci_assert(w->state & CI_TCP_STATE_TCP); ci_assert(w->state != CI_TCP_LISTEN); /* Destroy tls. * NB: nobody could possibly connect to it, so no need to do proper * shutdown. */ ci_assert_equal(ci_tcp_acceptq_n(tls), 0); ci_tcp_listenq_drop_all(c_ni, tls); citp_waitable_obj_free(c_ni, &tls->s.b); ci_netif_unlock(c_ni); /* Keep a port reference */ { tcp_helper_endpoint_t *l_ep, *a_ep; struct oo_file_ref* os_sock_ref; ci_irqlock_state_t lock_flags; l_ep = ci_trs_ep_get(netif2tcp_helper_resource(l_ni), l_id); a_ep = ci_trs_ep_get(netif2tcp_helper_resource(c_ni), W_SP(w)); ci_irqlock_lock(&l_ep->thr->lock, &lock_flags); os_sock_ref = l_ep->os_socket; ci_assert_equal(a_ep->os_port_keeper, NULL); if( os_sock_ref != NULL ) { os_sock_ref = oo_file_ref_add(os_sock_ref); os_sock_ref = oo_file_ref_xchg(&a_ep->os_port_keeper, os_sock_ref); ci_irqlock_unlock(&l_ep->thr->lock, &lock_flags); if( os_sock_ref != NULL ) oo_file_ref_drop(os_sock_ref); } else { ci_irqlock_unlock(&l_ep->thr->lock, &lock_flags); goto cleanup; } } /* lock l_ni: Check that l_id is the same socket it used to be */ /* create ref-sock in l_ni, put it into acc q */ if( ci_netif_lock(l_ni) != 0 ) goto cleanup; if( alien_tls->s.b.state != CI_TCP_LISTEN || (alien_tls->s.b.sb_aflags & CI_SB_AFLAG_ORPHAN) || S_TCP_HDR(&alien_tls->s)->tcp_source_be16 != TS_TCP(ts)->tcp_dest_be16 || (alien_tls->s.pkt.ip.ip_saddr_be32 != INADDR_ANY && alien_tls->s.pkt.ip.ip_saddr_be32 != ts->s.pkt.ip.ip_daddr_be32) ) { ci_netif_unlock(l_ni); goto cleanup; } ci_bit_mask_set(&w->sb_aflags, CI_SB_AFLAG_TCP_IN_ACCEPTQ | CI_SB_AFLAG_ORPHAN); wo = citp_waitable_obj_alloc(l_ni); if( wo == NULL ) { ci_netif_unlock(l_ni); goto cleanup; } wo->waitable.state = CI_TCP_CLOSED; wo->waitable.sb_aflags |= CI_SB_AFLAG_MOVED_AWAY; wo->waitable.moved_to_stack_id = c_ni->state->stack_id; wo->waitable.moved_to_sock_id = W_SP(w); LOG_TC(log("%s: put to acceptq %d:%d referencing %d:%d", __func__, l_ni->state->stack_id, OO_SP_TO_INT(W_SP(&wo->waitable)), c_ni->state->stack_id, OO_SP_TO_INT(W_SP(w)))); ci_tcp_acceptq_put(l_ni, alien_tls, &wo->waitable); citp_waitable_wake_not_in_poll(l_ni, &alien_tls->s.b, CI_SB_FLAG_WAKE_RX); ci_netif_unlock(l_ni); return rc; cleanup: ci_assert(w->sb_aflags & CI_SB_AFLAG_ORPHAN); ci_bit_mask_clear(&w->sb_aflags, CI_SB_AFLAG_TCP_IN_ACCEPTQ | CI_SB_AFLAG_ORPHAN); efab_tcp_helper_close_endpoint(netif2tcp_helper_resource(c_ni), w->bufid); /* we can not guarantee c_ni lock, so we can' call * ci_tcp_drop(c_ni, ts). So, we return error; UL will handover * and close ts endpoint. */ return -EBUSY; }
static int ci_tcp_connect_ul_syn_sent(ci_netif *ni, ci_tcp_state *ts) { int rc = 0; if( ts->s.b.state == CI_TCP_SYN_SENT ) { ci_netif_poll(ni); if( OO_SP_NOT_NULL(ts->local_peer) ) { /* No reason to sleep. Obviously, listener have dropped our syn * because of some reason. Go away! */ ci_tcp_drop(ni, ts, EBUSY); RET_WITH_ERRNO(EBUSY); } CI_TCP_SLEEP_WHILE(ni, ts, CI_SB_FLAG_WAKE_RX, ts->s.so.sndtimeo_msec, ts->s.b.state == CI_TCP_SYN_SENT, &rc); } if( rc == -EAGAIN ) { LOG_TC(log( LNT_FMT "timeout on sleep: %d", LNT_PRI_ARGS(ni, ts), -rc)); if( ! (ts->tcpflags & CI_TCPT_FLAG_NONBLOCK_CONNECT) ) { ts->tcpflags |= CI_TCPT_FLAG_NONBLOCK_CONNECT; CI_SET_ERROR(rc, EINPROGRESS); } else CI_SET_ERROR(rc, EALREADY); return rc; } else if( rc == -EINTR ) { LOG_TC(log(LNT_FMT "connect() was interrupted by a signal", LNT_PRI_ARGS(ni, ts))); ts->tcpflags |= CI_TCPT_FLAG_NONBLOCK_CONNECT; CI_SET_ERROR(rc, EINTR); return rc; } /*! \TODO propagate the correct error code: CONNREFUSED, NOROUTE, etc. */ if( ts->s.b.state == CI_TCP_CLOSED ) { /* Bug 3558: * Set OS socket state to allow/disallow next bind(). * It is Linux hack. */ #ifdef __ci_driver__ CI_TRY(efab_tcp_helper_set_tcp_close_os_sock(netif2tcp_helper_resource(ni), S_SP(ts))); #else CI_TRY(ci_tcp_helper_set_tcp_close_os_sock(ni, S_SP(ts))); #endif /* We should re-bind socket on the next use if the port was determined by * OS. */ if( ! (ts->s.s_flags & CI_SOCK_FLAG_PORT_BOUND) ) ts->s.s_flags |= CI_SOCK_FLAG_CONNECT_MUST_BIND; /* - if SO_ERROR is set, handle it and return this value; * - else if rx_errno is set, return it; * - else (TCP_RX_ERRNO==0, socket is CI_SHUT_RD) return ECONNABORTED */ if( (rc = ci_tcp_connect_handle_so_error(&ts->s)) == 0) rc = TCP_RX_ERRNO(ts) ? TCP_RX_ERRNO(ts) : ECONNABORTED; CI_SET_ERROR(rc, rc); if( ! (ts->s.s_flags & CI_SOCK_FLAG_ADDR_BOUND) ) { ts->s.pkt.ip.ip_saddr_be32 = 0; ts->s.cp.ip_laddr_be32 = 0; } return rc; } return 0; }
int ci_tcp_listen(citp_socket* ep, ci_fd_t fd, int backlog) { /* ** ?? error handling on possible fails not handled robustly... ** ?? Need to check port number is valid TODO */ /*! \todo If not bound then we have to be listening on all interfaces. * It's likely that we won't be coming through here as we have to * listen on the OS socket too! */ ci_tcp_state* ts; ci_tcp_socket_listen* tls; ci_netif* netif = ep->netif; ci_sock_cmn* s = ep->s; unsigned ul_backlog = backlog; int rc; oo_p sp; LOG_TC(log("%s "SK_FMT" listen backlog=%d", __FUNCTION__, SK_PRI_ARGS(ep), backlog)); CHECK_TEP(ep); if( NI_OPTS(netif).tcp_listen_handover ) return CI_SOCKET_HANDOVER; if( !NI_OPTS(netif).tcp_server_loopback) { /* We should handover if the socket is bound to alien address. */ if( s->s_flags & CI_SOCK_FLAG_BOUND_ALIEN ) return CI_SOCKET_HANDOVER; } if( ul_backlog < 0 ) ul_backlog = NI_OPTS(netif).max_ep_bufs; else if( ul_backlog < NI_OPTS(netif).acceptq_min_backlog ) ul_backlog = NI_OPTS(netif).acceptq_min_backlog; if( s->b.state == CI_TCP_LISTEN ) { tls = SOCK_TO_TCP_LISTEN(s); tls->acceptq_max = ul_backlog; ci_tcp_helper_listen_os_sock(fd, ul_backlog); return 0; } if( s->b.state != CI_TCP_CLOSED ) { CI_SET_ERROR(rc, EINVAL); return rc; } ts = SOCK_TO_TCP(s); /* Bug 3376: if socket used for a previous, failed, connect then the error * numbers will not be as expected. Only seen when not using listening * netifs (as moving the EP to the new netif resets them). */ ts->s.tx_errno = EPIPE; ts->s.rx_errno = ENOTCONN; /* fill in address/ports and all TCP state */ if( !(ts->s.s_flags & CI_SOCK_FLAG_BOUND) ) { ci_uint16 source_be16; /* They haven't previously done a bind, so we need to choose * a port. As we haven't been given a hint we let the OS choose. */ source_be16 = 0; rc = __ci_bind(ep->netif, ep->s, ts->s.pkt.ip.ip_saddr_be32, &source_be16); if (CI_LIKELY( rc==0 )) { TS_TCP(ts)->tcp_source_be16 = source_be16; ts->s.cp.lport_be16 = source_be16; LOG_TC(log(LNT_FMT "listen: our bind returned %s:%u", LNT_PRI_ARGS(ep->netif, ts), ip_addr_str(ts->s.pkt.ip.ip_saddr_be32), (unsigned) CI_BSWAP_BE16(TS_TCP(ts)->tcp_source_be16))); } else { LOG_U(ci_log("__ci_bind returned %d at %s:%d", CI_GET_ERROR(rc), __FILE__, __LINE__)); return rc; } } ci_sock_lock(netif, &ts->s.b); ci_tcp_set_slow_state(netif, ts, CI_TCP_LISTEN); tls = SOCK_TO_TCP_LISTEN(&ts->s); tcp_raddr_be32(tls) = 0u; tcp_rport_be16(tls) = 0u; ci_assert_equal(tls->s.tx_errno, EPIPE); ci_assert_equal(tls->s.rx_errno, ENOTCONN); /* setup listen timer - do it before the first return statement, * because __ci_tcp_listen_to_normal() will be called on error path. */ if( ~tls->s.s_flags & CI_SOCK_FLAG_BOUND_ALIEN ) { sp = TS_OFF(netif, tls); OO_P_ADD(sp, CI_MEMBER_OFFSET(ci_tcp_socket_listen, listenq_tid)); ci_ip_timer_init(netif, &tls->listenq_tid, sp, "lstq"); tls->listenq_tid.param1 = S_SP(tls); tls->listenq_tid.fn = CI_IP_TIMER_TCP_LISTEN; } rc = ci_tcp_listen_init(netif, tls); ci_sock_unlock(netif, &ts->s.b); if( rc != 0 ) { CI_SET_ERROR(rc, -rc); goto listen_fail; } tls->acceptq_max = ul_backlog; CITP_STATS_TCP_LISTEN(CI_ZERO(&tls->stats)); /* install all the filters needed for this connection * - tcp_laddr_be32(ts) = 0 for IPADDR_ANY * * TODO: handle BINDTODEVICE by setting phys_port paramter to correct * physical L5 port index * TODO: handle REUSEADDR by setting last paramter to TRUE */ if( ~s->s_flags & CI_SOCK_FLAG_BOUND_ALIEN ) { #ifdef ONLOAD_OFE if( netif->ofe != NULL ) { tls->s.ofe_code_start = ofe_socktbl_find( netif->ofe, OFE_SOCKTYPE_TCP_LISTEN, tcp_laddr_be32(tls), INADDR_ANY, tcp_lport_be16(ts), 0); tls->ofe_promote = ofe_socktbl_find( netif->ofe, OFE_SOCKTYPE_TCP_PASSIVE, tcp_laddr_be32(tls), INADDR_ANY, tcp_lport_be16(ts), 0); } #endif rc = ci_tcp_ep_set_filters(netif, S_SP(tls), tls->s.cp.so_bindtodevice, OO_SP_NULL); if( rc == -EFILTERSSOME ) { if( CITP_OPTS.no_fail ) rc = 0; else { ci_tcp_ep_clear_filters(netif, S_SP(tls), 0); rc = -ENOBUFS; } } ci_assert_nequal(rc, -EFILTERSSOME); VERB(ci_log("%s: set_filters returned %d", __FUNCTION__, rc)); if (rc < 0) { CI_SET_ERROR(rc, -rc); goto post_listen_fail; } } /* * Call of system listen() is required for listen any, local host * communications server and multi-homed server (to accept connections * to L5 assigned address(es), but incoming from other interfaces). */ #ifdef __ci_driver__ { rc = efab_tcp_helper_listen_os_sock( netif2tcp_helper_resource(netif), S_SP(tls), backlog); } #else rc = ci_tcp_helper_listen_os_sock(fd, backlog); #endif if ( rc < 0 ) { /* clear the filter we've just set */ ci_tcp_ep_clear_filters(netif, S_SP(tls), 0); goto post_listen_fail; } return 0; post_listen_fail: ci_tcp_listenq_drop_all(netif, tls); listen_fail: /* revert TCP state to a non-listening socket format */ __ci_tcp_listen_to_normal(netif, tls); /* Above function sets orphan flag but we are attached to an FD. */ ci_bit_clear(&tls->s.b.sb_aflags, CI_SB_AFLAG_ORPHAN_BIT); #ifdef __ci_driver__ return rc; #else return CI_SOCKET_ERROR; #endif }