void ci_netif_filter_remove(ci_netif* netif, oo_sp sock_p, unsigned laddr, unsigned lport, unsigned raddr, unsigned rport, unsigned protocol) { ci_netif_filter_table_entry* entry; unsigned hash1, hash2, tbl_i; ci_netif_filter_table* tbl; int hops = 0; unsigned first; ci_assert(ci_netif_is_locked(netif) #ifdef __KERNEL__ /* release_ep_tbl might be called without the stack lock. * Do not complain about this. */ || (netif2tcp_helper_resource(netif)->k_ref_count & TCP_HELPER_K_RC_DEAD) #endif ); tbl = netif->filter_table; hash1 = tcp_hash1(tbl, laddr, lport, raddr, rport, protocol); hash2 = tcp_hash2(tbl, laddr, lport, raddr, rport, protocol); first = hash1; LOG_TC(ci_log("%s: [%d:%d] REMOVE %s %s:%u->%s:%u hash=%u:%u", __FUNCTION__, NI_ID(netif), OO_SP_FMT(sock_p), CI_IP_PROTOCOL_STR(protocol), ip_addr_str(laddr), (unsigned) CI_BSWAP_BE16(lport), ip_addr_str(raddr), (unsigned) CI_BSWAP_BE16(rport), hash1, hash2)); tbl_i = hash1; while( 1 ) { entry = &tbl->table[tbl_i]; if( entry->id == OO_SP_TO_INT(sock_p) ) { if( laddr == entry->laddr ) break; } else if( entry->id == EMPTY ) { /* We allow multiple removes of the same filter -- helps avoid some * complexity in the filter module. */ return; } tbl_i = (tbl_i + hash2) & tbl->table_size_mask; ++hops; if( tbl_i == first ) { LOG_E(ci_log(FN_FMT "ERROR: LOOP [%d] %s %s:%u->%s:%u", FN_PRI_ARGS(netif), OO_SP_FMT(sock_p), CI_IP_PROTOCOL_STR(protocol), ip_addr_str(laddr), (unsigned) CI_BSWAP_BE16(lport), ip_addr_str(raddr), (unsigned) CI_BSWAP_BE16(rport))); return; } } __ci_netif_filter_remove(netif, hash1, hash2, hops, tbl_i); }
void citp_waitable_obj_free(ci_netif* ni, citp_waitable* w) { ci_assert(ci_netif_is_locked(ni)); #ifdef __KERNEL__ { /* Avoid racing with tcp_helper_do_non_atomic(). */ tcp_helper_endpoint_t* ep = ci_netif_get_valid_ep(ni, w->bufid); unsigned ep_aflags; again: if( (ep_aflags = ep->ep_aflags) & OO_THR_EP_AFLAG_NON_ATOMIC ) { ci_assert(!(ep_aflags & OO_THR_EP_AFLAG_NEED_FREE)); if( ci_cas32_fail(&ep->ep_aflags, ep_aflags, ep_aflags | OO_THR_EP_AFLAG_NEED_FREE) ) goto again; return; } ci_rmb(); } #endif __citp_waitable_obj_free(ni, w); w->wt_next = ni->state->free_eps_head; ni->state->free_eps_head = W_SP(w); /* Must be last, as may result in stack going away. */ ci_drop_orphan(ni); }
void ci_udp_state_free(ci_netif* ni, ci_udp_state* us) { ci_assert(ci_netif_is_locked(ni)); ci_assert(us->s.b.state == CI_TCP_STATE_UDP); ci_assert(ci_ni_dllist_is_self_linked(ni, &us->s.b.post_poll_link)); ci_udp_recv_q_drop(ni, &us->timestamp_q); citp_waitable_obj_free(ni, &us->s.b); }
citp_waitable_obj* citp_waitable_obj_alloc(ci_netif* netif) { citp_waitable_obj* wo; ci_assert(netif); ci_assert(ci_netif_is_locked(netif)); if( netif->state->deferred_free_eps_head != CI_ILL_END ) { ci_uint32 link; do link = netif->state->deferred_free_eps_head; while( ci_cas32_fail(&netif->state->deferred_free_eps_head, link, CI_ILL_END)); while( link != CI_ILL_END ) { citp_waitable* w = ID_TO_WAITABLE(netif, link); link = w->next_id; CI_DEBUG(w->next_id = CI_ILL_END); ci_assert_equal(w->state, CI_TCP_STATE_FREE); ci_assert(OO_SP_IS_NULL(w->wt_next)); w->wt_next = netif->state->free_eps_head; netif->state->free_eps_head = W_SP(w); } } if( OO_SP_IS_NULL(netif->state->free_eps_head) ) { ci_tcp_helper_more_socks(netif); if( OO_SP_IS_NULL(netif->state->free_eps_head) ) ci_netif_timeout_reap(netif); } if( OO_SP_IS_NULL(netif->state->free_eps_head) ) return NULL; LOG_TV(ci_log("%s: allocating %d", __FUNCTION__, OO_SP_FMT(netif->state->free_eps_head))); ci_assert(IS_VALID_SOCK_P(netif, netif->state->free_eps_head)); #if !defined(__KERNEL__) && !defined (CI_HAVE_OS_NOPAGE) ci_netif_mmap_shmbuf(netif, (netif->state->free_eps_head >> EP_BUF_BLOCKSHIFT) + 1); #endif wo = SP_TO_WAITABLE_OBJ(netif, netif->state->free_eps_head); ci_assert(OO_SP_EQ(W_SP(&wo->waitable), netif->state->free_eps_head)); ci_assert_equal(wo->waitable.state, CI_TCP_STATE_FREE); ci_assert_equal(wo->waitable.sb_aflags, (CI_SB_AFLAG_ORPHAN | CI_SB_AFLAG_NOT_READY)); ci_assert_equal(wo->waitable.lock.wl_val, 0); netif->state->free_eps_head = wo->waitable.wt_next; CI_DEBUG(wo->waitable.wt_next = OO_SP_NULL); ci_assert_equal(wo->waitable.state, CI_TCP_STATE_FREE); return wo; }
void ci_sock_cmn_timestamp_q_reap(ci_netif* ni, ci_sock_cmn* s) { ci_assert(ci_netif_is_locked(ni)); while( ! OO_PP_EQ(s->timestamp_q.head, s->timestamp_q_extract) ) { ci_ip_pkt_fmt* pkt = PKT_CHK(ni, s->timestamp_q.head); oo_pkt_p next = pkt->tsq_next; ci_netif_pkt_release(ni, pkt); --s->timestamp_q.num; s->timestamp_q.head = next; } }
int ci_netif_filter_lookup(ci_netif* netif, unsigned laddr, unsigned lport, unsigned raddr, unsigned rport, unsigned protocol) { unsigned hash1, hash2 = 0; ci_netif_filter_table* tbl; unsigned first; ci_assert(netif); ci_assert(ci_netif_is_locked(netif)); ci_assert(netif->filter_table); tbl = netif->filter_table; hash1 = tcp_hash1(tbl, laddr, lport, raddr, rport, protocol); first = hash1; LOG_NV(log("tbl_lookup: %s %s:%u->%s:%u hash=%u:%u at=%u", CI_IP_PROTOCOL_STR(protocol), ip_addr_str(laddr), (unsigned) CI_BSWAP_BE16(lport), ip_addr_str(raddr), (unsigned) CI_BSWAP_BE16(rport), first, tcp_hash2(tbl, laddr, lport, raddr, rport, protocol), hash1)); while( 1 ) { int id = tbl->table[hash1].id; if( CI_LIKELY(id >= 0) ) { ci_sock_cmn* s = ID_TO_SOCK(netif, id); if( ((laddr - tbl->table[hash1].laddr) | (lport - sock_lport_be16(s) ) | (raddr - sock_raddr_be32(s) ) | (rport - sock_rport_be16(s) ) | (protocol - sock_protocol(s) )) == 0 ) return hash1; } if( id == EMPTY ) break; /* We defer calculating hash2 until it's needed, just to make the fast * case that little bit faster. */ if( hash1 == first ) hash2 = tcp_hash2(tbl, laddr, lport, raddr, rport, protocol); hash1 = (hash1 + hash2) & tbl->table_size_mask; if( hash1 == first ) { LOG_E(ci_log(FN_FMT "ERROR: LOOP %s:%u->%s:%u hash=%u:%u", FN_PRI_ARGS(netif), ip_addr_str(laddr), lport, ip_addr_str(raddr), rport, hash1, hash2)); return -ELOOP; } } return -ENOENT; }
int ci_tcp_connect_lo_samestack(ci_netif *ni, ci_tcp_state *ts, oo_sp tls_id) { int crc, rc = 0; ci_assert(ci_netif_is_locked(ni)); ts->local_peer = tls_id; crc = ci_tcp_connect_ul_start(ni, ts, ts->s.pkt.ip_saddr_be32, ts->s.pkt.dport_be16, &rc); /* The connect is really finished, but we should return EINPROGRESS * for non-blocking connect and 0 for normal. */ if( crc == CI_CONNECT_UL_OK ) rc = ci_tcp_connect_ul_syn_sent(ni, ts); return rc; }
void citp_waitable_all_fds_gone(ci_netif* ni, oo_sp w_id) { citp_waitable_obj* wo; ci_assert(ni); ci_assert(IS_VALID_SOCK_P(ni, w_id)); ci_assert(ci_netif_is_locked(ni)); wo = SP_TO_WAITABLE_OBJ(ni, w_id); ci_assert(wo->waitable.state != CI_TCP_STATE_FREE); LOG_NC(ci_log("%s: %d:%d %s", __FUNCTION__, NI_ID(ni), OO_SP_FMT(w_id), ci_tcp_state_str(wo->waitable.state))); /* listening socket is closed in blocking conext, see * efab_tcp_helper_close_endpoint(). * CI_SB_AFLAG_ORPHAN is set earlier in this case.. */ CI_DEBUG(if( (wo->waitable.sb_aflags & CI_SB_AFLAG_ORPHAN) && wo->waitable.state != CI_TCP_LISTEN ) ci_log("%s: %d:%d already orphan", __FUNCTION__, NI_ID(ni), OO_SP_FMT(w_id))); /* It's essential that an ORPHANed socket not be on the deferred * socket list, because the same link field is used as timewait * list, free list etc. We must purge the deferred list before * setting the orphan flag. * * NB. This socket cannot now be added to the deferred list, because * no-one has a reference to it. */ ci_netif_purge_deferred_socket_list(ni); ci_bit_set(&wo->waitable.sb_aflags, CI_SB_AFLAG_ORPHAN_BIT); /* We also need to remove the socket from the post-poll list. It may * have been left there because the stack believes a wakeup is needed. */ ci_ni_dllist_remove_safe(ni, &wo->waitable.post_poll_link); ci_ni_dllist_remove_safe(ni, &wo->waitable.ready_link); wo->waitable.ready_list_id = 0; citp_waitable_cleanup(ni, wo, 1); }
void ci_udp_all_fds_gone(ci_netif* netif, oo_sp sock_id, int do_free) { /* All process references to this socket have gone. So we should * shutdown() if necessary, and arrange for all resources to eventually * get cleaned up. * * This is called by the driver only. [sock_id] is trusted. */ ci_udp_state* us = SP_TO_UDP(netif, sock_id); ci_assert(ci_netif_is_locked(netif)); ci_assert(us->s.b.state == CI_TCP_STATE_UDP); LOG_UC(ci_log("ci_udp_all_fds_gone: "NTS_FMT, NTS_PRI_ARGS(netif, us))); if( UDP_GET_FLAG(us, CI_UDPF_FILTERED) ) { UDP_CLR_FLAG(us, CI_UDPF_FILTERED); ci_tcp_ep_clear_filters(netif, S_SP(us), 0); } ci_udp_recv_q_drop(netif, &us->recv_q); ci_ni_dllist_remove(netif, &us->s.reap_link); if( OO_PP_NOT_NULL(us->zc_kernel_datagram) ) { ci_netif_pkt_release_rx(netif, PKT_CHK(netif, us->zc_kernel_datagram)); us->zc_kernel_datagram = OO_PP_NULL; us->zc_kernel_datagram_count = 0; } /* Only free state if no outstanding tx packets: otherwise it'll get * freed by the tx completion event. */ if( do_free ) { if( us->tx_count == 0 ) ci_udp_state_free(netif, us); else CITP_STATS_NETIF_INC(netif, udp_free_with_tx_active); } }
/* Fixme: most callers of oof_cb_sw_filter_insert do not check rc. */ int oof_cb_sw_filter_insert(struct oof_socket* skf, unsigned laddr, int lport, unsigned raddr, int rport, int protocol, int stack_locked) { ci_netif* ni = skf_to_ni(skf); struct tcp_helper_resource_s *trs = netif2tcp_helper_resource(ni); int rc = 0; ci_assert(!stack_locked || ci_netif_is_locked(ni)); /* We are holding a spinlock, so claim to be in driverlink context here */ if( stack_locked || efab_tcp_helper_netif_try_lock(trs, 1) ) { rc = ci_netif_filter_insert(ni, OO_SP_FROM_INT(ni, skf_to_ep(skf)->id), laddr, lport, raddr, rport, protocol); if( ! stack_locked ) efab_tcp_helper_netif_unlock(trs, 1); } else oof_cb_sw_filter_postpone(skf, laddr, lport, raddr, rport, protocol, OOF_CB_SW_FILTER_OP_ADD); return rc; }
/* Insert for either TCP or UDP */ int ci_netif_filter_insert(ci_netif* netif, oo_sp tcp_id, unsigned laddr, unsigned lport, unsigned raddr, unsigned rport, unsigned protocol) { ci_netif_filter_table_entry* entry; unsigned hash1, hash2; ci_netif_filter_table* tbl; #if !defined(NDEBUG) || CI_CFG_STATS_NETIF unsigned hops = 1; #endif unsigned first; ci_assert(netif); ci_assert(ci_netif_is_locked(netif)); ci_assert(netif->filter_table); tbl = netif->filter_table; hash1 = tcp_hash1(tbl, laddr, lport, raddr, rport, protocol); hash2 = tcp_hash2(tbl, laddr, lport, raddr, rport, protocol); first = hash1; /* Find a free slot. */ while( 1 ) { entry = &tbl->table[hash1]; if( entry->id < 0 ) break; ++entry->route_count; #if !defined(NDEBUG) || CI_CFG_STATS_NETIF ++hops; #endif /* A socket can only have multiple entries in the filter table if each * entry has a different [laddr]. */ ci_assert( !((entry->id == OO_SP_TO_INT(tcp_id)) && (laddr == entry->laddr)) ); hash1 = (hash1 + hash2) & tbl->table_size_mask; if( hash1 == first ) { ci_sock_cmn *s = SP_TO_SOCK_CMN(netif, tcp_id); if( ! (s->s_flags & CI_SOCK_FLAG_SW_FILTER_FULL) ) { LOG_E(ci_log(FN_FMT "%d FULL %s %s:%u->%s:%u hops=%u", FN_PRI_ARGS(netif), OO_SP_FMT(tcp_id), CI_IP_PROTOCOL_STR(protocol), ip_addr_str(laddr), (unsigned) CI_BSWAP_BE16(lport), ip_addr_str(raddr), (unsigned) CI_BSWAP_BE16(rport), hops)); s->s_flags |= CI_SOCK_FLAG_SW_FILTER_FULL; } CITP_STATS_NETIF_INC(netif, sw_filter_insert_table_full); return -ENOBUFS; } } /* Now insert the new entry. */ LOG_TC(ci_log(FN_FMT "%d INSERT %s %s:%u->%s:%u hash=%u:%u at=%u " "over=%d hops=%u", FN_PRI_ARGS(netif), OO_SP_FMT(tcp_id), CI_IP_PROTOCOL_STR(protocol), ip_addr_str(laddr), (unsigned) CI_BSWAP_BE16(lport), ip_addr_str(raddr), (unsigned) CI_BSWAP_BE16(rport), first, hash2, hash1, entry->id, hops)); #if CI_CFG_STATS_NETIF if( hops > netif->state->stats.table_max_hops ) netif->state->stats.table_max_hops = hops; /* Keep a rolling average of the number of hops per entry. */ if( netif->state->stats.table_mean_hops == 0 ) netif->state->stats.table_mean_hops = 1; netif->state->stats.table_mean_hops = (netif->state->stats.table_mean_hops * 9 + hops) / 10; if( entry->id == EMPTY ) ++netif->state->stats.table_n_slots; ++netif->state->stats.table_n_entries; #endif entry->id = OO_SP_TO_INT(tcp_id); entry->laddr = laddr; return 0; }
/* Locking policy: * Enterance: priv->thr->netif is assumed to be locked. * Exit: all stacks (the client stack and the listener's stack) are * unlocked. */ int efab_tcp_loopback_connect(ci_private_t *priv, void *arg) { struct oo_op_loopback_connect *carg = arg; ci_netif *alien_ni = NULL; oo_sp tls_id; ci_assert(ci_netif_is_locked(&priv->thr->netif)); carg->out_moved = 0; if( !CI_PRIV_TYPE_IS_ENDPOINT(priv->fd_type) ) return -EINVAL; if( NI_OPTS(&priv->thr->netif).tcp_client_loopback != CITP_TCP_LOOPBACK_TO_CONNSTACK && NI_OPTS(&priv->thr->netif).tcp_client_loopback != CITP_TCP_LOOPBACK_TO_LISTSTACK && NI_OPTS(&priv->thr->netif).tcp_client_loopback != CITP_TCP_LOOPBACK_TO_NEWSTACK) { ci_netif_unlock(&priv->thr->netif); return -EINVAL; } while( iterate_netifs_unlocked(&alien_ni) == 0 ) { if( !efab_thr_can_access_stack(netif2tcp_helper_resource(alien_ni), EFAB_THR_TABLE_LOOKUP_CHECK_USER) ) continue; /* no permission to look in here */ if( NI_OPTS(alien_ni).tcp_server_loopback == CITP_TCP_LOOPBACK_OFF ) continue; /* server does not accept loopback connections */ if( NI_OPTS(&priv->thr->netif).tcp_client_loopback != CITP_TCP_LOOPBACK_TO_LISTSTACK && NI_OPTS(alien_ni).tcp_server_loopback != CITP_TCP_LOOPBACK_ALLOW_ALIEN_IN_ACCEPTQ ) continue; /* options of the stacks to not match */ if( NI_OPTS(&priv->thr->netif).tcp_client_loopback != CITP_TCP_LOOPBACK_TO_LISTSTACK && !efab_thr_user_can_access_stack(alien_ni->uid, alien_ni->euid, &priv->thr->netif) ) continue; /* server can't accept our socket */ tls_id = ci_tcp_connect_find_local_peer(alien_ni, carg->dst_addr, carg->dst_port); if( OO_SP_NOT_NULL(tls_id) ) { int rc; /* We are going to exit in this or other way: get ref and * drop kref of alien_ni */ efab_thr_ref(netif2tcp_helper_resource(alien_ni)); iterate_netifs_unlocked_dropref(alien_ni); switch( NI_OPTS(&priv->thr->netif).tcp_client_loopback ) { case CITP_TCP_LOOPBACK_TO_CONNSTACK: /* connect_lo_toconn unlocks priv->thr->netif */ carg->out_rc = ci_tcp_connect_lo_toconn(&priv->thr->netif, priv->sock_id, carg->dst_addr, alien_ni, tls_id); efab_thr_release(netif2tcp_helper_resource(alien_ni)); return 0; case CITP_TCP_LOOPBACK_TO_LISTSTACK: /* Nobody should be using this socket, so trylock should succeed. * Overwise we hand over the socket and do not accelerate this * loopback connection. */ rc = ci_sock_trylock(&priv->thr->netif, SP_TO_WAITABLE(&priv->thr->netif, priv->sock_id)); if( rc == 0 ) { ci_netif_unlock(&priv->thr->netif); efab_thr_release(netif2tcp_helper_resource(alien_ni)); return -ECONNREFUSED; } /* move_to_alien changes locks - see comments near it */ rc = efab_file_move_to_alien_stack(priv, alien_ni); if( rc != 0 ) { /* error - everything is already unlocked */ efab_thr_release(netif2tcp_helper_resource(alien_ni)); /* if we return error, UL will hand the socket over. */ return rc; } /* now alien_ni is locked */ /* Connect again, using new endpoint */ carg->out_rc = ci_tcp_connect_lo_samestack( alien_ni, SP_TO_TCP(alien_ni, SP_TO_WAITABLE(&priv->thr->netif, priv->sock_id) ->moved_to_sock_id), tls_id); ci_netif_unlock(alien_ni); carg->out_moved = 1; return 0; case CITP_TCP_LOOPBACK_TO_NEWSTACK: { tcp_helper_resource_t *new_thr; ci_resource_onload_alloc_t alloc; /* create new stack * todo: no hardware interfaces are necessary */ strcpy(alloc.in_version, ONLOAD_VERSION); strcpy(alloc.in_uk_intf_ver, oo_uk_intf_ver); alloc.in_name[0] = '\0'; alloc.in_flags = 0; rc = tcp_helper_alloc_kernel(&alloc, &NI_OPTS(&priv->thr->netif), 0, &new_thr); if( rc != 0 ) { ci_netif_unlock(&priv->thr->netif); efab_thr_release(netif2tcp_helper_resource(alien_ni)); return -ECONNREFUSED; } rc = ci_sock_trylock(&priv->thr->netif, SP_TO_WAITABLE(&priv->thr->netif, priv->sock_id)); if( rc == 0 ) { ci_netif_unlock(&priv->thr->netif); efab_thr_release(netif2tcp_helper_resource(alien_ni)); efab_thr_release(new_thr); return -ECONNREFUSED; } /* move connecting socket to the new stack */ rc = efab_file_move_to_alien_stack(priv, &new_thr->netif); if( rc != 0 ) { /* error - everything is already unlocked */ efab_thr_release(netif2tcp_helper_resource(alien_ni)); efab_thr_release(new_thr); return -ECONNREFUSED; } /* now new_thr->netif is locked */ carg->out_moved = 1; carg->out_rc = -ECONNREFUSED; /* now connect via CITP_TCP_LOOPBACK_TO_CONNSTACK */ /* connect_lo_toconn unlocks new_thr->netif */ carg->out_rc = ci_tcp_connect_lo_toconn( &new_thr->netif, SP_TO_WAITABLE(&priv->thr->netif, priv->sock_id)->moved_to_sock_id, carg->dst_addr, alien_ni, tls_id); efab_thr_release(netif2tcp_helper_resource(alien_ni)); return 0; } } } else if( tls_id == OO_SP_INVALID ) break; } ci_netif_unlock(&priv->thr->netif); return -ENOENT; }
/* Move priv file to the alien_ni stack. * Should be called with the locked priv stack and socket; * the function returns with this stack being unlocked. * If rc=0, it returns with alien_ni stack locked; * otherwise, both stacks are unlocked. * Socket is always unlocked on return. */ int efab_file_move_to_alien_stack(ci_private_t *priv, ci_netif *alien_ni) { tcp_helper_resource_t *old_thr = priv->thr; tcp_helper_resource_t *new_thr = netif2tcp_helper_resource(alien_ni); ci_sock_cmn *old_s = SP_TO_SOCK(&old_thr->netif, priv->sock_id); ci_sock_cmn *new_s; ci_sock_cmn *mid_s; tcp_helper_endpoint_t *old_ep, *new_ep; int rc, i; int pollwait_register = 0; #if CI_CFG_FD_CACHING oo_p sp; #endif OO_DEBUG_TCPH(ci_log("%s: move %d:%d to %d", __func__, old_thr->id, priv->sock_id, new_thr->id)); /* Poll the old stack - deliver all data to our socket */ ci_netif_poll(&old_thr->netif); /* Endpoints in epoll list should not be moved, because waitq is already * in the epoll internal structures (bug 41152). */ if( !list_empty(&priv->_filp->f_ep_links) ) { rc = -EBUSY; goto fail1; } if( !efab_file_move_supported(&old_thr->netif, old_s) ) { rc = -EINVAL; goto fail1; } /* Lock the second stack */ i = 0; while( ! ci_netif_trylock(alien_ni) ) { ci_netif_unlock(&old_thr->netif); if( i++ >= 1000 ) { rc = -EBUSY; goto fail1_ni_unlocked; } rc = ci_netif_lock(&old_thr->netif); if( rc != 0 ) goto fail1_ni_unlocked; } /* Allocate a new socket in the alien_ni stack */ rc = -ENOMEM; if( old_s->b.state == CI_TCP_STATE_UDP ) { ci_udp_state *new_us = ci_udp_get_state_buf(alien_ni); if( new_us == NULL ) goto fail2; new_s = &new_us->s; } else { ci_tcp_state *new_ts = ci_tcp_get_state_buf(alien_ni); if( new_ts == NULL ) goto fail2; new_s = &new_ts->s; } /* Allocate an intermediate "socket" outside of everything */ mid_s = ci_alloc(CI_MAX(sizeof(ci_tcp_state), sizeof(ci_udp_state))); if( mid_s == NULL ) goto fail3; OO_DEBUG_TCPH(ci_log("%s: move %d:%d to %d:%d", __func__, old_thr->id, priv->sock_id, new_thr->id, new_s->b.bufid)); /* Copy TCP/UDP state */ memcpy(mid_s, old_s, CI_MAX(sizeof(ci_tcp_state), sizeof(ci_udp_state))); /* do not copy old_s->b.bufid * and other fields in stack adress space */ mid_s->b.sb_aflags |= CI_SB_AFLAG_ORPHAN; mid_s->b.bufid = new_s->b.bufid; mid_s->b.post_poll_link = new_s->b.post_poll_link; mid_s->b.ready_link = new_s->b.ready_link; mid_s->reap_link = new_s->reap_link; if( old_s->b.state & CI_TCP_STATE_TCP ) { ci_tcp_state *new_ts = SOCK_TO_TCP(new_s); ci_tcp_state *mid_ts = SOCK_TO_TCP(mid_s); mid_ts->timeout_q_link = new_ts->timeout_q_link; mid_ts->tx_ready_link = new_ts->tx_ready_link; mid_ts->rto_tid = new_ts->rto_tid; mid_ts->delack_tid = new_ts->delack_tid; mid_ts->zwin_tid = new_ts->zwin_tid; mid_ts->kalive_tid = new_ts->kalive_tid; mid_ts->cork_tid = new_ts->cork_tid; ci_ip_queue_init(&mid_ts->recv1); ci_ip_queue_init(&mid_ts->recv2); ci_ip_queue_init(&mid_ts->send); ci_ip_queue_init(&mid_ts->retrans); mid_ts->send_prequeue = OO_PP_ID_NULL; new_ts->retrans_ptr = OO_PP_NULL; mid_ts->tmpl_head = OO_PP_NULL; oo_atomic_set(&mid_ts->send_prequeue_in, 0); *new_ts = *mid_ts; ci_pmtu_state_init(alien_ni, &new_ts->s, &new_ts->pmtus, CI_IP_TIMER_PMTU_DISCOVER); #if CI_CFG_FD_CACHING sp = TS_OFF(alien_ni, new_ts); OO_P_ADD(sp, CI_MEMBER_OFFSET(ci_tcp_state, epcache_link)); ci_ni_dllist_link_init(alien_ni, &new_ts->epcache_link, sp, "epch"); ci_ni_dllist_self_link(alien_ni, &new_ts->epcache_link); sp = TS_OFF(alien_ni, new_ts); OO_P_ADD(sp, CI_MEMBER_OFFSET(ci_tcp_state, epcache_fd_link)); ci_ni_dllist_link_init(alien_ni, &new_ts->epcache_fd_link, sp, "ecfd"); ci_ni_dllist_self_link(alien_ni, &new_ts->epcache_fd_link); #endif /* free temporary mid_ts storage */ CI_FREE_OBJ(mid_ts); } else { ci_udp_state *mid_us = SOCK_TO_UDP(mid_s); *SOCK_TO_UDP(new_s) = *mid_us; CI_FREE_OBJ(mid_us); } /* Move the filter */ old_ep = ci_trs_ep_get(old_thr, priv->sock_id); new_ep = ci_trs_ep_get(new_thr, new_s->b.bufid); rc = tcp_helper_endpoint_move_filters_pre(old_ep, new_ep); if( rc != 0 ) { rc = -EINVAL; goto fail3; } /* Allocate a new file for the new endpoint */ rc = onload_alloc_file(new_thr, new_s->b.bufid, priv->_filp->f_flags, priv->fd_type, &old_ep->alien_ref); if( rc != 0 ) goto fail4; ci_assert(old_ep->alien_ref); /* Copy F_SETOWN_EX, F_SETSIG to the new file */ #ifdef F_SETOWN_EX rcu_read_lock(); __f_setown(old_ep->alien_ref->_filp, priv->_filp->f_owner.pid, priv->_filp->f_owner.pid_type, 1); rcu_read_unlock(); #endif old_ep->alien_ref->_filp->f_owner.signum = priv->_filp->f_owner.signum; old_ep->alien_ref->_filp->f_flags |= priv->_filp->f_flags & O_NONBLOCK; /* Move os_socket from one ep to another */ if( tcp_helper_endpoint_set_aflags(new_ep, OO_THR_EP_AFLAG_ATTACHED) & OO_THR_EP_AFLAG_ATTACHED ) { fput(old_ep->alien_ref->_filp); rc = -EBUSY; goto fail2; /* state & filters are cleared by fput() */ } /********* Point of no return **********/ ci_wmb(); priv->fd_type = CI_PRIV_TYPE_ALIEN_EP; priv->_filp->f_op = &linux_tcp_helper_fops_alien; ci_wmb(); oo_file_moved(priv); /* Read all already-arrived packets after the filters move but before * copying of the receive queue. */ ci_netif_poll(&old_thr->netif); tcp_helper_endpoint_move_filters_post(old_ep, new_ep); ci_assert( efab_file_move_supported(&old_thr->netif, old_s)); /* There's a gap between un-registering the old ep, and registering the * the new. However, the notifications shouldn't be in use for sockets * that are in a state that can be moved, so this shouldn't be a problem. */ if( old_ep->os_sock_pt.whead ) { pollwait_register = 1; efab_tcp_helper_os_pollwait_unregister(old_ep); } ci_assert_equal(new_ep->os_socket, NULL); new_ep->os_socket = oo_file_ref_xchg(&old_ep->os_socket, NULL); ci_assert_equal(old_ep->os_socket, NULL); if( pollwait_register ) efab_tcp_helper_os_pollwait_register(new_ep); ci_bit_clear(&new_s->b.sb_aflags, CI_SB_AFLAG_ORPHAN_BIT); if( new_s->b.state == CI_TCP_ESTABLISHED ) CI_TCP_STATS_INC_CURR_ESTAB(alien_ni); /* Copy recv queue */ if( new_s->b.state & CI_TCP_STATE_TCP ) { ci_tcp_state *new_ts = SOCK_TO_TCP(new_s); ci_tcp_state *old_ts = SOCK_TO_TCP(old_s); int i; /* Stop timers */ ci_ip_timer_clear(&old_thr->netif, &old_ts->kalive_tid); ci_ip_timer_clear(&old_thr->netif, &old_ts->delack_tid); efab_ip_queue_copy(alien_ni, &new_ts->recv1, &old_thr->netif, &old_ts->recv1); efab_ip_queue_copy(alien_ni, &new_ts->recv2, &old_thr->netif, &old_ts->recv2); new_ts->recv1_extract = new_ts->recv1.head; /* Drop reorder buffer */ ci_ip_queue_init(&new_ts->rob); new_ts->dsack_block = OO_PP_INVALID; new_ts->dsack_start = new_ts->dsack_end = 0; for( i = 0; i <= CI_TCP_SACK_MAX_BLOCKS; i++ ) new_ts->last_sack[i] = OO_PP_NULL; } else { /* There should not be any recv q, but drop it to be sure */ ci_udp_recv_q_init(&SOCK_TO_UDP(new_s)->recv_q); } /* Old stack can be unlocked */ old_s->b.sb_flags |= CI_SB_FLAG_MOVED; ci_netif_unlock(&old_thr->netif); ci_assert( efab_file_move_supported(alien_ni, new_s) ); /* Move done: poll for any new data. */ ci_netif_poll(alien_ni); if( new_s->b.state & CI_TCP_STATE_TCP ) { ci_tcp_state *new_ts = SOCK_TO_TCP(new_s); /* Timers setup: delack, keepalive */ if( (new_ts->acks_pending & CI_TCP_ACKS_PENDING_MASK) > 0) ci_tcp_timeout_delack(alien_ni, new_ts); ci_tcp_kalive_reset(alien_ni, new_ts); } /* Old ep: we are done. */ ci_bit_set(&old_s->b.sb_aflags, CI_SB_AFLAG_MOVED_AWAY_BIT); old_s->b.moved_to_stack_id = alien_ni->state->stack_id; old_s->b.moved_to_sock_id = new_s->b.bufid; if( ! list_empty(&priv->_filp->f_ep_links) ) ci_bit_set(&old_s->b.sb_aflags, CI_SB_AFLAG_MOVED_AWAY_IN_EPOLL_BIT); ci_sock_unlock(&old_thr->netif, &old_s->b); ci_sock_unlock(alien_ni, &new_s->b); ci_assert(ci_netif_is_locked(alien_ni)); OO_DEBUG_TCPH(ci_log("%s: -> [%d:%d] %s", __func__, new_thr->id, new_s->b.bufid, ci_tcp_state_str(new_s->b.state))); return 0; fail4: /* We clear the filters from the new ep. * For now, we do not need to re-insert old filters because hw filters * are alredy here (in case of accepted socket) or not needed. * We have not removed old sw filters yet. */ tcp_helper_endpoint_move_filters_undo(old_ep, new_ep); fail3: if( new_s->b.state & CI_TCP_STATE_TCP ) ci_tcp_state_free(alien_ni, SOCK_TO_TCP(new_s)); else ci_udp_state_free(alien_ni, SOCK_TO_UDP(new_s)); fail2: ci_netif_unlock(alien_ni); fail1: ci_netif_unlock(&old_thr->netif); fail1_ni_unlocked: ci_sock_unlock(&old_thr->netif, &old_s->b); OO_DEBUG_TCPH(ci_log("%s: rc=%d", __func__, rc)); return rc; }
/* c_ni is assumed to be locked on enterance and is always unlocked on * exit. */ int ci_tcp_connect_lo_toconn(ci_netif *c_ni, oo_sp c_id, ci_uint32 dst, ci_netif *l_ni, oo_sp l_id) { ci_tcp_state *ts; ci_tcp_socket_listen *tls, *alien_tls; citp_waitable_obj *wo; citp_waitable *w; int rc; ci_assert(ci_netif_is_locked(c_ni)); ci_assert(OO_SP_NOT_NULL(c_id)); ci_assert(OO_SP_NOT_NULL(l_id)); LOG_TC(log("%s: connect %d:%d to %d:%d", __FUNCTION__, c_ni->state->stack_id, OO_SP_TO_INT(c_id), l_ni->state->stack_id, OO_SP_TO_INT(l_id))); alien_tls = SP_TO_TCP_LISTEN(l_ni, l_id); if( (int)ci_tcp_acceptq_n(alien_tls) >= alien_tls->acceptq_max ) { ci_netif_unlock(c_ni); return -EBUSY; } /* In c_ni, create shadow listening socket tls (copy l_id) */ ts = ci_tcp_get_state_buf(c_ni); if( ts == NULL ) { ci_netif_unlock(c_ni); LOG_E(ci_log("%s: [%d] out of socket buffers", __FUNCTION__, NI_ID(c_ni))); return -ENOMEM; } /* init common tcp fields */ ts->s.so = alien_tls->s.so; ts->s.cp.ip_ttl = alien_tls->s.cp.ip_ttl; S_TCP_HDR(&ts->s)->tcp_source_be16 = S_TCP_HDR(&alien_tls->s)->tcp_source_be16; ts->s.domain = alien_tls->s.domain; ts->c = alien_tls->c; ts->c.tcp_defer_accept = OO_TCP_DEFER_ACCEPT_OFF; /* make sure nobody will ever connect to our "shadow" socket * except us */ ci_bit_set(&ts->s.b.sb_aflags, CI_SB_AFLAG_ORPHAN_BIT); ci_tcp_set_slow_state(c_ni, ts, CI_TCP_LISTEN); tls = SOCK_TO_TCP_LISTEN(&ts->s); /* no timer: */ tls->s.s_flags = alien_tls->s.s_flags | CI_SOCK_FLAG_BOUND_ALIEN; tls->acceptq_max = 1; rc = ci_tcp_listen_init(c_ni, tls); if( rc != 0 ) { citp_waitable_obj_free(c_ni, &tls->s.b); return rc; } /* Connect c_id to tls */ ts = SP_TO_TCP(c_ni, c_id); rc = ci_tcp_connect_lo_samestack(c_ni, ts, tls->s.b.bufid); /* Accept as from tls */ if( !ci_tcp_acceptq_not_empty(tls) ) { /* it is possible, for example, if ci_tcp_listenq_try_promote() failed * because there are no endpoints */ ci_tcp_listenq_drop_all(c_ni, tls); citp_waitable_obj_free(c_ni, &tls->s.b); ci_netif_unlock(c_ni); return -EBUSY; } w = ci_tcp_acceptq_get(c_ni, tls); ci_assert(w); LOG_TV(ci_log("%s: %d:%d to %d:%d shadow %d:%d accepted %d:%d", __FUNCTION__, c_ni->state->stack_id, OO_SP_TO_INT(c_id), l_ni->state->stack_id, OO_SP_TO_INT(l_id), c_ni->state->stack_id, tls->s.b.bufid, c_ni->state->stack_id, w->bufid)); ci_assert(w->state & CI_TCP_STATE_TCP); ci_assert(w->state != CI_TCP_LISTEN); /* Destroy tls. * NB: nobody could possibly connect to it, so no need to do proper * shutdown. */ ci_assert_equal(ci_tcp_acceptq_n(tls), 0); ci_tcp_listenq_drop_all(c_ni, tls); citp_waitable_obj_free(c_ni, &tls->s.b); ci_netif_unlock(c_ni); /* Keep a port reference */ { tcp_helper_endpoint_t *l_ep, *a_ep; struct oo_file_ref* os_sock_ref; ci_irqlock_state_t lock_flags; l_ep = ci_trs_ep_get(netif2tcp_helper_resource(l_ni), l_id); a_ep = ci_trs_ep_get(netif2tcp_helper_resource(c_ni), W_SP(w)); ci_irqlock_lock(&l_ep->thr->lock, &lock_flags); os_sock_ref = l_ep->os_socket; ci_assert_equal(a_ep->os_port_keeper, NULL); if( os_sock_ref != NULL ) { os_sock_ref = oo_file_ref_add(os_sock_ref); os_sock_ref = oo_file_ref_xchg(&a_ep->os_port_keeper, os_sock_ref); ci_irqlock_unlock(&l_ep->thr->lock, &lock_flags); if( os_sock_ref != NULL ) oo_file_ref_drop(os_sock_ref); } else { ci_irqlock_unlock(&l_ep->thr->lock, &lock_flags); goto cleanup; } } /* lock l_ni: Check that l_id is the same socket it used to be */ /* create ref-sock in l_ni, put it into acc q */ if( ci_netif_lock(l_ni) != 0 ) goto cleanup; if( alien_tls->s.b.state != CI_TCP_LISTEN || (alien_tls->s.b.sb_aflags & CI_SB_AFLAG_ORPHAN) || S_TCP_HDR(&alien_tls->s)->tcp_source_be16 != TS_TCP(ts)->tcp_dest_be16 || (alien_tls->s.pkt.ip.ip_saddr_be32 != INADDR_ANY && alien_tls->s.pkt.ip.ip_saddr_be32 != ts->s.pkt.ip.ip_daddr_be32) ) { ci_netif_unlock(l_ni); goto cleanup; } ci_bit_mask_set(&w->sb_aflags, CI_SB_AFLAG_TCP_IN_ACCEPTQ | CI_SB_AFLAG_ORPHAN); wo = citp_waitable_obj_alloc(l_ni); if( wo == NULL ) { ci_netif_unlock(l_ni); goto cleanup; } wo->waitable.state = CI_TCP_CLOSED; wo->waitable.sb_aflags |= CI_SB_AFLAG_MOVED_AWAY; wo->waitable.moved_to_stack_id = c_ni->state->stack_id; wo->waitable.moved_to_sock_id = W_SP(w); LOG_TC(log("%s: put to acceptq %d:%d referencing %d:%d", __func__, l_ni->state->stack_id, OO_SP_TO_INT(W_SP(&wo->waitable)), c_ni->state->stack_id, OO_SP_TO_INT(W_SP(w)))); ci_tcp_acceptq_put(l_ni, alien_tls, &wo->waitable); citp_waitable_wake_not_in_poll(l_ni, &alien_tls->s.b, CI_SB_FLAG_WAKE_RX); ci_netif_unlock(l_ni); return rc; cleanup: ci_assert(w->sb_aflags & CI_SB_AFLAG_ORPHAN); ci_bit_mask_clear(&w->sb_aflags, CI_SB_AFLAG_TCP_IN_ACCEPTQ | CI_SB_AFLAG_ORPHAN); efab_tcp_helper_close_endpoint(netif2tcp_helper_resource(c_ni), w->bufid); /* we can not guarantee c_ni lock, so we can' call * ci_tcp_drop(c_ni, ts). So, we return error; UL will handover * and close ts endpoint. */ return -EBUSY; }