void ci_netif_filter_remove(ci_netif* netif, oo_sp sock_p, unsigned laddr, unsigned lport, unsigned raddr, unsigned rport, unsigned protocol) { ci_netif_filter_table_entry* entry; unsigned hash1, hash2, tbl_i; ci_netif_filter_table* tbl; int hops = 0; unsigned first; ci_assert(ci_netif_is_locked(netif) #ifdef __KERNEL__ /* release_ep_tbl might be called without the stack lock. * Do not complain about this. */ || (netif2tcp_helper_resource(netif)->k_ref_count & TCP_HELPER_K_RC_DEAD) #endif ); tbl = netif->filter_table; hash1 = tcp_hash1(tbl, laddr, lport, raddr, rport, protocol); hash2 = tcp_hash2(tbl, laddr, lport, raddr, rport, protocol); first = hash1; LOG_TC(ci_log("%s: [%d:%d] REMOVE %s %s:%u->%s:%u hash=%u:%u", __FUNCTION__, NI_ID(netif), OO_SP_FMT(sock_p), CI_IP_PROTOCOL_STR(protocol), ip_addr_str(laddr), (unsigned) CI_BSWAP_BE16(lport), ip_addr_str(raddr), (unsigned) CI_BSWAP_BE16(rport), hash1, hash2)); tbl_i = hash1; while( 1 ) { entry = &tbl->table[tbl_i]; if( entry->id == OO_SP_TO_INT(sock_p) ) { if( laddr == entry->laddr ) break; } else if( entry->id == EMPTY ) { /* We allow multiple removes of the same filter -- helps avoid some * complexity in the filter module. */ return; } tbl_i = (tbl_i + hash2) & tbl->table_size_mask; ++hops; if( tbl_i == first ) { LOG_E(ci_log(FN_FMT "ERROR: LOOP [%d] %s %s:%u->%s:%u", FN_PRI_ARGS(netif), OO_SP_FMT(sock_p), CI_IP_PROTOCOL_STR(protocol), ip_addr_str(laddr), (unsigned) CI_BSWAP_BE16(lport), ip_addr_str(raddr), (unsigned) CI_BSWAP_BE16(rport))); return; } } __ci_netif_filter_remove(netif, hash1, hash2, hops, tbl_i); }
static int efab_tcp_helper_stack_attach(ci_private_t* priv, void *arg) { oo_stack_attach_t* op = arg; tcp_helper_resource_t* trs = priv->thr; int rc; if( trs == NULL ) { LOG_E(ci_log("%s: ERROR: not attached to a stack", __FUNCTION__)); return -EINVAL; } OO_DEBUG_TCPH(ci_log("%s: [%d]", __FUNCTION__, NI_ID(&trs->netif))); rc = oo_create_stack_fd(trs); if( rc < 0 ) { OO_DEBUG_ERR(ci_log("%s: oo_create_stack_fd failed (%d)", __FUNCTION__, rc)); return rc; } op->fd = rc; /* Re-read the OS socket buffer size settings. This ensures we'll use * up-to-date values for this new socket. */ efab_get_os_settings(&NI_OPTS_TRS(trs)); op->out_nic_set = trs->netif.nic_set; op->out_map_size = trs->mem_mmap_bytes; return 0; }
void oo_os_sock_release(ci_netif* ni, oo_os_file fd) { int rc = ci_sys_close(fd); oo_rwlock_unlock_read(&citp_dup2_lock); if( rc != 0 ) LOG_E(ci_log("%s: [%d] ci_sys_close returned %d (errno=%d)", __FUNCTION__, NI_ID(ni), rc, errno)); }
int oo_os_sock_get(ci_netif* ni, oo_sp sock_p, oo_os_file* os_sock_out) { int sock_id = OO_SP_TO_INT(sock_p); tcp_helper_endpoint_t* ep; if( sock_id != TRUSTED_SOCK_ID(ni, sock_id) ) { LOG_E(ci_log("%s: ERROR: %d:%d bad sock_id", __FUNCTION__, NI_ID(ni), sock_id)); return -EINVAL; } ep = ci_netif_ep_get(ni, sock_p); if( oo_os_sock_get_from_ep(ep, os_sock_out) == 0 ) return 0; LOG_E(ci_log("%s: ERROR: %d:%d has no O/S socket", __FUNCTION__, NI_ID(ni), sock_id)); return -ENOENT; }
static ci_tcp_state_synrecv* ci_tcp_listenq_bucket_lookup(ci_netif* ni, ci_tcp_listen_bucket* bucket, ciip_tcp_rx_pkt* rxp, int level) { ci_ni_aux_mem* aux; int idx = ci_tcp_listenq_hash2idx(rxp->hash, level); ci_tcp_state_synrecv* tsr; unsigned saddr, daddr, sport; #ifdef __KERNEL__ int i = 0; if( level > CI_LISTENQ_BUCKET_MAX_DEPTH(ni) ) { ci_netif_error_detected(ni, CI_NETIF_ERROR_SYNRECV_TABLE, __FUNCTION__); return 0; } #endif LOG_TV(ci_log("%s([%d] level=%d hash:%x l:%s r:%s:%d)", __func__, NI_ID(ni), level, rxp->hash, ip_addr_str(oo_ip_hdr(rxp->pkt)->ip_daddr_be32), ip_addr_str(oo_ip_hdr(rxp->pkt)->ip_saddr_be32), CI_BSWAP_BE16(rxp->tcp->tcp_source_be16))); if( OO_P_IS_NULL(bucket->bucket[idx]) ) return NULL; level++; aux = ci_ni_aux_p2aux(ni, bucket->bucket[idx]); if( aux->type == CI_TCP_AUX_TYPE_BUCKET ) return ci_tcp_listenq_bucket_lookup(ni, &aux->u.bucket, rxp, level); saddr = oo_ip_hdr(rxp->pkt)->ip_saddr_be32; daddr = oo_ip_hdr(rxp->pkt)->ip_daddr_be32; sport = rxp->tcp->tcp_source_be16; tsr = &aux->u.synrecv; do { if( ! ((saddr - tsr->r_addr) | (daddr - tsr->l_addr) | (sport - tsr->r_port)) ) return tsr; if( OO_P_IS_NULL(tsr->bucket_link) ) return NULL; aux = ci_ni_aux_p2aux(ni, tsr->bucket_link); tsr = &aux->u.synrecv; #ifdef __KERNEL__ if( i++ > CI_LISTENQ_BUCKET_LIST_LIMIT(ni) ) { ci_netif_error_detected(ni, CI_NETIF_ERROR_SYNRECV_TABLE, __FUNCTION__); return NULL; } #endif } while(1); /* unreachable */ return NULL; }
static void ci_tcp_listenq_bucket_insert(ci_netif* ni, ci_tcp_socket_listen* tls, ci_tcp_listen_bucket* bucket, ci_tcp_state_synrecv* tsr, int level) { ci_ni_aux_mem* aux; int idx = ci_tcp_listenq_hash2idx(tsr->hash, level); oo_p tsr_p = ci_tcp_synrecv2p(ni, tsr); #ifdef __KERNEL__ int i = 0; #endif LOG_TV(ci_log("%s([%d] level=%d "TSR_FMT")", __func__, NI_ID(ni), level, TSR_ARGS(tsr))); if( OO_P_IS_NULL(bucket->bucket[idx]) ) { bucket->bucket[idx] = tsr_p; return; } level++; aux = ci_ni_aux_p2aux(ni, bucket->bucket[idx]); if( aux->type == CI_TCP_AUX_TYPE_BUCKET ) { ci_tcp_listenq_bucket_insert(ni, tls, &aux->u.bucket, tsr, level); return; } /* So, this bucket contains of a list of other synrecv states. We add * our trs to this list and try to improve things by allocating * next-level bucket. */ tsr->bucket_link = bucket->bucket[idx]; bucket->bucket[idx] = tsr_p; if( level > CI_LISTENQ_BUCKET_MAX_DEPTH(ni) ) return; bucket->bucket[idx] = ci_ni_aux_alloc_bucket(ni); if( OO_P_IS_NULL(bucket->bucket[idx]) ) return; bucket = ci_ni_aux_p2bucket(ni, bucket->bucket[idx]); tls->n_buckets++; while( OO_P_NOT_NULL(tsr_p) ) { tsr = &ci_ni_aux_p2aux(ni, tsr_p)->u.synrecv; #ifdef __KERNEL__ if( i++ > CI_LISTENQ_BUCKET_LIST_LIMIT(ni) ) { ci_tcp_listenq_bucket_insert(ni, tls, bucket, tsr, level); ci_netif_error_detected(ni, CI_NETIF_ERROR_SYNRECV_TABLE, __FUNCTION__); return; } #endif tsr_p = tsr->bucket_link; tsr->bucket_link = OO_P_NULL; ci_tcp_listenq_bucket_insert(ni, tls, bucket, tsr, level); } }
void citp_waitable_all_fds_gone(ci_netif* ni, oo_sp w_id) { citp_waitable_obj* wo; ci_assert(ni); ci_assert(IS_VALID_SOCK_P(ni, w_id)); ci_assert(ci_netif_is_locked(ni)); wo = SP_TO_WAITABLE_OBJ(ni, w_id); ci_assert(wo->waitable.state != CI_TCP_STATE_FREE); LOG_NC(ci_log("%s: %d:%d %s", __FUNCTION__, NI_ID(ni), OO_SP_FMT(w_id), ci_tcp_state_str(wo->waitable.state))); /* listening socket is closed in blocking conext, see * efab_tcp_helper_close_endpoint(). * CI_SB_AFLAG_ORPHAN is set earlier in this case.. */ CI_DEBUG(if( (wo->waitable.sb_aflags & CI_SB_AFLAG_ORPHAN) && wo->waitable.state != CI_TCP_LISTEN ) ci_log("%s: %d:%d already orphan", __FUNCTION__, NI_ID(ni), OO_SP_FMT(w_id))); /* It's essential that an ORPHANed socket not be on the deferred * socket list, because the same link field is used as timewait * list, free list etc. We must purge the deferred list before * setting the orphan flag. * * NB. This socket cannot now be added to the deferred list, because * no-one has a reference to it. */ ci_netif_purge_deferred_socket_list(ni); ci_bit_set(&wo->waitable.sb_aflags, CI_SB_AFLAG_ORPHAN_BIT); /* We also need to remove the socket from the post-poll list. It may * have been left there because the stack believes a wakeup is needed. */ ci_ni_dllist_remove_safe(ni, &wo->waitable.post_poll_link); ci_ni_dllist_remove_safe(ni, &wo->waitable.ready_link); wo->waitable.ready_list_id = 0; citp_waitable_cleanup(ni, wo, 1); }
static void citp_waitable_dump2(ci_netif* ni, citp_waitable* w, const char* pf, oo_dump_log_fn_t logger, void* log_arg) { unsigned tmp; if( CI_TCP_STATE_IS_SOCKET(w->state) ) { ci_sock_cmn* s = CI_CONTAINER(ci_sock_cmn, b, w); logger(log_arg, "%s%s "NT_FMT"lcl="OOF_IP4PORT" rmt="OOF_IP4PORT" %s", pf, citp_waitable_type_str(w), NI_ID(ni), W_FMT(w), OOFA_IP4PORT(sock_laddr_be32(s), sock_lport_be16(s)), OOFA_IP4PORT(sock_raddr_be32(s), sock_rport_be16(s)), ci_tcp_state_str(w->state)); } else logger(log_arg, "%s%s "NT_FMT, pf, citp_waitable_type_str(w), NI_ID(ni), W_FMT(w)); if( w->state == CI_TCP_STATE_FREE || w->state == CI_TCP_STATE_AUXBUF ) return; tmp = w->lock.wl_val; logger(log_arg, "%s lock: %x %s%s", pf, tmp, (tmp & OO_WAITABLE_LK_LOCKED) ? "LOCKED" : "", (tmp & OO_WAITABLE_LK_NEED_WAKE) ? " CONTENDED": ""); logger(log_arg, "%s rx_wake=%08x%s tx_wake=%08x%s flags: "CI_SB_FLAGS_FMT, pf, w->sleep_seq.rw.rx, ci_bit_test(&w->wake_request, CI_SB_FLAG_WAKE_RX_B) ? "(RQ)":" ", w->sleep_seq.rw.tx, ci_bit_test(&w->wake_request, CI_SB_FLAG_WAKE_TX_B) ? "(RQ)":" ", CI_SB_FLAGS_PRI_ARG(w)); if( w->spin_cycles == -1 ) logger(log_arg, "%s ul_poll: -1 spin cycles -1 usecs", pf); else logger(log_arg, "%s ul_poll: %llu spin cycles %u usec", pf, w->spin_cycles, oo_cycles64_to_usec(ni, w->spin_cycles)); }
ci_fd_t ci_udp_ep_ctor(citp_socket* ep, ci_netif* netif, int domain, int type) { ci_udp_state* us; ci_fd_t fd; VERB( log(LPFIN "ctor( )" ) ); ci_assert(ep); ci_assert(netif); ci_netif_lock(netif); us = ci_udp_get_state_buf(netif); if (!us) { ci_netif_unlock(netif); LOG_E(ci_log("%s: [%d] out of socket buffers", __FUNCTION__,NI_ID(netif))); return -ENOMEM; } /* It's required to set protocol before ci_tcp_helper_sock_attach() * since it's used to determine if TCP or UDP file operations should be * attached to the file descriptor in kernel. */ sock_protocol(&us->s) = IPPROTO_UDP; /* NB: this attach will close the os_sock_fd */ fd = ci_tcp_helper_sock_attach(ci_netif_get_driver_handle(netif), SC_SP(&us->s), domain, type); if( fd < 0 ) { if( fd == -EAFNOSUPPORT ) LOG_U(ci_log("%s: ci_tcp_helper_sock_attach (domain=%d, type=%d) " "failed %d", __FUNCTION__, domain, type, fd)); else LOG_E(ci_log("%s: ci_tcp_helper_sock_attach (domain=%d, type=%d) " "failed %d", __FUNCTION__, domain, type, fd)); ci_netif_unlock(netif); return fd; } ci_assert(~us->s.b.sb_aflags & CI_SB_AFLAG_ORPHAN); us->s.rx_errno = 0; us->s.tx_errno = 0; us->s.so_error = 0; us->s.cp.sock_cp_flags |= OO_SCP_UDP_WILD; ep->s = &us->s; ep->netif = netif; CHECK_UEP(ep); ci_netif_unlock(netif); return fd; }
void citp_passthrough_init(citp_alien_fdi* epi) { int rc = oo_os_sock_get(epi->netif, epi->ep->bufid, &epi->os_socket); /* No sensible way to handle oo_os_sock_get failure. Just record it. */ if( rc != 0 ) { Log_U(ci_log("%s: oo_os_sock_get([%d:%d]) returned %d", __func__, NI_ID(epi->netif), epi->ep->bufid, rc)); epi->os_socket = -1; return; } __citp_fdtable_reserve(epi->os_socket, 1); /* ci_tcp_helper_get_sock_fd gets the citp_dup2_lock lock: release it */ oo_rwlock_unlock_read(&citp_dup2_lock); }
static int efab_stacks_seq_show(struct seq_file *seq, void *v) { ci_netif *ni = v; ci_netif_stats* s = &ni->state->stats; seq_printf(seq, "%d: %d %d %u %u %u %u %u %u %u %u %u %u %u %u %u %u %u %u %u\n", NI_ID(ni), (int) ni->state->pid, (int) ni->state->uid, s->periodic_polls, s->periodic_evs, s->timeout_interrupts, s->interrupts, s->interrupt_polls, s->interrupt_wakes, s->interrupt_evs, s->interrupt_primes, s->select_primes, s->sock_wakes_rx + s->sock_wakes_tx + s->sock_wakes_rx_os + s->sock_wakes_tx_os, s->pkt_wakes, s->unlock_slow, s->lock_wakes, s->deferred_work, s->sock_lock_sleeps, s->rx_evs, s->tx_evs); return 0; }
struct oof_socket* oof_cb_sw_filter_lookup(struct tcp_helper_resource_s* stack, unsigned laddr, int lport, unsigned raddr, int rport, int protocol) { ci_netif* ni = &stack->netif; int sock_id, tbl_idx; tbl_idx = ci_netif_filter_lookup(ni, laddr, lport, raddr, rport, protocol); if( tbl_idx < 0 ) return NULL; sock_id = ni->filter_table->table[tbl_idx].id; if( ! IS_VALID_SOCK_ID(ni, sock_id) ) { OO_DEBUG_ERR(ci_log("%s: ERROR: %d %s "IPPORT_FMT" "IPPORT_FMT, __FUNCTION__, NI_ID(ni), FMT_PROTOCOL(protocol), IPPORT_ARG(laddr, lport), IPPORT_ARG(raddr, rport)); ci_log("--> idx=%d sock_id=%d sock_id_max=%d", tbl_idx, sock_id, ni->ep_tbl_n)); return NULL; }
ci_fd_t ci_tcp_ep_ctor(citp_socket* ep, ci_netif* netif, int domain, int type) { ci_tcp_state* ts; ci_fd_t fd; ci_assert(ep); ci_assert(netif); ci_netif_lock(netif); ts = ci_tcp_get_state_buf(netif); if( ts == NULL ) { ci_netif_unlock(netif); LOG_E(ci_log("%s: [%d] out of socket buffers", __FUNCTION__,NI_ID(netif))); return -ENOMEM; } fd = ci_tcp_helper_sock_attach(ci_netif_get_driver_handle(netif), S_SP(ts), domain, type); if( fd < 0 ) { if( fd == -EAFNOSUPPORT ) LOG_U(ci_log("%s: ci_tcp_helper_sock_attach" \ "(domain=%d, type=%d) failed %d", __FUNCTION__, domain, type, fd)); else LOG_E(ci_log("%s: ci_tcp_helper_sock_attach" \ "(domain=%d, type=%d) failed %d", __FUNCTION__, domain, type, fd)); } else { ci_assert(~ts->s.b.sb_aflags & CI_SB_AFLAG_ORPHAN); /* Apply default sockbuf sizes now we've updated them from the kernel ** defaults. */ ts->s.so.sndbuf = NI_OPTS(netif).tcp_sndbuf_def; ts->s.so.rcvbuf = NI_OPTS(netif).tcp_rcvbuf_def; ep->netif = netif; ep->s = &ts->s; CHECK_TEP(ep); } ci_netif_unlock(netif); return fd; }
int oo_os_sock_ioctl(ci_netif* ni, oo_sp sock_p, int request, void* arg, int* ioctl_rc) { oo_os_file os_sock_fd; int rc; if( (rc = oo_os_sock_get(ni, sock_p, &os_sock_fd)) == 0 ) { rc = ci_sys_ioctl(os_sock_fd, request, arg); if( rc < 0 ) rc = -errno; oo_os_sock_release(ni, os_sock_fd); if( ioctl_rc != NULL ) { *ioctl_rc = rc; rc = 0; } } else { LOG_E(ci_log("%s: [%d:%d] ERROR: failed to get kernel sock fd " "(rc=%d req=%d)", __FUNCTION__, NI_ID(ni), OO_SP_FMT(sock_p), rc, request)); } return rc; }
ssize_t linux_tcp_helper_fop_sendpage(struct file* filp, struct page* page, int offset, size_t size, loff_t* ppos, int flags) { ci_private_t* priv = filp->private_data; tcp_helper_resource_t* trs = efab_priv_to_thr(priv); ci_sock_cmn* s; OO_DEBUG_VERB(ci_log("%s: %d:%d offset=%d size=%d flags=%x", __FUNCTION__, NI_ID(&trs->netif), OO_SP_FMT(priv->sock_id), offset, (int) size, flags)); ci_assert(page); ci_assert_ge(offset, 0); ci_assert_gt(size, 0); ci_assert_le(offset + size, CI_PAGE_SIZE); #ifndef MSG_SENDPAGE_NOTLAST /* "flags" is really "more". Convert it. */ if( flags ) flags = MSG_MORE; /* [more] is sometimes true even for the last page. We get a little ** closer to the truth by spotting that we're not reading to the end of ** the page. - seen on 2.6.18, but not on 2.6.26 or later */ if( offset + size < CI_PAGE_SIZE && flags ) flags = 0; #endif s = SP_TO_SOCK(&trs->netif, priv->sock_id); if(CI_LIKELY( s->b.state & CI_TCP_STATE_TCP_CONN )) return sendpage_copy(&trs->netif,SOCK_TO_TCP(s),page,offset,size,flags); else /* Closed or listening. Return epipe. Do not send SIGPIPE, because ** Linux will do it for us. */ return -s->tx_errno; }
/* c_ni is assumed to be locked on enterance and is always unlocked on * exit. */ int ci_tcp_connect_lo_toconn(ci_netif *c_ni, oo_sp c_id, ci_uint32 dst, ci_netif *l_ni, oo_sp l_id) { ci_tcp_state *ts; ci_tcp_socket_listen *tls, *alien_tls; citp_waitable_obj *wo; citp_waitable *w; int rc; ci_assert(ci_netif_is_locked(c_ni)); ci_assert(OO_SP_NOT_NULL(c_id)); ci_assert(OO_SP_NOT_NULL(l_id)); LOG_TC(log("%s: connect %d:%d to %d:%d", __FUNCTION__, c_ni->state->stack_id, OO_SP_TO_INT(c_id), l_ni->state->stack_id, OO_SP_TO_INT(l_id))); alien_tls = SP_TO_TCP_LISTEN(l_ni, l_id); if( (int)ci_tcp_acceptq_n(alien_tls) >= alien_tls->acceptq_max ) { ci_netif_unlock(c_ni); return -EBUSY; } /* In c_ni, create shadow listening socket tls (copy l_id) */ ts = ci_tcp_get_state_buf(c_ni); if( ts == NULL ) { ci_netif_unlock(c_ni); LOG_E(ci_log("%s: [%d] out of socket buffers", __FUNCTION__, NI_ID(c_ni))); return -ENOMEM; } /* init common tcp fields */ ts->s.so = alien_tls->s.so; ts->s.cp.ip_ttl = alien_tls->s.cp.ip_ttl; S_TCP_HDR(&ts->s)->tcp_source_be16 = S_TCP_HDR(&alien_tls->s)->tcp_source_be16; ts->s.domain = alien_tls->s.domain; ts->c = alien_tls->c; ts->c.tcp_defer_accept = OO_TCP_DEFER_ACCEPT_OFF; /* make sure nobody will ever connect to our "shadow" socket * except us */ ci_bit_set(&ts->s.b.sb_aflags, CI_SB_AFLAG_ORPHAN_BIT); ci_tcp_set_slow_state(c_ni, ts, CI_TCP_LISTEN); tls = SOCK_TO_TCP_LISTEN(&ts->s); /* no timer: */ tls->s.s_flags = alien_tls->s.s_flags | CI_SOCK_FLAG_BOUND_ALIEN; tls->acceptq_max = 1; rc = ci_tcp_listen_init(c_ni, tls); if( rc != 0 ) { citp_waitable_obj_free(c_ni, &tls->s.b); return rc; } /* Connect c_id to tls */ ts = SP_TO_TCP(c_ni, c_id); rc = ci_tcp_connect_lo_samestack(c_ni, ts, tls->s.b.bufid); /* Accept as from tls */ if( !ci_tcp_acceptq_not_empty(tls) ) { /* it is possible, for example, if ci_tcp_listenq_try_promote() failed * because there are no endpoints */ ci_tcp_listenq_drop_all(c_ni, tls); citp_waitable_obj_free(c_ni, &tls->s.b); ci_netif_unlock(c_ni); return -EBUSY; } w = ci_tcp_acceptq_get(c_ni, tls); ci_assert(w); LOG_TV(ci_log("%s: %d:%d to %d:%d shadow %d:%d accepted %d:%d", __FUNCTION__, c_ni->state->stack_id, OO_SP_TO_INT(c_id), l_ni->state->stack_id, OO_SP_TO_INT(l_id), c_ni->state->stack_id, tls->s.b.bufid, c_ni->state->stack_id, w->bufid)); ci_assert(w->state & CI_TCP_STATE_TCP); ci_assert(w->state != CI_TCP_LISTEN); /* Destroy tls. * NB: nobody could possibly connect to it, so no need to do proper * shutdown. */ ci_assert_equal(ci_tcp_acceptq_n(tls), 0); ci_tcp_listenq_drop_all(c_ni, tls); citp_waitable_obj_free(c_ni, &tls->s.b); ci_netif_unlock(c_ni); /* Keep a port reference */ { tcp_helper_endpoint_t *l_ep, *a_ep; struct oo_file_ref* os_sock_ref; ci_irqlock_state_t lock_flags; l_ep = ci_trs_ep_get(netif2tcp_helper_resource(l_ni), l_id); a_ep = ci_trs_ep_get(netif2tcp_helper_resource(c_ni), W_SP(w)); ci_irqlock_lock(&l_ep->thr->lock, &lock_flags); os_sock_ref = l_ep->os_socket; ci_assert_equal(a_ep->os_port_keeper, NULL); if( os_sock_ref != NULL ) { os_sock_ref = oo_file_ref_add(os_sock_ref); os_sock_ref = oo_file_ref_xchg(&a_ep->os_port_keeper, os_sock_ref); ci_irqlock_unlock(&l_ep->thr->lock, &lock_flags); if( os_sock_ref != NULL ) oo_file_ref_drop(os_sock_ref); } else { ci_irqlock_unlock(&l_ep->thr->lock, &lock_flags); goto cleanup; } } /* lock l_ni: Check that l_id is the same socket it used to be */ /* create ref-sock in l_ni, put it into acc q */ if( ci_netif_lock(l_ni) != 0 ) goto cleanup; if( alien_tls->s.b.state != CI_TCP_LISTEN || (alien_tls->s.b.sb_aflags & CI_SB_AFLAG_ORPHAN) || S_TCP_HDR(&alien_tls->s)->tcp_source_be16 != TS_TCP(ts)->tcp_dest_be16 || (alien_tls->s.pkt.ip.ip_saddr_be32 != INADDR_ANY && alien_tls->s.pkt.ip.ip_saddr_be32 != ts->s.pkt.ip.ip_daddr_be32) ) { ci_netif_unlock(l_ni); goto cleanup; } ci_bit_mask_set(&w->sb_aflags, CI_SB_AFLAG_TCP_IN_ACCEPTQ | CI_SB_AFLAG_ORPHAN); wo = citp_waitable_obj_alloc(l_ni); if( wo == NULL ) { ci_netif_unlock(l_ni); goto cleanup; } wo->waitable.state = CI_TCP_CLOSED; wo->waitable.sb_aflags |= CI_SB_AFLAG_MOVED_AWAY; wo->waitable.moved_to_stack_id = c_ni->state->stack_id; wo->waitable.moved_to_sock_id = W_SP(w); LOG_TC(log("%s: put to acceptq %d:%d referencing %d:%d", __func__, l_ni->state->stack_id, OO_SP_TO_INT(W_SP(&wo->waitable)), c_ni->state->stack_id, OO_SP_TO_INT(W_SP(w)))); ci_tcp_acceptq_put(l_ni, alien_tls, &wo->waitable); citp_waitable_wake_not_in_poll(l_ni, &alien_tls->s.b, CI_SB_FLAG_WAKE_RX); ci_netif_unlock(l_ni); return rc; cleanup: ci_assert(w->sb_aflags & CI_SB_AFLAG_ORPHAN); ci_bit_mask_clear(&w->sb_aflags, CI_SB_AFLAG_TCP_IN_ACCEPTQ | CI_SB_AFLAG_ORPHAN); efab_tcp_helper_close_endpoint(netif2tcp_helper_resource(c_ni), w->bufid); /* we can not guarantee c_ni lock, so we can' call * ci_tcp_drop(c_ni, ts). So, we return error; UL will handover * and close ts endpoint. */ return -EBUSY; }
/* Return 1 if the bucket is empty now */ static int ci_tcp_listenq_bucket_remove(ci_netif* ni, ci_tcp_socket_listen* tls, ci_tcp_listen_bucket* bucket, ci_tcp_state_synrecv* tsr, int level) { ci_ni_aux_mem* aux; int idx = ci_tcp_listenq_hash2idx(tsr->hash, level); oo_p tsr_p = ci_tcp_synrecv2p(ni, tsr); /* Fixme: we remove empty buckets only. In theory, it may be useful to * remove a bucket with one non-empty list, but it maked code more * complicated. */ int empty = 0; #ifdef __KERNEL__ int i = 0; if( level > CI_LISTENQ_BUCKET_MAX_DEPTH(ni) ) { ci_netif_error_detected(ni, CI_NETIF_ERROR_SYNRECV_TABLE, __FUNCTION__); return 0; } #endif LOG_TV(ci_log("%s([%d] level=%d "TSR_FMT")", __func__, NI_ID(ni), level, TSR_ARGS(tsr))); ci_assert( OO_P_NOT_NULL(bucket->bucket[idx]) ); #ifdef __KERNEL__ if( OO_P_IS_NULL(bucket->bucket[idx]) ) { ci_netif_error_detected(ni, CI_NETIF_ERROR_SYNRECV_TABLE, __FUNCTION__); return 0; } #endif level++; aux = ci_ni_aux_p2aux(ni, bucket->bucket[idx]); if( aux->type == CI_TCP_AUX_TYPE_BUCKET ) { empty = ci_tcp_listenq_bucket_remove(ni, tls, &aux->u.bucket, tsr, level); if( empty ) { bucket->bucket[idx] = OO_P_NULL; ci_ni_aux_free(ni, aux); tls->n_buckets--; } } else { if( bucket->bucket[idx] == tsr_p ) { bucket->bucket[idx] = tsr->bucket_link; empty = OO_P_IS_NULL(bucket->bucket[idx]); } else { ci_tcp_state_synrecv* prev = &aux->u.synrecv; while( prev->bucket_link != tsr_p ) { aux = ci_ni_aux_p2aux(ni, prev->bucket_link); prev = &aux->u.synrecv; #ifdef __KERNEL__ if( i++ > CI_LISTENQ_BUCKET_LIST_LIMIT(ni) ) { ci_netif_error_detected(ni, CI_NETIF_ERROR_SYNRECV_TABLE, __FUNCTION__); return 0; } #endif } prev->bucket_link = tsr->bucket_link; } } if( empty ) { int i; for( i = 0; i < CI_TCP_LISTEN_BUCKET_SIZE; i++ ) if( OO_P_NOT_NULL(bucket->bucket[i]) ) return 0; return 1; } return 0; }
int oof_cb_stack_id(struct tcp_helper_resource_s* stack) { return stack ? NI_ID(&stack->netif) : -1; }
/* ** promote a synrecv structure to an established socket ** ** Assumes that the caller will handle a fail if we can't allocate a new ** tcp_state structure due to memory pressure or the like */ int ci_tcp_listenq_try_promote(ci_netif* netif, ci_tcp_socket_listen* tls, ci_tcp_state_synrecv* tsr, ci_ip_cached_hdrs* ipcache, ci_tcp_state** ts_out) { int rc = 0; ci_assert(netif); ci_assert(tls); ci_assert(tls->s.b.state == CI_TCP_LISTEN); ci_assert(tsr); if( (int) ci_tcp_acceptq_n(tls) < tls->acceptq_max ) { ci_tcp_state* ts; /* grab a tcp_state structure that will go onto the accept queue. We take * from the cache of EPs if any are available */ ts = get_ts_from_cache (netif, tsr, tls); if( !ts ) { /* None on cache; try allocating a new ts */ ts = ci_tcp_get_state_buf(netif); #if CI_CFG_FD_CACHING if( ts == NULL ) { /* We've reaped. Did this result in any being cached */ ts = get_ts_from_cache(netif, tsr, tls); if (ts == NULL ) { /* No -- try again to allocate. */ ts = ci_tcp_get_state_buf(netif); } else { CITP_STATS_NETIF(++netif->state->stats.sockcache_hit_reap); } } #endif if( ts == NULL ) { LOG_TV(ci_log("%s: [%d] out of socket buffers", __FUNCTION__, NI_ID(netif))); CITP_STATS_TCP_LISTEN(++tls->stats.n_acceptq_no_sock); CI_SET_SO_ERROR(&tls->s, ENOMEM); citp_waitable_wake(netif, &tls->s.b, CI_SB_FLAG_WAKE_RX); return -ENOMEM; } ci_assert(ci_tcp_is_cached(ts) || (ts->s.b.sb_aflags & CI_SB_AFLAG_ORPHAN)); } #ifdef ONLOAD_OFE ts->s.ofe_code_start = tls->ofe_promote; #endif if( ! ci_tcp_is_cached(ts) ) { /* Need to initialise address information for use when setting filters */ ci_tcp_set_addr_on_promote(netif, ts, tsr, tls); /* "borrow" filter from listening socket. For loopback socket, we * do not need filters, but we have to take a reference of the OS * socket. */ rc = ci_tcp_ep_set_filters(netif, S_SP(ts), ts->s.cp.so_bindtodevice, S_SP(tls)); if( rc < 0 ) { LOG_U(ci_log("%s: Unable to set filters %d", __FUNCTION__, rc)); /* Either put this back on the list (at the head) or free it */ ci_tcp_state_free(netif, ts); return rc; } } #if CI_CFG_FD_CACHING else { /* Now set the s/w filter. We leave the hw filter in place for cached * EPS. This will probably not have the correct raddr and rport, but as * it's sharing the listening socket's filter that's not a problem. It * will be updated if this is still around when the listener is closed. */ rc = ci_netif_filter_insert(netif, S_SP(ts), tsr->l_addr, sock_lport_be16(&tls->s), tsr->r_addr, tsr->r_port, tcp_protocol(ts)); if (rc < 0) { /* Bung it back on the cache list */ LOG_EP(ci_log("Unable to create s/w filter!")); ci_ni_dllist_push(netif, &tls->epcache.cache, &ts->epcache_link); return rc; } /* Need to initialise address information. We do this after trying to * insert the sw filter, so we can push the tcp state back onto the * cache queue with as few changes as possible if we fail to add the * sw filter. */ ci_tcp_set_addr_on_promote(netif, ts, tsr, tls); LOG_EP(ci_log("Cached fd %d from cached to connected", ts->cached_on_fd)); ci_ni_dllist_push(netif, &tls->epcache_connected, &ts->epcache_link); } #endif ci_assert(IS_VALID_SOCK_P(netif, S_SP(ts))); ci_assert(ts->s.b.state == CI_TCP_CLOSED); ts->s.domain = tls->s.domain; cicp_ip_cache_update_from(netif, &ts->s.pkt, ipcache); ci_pmtu_state_init(netif, &ts->s, &ts->pmtus, CI_IP_TIMER_PMTU_DISCOVER); ci_pmtu_set(netif, &ts->pmtus, CI_MIN(ts->s.pkt.mtu, tsr->tcpopts.smss + sizeof(ci_tcp_hdr) + sizeof(ci_ip4_hdr))); /* If we've got SYN via local route, we can handle it */ ci_assert_equiv(ts->s.pkt.status == retrrc_localroute, OO_SP_NOT_NULL(tsr->local_peer)); if( ts->s.pkt.status == retrrc_localroute ) ts->s.pkt.flags |= CI_IP_CACHE_IS_LOCALROUTE; ts->amss = tsr->amss; /* options and flags */ ts->tcpflags = 0; ts->tcpflags |= tsr->tcpopts.flags; ts->tcpflags |= CI_TCPT_FLAG_PASSIVE_OPENED; ts->outgoing_hdrs_len = sizeof(ci_ip4_hdr) + sizeof(ci_tcp_hdr); if( ts->tcpflags & CI_TCPT_FLAG_WSCL ) { ts->snd_wscl = tsr->tcpopts.wscl_shft; ts->rcv_wscl = tsr->rcv_wscl; } else { ts->snd_wscl = ts->rcv_wscl = 0u; } CI_IP_SOCK_STATS_VAL_TXWSCL( ts, ts->snd_wscl); CI_IP_SOCK_STATS_VAL_RXWSCL( ts, ts->rcv_wscl); /* Send and receive sequence numbers */ tcp_snd_una(ts) = tcp_snd_nxt(ts) = tcp_enq_nxt(ts) = tcp_snd_up(ts) = tsr->snd_isn + 1; ci_tcp_set_snd_max(ts, tsr->rcv_nxt, tcp_snd_una(ts), 0); ci_tcp_rx_set_isn(ts, tsr->rcv_nxt); tcp_rcv_up(ts) = SEQ_SUB(tcp_rcv_nxt(ts), 1); if( ts->tcpflags & CI_TCPT_FLAG_TSO ) { ts->incoming_tcp_hdr_len += 12; ts->outgoing_hdrs_len += 12; ts->tspaws = ci_tcp_time_now(netif); ts->tsrecent = tsr->tspeer; ts->tslastack = tsr->rcv_nxt; } else { /* Must be after initialising snd_una. */ ci_tcp_clear_rtt_timing(ts); ts->timed_ts = tsr->timest; } /* SACK has nothing to be done. */ /* ?? ECN */ ci_tcp_set_hdr_len(ts, (ts->outgoing_hdrs_len - sizeof(ci_ip4_hdr))); ts->smss = tsr->tcpopts.smss; ts->c.user_mss = tls->c.user_mss; if (ts->c.user_mss && ts->c.user_mss < ts->smss) ts->smss = ts->c.user_mss; #if CI_CFG_LIMIT_SMSS ts->smss = ci_tcp_limit_mss(ts->smss, netif, __FUNCTION__); #endif ci_assert(ts->smss>0); ci_tcp_set_eff_mss(netif, ts); ci_tcp_set_initialcwnd(netif, ts); /* Copy socket options & related fields that should be inherited. * Note: Windows does not inherit rcvbuf until the call to accept * completes. The assumption here is that all options can be * inherited at the same time (most won't have an effect until there * is a socket available for use by the app.). */ ci_tcp_inherit_accept_options(netif, tls, ts, "SYN RECV (LISTENQ PROMOTE)"); /* NB. Must have already set peer (which we have). */ ci_tcp_set_established_state(netif, ts); CITP_STATS_NETIF(++netif->state->stats.synrecv2established); ci_assert(ts->ka_probes == 0); ci_tcp_kalive_restart(netif, ts, ci_tcp_kalive_idle_get(ts)); ci_tcp_set_flags(ts, CI_TCP_FLAG_ACK); /* Remove the synrecv structure from the listen queue, and free the ** buffer. */ if( tsr->tcpopts.flags & CI_TCPT_FLAG_SYNCOOKIE ) ci_free(tsr); else { ci_tcp_listenq_remove(netif, tls, tsr); ci_tcp_synrecv_free(netif, tsr); } ci_bit_set(&ts->s.b.sb_aflags, CI_SB_AFLAG_TCP_IN_ACCEPTQ_BIT); ci_tcp_acceptq_put(netif, tls, &ts->s.b); LOG_TC(log(LNT_FMT "new ts=%d SYN-RECV->ESTABLISHED flags=0x%x", LNT_PRI_ARGS(netif, tls), S_FMT(ts), ts->tcpflags); log(LNTS_FMT RCV_WND_FMT " snd=%08x-%08x-%08x enq=%08x", LNTS_PRI_ARGS(netif, ts), RCV_WND_ARGS(ts), tcp_snd_una(ts), tcp_snd_nxt(ts), ts->snd_max, tcp_enq_nxt(ts))); citp_waitable_wake(netif, &tls->s.b, CI_SB_FLAG_WAKE_RX); *ts_out = ts; return 0; }