static int efab_vi_rm_mmap_io(struct efrm_vi *virs, unsigned long *bytes, void *opaque, int *map_num, unsigned long *offset) { int rc; int len; int instance; int base; unsigned vi_stride; struct efhw_nic *nic; nic = efrm_client_get_nic(virs->rs.rs_client); instance = virs->rs.rs_instance; len = CI_MIN(*bytes, CI_PAGE_SIZE); *bytes -=len; /* Make sure we can get away with a single page here. */ switch (nic->devtype.arch) { case EFHW_ARCH_FALCON: ci_assert_lt(falcon_tx_dma_page_offset(instance), CI_PAGE_SIZE); ci_assert_lt(falcon_rx_dma_page_offset(instance), CI_PAGE_SIZE); ci_assert_equal(falcon_tx_dma_page_base(instance), falcon_rx_dma_page_base(instance)); base = falcon_tx_dma_page_base(instance); break; case EFHW_ARCH_EF10: vi_stride = nic->vi_stride; ci_assert_lt(ef10_tx_dma_page_offset(vi_stride, instance), CI_PAGE_SIZE); ci_assert_lt(ef10_rx_dma_page_offset(vi_stride, instance), CI_PAGE_SIZE); ci_assert_equal(ef10_tx_dma_page_base(vi_stride, instance), ef10_rx_dma_page_base(vi_stride, instance)); base = ef10_tx_dma_page_base(vi_stride, instance); break; default: EFCH_ERR("%s: ERROR: unknown nic type (%d)", __FUNCTION__, nic->devtype.arch); base = 0; /* To quiet the compiler */ BUG(); } rc = ci_mmap_bar(nic, base, len, opaque, map_num, offset, 0); if (rc < 0 ) { EFCH_ERR("%s: ERROR: ci_mmap_bar failed rc=%d", __FUNCTION__, rc); return rc; } return 0; }
void __citp_fdtable_reserve(int fd, int protect) { /* Must be holding the lock. */ CITP_FDTABLE_ASSERT_LOCKED(1); ci_assert_lt((unsigned) fd, citp_fdtable.size); if( protect ) citp_fdtable_new_fd_set(fd, fdip_reserved, 1); else fdtable_swap(fd, fdip_reserved, fdip_unknown, 1); }
void __citp_fdinfo_ref_count_zero(citp_fdinfo* fdi, int fdt_locked) { Log_V(log("%s: fd=%d on_rcz=%d", __FUNCTION__, fdi->fd, fdi->on_ref_count_zero)); citp_fdinfo_assert_valid(fdi); ci_assert(oo_atomic_read(&fdi->ref_count) == 0); ci_assert_ge(fdi->fd, 0); ci_assert_lt(fdi->fd, citp_fdtable.inited_count); ci_assert_nequal(fdi_to_fdip(fdi), citp_fdtable.table[fdi->fd].fdip); switch( fdi->on_ref_count_zero ) { case FDI_ON_RCZ_CLOSE: #if CI_CFG_FD_CACHING if( citp_fdinfo_get_ops(fdi)->cache(fdi) == 1 ) { if( ! fdt_locked && fdtable_strict() ) CITP_FDTABLE_LOCK(); fdtable_swap(fdi->fd, fdip_closing, fdip_unknown, fdt_locked | fdtable_strict()); citp_fdinfo_get_ops(fdi)->dtor(fdi, fdt_locked | fdtable_strict()); if( ! fdt_locked && fdtable_strict() ) CITP_FDTABLE_UNLOCK(); citp_fdinfo_free(fdi); break; } else #endif { if( ! fdt_locked && fdtable_strict() ) CITP_FDTABLE_LOCK(); ci_tcp_helper_close_no_trampoline(fdi->fd); /* The swap must occur after the close, otherwise another thread could * cause a probe of the old endpoint info, which is about be freed. */ fdtable_swap(fdi->fd, fdip_closing, fdip_unknown, fdt_locked | fdtable_strict()); citp_fdinfo_get_ops(fdi)->dtor(fdi, fdt_locked | fdtable_strict()); if( ! fdt_locked && fdtable_strict() ) CITP_FDTABLE_UNLOCK(); citp_fdinfo_free(fdi); break; } case FDI_ON_RCZ_DUP2: dup2_complete(fdi, fdi_to_fdip(fdi), fdt_locked); break; case FDI_ON_RCZ_HANDOVER: citp_fdinfo_do_handover(fdi, fdt_locked); break; case FDI_ON_RCZ_MOVED: citp_fdinfo_get_ops(fdi)->dtor(fdi, fdt_locked); citp_fdinfo_free(fdi); break; default: CI_DEBUG(ci_log("%s: fd=%d on_ref_count_zero=%d", __FUNCTION__, fdi->fd, fdi->on_ref_count_zero)); ci_assert(0); } }
citp_fdinfo_p citp_fdtable_new_fd_set(unsigned fd, citp_fdinfo_p new_fdip, int fdt_locked) { volatile citp_fdinfo_p* p_fdip; citp_fdinfo_p prev; if( fd >= citp_fdtable.inited_count ) { ci_assert_lt(fd, citp_fdtable.size); if( ! fdt_locked ) CITP_FDTABLE_LOCK(); __citp_fdtable_extend(fd); if( ! fdt_locked ) CITP_FDTABLE_UNLOCK(); } p_fdip = &citp_fdtable.table[fd].fdip; do { prev = *p_fdip; /* Busy? Perhaps just closed, but not yet marked unknown. Or perhaps it ** is being probed. */ if( fdip_is_busy(prev) ) prev = citp_fdtable_busy_wait(fd, fdt_locked); /* There is a close in progress, so we wait until it is resolved. */ if( fdip_is_closing(prev) ) prev = citp_fdtable_closing_wait(fd, fdt_locked); /* Reserved? Perhaps it was a netif fd that has just been closed. So it ** should be about to be unreserved. */ } while (fdip_is_reserved(prev) || fdip_cas_fail(p_fdip, prev, new_fdip) ); if( fdip_is_normal(prev) ) { /* We can get here is close-trampolining fails. So for release ** builds we accept that the user-level state got out-of-sync, and ** leak [fdi] since it seems like a suitably cautious thing to do. */ ci_log("%s: ERROR: Orphaned entry in user-level fd-table", __FUNCTION__); } else /* We (at time of writing) only register a trampoline handler when we ** create a netif, so we can miss the closing of pass-through ** descriptors. */ ci_assert(fdip_is_unknown(prev) || fdip_is_passthru(prev)); return prev; }
int citp_fdtable_ctor() { struct rlimit rlim; int rc; Log_S(log("%s:", __FUNCTION__)); /* How big should our fdtable be by default? It's pretty arbitrary, but we have * seen a few apps that use setrlimit to set the fdtable to 4096 entries on * start-up (see bugs 3253 and 3373), so we choose that. (Note: we can't grow * the table if the app later does setrlimit, and unused entries consume virtual * space only, so it's worth allocating a table of reasonable sized.) */ citp_fdtable.size = 4096; if( getrlimit(RLIMIT_NOFILE, &rlim) == 0 ) { citp_fdtable.size = rlim.rlim_max; if( CITP_OPTS.fdtable_size != 0 && CITP_OPTS.fdtable_size != rlim.rlim_max ) { Log_S(ci_log("Set the limits for the number of opened files " "to EF_FDTABLE_SIZE=%u value.", CITP_OPTS.fdtable_size)); rlim.rlim_max = CITP_OPTS.fdtable_size; if( rlim.rlim_cur > rlim.rlim_max ) rlim.rlim_cur = rlim.rlim_max; if( ci_sys_setrlimit(RLIMIT_NOFILE, &rlim) == 0 ) citp_fdtable.size = rlim.rlim_max; else { /* Most probably, we've got EPERM */ ci_assert_lt(citp_fdtable.size, CITP_OPTS.fdtable_size); ci_log("Can't set EF_FDTABLE_SIZE=%u; using %u", CITP_OPTS.fdtable_size, citp_fdtable.size); rlim.rlim_max = rlim.rlim_cur = citp_fdtable.size; CI_TRY(ci_sys_setrlimit(RLIMIT_NOFILE, &rlim)); } } } else Log_S(ci_log("Assume EF_FDTABLE_SIZE=%u", citp_fdtable.size)); citp_fdtable.inited_count = 0; citp_fdtable.table = ci_libc_malloc(sizeof (citp_fdtable_entry) * citp_fdtable.size); if( ! citp_fdtable.table ) { Log_U(log("%s: failed to allocate fdtable (0x%x)", __FUNCTION__, citp_fdtable.size)); return -1; } /* The whole table is not initialised at start-of-day, but is initialised ** on demand. citp_fdtable.inited_count counts the number of initialised ** entries. */ if( (rc = oo_rwlock_ctor(&citp_ul_lock)) != 0 ) { Log_E(log("%s: oo_rwlock_ctor %d", __FUNCTION__, rc)); return -1; } /* Install SIGONLOAD handler */ { struct sigaction sa; memset(&sa, 0, sizeof(sa)); /* sa_flags and sa_mask = 0 */ sa.sa_handler = sighandler_do_nothing; sigaction(SIGONLOAD, &sa, NULL); } return 0; }
static int ci_udp_filter_kernel_pkt(ci_netif* ni, ci_udp_state* us, struct msghdr* msg, int *bytes) { enum onload_zc_callback_rc rc; struct onload_zc_msg zc_msg; struct onload_zc_iovec zc_iovec[CI_UDP_ZC_IOVEC_MAX]; unsigned cb_flags = 0; int i = 0, bytes_remaining = *bytes; if( msg->msg_iovlen > CI_UDP_ZC_IOVEC_MAX ) { LOG_U(log("%s: too many fragments (%d), passing packet unfiltered", __FUNCTION__, (int)msg->msg_iovlen)); return 1; } zc_msg.iov = zc_iovec; zc_msg.msghdr = *msg; zc_msg.msghdr.msg_iov = NULL; ci_assert_gt(msg->msg_iovlen, 0); do { zc_msg.iov[i].iov_base = msg->msg_iov[i].iov_base; zc_msg.iov[i].iov_len = msg->msg_iov[i].iov_len > bytes_remaining ? bytes_remaining : msg->msg_iov[i].iov_len; zc_msg.iov[i].buf = ONLOAD_ZC_HANDLE_NONZC; zc_msg.iov[i].iov_flags = 0; bytes_remaining -= zc_msg.iov[i].iov_len; } while(++i < msg->msg_iovlen && bytes_remaining); zc_msg.msghdr.msg_iovlen = i; rc = (*(onload_zc_recv_filter_callback)((ci_uintptr_t)us->recv_q_filter)) (&zc_msg, (void *)((ci_uintptr_t)us->recv_q_filter_arg), cb_flags); ci_assert(!(rc & ONLOAD_ZC_KEEP)); if( rc & ONLOAD_ZC_TERMINATE ) return 0; else { if( rc & ONLOAD_ZC_MODIFIED ) { int new_len = 0; #ifndef NDEBUG int found_shortened_iov = 0; #endif for( i = 0; i < zc_msg.msghdr.msg_iovlen; ++i ) { new_len += zc_msg.iov[i].iov_len; #ifndef NDEBUG if( found_shortened_iov ) ci_assert_equal(zc_msg.iov[i].iov_len, 0); ci_assert_equal(zc_msg.iov[i].iov_base, msg->msg_iov[i].iov_base); if( zc_msg.iov[i].iov_len != msg->msg_iov[i].iov_len ) { ci_assert_lt(zc_msg.iov[i].iov_len, msg->msg_iov[i].iov_len); found_shortened_iov = 1; } #endif } #ifndef NDEBUG if( found_shortened_iov ) ci_assert_lt(new_len, *bytes); else ci_assert_equal(new_len, *bytes); #endif *bytes = new_len; } } return 1; }
int ci_udp_filter_recved_pkts(ci_netif* ni, ci_udp_state* us) { enum onload_zc_callback_rc rc; struct onload_zc_msg zc_msg; struct onload_zc_iovec zc_iovec[CI_UDP_ZC_IOVEC_MAX]; ci_ip_pkt_fmt* pkt; unsigned cb_flags; int dropped_bytes; ci_assert(ci_sock_is_locked(ni, &us->s.b)); zc_msg.iov = zc_iovec; zc_msg.msghdr.msg_controllen = 0; zc_msg.msghdr.msg_flags = 0; while( us->recv_q.pkts_added != us->recv_q.pkts_filter_passed + us->recv_q.pkts_filter_dropped ) { ci_rmb(); pkt = PKT_CHK_NNL(ni, us->recv_q.filter); if( pkt->pf.udp.rx_flags & (CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_PASSED | CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_DROPPED) ) { /* We know this can't go past tail because of the while loop condition */ us->recv_q.filter = pkt->next; pkt = PKT_CHK_NNL(ni, us->recv_q.filter); ci_assert( !(pkt->pf.udp.rx_flags & (CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_PASSED | CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_DROPPED)) ); } ci_udp_pkt_to_zc_msg(ni, pkt, &zc_msg); cb_flags = CI_IP_IS_MULTICAST(oo_ip_hdr(pkt)->ip_daddr_be32) ? ONLOAD_ZC_MSG_SHARED : 0; rc = (*(onload_zc_recv_filter_callback)((ci_uintptr_t)us->recv_q_filter)) (&zc_msg, (void *)((ci_uintptr_t)us->recv_q_filter_arg), cb_flags); ci_assert(!(rc & ONLOAD_ZC_KEEP)); if( rc & ONLOAD_ZC_TERMINATE ) { us->recv_q.bytes_filter_dropped += pkt->pf.udp.pay_len; pkt->pf.udp.rx_flags |= CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_DROPPED; ++us->recv_q.pkts_filter_dropped; } else { pkt->pf.udp.rx_flags |= CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_PASSED; ++us->recv_q.pkts_filter_passed; if( rc & ONLOAD_ZC_MODIFIED ) { ci_assert(!(cb_flags & ONLOAD_ZC_MSG_SHARED)); dropped_bytes = ci_zc_msg_to_udp_pkt(ni, &zc_msg, pkt); ci_assert_gt(dropped_bytes, 0); ci_assert_lt(dropped_bytes, pkt->pf.udp.pay_len); pkt->pf.udp.pay_len -= dropped_bytes; us->recv_q.bytes_filter_dropped += dropped_bytes; } us->recv_q.bytes_filter_passed += pkt->pf.udp.pay_len; return 1; } } return us->recv_q.pkts_filter_passed != us->recv_q.pkts_delivered; }
static int ci_zc_msg_to_udp_pkt(ci_netif* ni, struct onload_zc_msg* zc_msg, ci_ip_pkt_fmt* pkt) { int i, n_buffers = pkt->n_buffers, dropped_bytes = 0; ci_ip_pkt_fmt* frag; ci_ip_pkt_fmt* prev_frag = NULL; frag = pkt; i = 0; ci_assert_nequal(zc_msg->iov, NULL); /* Ignore first frag if zero length and there is another frag */ if( oo_offbuf_left(&frag->buf) == 0 && OO_PP_NOT_NULL(frag->frag_next) ) { frag = PKT_CHK_NNL(ni, frag->frag_next); --n_buffers; } CI_TEST(zc_msg->msghdr.msg_iovlen <= n_buffers); CI_TEST(zc_msg->msghdr.msg_iovlen > 0); do { CI_TEST(zc_msg->iov[i].buf == (onload_zc_handle)frag); CI_TEST(zc_msg->iov[i].iov_len != 0); if( i < zc_msg->msghdr.msg_iovlen ) { if( zc_msg->iov[i].iov_base != oo_offbuf_ptr(&frag->buf) ) { ci_assert_gt((char*)zc_msg->iov[i].iov_base, oo_offbuf_ptr(&frag->buf)); dropped_bytes += ((char*)zc_msg->iov[i].iov_base - oo_offbuf_ptr(&frag->buf) ); oo_offbuf_set_start(&frag->buf, (char*)zc_msg->iov[i].iov_base); } if( zc_msg->iov[i].iov_len != oo_offbuf_left(&frag->buf) ) { ci_assert_lt(zc_msg->iov[i].iov_len, oo_offbuf_left(&frag->buf)); dropped_bytes += (oo_offbuf_left(&frag->buf) - zc_msg->iov[i].iov_len); oo_offbuf_set_len(&frag->buf, zc_msg->iov[i].iov_len); } } else { /* All remaining fragments should be discarded. Should not get * here on first frag as msg_iovlen > 0 */ ci_assert(prev_frag != NULL); prev_frag->frag_next = OO_PP_NULL; /* remember frag so we can release it after counting dropped bytes */ prev_frag = frag; do { dropped_bytes += oo_offbuf_left(&frag->buf); if( ++i == n_buffers ) break; frag = PKT_CHK_NNL(ni, frag->frag_next); } while( 1 ); ci_netif_pkt_release(ni, prev_frag); pkt->n_buffers -= (n_buffers - zc_msg->msghdr.msg_iovlen); return dropped_bytes; } ci_assert_lt(oo_offbuf_offset(&frag->buf) + oo_offbuf_left(&frag->buf), CI_CFG_PKT_BUF_SIZE); if( ++i == n_buffers ) break; prev_frag = frag; frag = PKT_CHK_NNL(ni, frag->frag_next); } while( 1 ); return dropped_bytes; }
void ci_ip_send_tcp_slow(ci_netif* ni, ci_tcp_state* ts, ci_ip_pkt_fmt* pkt) { /* We're here because the ipcache is not valid. */ int rc, prev_mtu = ts->s.pkt.mtu; cicp_user_retrieve(ni, &ts->s.pkt, &ts->s.cp); if( ts->s.pkt.status == retrrc_success ) { if( ts->s.pkt.mtu != prev_mtu ) CI_PMTU_TIMER_NOW(ni, &ts->pmtus); ci_ip_set_mac_and_port(ni, &ts->s.pkt, pkt); ci_netif_send(ni, pkt); return; } else if( ts->s.pkt.status == retrrc_localroute && (ts->s.pkt.flags & CI_IP_CACHE_IS_LOCALROUTE) ) ci_ip_local_send(ni, pkt, &ts->s, OO_SP_NULL); /* For TCP, we want the ipcache to only be valid when onloadable. */ ci_ip_cache_invalidate(&ts->s.pkt); switch( ts->s.pkt.status ) { case retrrc_nomac: rc = 0; /* If we resend SYN, and there is no MAC - it means ARP failed. * Connect() should return with EHOSTUNREACH. * We verify twice - on the first and the second retransmit. * Very hackish. */ if( ts->s.b.state == CI_TCP_SYN_SENT ) { if( ts->retransmits == 1 ) ts->tcpflags |= CI_TCPT_FLAG_NO_ARP; else if( (ts->tcpflags & CI_TCPT_FLAG_NO_ARP) && ts->retransmits == 2 ) { ci_tcp_drop(ni, ts, EHOSTUNREACH); return; } } cicp_user_defer_send(ni, retrrc_nomac, &rc, OO_PKT_P(pkt), ts->s.pkt.ifindex); ++ts->stats.tx_nomac_defer; return; case retrrc_noroute: rc = -EHOSTUNREACH; break; case retrrc_alienroute: case retrrc_localroute: /* ?? TODO: inc some stat */ return; default: ci_assert_lt(ts->s.pkt.status, 0); if( ts->s.pkt.status < 0 ) rc = ts->s.pkt.status; else /* belt and braces... */ rc = 0; } ci_assert_le(rc, 0); /* In most cases, we should ignore return code; the packet will be resend * later, because of RTO. However, in SYN-SENT we should pass errors to * user. At the same time, we should not pass ENOBUFS to user - it is * pretty internal problem of cplane, so we should try again. Possibly, * there may be other internal problems, such as ENOMEM. * * Also, do not break connection when the first SYN fails: * - Linux does not do it; * - cplane has some latency, so we have false positives here; * - ci_tcp_connect() does not expect it. */ if( ts->s.b.state == CI_TCP_SYN_SENT && rc < 0 && ts->retransmits > 0 && (rc == -EHOSTUNREACH || rc == -ENETUNREACH || rc == -ENETDOWN) ) ci_tcp_drop(ni, ts, -rc); }