void citp_waitable_obj_free(ci_netif* ni, citp_waitable* w) { ci_assert(ci_netif_is_locked(ni)); #ifdef __KERNEL__ { /* Avoid racing with tcp_helper_do_non_atomic(). */ tcp_helper_endpoint_t* ep = ci_netif_get_valid_ep(ni, w->bufid); unsigned ep_aflags; again: if( (ep_aflags = ep->ep_aflags) & OO_THR_EP_AFLAG_NON_ATOMIC ) { ci_assert(!(ep_aflags & OO_THR_EP_AFLAG_NEED_FREE)); if( ci_cas32_fail(&ep->ep_aflags, ep_aflags, ep_aflags | OO_THR_EP_AFLAG_NEED_FREE) ) goto again; return; } ci_rmb(); } #endif __citp_waitable_obj_free(ni, w); w->wt_next = ni->state->free_eps_head; ni->state->free_eps_head = W_SP(w); /* Must be last, as may result in stack going away. */ ci_drop_orphan(ni); }
int __oo_cp_route_resolve(struct oo_cplane_handle* cp, cicp_verinfo_t* verinfo, struct cp_fwd_key* key, int/*bool*/ ask_server, struct cp_fwd_data* data) { struct cp_mibs* mib = &cp->mib[0]; cp_version_t ver, old_ver; cicp_mac_rowid_t id; struct cp_fwd_row* fwd; find_again: id = cp_fwd_find_match(mib, key); if( id == CICP_MAC_ROWID_BAD || ~(fwd = cp_get_fwd_by_id(mib, id))->flags & CICP_FWD_FLAG_DATA_VALID || ! cp_fwd_find_row_found_perfect_match(mib, id, key) ) { if( ! ask_server ) return -ENOENT; oo_op_route_resolve(cp, key); ask_server = CI_FALSE; goto find_again; } ver = OO_ACCESS_ONCE(*cp_fwd_version(fwd)); do { if( ~ fwd->flags & CICP_FWD_FLAG_DATA_VALID || ! cp_fwd_key_match(fwd, key) ) goto find_again; ci_rmb(); *data = *cp_get_fwd_data_current(fwd); /* We can accidentally increase TTL for a wrong row - we do not care */ if( fwd->flags & CICP_FWD_FLAG_STALE ) mib->fwd_rw[id].frc_used = ci_frc64_get(); old_ver = ver; ci_rmb(); } while( old_ver != (ver = OO_ACCESS_ONCE(*cp_fwd_version(fwd))) ); verinfo->id = id; verinfo->version = ver; /* Cplane server will refresh ARP when it reads fwd_rw[id], but it may * happen after some time. Ask for the ARP immediately. */ if( ask_server && ! data->arp_valid ) oo_cp_arp_resolve(cp, verinfo); return 0; }
void ci_synchronise_clock(ci_netif *ni, struct oo_timesync* oo_ts_local) { ci_uint32 gc; struct oo_timesync *oo_ts; oo_ts = oo_timesync_state(CICP_HANDLE(ni)); /* Check if our current datapoint for clock_gettime is up to date, * and take another if not */ if( oo_ts_local->generation_count != oo_ts->generation_count ) { do { gc = oo_ts->generation_count; ci_rmb(); oo_ts_local->smoothed_ticks = oo_ts->smoothed_ticks; oo_ts_local->smoothed_ns = oo_ts->smoothed_ns; oo_ts_local->clock.tv_sec = oo_ts->clock.tv_sec; oo_ts_local->clock.tv_nsec = oo_ts->clock.tv_nsec; oo_ts_local->clock_made = oo_ts->clock_made; ci_rmb(); } while (gc & 1 || gc != oo_ts->generation_count); oo_ts_local->generation_count = gc; } }
void ci_tcp_linger(ci_netif* ni, ci_tcp_state* ts) { /* This is called at user-level when a socket is closed if linger is ** enabled and has a timeout, and there is TX data outstanding. ** ** Our job is to block until all data is successfully sent and acked, or ** until timeout. */ ci_uint64 sleep_seq; int rc = 0; ci_uint32 timeout = ts->s.so.linger * 1000; LOG_TC(log("%s: "NTS_FMT, __FUNCTION__, NTS_PRI_ARGS(ni, ts))); ci_assert(ts->s.b.sb_aflags & CI_SB_AFLAG_ORPHAN); ci_assert(ts->s.b.sb_aflags & CI_SB_AFLAG_IN_SO_LINGER); ci_assert(ts->s.s_flags & CI_SOCK_FLAG_LINGER); ci_assert(ts->s.b.state != CI_TCP_LISTEN); while( 1 ) { sleep_seq = ts->s.b.sleep_seq.all; ci_rmb(); if( SEQ_EQ(tcp_enq_nxt(ts), tcp_snd_una(ts)) ) return; rc = ci_sock_sleep(ni, &ts->s.b, CI_SB_FLAG_WAKE_TX, 0, sleep_seq, &timeout); if( rc ) break; } if( ! SEQ_EQ(tcp_enq_nxt(ts), tcp_snd_una(ts)) ) { ci_netif_lock(ni); /* check we are working with the same socket, and it was not closed and * dropped under our feet. */ if( ! SEQ_EQ(tcp_enq_nxt(ts), tcp_snd_una(ts)) && (ts->s.b.sb_aflags & CI_SB_AFLAG_IN_SO_LINGER) ) ci_tcp_drop(ni, ts, 0); ci_netif_unlock(ni); } }
int citp_ep_dup3(unsigned fromfd, unsigned tofd, int flags) { volatile citp_fdinfo_p* p_tofdip; citp_fdinfo_p tofdip; unsigned max; Log_V(log("%s(%d, %d)", __FUNCTION__, fromfd, tofd)); /* Must be checked by callers. */ ci_assert(fromfd != tofd); /* Hack: if [tofd] is the fd we're using for logging, we'd better choose ** a different one! */ if( tofd == citp.log_fd ) citp_log_change_fd(); ci_assert(citp.init_level >= CITP_INIT_FDTABLE); max = CI_MAX(fromfd, tofd); if( max >= citp_fdtable.inited_count ) { ci_assert(max < citp_fdtable.size); CITP_FDTABLE_LOCK(); __citp_fdtable_extend(max); CITP_FDTABLE_UNLOCK(); } /* Bug1151: Concurrent threads doing dup2(x,y) and dup2(y,x) can deadlock ** against one another. So we take out a fat lock to prevent concurrent ** dup2()s. */ /* Lock tofd. We need to interlock against select and poll etc, so we ** also grab the exclusive lock. Also grab the bug1151 lock. */ pthread_mutex_lock(&citp_dup_lock); CITP_FDTABLE_LOCK(); p_tofdip = &citp_fdtable.table[tofd].fdip; lock_tofdip_again: tofdip = *p_tofdip; if( fdip_is_busy(tofdip) ) tofdip = citp_fdtable_busy_wait(tofd, 1); if( fdip_is_closing(tofdip) ) tofdip = citp_fdtable_closing_wait(tofd, 1); if( fdip_is_reserved(tofdip) ) { /* ?? FIXME: we can't cope with this at the moment */ CITP_FDTABLE_UNLOCK(); Log_U(log("%s(%d, %d): target is reserved", __FUNCTION__, fromfd, tofd)); errno = EBUSY; tofd = -1; goto out; } if( fdip_cas_fail(p_tofdip, tofdip, fdip_busy) ) goto lock_tofdip_again; CITP_FDTABLE_UNLOCK(); ci_assert(fdip_is_normal(tofdip) | fdip_is_passthru(tofdip) | fdip_is_unknown(tofdip)); if( fdip_is_normal(tofdip) ) { /* We're duping onto a user-level socket. */ citp_fdinfo* tofdi = fdip_to_fdi(tofdip); if( tofdi->epoll_fd >= 0 ) { citp_fdinfo* epoll_fdi = citp_epoll_fdi_from_member(tofdi, 0); if( epoll_fdi ) { if( epoll_fdi->protocol->type == CITP_EPOLL_FD ) citp_epoll_on_close(epoll_fdi, tofdi, 0); citp_fdinfo_release_ref(epoll_fdi, 0); } } ci_assert_equal(tofdi->on_ref_count_zero, FDI_ON_RCZ_NONE); tofdi->on_ref_count_zero = FDI_ON_RCZ_DUP2; tofdi->on_rcz.dup3_args.fd = fromfd; tofdi->on_rcz.dup3_args.flags = flags; citp_fdinfo_release_ref(tofdi, 0); { int i = 0; /* We need to free this fdi. If someone is using it right now, * we are in trouble. So, we spin for a while and interrupt the * user. See bug 28123. */ while( tofdi->on_ref_count_zero != FDI_ON_RCZ_DONE ) { if( ci_is_multithreaded() && i % 10000 == 9999 ) { pthread_t pth = tofdi->thread_id; if( pth != pthread_self() && pth != PTHREAD_NULL ) { pthread_kill(pth, SIGONLOAD); sleep(1); } } ci_spinloop_pause(); i++; } ci_rmb(); } if( tofdi->on_rcz.dup2_result < 0 ) { errno = -tofdi->on_rcz.dup2_result; /* Need to re-insert [tofdi] into the table. */ ci_assert_equal(oo_atomic_read(&tofdi->ref_count), 0); oo_atomic_set(&tofdi->ref_count, 1); CI_DEBUG(tofdi->on_ref_count_zero = FDI_ON_RCZ_NONE); citp_fdtable_busy_clear(tofd, tofdip, 0); tofd = -1; } else { ci_assert(tofdi->on_rcz.dup2_result == tofd); citp_fdinfo_get_ops(tofdi)->dtor(tofdi, 0); citp_fdinfo_free(tofdi); } goto out; } ci_assert(fdip_is_passthru(tofdip) | fdip_is_unknown(tofdip)); { /* We're dupping onto an O/S descriptor, or it may be closed. Create a ** dummy [citp_fdinfo], just so we can share code with the case above. */ citp_fdinfo fdi; fdi.fd = tofd; fdi.on_rcz.dup3_args.fd = fromfd; fdi.on_rcz.dup3_args.flags = flags; dup2_complete(&fdi, tofdip, 0); if( fdi.on_rcz.dup2_result < 0 ) { errno = -fdi.on_rcz.dup2_result; citp_fdtable_busy_clear(tofd, tofdip, 0); tofd = -1; } else ci_assert(fdi.on_rcz.dup2_result == tofd); } out: pthread_mutex_unlock(&citp_dup_lock); return tofd; }
static int ci_udp_recvmsg_socklocked_slowpath(ci_udp_iomsg_args* a, ci_msghdr* msg, ci_iovec_ptr *piov, int flags) { int rc = 0; ci_netif* ni = a->ni; ci_udp_state* us = a->us; if(CI_UNLIKELY( ni->state->rxq_low )) ci_netif_rxq_low_on_recv(ni, &us->s, 1 /* assume at least one pkt freed */); /* In the kernel recv() with flags is not called. * only read(). So flags may only contain MSG_DONTWAIT */ #ifdef __KERNEL__ ci_assert_equal(flags, 0); #endif #ifndef __KERNEL__ if( flags & MSG_ERRQUEUE_CHK ) { if( OO_PP_NOT_NULL(us->timestamp_q.extract) ) { ci_ip_pkt_fmt* pkt; struct timespec ts[3]; struct cmsg_state cmsg_state; ci_udp_hdr* udp; int paylen; /* TODO is this necessary? - mirroring ci_udp_recvmsg_get() */ ci_rmb(); pkt = PKT_CHK_NNL(ni, us->timestamp_q.extract); if( pkt->tx_hw_stamp.tv_sec == CI_PKT_TX_HW_STAMP_CONSUMED ) { if( OO_PP_IS_NULL(pkt->tsq_next) ) goto errqueue_empty; us->timestamp_q.extract = pkt->tsq_next; pkt = PKT_CHK_NNL(ni, us->timestamp_q.extract); ci_assert(pkt->tx_hw_stamp.tv_sec != CI_PKT_TX_HW_STAMP_CONSUMED); } udp = oo_ip_data(pkt); paylen = CI_BSWAP_BE16(oo_ip_hdr(pkt)->ip_tot_len_be16) - sizeof(ci_ip4_hdr) - sizeof(udp); msg->msg_flags = 0; cmsg_state.msg = msg; cmsg_state.cm = msg->msg_control; cmsg_state.cmsg_bytes_used = 0; ci_iovec_ptr_init_nz(piov, msg->msg_iov, msg->msg_iovlen); memset(ts, 0, sizeof(ts)); if( us->s.timestamping_flags & ONLOAD_SOF_TIMESTAMPING_RAW_HARDWARE ) { ts[2].tv_sec = pkt->tx_hw_stamp.tv_sec; ts[2].tv_nsec = pkt->tx_hw_stamp.tv_nsec; } if( (us->s.timestamping_flags & ONLOAD_SOF_TIMESTAMPING_SYS_HARDWARE) && (pkt->tx_hw_stamp.tv_nsec & CI_IP_PKT_HW_STAMP_FLAG_IN_SYNC) ) { ts[1].tv_sec = pkt->tx_hw_stamp.tv_sec; ts[1].tv_nsec = pkt->tx_hw_stamp.tv_nsec; } ci_put_cmsg(&cmsg_state, SOL_SOCKET, ONLOAD_SCM_TIMESTAMPING, sizeof(ts), &ts); oo_offbuf_set_start(&pkt->buf, udp + 1); oo_offbuf_set_len(&pkt->buf, paylen); rc = oo_copy_pkt_to_iovec_no_adv(ni, pkt, piov, paylen); /* Mark this packet/timestamp as consumed */ pkt->tx_hw_stamp.tv_sec = CI_PKT_TX_HW_STAMP_CONSUMED; ci_ip_cmsg_finish(&cmsg_state); msg->msg_flags |= MSG_ERRQUEUE_CHK; return rc; } errqueue_empty: /* ICMP is handled via OS, so get OS error */ rc = oo_os_sock_recvmsg(ni, SC_SP(&us->s), msg, flags); if( rc < 0 ) { ci_assert(-rc == errno); return -1; } else return rc; } #endif if( (rc = ci_get_so_error(&us->s)) != 0 ) { CI_SET_ERROR(rc, rc); return rc; } if( msg->msg_iovlen > 0 && msg->msg_iov == NULL ) { CI_SET_ERROR(rc, EFAULT); return rc; } #if MSG_OOB_CHK if( flags & MSG_OOB_CHK ) { CI_SET_ERROR(rc, EOPNOTSUPP); return rc; } #endif #if CI_CFG_POSIX_RECV if( ! udp_lport_be16(us)) { LOG_UV(log("%s: -1 (ENOTCONN)", __FUNCTION__)); CI_SET_ERROR(rc, ENOTCONN); return rc; } #endif if( msg->msg_iovlen == 0 ) { /* We have a difference in behaviour from the Linux stack here. When ** msg_iovlen is 0 Linux 2.4.21-15.EL does not set MSG_TRUNC when a ** datagram has non-zero length. We do. */ CI_IOVEC_LEN(&piov->io) = piov->iovlen = 0; return IOVLEN_WORKAROUND_RC_VALUE; } return 0; }
static int ci_udp_recvmsg_get(ci_netif* ni, ci_udp_state* us, ci_iovec_ptr* piov, ci_msghdr* msg, int flags) { ci_ip_pkt_fmt* pkt; int rc; /* NB. [msg] can be NULL for async recv. */ if( ci_udp_recv_q_not_readable(ni, us) ) goto recv_q_is_empty; ci_rmb(); pkt = PKT_CHK_NNL(ni, us->recv_q.extract); if( pkt->pf.udp.rx_flags & CI_IP_PKT_FMT_PREFIX_UDP_RX_CONSUMED ) { /* We know that the receive queue is not empty and if a filter is * involved that there are some that have passed the filter, so if * this pkt is already consumed, the next one must be OK to * receive (and already have been filtered) */ us->recv_q.extract = pkt->next; pkt = PKT_CHK_NNL(ni, us->recv_q.extract); ci_assert( !(pkt->pf.udp.rx_flags & CI_IP_PKT_FMT_PREFIX_UDP_RX_CONSUMED) ); #if CI_CFG_ZC_RECV_FILTER if( us->recv_q_filter ) /* Filter should have run on this packet and marked it */ ci_assert( (pkt->pf.udp.rx_flags & (CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_PASSED | CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_DROPPED)) ); else /* Bump this along as we don't have a filter installed, but want * to keep the filter pointer in a sane place */ us->recv_q.filter = us->recv_q.extract; #endif } #if CI_CFG_ZC_RECV_FILTER /* Skip any that the filter has dropped. This must terminate before * hitting the tail because we know the queue is readable. */ while( pkt->pf.udp.rx_flags & CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_DROPPED ) { us->recv_q.extract = pkt->next; pkt = PKT_CHK_NNL(ni, us->recv_q.extract); } #endif #if defined(__linux__) && !defined(__KERNEL__) if( msg != NULL && msg->msg_controllen != 0 ) { if( CI_UNLIKELY(us->s.cmsg_flags != 0 ) ) ci_ip_cmsg_recv(ni, us, pkt, msg, 0); else msg->msg_controllen = 0; } #endif us->stamp = pkt->pf.udp.rx_stamp; rc = oo_copy_pkt_to_iovec_no_adv(ni, pkt, piov, pkt->pf.udp.pay_len); if(CI_LIKELY( rc >= 0 )) { #if HAVE_MSG_FLAGS if(CI_UNLIKELY( rc < pkt->pf.udp.pay_len && msg != NULL )) msg->msg_flags |= LOCAL_MSG_TRUNC; #endif ci_udp_recvmsg_fill_msghdr(ni, msg, pkt, &us->s); if( ! (flags & MSG_PEEK) ) { us->recv_q.bytes_delivered += pkt->pf.udp.pay_len; us->recv_q.pkts_delivered += 1; pkt->pf.udp.rx_flags |= CI_IP_PKT_FMT_PREFIX_UDP_RX_CONSUMED; #if CI_CFG_ZC_RECV_FILTER if( !us->recv_q_filter ) { /* Pretend this packet passed the filter, to keep state consistent */ ++us->recv_q.pkts_filter_passed; us->recv_q.bytes_filter_passed += pkt->pf.udp.pay_len; pkt->pf.udp.rx_flags |= CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_PASSED; } #endif } us->udpflags |= CI_UDPF_LAST_RECV_ON; } return rc; recv_q_is_empty: return -EAGAIN; }
int ci_udp_filter_recved_pkts(ci_netif* ni, ci_udp_state* us) { enum onload_zc_callback_rc rc; struct onload_zc_msg zc_msg; struct onload_zc_iovec zc_iovec[CI_UDP_ZC_IOVEC_MAX]; ci_ip_pkt_fmt* pkt; unsigned cb_flags; int dropped_bytes; ci_assert(ci_sock_is_locked(ni, &us->s.b)); zc_msg.iov = zc_iovec; zc_msg.msghdr.msg_controllen = 0; zc_msg.msghdr.msg_flags = 0; while( us->recv_q.pkts_added != us->recv_q.pkts_filter_passed + us->recv_q.pkts_filter_dropped ) { ci_rmb(); pkt = PKT_CHK_NNL(ni, us->recv_q.filter); if( pkt->pf.udp.rx_flags & (CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_PASSED | CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_DROPPED) ) { /* We know this can't go past tail because of the while loop condition */ us->recv_q.filter = pkt->next; pkt = PKT_CHK_NNL(ni, us->recv_q.filter); ci_assert( !(pkt->pf.udp.rx_flags & (CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_PASSED | CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_DROPPED)) ); } ci_udp_pkt_to_zc_msg(ni, pkt, &zc_msg); cb_flags = CI_IP_IS_MULTICAST(oo_ip_hdr(pkt)->ip_daddr_be32) ? ONLOAD_ZC_MSG_SHARED : 0; rc = (*(onload_zc_recv_filter_callback)((ci_uintptr_t)us->recv_q_filter)) (&zc_msg, (void *)((ci_uintptr_t)us->recv_q_filter_arg), cb_flags); ci_assert(!(rc & ONLOAD_ZC_KEEP)); if( rc & ONLOAD_ZC_TERMINATE ) { us->recv_q.bytes_filter_dropped += pkt->pf.udp.pay_len; pkt->pf.udp.rx_flags |= CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_DROPPED; ++us->recv_q.pkts_filter_dropped; } else { pkt->pf.udp.rx_flags |= CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_PASSED; ++us->recv_q.pkts_filter_passed; if( rc & ONLOAD_ZC_MODIFIED ) { ci_assert(!(cb_flags & ONLOAD_ZC_MSG_SHARED)); dropped_bytes = ci_zc_msg_to_udp_pkt(ni, &zc_msg, pkt); ci_assert_gt(dropped_bytes, 0); ci_assert_lt(dropped_bytes, pkt->pf.udp.pay_len); pkt->pf.udp.pay_len -= dropped_bytes; us->recv_q.bytes_filter_dropped += dropped_bytes; } us->recv_q.bytes_filter_passed += pkt->pf.udp.pay_len; return 1; } } return us->recv_q.pkts_filter_passed != us->recv_q.pkts_delivered; }