int efab_eplock_lock_wait(ci_netif* ni) { wait_queue_t wait; int rc; #if CI_CFG_EFAB_EPLOCK_RECORD_CONTENTIONS efab_eplock_record_pid(ni); #endif init_waitqueue_entry(&wait, current); add_wait_queue(&ni->eplock_helper.wq, &wait); while( 1 ) { set_current_state(TASK_INTERRUPTIBLE); rc = efab_eplock_is_unlocked_or_request_wake(&ni->state->lock); if( rc <= 0 ) break; schedule(); if(CI_UNLIKELY( signal_pending(current) )) { rc = -ERESTARTSYS; break; } } remove_wait_queue(&ni->eplock_helper.wq, &wait); set_current_state(TASK_RUNNING); return rc; }
unsigned ci_ip_checksum(const ci_ip4_hdr* ip) { const ci_uint16* p = (const ci_uint16*) ip; unsigned csum; int bytes; csum = p[0]; csum += p[1]; csum += p[2]; csum += p[3]; csum += p[4]; /* omit ip_check_be16 */ csum += p[6]; csum += p[7]; csum += p[8]; csum += p[9]; bytes = CI_IP4_IHL(ip); if(CI_UNLIKELY( bytes > 20 )) { p += 10; bytes -= 20; do { csum += *p++; bytes -= 2; } while( bytes ); } return ci_ip_hdr_csum_finish(csum); }
static int citp_pipe_epoll_writer(citp_fdinfo* fdinfo, struct citp_epoll_member* eitem, struct oo_ul_epoll_state* eps, int* stored_event) { unsigned mask; struct oo_pipe* pipe = fdi_to_pipe_fdi(fdinfo)->pipe; ci_uint64 sleep_seq; int seq_mismatch = 0; #if CI_CFG_SPIN_STATS if( CI_UNLIKELY(! eps->stat_incremented) ) { fdi_to_pipe_fdi(fdinfo)->ni->state->stats.spin_epoll++; eps->stat_incremented = 1; } #endif sleep_seq = pipe->b.sleep_seq.all; mask = oo_pipe_poll_write_events(pipe); *stored_event = citp_ul_epoll_set_ul_events(eps, eitem, mask, sleep_seq, &pipe->b.sleep_seq.all, &seq_mismatch); return seq_mismatch; }
static int citp_pipe_poll_writer(citp_fdinfo* fdinfo, struct pollfd* pfd, struct oo_ul_poll_state* ps) { citp_pipe_fdi* epi; struct oo_pipe* p; unsigned mask; epi = fdi_to_pipe_fdi(fdinfo); p = epi->pipe; #if CI_CFG_SPIN_STATS if( CI_UNLIKELY(! ps->stat_incremented) ) { epi->ni->state->stats.spin_poll++; ps->stat_incremented = 1; } #endif /* set mask */ mask = oo_pipe_poll_write_events(p); /* set revents */ pfd->revents = mask & (pfd->events | POLLERR | POLLHUP); return 1; }
static int citp_pipe_select_reader(citp_fdinfo* fdinfo, int* n, int rd, int wr, int ex, struct oo_ul_select_state* ss) { citp_pipe_fdi* epi; struct oo_pipe* p; unsigned mask = 0; epi = fdi_to_pipe_fdi(fdinfo); p = epi->pipe; #if CI_CFG_SPIN_STATS if( CI_UNLIKELY(! ss->stat_incremented) ) { epi->ni->state->stats.spin_select++; ss->stat_incremented = 1; } #endif /* set mask */ mask = oo_pipe_poll_read_events(p); if( rd && (mask & SELECT_RD_SET) ) { FD_SET(fdinfo->fd, ss->rdu); ++*n; } return 1; }
extern int ci_cfg_rd_trylock(void) { int rc = 0; if (!ci_cfg_handle_open) { ci_log("config: attempt to access configuration " "before initialization"); rc = -ENXIO; /* "no such device or address"? */ } else { int readers; ci_lock_lock(&ci_cfg_handle.lock); if (CI_UNLIKELY(ci_cfg_handle.writing == 1)) { DEBUG_LOCK(DPRINTF("config: read denied during a write");); rc = -EAGAIN; } else if (CI_UNLIKELY((readers=ci_cfg_readers()) >= CI_CFG_READERS_MAX))
static int oo_copy_pkt_to_iovec_no_adv(ci_netif* ni, const ci_ip_pkt_fmt* pkt, ci_iovec_ptr* piov, int bytes_to_copy) { /* Copy data from [pkt] to [piov], following [pkt->frag_next] as * necessary. Does not modify [pkt]. May or may not advance [piov]. * The packet must contain at least [bytes_to_copy] of data in the * [pkt->buf]. [piov] may contain an arbitrary amount of space. * * Returns number of bytes copied on success, or -EFAULT otherwise. */ int n, pkt_left, pkt_off = 0; int bytes_copied = 0; while( 1 ) { pkt_left = oo_offbuf_left(&pkt->buf) - pkt_off; n = CI_MIN(pkt_left, CI_IOVEC_LEN(&piov->io)); n = CI_MIN(n, bytes_to_copy); if(CI_UNLIKELY( do_copy(CI_IOVEC_BASE(&piov->io), oo_offbuf_ptr(&pkt->buf) + pkt_off, n) != 0 )) return -EFAULT; bytes_copied += n; pkt_off += n; if( n == bytes_to_copy ) return bytes_copied; bytes_to_copy -= n; if( n == pkt_left ) { /* Caller guarantees that packet contains at least [bytes_to_copy]. */ ci_assert(OO_PP_NOT_NULL(pkt->frag_next)); ci_iovec_ptr_advance(piov, n); pkt = PKT_CHK_NNL(ni, pkt->frag_next); pkt_off = 0; /* We're unlikely to hit end-of-pkt-buf and end-of-iovec at the same * time, and if we do, just go round the loop again. */ continue; } ci_assert_equal(n, CI_IOVEC_LEN(&piov->io)); if( piov->iovlen == 0 ) return bytes_copied; piov->io = *piov->iov++; --piov->iovlen; } }
/* Looks up the user-level 'FD info' for a given file descriptor. ** Returns pointer to the user-level 'FD info' for a given file ** descriptor, or NULL if the FD is not user-level. ** NOTE: The reference count of the 'FD info' is incremented, the ** caller should ensure the reference is dropped when no ** longer needed by calling citp_fdinfo_release_ref(). */ citp_fdinfo* citp_fdtable_lookup_noprobe(unsigned fd) { /* Need to be initialised before we can try and grab the lock at the ** moment. TODO: make this more efficient by using a trylock to grab the ** fdtable lock, and on fail see if we need to initialise it. */ if( CI_UNLIKELY(citp.init_level < CITP_INIT_FDTABLE) ) { if (_citp_do_init_inprogress == 0) CI_TRY(citp_do_init(CITP_INIT_ALL)); else CI_TRY(citp_do_init(CITP_INIT_FDTABLE)); /* get what we need */ return NULL; } if( fd < citp_fdtable.inited_count ) { volatile citp_fdinfo_p* p_fdip = &citp_fdtable.table[fd].fdip; citp_fdinfo_p fdip; again: /* Swap in the busy marker. */ fdip = *p_fdip; if( fdip_is_normal(fdip) ) { if( fdip_cas_succeed(p_fdip, fdip, fdip_busy) ) { /* Bump the reference count. */ citp_fdinfo* fdi = fdip_to_fdi(fdip); citp_fdinfo_ref(fdi); /* Swap the busy marker out again. */ citp_fdtable_busy_clear(fd, fdip, 0); return fdi; } goto again; } /* Not normal! */ else if( fdip_is_busy(fdip) ) { citp_fdtable_busy_wait(fd, 0); goto again; } } return NULL; }
/* ** Why do these live here? Because they need to hack into the low-level ** dirty nastiness of the fdtable. */ int citp_ep_dup(unsigned oldfd, int (*syscall)(int oldfd, long arg), long arg) { /* This implements dup(oldfd) and fcntl(oldfd, F_DUPFD, arg). */ volatile citp_fdinfo_p* p_oldfdip; citp_fdinfo_p oldfdip; citp_fdinfo* newfdi = 0; citp_fdinfo* oldfdi; int newfd; Log_V(log("%s(%d)", __FUNCTION__, oldfd)); if(CI_UNLIKELY( citp.init_level < CITP_INIT_FDTABLE || oo_per_thread_get()->in_vfork_child )) /* Lib not initialised, so no U/L state, and therefore system dup() ** will do just fine. */ return syscall(oldfd, arg); if( oldfd >= citp_fdtable.inited_count ) { /* NB. We can't just pass through in this case because we need to worry ** about other threads racing with us. So we need to be able to lock ** this fd while we do the dup. */ ci_assert(oldfd < citp_fdtable.size); CITP_FDTABLE_LOCK(); __citp_fdtable_extend(oldfd); CITP_FDTABLE_UNLOCK(); } p_oldfdip = &citp_fdtable.table[oldfd].fdip; again: oldfdip = *p_oldfdip; if( fdip_is_busy(oldfdip) ) oldfdip = citp_fdtable_busy_wait(oldfd, 0); if( fdip_is_closing(oldfdip) | fdip_is_reserved(oldfdip) ) { errno = EBADF; return -1; } #if CI_CFG_FD_CACHING /* Need to check in case this sucker's cached */ if( fdip_is_unknown(oldfdip) ) { CITP_FDTABLE_LOCK(); oldfdi = citp_fdtable_probe_locked(oldfd, CI_FALSE, CI_FALSE); CITP_FDTABLE_UNLOCK(); if( oldfdi == &citp_the_closed_fd ) { citp_fdinfo_release_ref(oldfdi, CI_TRUE); errno = EBADF; return -1; } if( oldfdi ) citp_fdinfo_release_ref(oldfdi, CI_TRUE); } #endif if( fdip_cas_fail(p_oldfdip, oldfdip, fdip_busy) ) goto again; #if CI_CFG_FD_CACHING /* May end up with multiple refs to this, don't allow it to be cached. */ if( fdip_is_normal(oldfdip) ) fdip_to_fdi(oldfdip)->can_cache = 0; #endif if( fdip_is_normal(oldfdip) && (((oldfdi = fdip_to_fdi(oldfdip))->protocol->type) == CITP_EPOLL_FD) ) { newfdi = citp_fdinfo_get_ops(oldfdi)->dup(oldfdi); if( ! newfdi ) { citp_fdtable_busy_clear(oldfd, oldfdip, 0); errno = ENOMEM; return -1; } if( fdtable_strict() ) CITP_FDTABLE_LOCK(); newfd = syscall(oldfd, arg); if( newfd >= 0 ) citp_fdtable_new_fd_set(newfd, fdip_busy, fdtable_strict()); if( fdtable_strict() ) CITP_FDTABLE_UNLOCK(); if( newfd >= 0 ) { citp_fdtable_insert(newfdi, newfd, 0); newfdi = 0; } } else { if( fdtable_strict() ) CITP_FDTABLE_LOCK(); newfd = syscall(oldfd, arg); if( newfd >= 0 && newfd < citp_fdtable.inited_count ) { /* Mark newfd as unknown. When used, it'll get probed. * * We are not just being lazy here: Setting to unknown rather than * installing a proper fdi (when oldfd is accelerated) is essential to * vfork()+dup()+exec() working properly. Reason is that child and * parent share address space, so child is modifying the parent's * fdtable. Setting an entry to unknown is safe. */ citp_fdtable_new_fd_set(newfd, fdip_unknown, fdtable_strict()); } if( fdtable_strict() ) CITP_FDTABLE_UNLOCK(); } citp_fdtable_busy_clear(oldfd, oldfdip, 0); if( newfdi ) citp_fdinfo_free(newfdi); return newfd; }
static int ci_udp_recvmsg_block(ci_udp_iomsg_args* a, ci_netif* ni, ci_udp_state* us, int timeout) { int rc; #ifndef __KERNEL__ { citp_signal_info* si; struct pollfd pfd; #if !CI_CFG_CITP_INSIDE_LIB_IS_FLAG int inside_lib; #endif pfd.fd = a->fd; pfd.events = POLLIN; if( timeout == 0 ) timeout = -1; /* Ideally, we should do the same as in citp_tcp_accept(), but since * we do not have lib_context and citp_exit_lib() out of unix/ * subdirectory, we copy it contents. */ si = citp_signal_get_specific_inited(); continue_to_block: #if !CI_CFG_CITP_INSIDE_LIB_IS_FLAG inside_lib = si->inside_lib; ci_assert_gt(inside_lib, 0); #endif si->inside_lib = 0; ci_compiler_barrier(); if(CI_UNLIKELY( si->aflags & OO_SIGNAL_FLAG_HAVE_PENDING )) citp_signal_run_pending(si); rc = ci_sys_poll(&pfd, 1, timeout); #if CI_CFG_CITP_INSIDE_LIB_IS_FLAG si->inside_lib = 1; #else si->inside_lib = inside_lib; #endif if( rc > 0 ) return 0; else if( rc == 0 ) rc = -EAGAIN; else if( errno == EINTR && (si->aflags & OO_SIGNAL_FLAG_NEED_RESTART) && timeout == -1 ) { /* Blocking recv() should only be restarted if there is no timeout. */ goto continue_to_block; } else rc = -errno; return rc; } #else /* __KERNEL__ */ { int mask; s64 t; if( timeout == 0 ) t = -1; else t = msecs_to_jiffies(timeout); mask = POLLIN; rc = efab_tcp_helper_poll_udp(a->filp, &mask, &t); if( rc == 0 ) { if( mask ) { return 0; } else rc = -EAGAIN; } else if( rc == -ERESTARTSYS && us->s.so.rcvtimeo_msec ) rc = -EINTR; } return rc; #endif /* __KERNEL__ */ }
static int ci_udp_recvmsg_socklocked_slowpath(ci_udp_iomsg_args* a, ci_msghdr* msg, ci_iovec_ptr *piov, int flags) { int rc = 0; ci_netif* ni = a->ni; ci_udp_state* us = a->us; if(CI_UNLIKELY( ni->state->rxq_low )) ci_netif_rxq_low_on_recv(ni, &us->s, 1 /* assume at least one pkt freed */); /* In the kernel recv() with flags is not called. * only read(). So flags may only contain MSG_DONTWAIT */ #ifdef __KERNEL__ ci_assert_equal(flags, 0); #endif #ifndef __KERNEL__ if( flags & MSG_ERRQUEUE_CHK ) { if( OO_PP_NOT_NULL(us->timestamp_q.extract) ) { ci_ip_pkt_fmt* pkt; struct timespec ts[3]; struct cmsg_state cmsg_state; ci_udp_hdr* udp; int paylen; /* TODO is this necessary? - mirroring ci_udp_recvmsg_get() */ ci_rmb(); pkt = PKT_CHK_NNL(ni, us->timestamp_q.extract); if( pkt->tx_hw_stamp.tv_sec == CI_PKT_TX_HW_STAMP_CONSUMED ) { if( OO_PP_IS_NULL(pkt->tsq_next) ) goto errqueue_empty; us->timestamp_q.extract = pkt->tsq_next; pkt = PKT_CHK_NNL(ni, us->timestamp_q.extract); ci_assert(pkt->tx_hw_stamp.tv_sec != CI_PKT_TX_HW_STAMP_CONSUMED); } udp = oo_ip_data(pkt); paylen = CI_BSWAP_BE16(oo_ip_hdr(pkt)->ip_tot_len_be16) - sizeof(ci_ip4_hdr) - sizeof(udp); msg->msg_flags = 0; cmsg_state.msg = msg; cmsg_state.cm = msg->msg_control; cmsg_state.cmsg_bytes_used = 0; ci_iovec_ptr_init_nz(piov, msg->msg_iov, msg->msg_iovlen); memset(ts, 0, sizeof(ts)); if( us->s.timestamping_flags & ONLOAD_SOF_TIMESTAMPING_RAW_HARDWARE ) { ts[2].tv_sec = pkt->tx_hw_stamp.tv_sec; ts[2].tv_nsec = pkt->tx_hw_stamp.tv_nsec; } if( (us->s.timestamping_flags & ONLOAD_SOF_TIMESTAMPING_SYS_HARDWARE) && (pkt->tx_hw_stamp.tv_nsec & CI_IP_PKT_HW_STAMP_FLAG_IN_SYNC) ) { ts[1].tv_sec = pkt->tx_hw_stamp.tv_sec; ts[1].tv_nsec = pkt->tx_hw_stamp.tv_nsec; } ci_put_cmsg(&cmsg_state, SOL_SOCKET, ONLOAD_SCM_TIMESTAMPING, sizeof(ts), &ts); oo_offbuf_set_start(&pkt->buf, udp + 1); oo_offbuf_set_len(&pkt->buf, paylen); rc = oo_copy_pkt_to_iovec_no_adv(ni, pkt, piov, paylen); /* Mark this packet/timestamp as consumed */ pkt->tx_hw_stamp.tv_sec = CI_PKT_TX_HW_STAMP_CONSUMED; ci_ip_cmsg_finish(&cmsg_state); msg->msg_flags |= MSG_ERRQUEUE_CHK; return rc; } errqueue_empty: /* ICMP is handled via OS, so get OS error */ rc = oo_os_sock_recvmsg(ni, SC_SP(&us->s), msg, flags); if( rc < 0 ) { ci_assert(-rc == errno); return -1; } else return rc; } #endif if( (rc = ci_get_so_error(&us->s)) != 0 ) { CI_SET_ERROR(rc, rc); return rc; } if( msg->msg_iovlen > 0 && msg->msg_iov == NULL ) { CI_SET_ERROR(rc, EFAULT); return rc; } #if MSG_OOB_CHK if( flags & MSG_OOB_CHK ) { CI_SET_ERROR(rc, EOPNOTSUPP); return rc; } #endif #if CI_CFG_POSIX_RECV if( ! udp_lport_be16(us)) { LOG_UV(log("%s: -1 (ENOTCONN)", __FUNCTION__)); CI_SET_ERROR(rc, ENOTCONN); return rc; } #endif if( msg->msg_iovlen == 0 ) { /* We have a difference in behaviour from the Linux stack here. When ** msg_iovlen is 0 Linux 2.4.21-15.EL does not set MSG_TRUNC when a ** datagram has non-zero length. We do. */ CI_IOVEC_LEN(&piov->io) = piov->iovlen = 0; return IOVLEN_WORKAROUND_RC_VALUE; } return 0; }
static int ci_udp_recvmsg_get(ci_netif* ni, ci_udp_state* us, ci_iovec_ptr* piov, ci_msghdr* msg, int flags) { ci_ip_pkt_fmt* pkt; int rc; /* NB. [msg] can be NULL for async recv. */ if( ci_udp_recv_q_not_readable(ni, us) ) goto recv_q_is_empty; ci_rmb(); pkt = PKT_CHK_NNL(ni, us->recv_q.extract); if( pkt->pf.udp.rx_flags & CI_IP_PKT_FMT_PREFIX_UDP_RX_CONSUMED ) { /* We know that the receive queue is not empty and if a filter is * involved that there are some that have passed the filter, so if * this pkt is already consumed, the next one must be OK to * receive (and already have been filtered) */ us->recv_q.extract = pkt->next; pkt = PKT_CHK_NNL(ni, us->recv_q.extract); ci_assert( !(pkt->pf.udp.rx_flags & CI_IP_PKT_FMT_PREFIX_UDP_RX_CONSUMED) ); #if CI_CFG_ZC_RECV_FILTER if( us->recv_q_filter ) /* Filter should have run on this packet and marked it */ ci_assert( (pkt->pf.udp.rx_flags & (CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_PASSED | CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_DROPPED)) ); else /* Bump this along as we don't have a filter installed, but want * to keep the filter pointer in a sane place */ us->recv_q.filter = us->recv_q.extract; #endif } #if CI_CFG_ZC_RECV_FILTER /* Skip any that the filter has dropped. This must terminate before * hitting the tail because we know the queue is readable. */ while( pkt->pf.udp.rx_flags & CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_DROPPED ) { us->recv_q.extract = pkt->next; pkt = PKT_CHK_NNL(ni, us->recv_q.extract); } #endif #if defined(__linux__) && !defined(__KERNEL__) if( msg != NULL && msg->msg_controllen != 0 ) { if( CI_UNLIKELY(us->s.cmsg_flags != 0 ) ) ci_ip_cmsg_recv(ni, us, pkt, msg, 0); else msg->msg_controllen = 0; } #endif us->stamp = pkt->pf.udp.rx_stamp; rc = oo_copy_pkt_to_iovec_no_adv(ni, pkt, piov, pkt->pf.udp.pay_len); if(CI_LIKELY( rc >= 0 )) { #if HAVE_MSG_FLAGS if(CI_UNLIKELY( rc < pkt->pf.udp.pay_len && msg != NULL )) msg->msg_flags |= LOCAL_MSG_TRUNC; #endif ci_udp_recvmsg_fill_msghdr(ni, msg, pkt, &us->s); if( ! (flags & MSG_PEEK) ) { us->recv_q.bytes_delivered += pkt->pf.udp.pay_len; us->recv_q.pkts_delivered += 1; pkt->pf.udp.rx_flags |= CI_IP_PKT_FMT_PREFIX_UDP_RX_CONSUMED; #if CI_CFG_ZC_RECV_FILTER if( !us->recv_q_filter ) { /* Pretend this packet passed the filter, to keep state consistent */ ++us->recv_q.pkts_filter_passed; us->recv_q.bytes_filter_passed += pkt->pf.udp.pay_len; pkt->pf.udp.rx_flags |= CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_PASSED; } #endif } us->udpflags |= CI_UDPF_LAST_RECV_ON; } return rc; recv_q_is_empty: return -EAGAIN; }
static int ci_tcp_connect_ul_start(ci_netif *ni, ci_tcp_state* ts, ci_uint32 dst_be32, unsigned dport_be16, int* fail_rc) { ci_ip_pkt_fmt* pkt; int rc = 0; ci_assert(ts->s.pkt.mtu); /* Now that we know the outgoing route, set the MTU related values. * Note, even these values are speculative since the real MTU * could change between now and passing the packet to the lower layers */ ts->amss = ts->s.pkt.mtu - sizeof(ci_tcp_hdr) - sizeof(ci_ip4_hdr); #if CI_CFG_LIMIT_AMSS ts->amss = ci_tcp_limit_mss(ts->amss, ni, __FUNCTION__); #endif /* Default smss until discovered by MSS option in SYN - RFC1122 4.2.2.6 */ ts->smss = CI_CFG_TCP_DEFAULT_MSS; /* set pmtu, eff_mss, snd_buf and adjust windows */ ci_pmtu_set(ni, &ts->pmtus, ts->s.pkt.mtu); ci_tcp_set_eff_mss(ni, ts); ci_tcp_set_initialcwnd(ni, ts); /* Send buffer adjusted by ci_tcp_set_eff_mss(), but we want it to stay * zero until the connection is established. */ ts->so_sndbuf_pkts = 0; /* * 3. State and address are OK. It's address routed through our NIC. * Do connect(). */ ci_assert_nequal(ts->s.pkt.ip.ip_saddr_be32, INADDR_ANY); if( ts->s.s_flags & CI_SOCK_FLAG_CONNECT_MUST_BIND ) { ci_sock_cmn* s = &ts->s; ci_uint16 source_be16 = 0; if( s->s_flags & CI_SOCK_FLAG_ADDR_BOUND ) rc = __ci_bind(ni, &ts->s, ts->s.pkt.ip.ip_saddr_be32, &source_be16); else rc = __ci_bind(ni, &ts->s, INADDR_ANY, &source_be16); if(CI_LIKELY( rc == 0 )) { TS_TCP(ts)->tcp_source_be16 = source_be16; ts->s.cp.lport_be16 = source_be16; LOG_TC(log(LNT_FMT "connect: our bind returned %s:%u", LNT_PRI_ARGS(ni, ts), ip_addr_str(INADDR_ANY), (unsigned) CI_BSWAP_BE16(TS_TCP(ts)->tcp_source_be16))); } else { LOG_U(ci_log("__ci_bind returned %d at %s:%d", CI_GET_ERROR(rc), __FILE__, __LINE__)); *fail_rc = rc; return CI_CONNECT_UL_FAIL; } if(CI_UNLIKELY( ts->s.pkt.ip.ip_saddr_be32 == 0 )) { CI_SET_ERROR(*fail_rc, EINVAL); return CI_CONNECT_UL_FAIL; } } ci_tcp_set_peer(ts, dst_be32, dport_be16); /* Make sure we can get a buffer before we change state. */ pkt = ci_netif_pkt_tx_tcp_alloc(ni); if( CI_UNLIKELY(! pkt) ) { /* NB. We've already done a poll above. */ rc = ci_netif_pkt_wait(ni, &ts->s, CI_SLEEP_NETIF_LOCKED|CI_SLEEP_NETIF_RQ); if( ci_netif_pkt_wait_was_interrupted(rc) ) { CI_SET_ERROR(*fail_rc, -rc); return CI_CONNECT_UL_LOCK_DROPPED; } /* OK, there are (probably) packets available - go try again. Note we * jump back to the top of the function because someone may have * connected this socket in the mean-time, so we need to check the * state once more. */ return CI_CONNECT_UL_START_AGAIN; } #ifdef ONLOAD_OFE if( ni->ofe != NULL ) ts->s.ofe_code_start = ofe_socktbl_find( ni->ofe, OFE_SOCKTYPE_TCP_ACTIVE, tcp_laddr_be32(ts), tcp_raddr_be32(ts), tcp_lport_be16(ts), tcp_rport_be16(ts)); #endif rc = ci_tcp_ep_set_filters(ni, S_SP(ts), ts->s.cp.so_bindtodevice, OO_SP_NULL); if( rc < 0 ) { /* Perhaps we've run out of filters? See if we can push a socket out * of timewait and steal its filter. */ ci_assert_nequal(rc, -EFILTERSSOME); if( rc != -EBUSY || ! ci_netif_timewait_try_to_free_filter(ni) || (rc = ci_tcp_ep_set_filters(ni, S_SP(ts), ts->s.cp.so_bindtodevice, OO_SP_NULL)) < 0 ) { ci_assert_nequal(rc, -EFILTERSSOME); /* Either a different error, or our efforts to free a filter did not * work. */ if( ! (ts->s.s_flags & CI_SOCK_FLAG_ADDR_BOUND) ) { ts->s.pkt.ip.ip_saddr_be32 = 0; ts->s.cp.ip_laddr_be32 = 0; } ci_netif_pkt_release(ni, pkt); CI_SET_ERROR(*fail_rc, -rc); return CI_CONNECT_UL_FAIL; } } LOG_TC(log(LNT_FMT "CONNECT %s:%u->%s:%u", LNT_PRI_ARGS(ni, ts), ip_addr_str(ts->s.pkt.ip.ip_saddr_be32), (unsigned) CI_BSWAP_BE16(TS_TCP(ts)->tcp_source_be16), ip_addr_str(ts->s.pkt.ip.ip_daddr_be32), (unsigned) CI_BSWAP_BE16(TS_TCP(ts)->tcp_dest_be16))); /* We are going to send the SYN - set states appropriately */ tcp_snd_una(ts) = tcp_snd_nxt(ts) = tcp_enq_nxt(ts) = tcp_snd_up(ts) = ci_tcp_initial_seqno(ni); ts->snd_max = tcp_snd_nxt(ts) + 1; /* Must be after initialising snd_una. */ ci_tcp_clear_rtt_timing(ts); ci_tcp_set_flags(ts, CI_TCP_FLAG_SYN); ts->tcpflags &=~ CI_TCPT_FLAG_OPT_MASK; ts->tcpflags |= NI_OPTS(ni).syn_opts; if( (ts->tcpflags & CI_TCPT_FLAG_WSCL) ) { ts->rcv_wscl = ci_tcp_wscl_by_buff(ni, ci_tcp_rcvbuf_established(ni, &ts->s)); CI_IP_SOCK_STATS_VAL_RXWSCL(ts, ts->rcv_wscl); } else { ts->rcv_wscl = 0; CI_IP_SOCK_STATS_VAL_RXWSCL(ts, 0); } ci_tcp_set_rcvbuf(ni, ts); ci_tcp_init_rcv_wnd(ts, "CONNECT"); /* outgoing_hdrs_len is initialised to include timestamp option. */ if( ! (ts->tcpflags & CI_TCPT_FLAG_TSO) ) ts->outgoing_hdrs_len = sizeof(ci_ip4_hdr)+sizeof(ci_tcp_hdr); if( ci_tcp_can_stripe(ni, ts->s.pkt.ip.ip_saddr_be32, ts->s.pkt.ip.ip_daddr_be32) ) ts->tcpflags |= CI_TCPT_FLAG_STRIPE; ci_tcp_set_slow_state(ni, ts, CI_TCP_SYN_SENT); /* If the app trys to send data on a socket in SYN_SENT state ** then the data is queued for send until the SYN gets ACKed. ** (rfc793 p56) ** ** Receive calls on the socket should block until data arrives ** (rfc793 p58) ** ** Clearing tx_errno and rx_errno acheive this. The transmit window ** is set to 1 byte which ensures that only the SYN packet gets ** sent until the ACK is received with more window. */ ci_assert(ts->snd_max == tcp_snd_nxt(ts) + 1); ts->s.rx_errno = 0; ts->s.tx_errno = 0; ci_tcp_enqueue_no_data(ts, ni, pkt); ci_tcp_set_flags(ts, CI_TCP_FLAG_ACK); if( ts->s.b.sb_aflags & (CI_SB_AFLAG_O_NONBLOCK | CI_SB_AFLAG_O_NDELAY) ) { ts->tcpflags |= CI_TCPT_FLAG_NONBLOCK_CONNECT; LOG_TC(log( LNT_FMT "Non-blocking connect - return EINPROGRESS", LNT_PRI_ARGS(ni, ts))); CI_SET_ERROR(*fail_rc, EINPROGRESS); return CI_CONNECT_UL_FAIL; } return CI_CONNECT_UL_OK; }