int
efab_eplock_lock_wait(ci_netif* ni)
{
    wait_queue_t wait;
    int rc;

#if CI_CFG_EFAB_EPLOCK_RECORD_CONTENTIONS
    efab_eplock_record_pid(ni);
#endif

    init_waitqueue_entry(&wait, current);
    add_wait_queue(&ni->eplock_helper.wq, &wait);

    while( 1 ) {
        set_current_state(TASK_INTERRUPTIBLE);
        rc = efab_eplock_is_unlocked_or_request_wake(&ni->state->lock);
        if( rc <= 0 )
            break;
        schedule();
        if(CI_UNLIKELY( signal_pending(current) )) {
            rc = -ERESTARTSYS;
            break;
        }
    }

    remove_wait_queue(&ni->eplock_helper.wq, &wait);
    set_current_state(TASK_RUNNING);
    return rc;
}
Esempio n. 2
0
unsigned ci_ip_checksum(const ci_ip4_hdr* ip)
{
  const ci_uint16* p = (const ci_uint16*) ip;
  unsigned csum;
  int bytes;

  csum  = p[0];
  csum += p[1];
  csum += p[2];
  csum += p[3];
  csum += p[4];
  /* omit ip_check_be16 */
  csum += p[6];
  csum += p[7];
  csum += p[8];
  csum += p[9];

  bytes = CI_IP4_IHL(ip);
  if(CI_UNLIKELY( bytes > 20 )) {
    p += 10;
    bytes -= 20;
    do {
      csum += *p++;
      bytes -= 2;
    } while( bytes );
  }

  return ci_ip_hdr_csum_finish(csum);
}
Esempio n. 3
0
static int citp_pipe_epoll_writer(citp_fdinfo* fdinfo,
                                  struct citp_epoll_member* eitem,
                                  struct oo_ul_epoll_state* eps,
                                  int* stored_event)
{
  unsigned mask;
  struct oo_pipe* pipe = fdi_to_pipe_fdi(fdinfo)->pipe;
  ci_uint64 sleep_seq;
  int seq_mismatch = 0;

#if CI_CFG_SPIN_STATS
  if( CI_UNLIKELY(! eps->stat_incremented) ) {
    fdi_to_pipe_fdi(fdinfo)->ni->state->stats.spin_epoll++;
    eps->stat_incremented = 1;
  }
#endif

  sleep_seq = pipe->b.sleep_seq.all;
  mask = oo_pipe_poll_write_events(pipe);
  *stored_event = citp_ul_epoll_set_ul_events(eps, eitem, mask, sleep_seq,
                                              &pipe->b.sleep_seq.all,
                                              &seq_mismatch);

  return seq_mismatch;
}
Esempio n. 4
0
static int citp_pipe_poll_writer(citp_fdinfo* fdinfo, struct pollfd* pfd,
                                 struct oo_ul_poll_state* ps)
{
  citp_pipe_fdi* epi;
  struct oo_pipe* p;
  unsigned mask;

  epi = fdi_to_pipe_fdi(fdinfo);
  p = epi->pipe;

#if CI_CFG_SPIN_STATS
  if( CI_UNLIKELY(! ps->stat_incremented) ) {
    epi->ni->state->stats.spin_poll++;
    ps->stat_incremented = 1;
  }
#endif

  /* set mask */
  mask = oo_pipe_poll_write_events(p);

  /* set revents */
  pfd->revents = mask & (pfd->events | POLLERR | POLLHUP);

  return 1;
}
Esempio n. 5
0
static int citp_pipe_select_reader(citp_fdinfo* fdinfo, int* n,
                                   int rd, int wr, int ex,
                                   struct oo_ul_select_state* ss)
{
  citp_pipe_fdi* epi;
  struct oo_pipe* p;
  unsigned mask = 0;

  epi = fdi_to_pipe_fdi(fdinfo);
  p = epi->pipe;

#if CI_CFG_SPIN_STATS
  if( CI_UNLIKELY(! ss->stat_incremented) ) {
    epi->ni->state->stats.spin_select++;
    ss->stat_incremented = 1;
  }
#endif

  /* set mask */
  mask = oo_pipe_poll_read_events(p);

  if( rd && (mask & SELECT_RD_SET) ) {
    FD_SET(fdinfo->fd, ss->rdu);
    ++*n;
  }

  return 1;
}
Esempio n. 6
0
extern int ci_cfg_rd_trylock(void)
{
  int rc = 0;
  if (!ci_cfg_handle_open)
  {   ci_log("config: attempt to access configuration "
	     "before initialization");
      rc = -ENXIO; /* "no such device or address"? */
  } else 
  {   int readers;
      ci_lock_lock(&ci_cfg_handle.lock);
      if (CI_UNLIKELY(ci_cfg_handle.writing == 1))
      {   DEBUG_LOCK(DPRINTF("config: read denied during a write"););
	  rc = -EAGAIN;
      } else if (CI_UNLIKELY((readers=ci_cfg_readers()) >= CI_CFG_READERS_MAX))
Esempio n. 7
0
static int
oo_copy_pkt_to_iovec_no_adv(ci_netif* ni, const ci_ip_pkt_fmt* pkt,
                            ci_iovec_ptr* piov, int bytes_to_copy)
{
  /* Copy data from [pkt] to [piov], following [pkt->frag_next] as
   * necessary.  Does not modify [pkt].  May or may not advance [piov].
   * The packet must contain at least [bytes_to_copy] of data in the
   * [pkt->buf].  [piov] may contain an arbitrary amount of space.
   *
   * Returns number of bytes copied on success, or -EFAULT otherwise.
   */
  int n, pkt_left, pkt_off = 0;
  int bytes_copied = 0;

  while( 1 ) {
    pkt_left = oo_offbuf_left(&pkt->buf) - pkt_off;
    n = CI_MIN(pkt_left, CI_IOVEC_LEN(&piov->io));
    n = CI_MIN(n, bytes_to_copy);
    if(CI_UNLIKELY( do_copy(CI_IOVEC_BASE(&piov->io),
                            oo_offbuf_ptr(&pkt->buf) + pkt_off, n) != 0 ))
      return -EFAULT;

    bytes_copied += n;
    pkt_off += n;
    if( n == bytes_to_copy )
      return bytes_copied;

    bytes_to_copy -= n;
    if( n == pkt_left ) {
      /* Caller guarantees that packet contains at least [bytes_to_copy]. */
      ci_assert(OO_PP_NOT_NULL(pkt->frag_next));
      ci_iovec_ptr_advance(piov, n);
      pkt = PKT_CHK_NNL(ni, pkt->frag_next);
      pkt_off = 0;
      /* We're unlikely to hit end-of-pkt-buf and end-of-iovec at the same
       * time, and if we do, just go round the loop again.
       */
      continue;
    }

    ci_assert_equal(n, CI_IOVEC_LEN(&piov->io));
    if( piov->iovlen == 0 )
      return bytes_copied;
    piov->io = *piov->iov++;
    --piov->iovlen;
  }
}
Esempio n. 8
0
/* Looks up the user-level 'FD info' for a given file descriptor.
** Returns pointer to the user-level 'FD info' for a given file
** descriptor, or NULL if the FD is not user-level.
** NOTE: The reference count of the 'FD info' is incremented, the
**       caller should ensure the reference is dropped when no
**       longer needed by calling citp_fdinfo_release_ref().
*/
citp_fdinfo* citp_fdtable_lookup_noprobe(unsigned fd)
{
  /* Need to be initialised before we can try and grab the lock at the
  ** moment.  TODO: make this more efficient by using a trylock to grab the
  ** fdtable lock, and on fail see if we need to initialise it.
  */
  if( CI_UNLIKELY(citp.init_level < CITP_INIT_FDTABLE) ) {
    if (_citp_do_init_inprogress == 0)
      CI_TRY(citp_do_init(CITP_INIT_ALL));
    else
      CI_TRY(citp_do_init(CITP_INIT_FDTABLE)); /* get what we need */

    return NULL;
  }

  if( fd < citp_fdtable.inited_count ) {

    volatile citp_fdinfo_p* p_fdip = &citp_fdtable.table[fd].fdip;
    citp_fdinfo_p fdip;

  again:
    /* Swap in the busy marker. */
    fdip = *p_fdip;
    if( fdip_is_normal(fdip) ) {
      if( fdip_cas_succeed(p_fdip, fdip, fdip_busy) ) {
	/* Bump the reference count. */
	citp_fdinfo* fdi = fdip_to_fdi(fdip);
	citp_fdinfo_ref(fdi);
	/* Swap the busy marker out again. */
	citp_fdtable_busy_clear(fd, fdip, 0);
        return fdi;
      }
      goto again;
    }
    /* Not normal! */
    else if( fdip_is_busy(fdip) ) {
      citp_fdtable_busy_wait(fd, 0);
      goto again;
    }

  }

  return NULL;
}
Esempio n. 9
0
/*
** Why do these live here?  Because they need to hack into the low-level
** dirty nastiness of the fdtable.
*/
int citp_ep_dup(unsigned oldfd, int (*syscall)(int oldfd, long arg),
		long arg)
{
  /* This implements dup(oldfd) and fcntl(oldfd, F_DUPFD, arg). */

  volatile citp_fdinfo_p* p_oldfdip;
  citp_fdinfo_p oldfdip;
  citp_fdinfo* newfdi = 0;
  citp_fdinfo* oldfdi;
  int newfd;

  Log_V(log("%s(%d)", __FUNCTION__, oldfd));

  if(CI_UNLIKELY( citp.init_level < CITP_INIT_FDTABLE ||
                  oo_per_thread_get()->in_vfork_child ))
    /* Lib not initialised, so no U/L state, and therefore system dup()
    ** will do just fine. */
    return syscall(oldfd, arg);

  if( oldfd >= citp_fdtable.inited_count ) {
    /* NB. We can't just pass through in this case because we need to worry
    ** about other threads racing with us.  So we need to be able to lock
    ** this fd while we do the dup. */
    ci_assert(oldfd < citp_fdtable.size);
    CITP_FDTABLE_LOCK();
    __citp_fdtable_extend(oldfd);
    CITP_FDTABLE_UNLOCK();
  }

  p_oldfdip = &citp_fdtable.table[oldfd].fdip;
 again:
  oldfdip = *p_oldfdip;
  if( fdip_is_busy(oldfdip) )
    oldfdip = citp_fdtable_busy_wait(oldfd, 0);
  if( fdip_is_closing(oldfdip) | fdip_is_reserved(oldfdip) ) {
    errno = EBADF;
    return -1;
  }
#if CI_CFG_FD_CACHING
  /* Need to check in case this sucker's cached */
  if( fdip_is_unknown(oldfdip) ) {
    CITP_FDTABLE_LOCK();
    oldfdi = citp_fdtable_probe_locked(oldfd, CI_FALSE, CI_FALSE);
    CITP_FDTABLE_UNLOCK();
    if( oldfdi == &citp_the_closed_fd ) {
      citp_fdinfo_release_ref(oldfdi, CI_TRUE);
      errno = EBADF;
      return -1;
    }
    if( oldfdi )
      citp_fdinfo_release_ref(oldfdi, CI_TRUE);
  }
#endif
  if( fdip_cas_fail(p_oldfdip, oldfdip, fdip_busy) )
    goto again;

#if CI_CFG_FD_CACHING
  /* May end up with multiple refs to this, don't allow it to be cached. */
  if( fdip_is_normal(oldfdip) )
    fdip_to_fdi(oldfdip)->can_cache = 0;
#endif

  if( fdip_is_normal(oldfdip) &&
      (((oldfdi = fdip_to_fdi(oldfdip))->protocol->type) == CITP_EPOLL_FD) ) {
    newfdi = citp_fdinfo_get_ops(oldfdi)->dup(oldfdi);
    if( ! newfdi ) {
      citp_fdtable_busy_clear(oldfd, oldfdip, 0);
      errno = ENOMEM;
      return -1;
    }

    if( fdtable_strict() )  CITP_FDTABLE_LOCK();
    newfd = syscall(oldfd, arg);
    if( newfd >= 0 )
      citp_fdtable_new_fd_set(newfd, fdip_busy, fdtable_strict());
    if( fdtable_strict() )  CITP_FDTABLE_UNLOCK();
    if( newfd >= 0 ) {
      citp_fdtable_insert(newfdi, newfd, 0);
      newfdi = 0;
    }
  }
  else {
    if( fdtable_strict() )  CITP_FDTABLE_LOCK();
    newfd = syscall(oldfd, arg);
    if( newfd >= 0 && newfd < citp_fdtable.inited_count ) {
      /* Mark newfd as unknown.  When used, it'll get probed.
       *
       * We are not just being lazy here: Setting to unknown rather than
       * installing a proper fdi (when oldfd is accelerated) is essential to
       * vfork()+dup()+exec() working properly.  Reason is that child and
       * parent share address space, so child is modifying the parent's
       * fdtable.  Setting an entry to unknown is safe.
       */
      citp_fdtable_new_fd_set(newfd, fdip_unknown, fdtable_strict());
    }
    if( fdtable_strict() )  CITP_FDTABLE_UNLOCK();
  }

  citp_fdtable_busy_clear(oldfd, oldfdip, 0);
  if( newfdi )  citp_fdinfo_free(newfdi);
  return newfd;
}
Esempio n. 10
0
static int 
ci_udp_recvmsg_block(ci_udp_iomsg_args* a, ci_netif* ni, ci_udp_state* us,
                     int timeout)
{
  int rc;

#ifndef __KERNEL__
  {
    citp_signal_info* si;
    struct pollfd pfd;
#if !CI_CFG_CITP_INSIDE_LIB_IS_FLAG
    int inside_lib;
#endif
    pfd.fd = a->fd;
    pfd.events = POLLIN;

    if( timeout == 0 )
      timeout = -1;

    /* Ideally, we should do the same as in citp_tcp_accept(), but since
     * we do not have lib_context and citp_exit_lib() out of unix/
     * subdirectory, we copy it contents. */
    si = citp_signal_get_specific_inited();
  continue_to_block:
#if !CI_CFG_CITP_INSIDE_LIB_IS_FLAG
    inside_lib = si->inside_lib;
    ci_assert_gt(inside_lib, 0);
#endif
    si->inside_lib = 0;
    ci_compiler_barrier();
    if(CI_UNLIKELY( si->aflags & OO_SIGNAL_FLAG_HAVE_PENDING ))
      citp_signal_run_pending(si);

    rc = ci_sys_poll(&pfd, 1, timeout);

#if CI_CFG_CITP_INSIDE_LIB_IS_FLAG
    si->inside_lib = 1;
#else
    si->inside_lib = inside_lib;
#endif

    if( rc > 0 )
      return 0;
    else if( rc == 0 )
      rc = -EAGAIN;
    else if( errno == EINTR && (si->aflags & OO_SIGNAL_FLAG_NEED_RESTART) &&
             timeout == -1 ) {
      /* Blocking recv() should only be restarted if there is no timeout. */
      goto continue_to_block;
    } else 
      rc = -errno;

    return rc;
  }
#else  /* __KERNEL__ */
  {
    int mask;
    s64 t;

    if( timeout == 0 )
      t = -1;
    else
      t = msecs_to_jiffies(timeout);

    mask = POLLIN;
    rc = efab_tcp_helper_poll_udp(a->filp, &mask, &t);
    if( rc == 0 ) {
      if( mask ) {
        return 0;
      }
      else
        rc = -EAGAIN;
    }
    else if( rc == -ERESTARTSYS &&  us->s.so.rcvtimeo_msec )
      rc = -EINTR;
  }
  return rc;
#endif /* __KERNEL__ */
}
Esempio n. 11
0
static int ci_udp_recvmsg_socklocked_slowpath(ci_udp_iomsg_args* a, 
                                              ci_msghdr* msg,
                                              ci_iovec_ptr *piov, int flags)
{
  int rc = 0;
  ci_netif* ni = a->ni;
  ci_udp_state* us = a->us;

  if(CI_UNLIKELY( ni->state->rxq_low ))
    ci_netif_rxq_low_on_recv(ni, &us->s,
                             1 /* assume at least one pkt freed */);
  /* In the kernel recv() with flags is not called.
   * only read(). So flags may only contain MSG_DONTWAIT */
#ifdef __KERNEL__
  ci_assert_equal(flags, 0);
#endif

#ifndef __KERNEL__
  if( flags & MSG_ERRQUEUE_CHK ) {
    if( OO_PP_NOT_NULL(us->timestamp_q.extract) ) {
      ci_ip_pkt_fmt* pkt;
      struct timespec ts[3];
      struct cmsg_state cmsg_state;
      ci_udp_hdr* udp;
      int paylen;

      /* TODO is this necessary? - mirroring ci_udp_recvmsg_get() */
      ci_rmb();
      
      pkt = PKT_CHK_NNL(ni, us->timestamp_q.extract);
      if( pkt->tx_hw_stamp.tv_sec == CI_PKT_TX_HW_STAMP_CONSUMED ) {
        if( OO_PP_IS_NULL(pkt->tsq_next) )
          goto errqueue_empty;
        us->timestamp_q.extract = pkt->tsq_next;
        pkt = PKT_CHK_NNL(ni, us->timestamp_q.extract);
        ci_assert(pkt->tx_hw_stamp.tv_sec != CI_PKT_TX_HW_STAMP_CONSUMED);
      }

      udp = oo_ip_data(pkt);
      paylen = CI_BSWAP_BE16(oo_ip_hdr(pkt)->ip_tot_len_be16) -
                        sizeof(ci_ip4_hdr) - sizeof(udp);

      msg->msg_flags = 0;
      cmsg_state.msg = msg;
      cmsg_state.cm = msg->msg_control;
      cmsg_state.cmsg_bytes_used = 0;
      ci_iovec_ptr_init_nz(piov, msg->msg_iov, msg->msg_iovlen);
      memset(ts, 0, sizeof(ts));

      if( us->s.timestamping_flags & ONLOAD_SOF_TIMESTAMPING_RAW_HARDWARE ) {
        ts[2].tv_sec = pkt->tx_hw_stamp.tv_sec;
        ts[2].tv_nsec = pkt->tx_hw_stamp.tv_nsec;
      }
      if( (us->s.timestamping_flags & ONLOAD_SOF_TIMESTAMPING_SYS_HARDWARE) &&
          (pkt->tx_hw_stamp.tv_nsec & CI_IP_PKT_HW_STAMP_FLAG_IN_SYNC) ) {
        ts[1].tv_sec = pkt->tx_hw_stamp.tv_sec;
        ts[1].tv_nsec = pkt->tx_hw_stamp.tv_nsec;
      }
      ci_put_cmsg(&cmsg_state, SOL_SOCKET, ONLOAD_SCM_TIMESTAMPING,
                  sizeof(ts), &ts);
      oo_offbuf_set_start(&pkt->buf, udp + 1);
      oo_offbuf_set_len(&pkt->buf, paylen);
      rc = oo_copy_pkt_to_iovec_no_adv(ni, pkt, piov, paylen);

      /* Mark this packet/timestamp as consumed */
      pkt->tx_hw_stamp.tv_sec = CI_PKT_TX_HW_STAMP_CONSUMED;

      ci_ip_cmsg_finish(&cmsg_state);
      msg->msg_flags |= MSG_ERRQUEUE_CHK;
      return rc;
    }
  errqueue_empty:
    /* ICMP is handled via OS, so get OS error */
    rc = oo_os_sock_recvmsg(ni, SC_SP(&us->s), msg, flags);
    if( rc < 0 ) {
      ci_assert(-rc == errno);
      return -1;
    }
    else
      return rc;
  }
#endif
  if( (rc = ci_get_so_error(&us->s)) != 0 ) {
    CI_SET_ERROR(rc, rc);
    return rc;
  }
  if( msg->msg_iovlen > 0 && msg->msg_iov == NULL ) {
    CI_SET_ERROR(rc, EFAULT);
    return rc;
  }
#if MSG_OOB_CHK
  if( flags & MSG_OOB_CHK ) {
    CI_SET_ERROR(rc, EOPNOTSUPP);
    return rc;
  }
#endif
#if CI_CFG_POSIX_RECV  
  if( ! udp_lport_be16(us)) {
    LOG_UV(log("%s: -1 (ENOTCONN)", __FUNCTION__));
    CI_SET_ERROR(rc, ENOTCONN);
    return rc;
  }
#endif
  if( msg->msg_iovlen == 0 ) {
    /* We have a difference in behaviour from the Linux stack here.  When
    ** msg_iovlen is 0 Linux 2.4.21-15.EL does not set MSG_TRUNC when a
    ** datagram has non-zero length.  We do. */
    CI_IOVEC_LEN(&piov->io) = piov->iovlen = 0;
    return IOVLEN_WORKAROUND_RC_VALUE;
  }
  return 0;
}
Esempio n. 12
0
static int ci_udp_recvmsg_get(ci_netif* ni, ci_udp_state* us,
                              ci_iovec_ptr* piov, 
                              ci_msghdr* msg, int flags)
{
  ci_ip_pkt_fmt* pkt;
  int rc;

  /* NB. [msg] can be NULL for async recv. */

  if( ci_udp_recv_q_not_readable(ni, us) )
    goto recv_q_is_empty;

  ci_rmb();

  pkt = PKT_CHK_NNL(ni, us->recv_q.extract);
  if( pkt->pf.udp.rx_flags & CI_IP_PKT_FMT_PREFIX_UDP_RX_CONSUMED ) {
    /* We know that the receive queue is not empty and if a filter is
     * involved that there are some that have passed the filter, so if
     * this pkt is already consumed, the next one must be OK to
     * receive (and already have been filtered)
     */
    us->recv_q.extract = pkt->next;
    pkt = PKT_CHK_NNL(ni, us->recv_q.extract);
    ci_assert( !(pkt->pf.udp.rx_flags & 
                 CI_IP_PKT_FMT_PREFIX_UDP_RX_CONSUMED) );
#if CI_CFG_ZC_RECV_FILTER
    if( us->recv_q_filter )
      /* Filter should have run on this packet and marked it */
      ci_assert( (pkt->pf.udp.rx_flags & 
                  (CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_PASSED | 
                   CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_DROPPED)) ); 
    else
      /* Bump this along as we don't have a filter installed, but want
       * to keep the filter pointer in a sane place
       */
      us->recv_q.filter = us->recv_q.extract;
#endif
  }

#if CI_CFG_ZC_RECV_FILTER
  /* Skip any that the filter has dropped.  This must terminate before
   * hitting the tail because we know the queue is readable.
   */
  while( pkt->pf.udp.rx_flags & CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_DROPPED ) {
    us->recv_q.extract = pkt->next;
    pkt = PKT_CHK_NNL(ni, us->recv_q.extract);
  }
#endif

#if defined(__linux__) && !defined(__KERNEL__)
  if( msg != NULL && msg->msg_controllen != 0 ) {
    if( CI_UNLIKELY(us->s.cmsg_flags != 0 ) )
      ci_ip_cmsg_recv(ni, us, pkt, msg, 0);
    else
      msg->msg_controllen = 0;
  }
#endif
  us->stamp = pkt->pf.udp.rx_stamp;

  rc = oo_copy_pkt_to_iovec_no_adv(ni, pkt, piov, pkt->pf.udp.pay_len);

  if(CI_LIKELY( rc >= 0 )) {
#if HAVE_MSG_FLAGS
    if(CI_UNLIKELY( rc < pkt->pf.udp.pay_len && msg != NULL ))
      msg->msg_flags |= LOCAL_MSG_TRUNC;
#endif
    ci_udp_recvmsg_fill_msghdr(ni, msg, pkt, &us->s);
    if( ! (flags & MSG_PEEK) ) {
      us->recv_q.bytes_delivered += pkt->pf.udp.pay_len;
      us->recv_q.pkts_delivered  += 1;
      pkt->pf.udp.rx_flags |= CI_IP_PKT_FMT_PREFIX_UDP_RX_CONSUMED;
#if CI_CFG_ZC_RECV_FILTER
      if( !us->recv_q_filter ) {
        /* Pretend this packet passed the filter, to keep state consistent */
        ++us->recv_q.pkts_filter_passed;
        us->recv_q.bytes_filter_passed += pkt->pf.udp.pay_len;
        pkt->pf.udp.rx_flags |= CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_PASSED;
      }
#endif
    }
    us->udpflags |= CI_UDPF_LAST_RECV_ON;
  }

  return rc;

 recv_q_is_empty:
  return -EAGAIN;
}
Esempio n. 13
0
static int ci_tcp_connect_ul_start(ci_netif *ni, ci_tcp_state* ts,
				   ci_uint32 dst_be32, unsigned dport_be16,
                                   int* fail_rc)
{
  ci_ip_pkt_fmt* pkt;
  int rc = 0;

  ci_assert(ts->s.pkt.mtu);

  /* Now that we know the outgoing route, set the MTU related values.
   * Note, even these values are speculative since the real MTU
   * could change between now and passing the packet to the lower layers
   */
  ts->amss = ts->s.pkt.mtu - sizeof(ci_tcp_hdr) - sizeof(ci_ip4_hdr);
#if CI_CFG_LIMIT_AMSS
  ts->amss = ci_tcp_limit_mss(ts->amss, ni, __FUNCTION__);
#endif

  /* Default smss until discovered by MSS option in SYN - RFC1122 4.2.2.6 */
  ts->smss = CI_CFG_TCP_DEFAULT_MSS;

  /* set pmtu, eff_mss, snd_buf and adjust windows */
  ci_pmtu_set(ni, &ts->pmtus, ts->s.pkt.mtu);
  ci_tcp_set_eff_mss(ni, ts);
  ci_tcp_set_initialcwnd(ni, ts);

  /* Send buffer adjusted by ci_tcp_set_eff_mss(), but we want it to stay
   * zero until the connection is established.
   */
  ts->so_sndbuf_pkts = 0;

  /* 
   * 3. State and address are OK. It's address routed through our NIC.
   *    Do connect().
   */
  ci_assert_nequal(ts->s.pkt.ip.ip_saddr_be32, INADDR_ANY);

  if( ts->s.s_flags & CI_SOCK_FLAG_CONNECT_MUST_BIND ) {
    ci_sock_cmn* s = &ts->s;
    ci_uint16 source_be16 = 0;

    if( s->s_flags & CI_SOCK_FLAG_ADDR_BOUND )
      rc = __ci_bind(ni, &ts->s, ts->s.pkt.ip.ip_saddr_be32, &source_be16);
    else 
      rc = __ci_bind(ni, &ts->s, INADDR_ANY, &source_be16);
    if(CI_LIKELY( rc == 0 )) {
      TS_TCP(ts)->tcp_source_be16 = source_be16;
      ts->s.cp.lport_be16 = source_be16;
      LOG_TC(log(LNT_FMT "connect: our bind returned %s:%u", 
                 LNT_PRI_ARGS(ni, ts),
                 ip_addr_str(INADDR_ANY),
                 (unsigned) CI_BSWAP_BE16(TS_TCP(ts)->tcp_source_be16)));
    }
    else {
      LOG_U(ci_log("__ci_bind returned %d at %s:%d", CI_GET_ERROR(rc),
                   __FILE__, __LINE__));
      *fail_rc = rc;
      return CI_CONNECT_UL_FAIL;
    }
    if(CI_UNLIKELY( ts->s.pkt.ip.ip_saddr_be32 == 0 )) {
      CI_SET_ERROR(*fail_rc, EINVAL);
      return CI_CONNECT_UL_FAIL;
    }
  }

  ci_tcp_set_peer(ts, dst_be32, dport_be16);

  /* Make sure we can get a buffer before we change state. */
  pkt = ci_netif_pkt_tx_tcp_alloc(ni);
  if( CI_UNLIKELY(! pkt) ) {
    /* NB. We've already done a poll above. */
    rc = ci_netif_pkt_wait(ni, &ts->s, CI_SLEEP_NETIF_LOCKED|CI_SLEEP_NETIF_RQ);
    if( ci_netif_pkt_wait_was_interrupted(rc) ) {
      CI_SET_ERROR(*fail_rc, -rc);
      return CI_CONNECT_UL_LOCK_DROPPED;
    }
    /* OK, there are (probably) packets available - go try again.  Note we
     * jump back to the top of the function because someone may have
     * connected this socket in the mean-time, so we need to check the
     * state once more.
     */
    return CI_CONNECT_UL_START_AGAIN;
  }

#ifdef ONLOAD_OFE
    if( ni->ofe != NULL )
      ts->s.ofe_code_start = ofe_socktbl_find(
                        ni->ofe, OFE_SOCKTYPE_TCP_ACTIVE,
                        tcp_laddr_be32(ts), tcp_raddr_be32(ts),
                        tcp_lport_be16(ts), tcp_rport_be16(ts));
#endif

  rc = ci_tcp_ep_set_filters(ni, S_SP(ts), ts->s.cp.so_bindtodevice,
                             OO_SP_NULL);
  if( rc < 0 ) {
    /* Perhaps we've run out of filters?  See if we can push a socket out
     * of timewait and steal its filter.
     */
    ci_assert_nequal(rc, -EFILTERSSOME);
    if( rc != -EBUSY || ! ci_netif_timewait_try_to_free_filter(ni) ||
        (rc = ci_tcp_ep_set_filters(ni, S_SP(ts),
                                    ts->s.cp.so_bindtodevice,
                                    OO_SP_NULL)) < 0 ) {
      ci_assert_nequal(rc, -EFILTERSSOME);
      /* Either a different error, or our efforts to free a filter did not
       * work.
       */
      if( ! (ts->s.s_flags & CI_SOCK_FLAG_ADDR_BOUND) ) {
        ts->s.pkt.ip.ip_saddr_be32 = 0;
        ts->s.cp.ip_laddr_be32 = 0;
      }
      ci_netif_pkt_release(ni, pkt);
      CI_SET_ERROR(*fail_rc, -rc);
      return CI_CONNECT_UL_FAIL;
    }
  }

  LOG_TC(log(LNT_FMT "CONNECT %s:%u->%s:%u", LNT_PRI_ARGS(ni, ts),
	     ip_addr_str(ts->s.pkt.ip.ip_saddr_be32),
	     (unsigned) CI_BSWAP_BE16(TS_TCP(ts)->tcp_source_be16),
	     ip_addr_str(ts->s.pkt.ip.ip_daddr_be32),
	     (unsigned) CI_BSWAP_BE16(TS_TCP(ts)->tcp_dest_be16)));

  /* We are going to send the SYN - set states appropriately */
  tcp_snd_una(ts) = tcp_snd_nxt(ts) = tcp_enq_nxt(ts) = tcp_snd_up(ts) =
    ci_tcp_initial_seqno(ni);
  ts->snd_max = tcp_snd_nxt(ts) + 1;

  /* Must be after initialising snd_una. */
  ci_tcp_clear_rtt_timing(ts);
  ci_tcp_set_flags(ts, CI_TCP_FLAG_SYN);
  ts->tcpflags &=~ CI_TCPT_FLAG_OPT_MASK;
  ts->tcpflags |= NI_OPTS(ni).syn_opts;

  if( (ts->tcpflags & CI_TCPT_FLAG_WSCL) ) {
    ts->rcv_wscl = ci_tcp_wscl_by_buff(ni, ci_tcp_rcvbuf_established(ni, &ts->s));
    CI_IP_SOCK_STATS_VAL_RXWSCL(ts, ts->rcv_wscl);
  }
  else {
    ts->rcv_wscl = 0;
    CI_IP_SOCK_STATS_VAL_RXWSCL(ts, 0);
  }
  ci_tcp_set_rcvbuf(ni, ts);
  ci_tcp_init_rcv_wnd(ts, "CONNECT");

  /* outgoing_hdrs_len is initialised to include timestamp option. */
  if( ! (ts->tcpflags & CI_TCPT_FLAG_TSO) )
    ts->outgoing_hdrs_len = sizeof(ci_ip4_hdr)+sizeof(ci_tcp_hdr);
  if( ci_tcp_can_stripe(ni, ts->s.pkt.ip.ip_saddr_be32,
			ts->s.pkt.ip.ip_daddr_be32) )
    ts->tcpflags |= CI_TCPT_FLAG_STRIPE;
  ci_tcp_set_slow_state(ni, ts, CI_TCP_SYN_SENT);

  /* If the app trys to send data on a socket in SYN_SENT state
  ** then the data is queued for send until the SYN gets ACKed.
  ** (rfc793 p56)
  **
  ** Receive calls on the socket should block until data arrives
  ** (rfc793 p58)
  **
  ** Clearing tx_errno and rx_errno acheive this. The transmit window
  ** is set to 1 byte which ensures that only the SYN packet gets
  ** sent until the ACK is received with more window. 
  */
  ci_assert(ts->snd_max == tcp_snd_nxt(ts) + 1);
  ts->s.rx_errno = 0;
  ts->s.tx_errno = 0; 
  ci_tcp_enqueue_no_data(ts, ni, pkt);
  ci_tcp_set_flags(ts, CI_TCP_FLAG_ACK);  

  if( ts->s.b.sb_aflags & (CI_SB_AFLAG_O_NONBLOCK | CI_SB_AFLAG_O_NDELAY) ) {
    ts->tcpflags |= CI_TCPT_FLAG_NONBLOCK_CONNECT;
    LOG_TC(log( LNT_FMT "Non-blocking connect - return EINPROGRESS",
		LNT_PRI_ARGS(ni, ts)));
    CI_SET_ERROR(*fail_rc, EINPROGRESS);
    return CI_CONNECT_UL_FAIL;
  }

  return CI_CONNECT_UL_OK;
}