Exemple #1
0
static int
efab_tcp_drop_from_acceptq(ci_private_t *priv, void *arg)
{
  struct oo_op_tcp_drop_from_acceptq *carg = arg;
  tcp_helper_resource_t *thr;
  tcp_helper_endpoint_t *ep;
  citp_waitable *w;
  ci_tcp_state *ts;
  int rc = -EINVAL;

  /* find stack */
  rc = efab_thr_table_lookup(NULL, carg->stack_id,
                                 EFAB_THR_TABLE_LOOKUP_CHECK_USER |
                                 EFAB_THR_TABLE_LOOKUP_NO_UL,
                                 &thr);

  if( rc < 0 )
    return rc;
  ci_assert( thr->k_ref_count & TCP_HELPER_K_RC_NO_USERLAND );

  /* find endpoint and drop OS socket */
  ep = ci_trs_get_valid_ep(thr, carg->sock_id);
  if( ep == NULL )
    goto fail1;

  w = SP_TO_WAITABLE(&thr->netif, carg->sock_id);
  if( !(w->state & CI_TCP_STATE_TCP) || w->state == CI_TCP_LISTEN )
    goto fail2;
  ts = SP_TO_TCP(&thr->netif, carg->sock_id);
  ci_assert(ep->os_port_keeper);
  ci_assert_equal(ep->os_socket, NULL);

  LOG_TV(ci_log("%s: send reset to non-accepted connection", __FUNCTION__));

  /* copy from ci_tcp_listen_shutdown_queues() */
  ci_assert(ts->s.b.sb_aflags & CI_SB_AFLAG_TCP_IN_ACCEPTQ);
  rc = ci_netif_lock(&thr->netif);
  if( rc != 0 ) {
    ci_assert_equal(rc, -EINTR);
    rc = -ERESTARTSYS;
    goto fail2;
  }
  ci_bit_clear(&ts->s.b.sb_aflags, CI_SB_AFLAG_TCP_IN_ACCEPTQ_BIT);
  /* We have no way to close this connection from the other side:
   * there was no RST from peer. */
  ci_assert_nequal(ts->s.b.state, CI_TCP_CLOSED);
  ci_assert_nequal(ts->s.b.state, CI_TCP_TIME_WAIT);
  ci_tcp_send_rst(&thr->netif, ts);
  ci_tcp_drop(&thr->netif, ts, ECONNRESET);
  ci_assert_equal(ep->os_port_keeper, NULL);
  ci_netif_unlock(&thr->netif);
  efab_tcp_helper_k_ref_count_dec(thr, 1);
  return 0;

fail1:
  efab_thr_release(thr);
fail2:
  ci_log("%s: inconsistent ep %d:%d", __func__, carg->stack_id, carg->sock_id);
  return rc;
}
Exemple #2
0
int ci_udp_ioctl(citp_socket *ep, ci_fd_t fd, int request, void* arg)
{
  ci_netif* ni = ep->netif;
  ci_udp_state* us = SOCK_TO_UDP(ep->s);
  int rc;

  ci_netif_lock(ni);
  rc = ci_udp_ioctl_locked(ni, us, fd, request, arg);
  ci_netif_unlock(ni);
  return rc;
}
Exemple #3
0
ci_fd_t ci_udp_ep_ctor(citp_socket* ep, ci_netif* netif, int domain, int type)
{
  ci_udp_state* us;
  ci_fd_t fd;

  VERB( log(LPFIN "ctor( )" ) );

  ci_assert(ep);
  ci_assert(netif);

  ci_netif_lock(netif);
  us = ci_udp_get_state_buf(netif);
  if (!us) {
    ci_netif_unlock(netif);
    LOG_E(ci_log("%s: [%d] out of socket buffers", __FUNCTION__,NI_ID(netif)));
    return -ENOMEM;
  }

  /* It's required to set protocol before ci_tcp_helper_sock_attach()
   * since it's used to determine if TCP or UDP file operations should be
   * attached to the file descriptor in kernel. */
   sock_protocol(&us->s) = IPPROTO_UDP;

  /* NB: this attach will close the os_sock_fd */
  fd = ci_tcp_helper_sock_attach(ci_netif_get_driver_handle(netif),  
                                 SC_SP(&us->s), domain, type);
  if( fd < 0 ) {
    if( fd == -EAFNOSUPPORT )
      LOG_U(ci_log("%s: ci_tcp_helper_sock_attach (domain=%d, type=%d) "
                   "failed %d", __FUNCTION__, domain, type, fd));
    else
      LOG_E(ci_log("%s: ci_tcp_helper_sock_attach (domain=%d, type=%d) "
                   "failed %d", __FUNCTION__, domain, type, fd));
    ci_netif_unlock(netif);
    return fd;
  }

  ci_assert(~us->s.b.sb_aflags & CI_SB_AFLAG_ORPHAN);

  us->s.rx_errno = 0;
  us->s.tx_errno = 0;
  us->s.so_error = 0;
  us->s.cp.sock_cp_flags |= OO_SCP_UDP_WILD;

  ep->s = &us->s;
  ep->netif = netif;
  CHECK_UEP(ep);
  ci_netif_unlock(netif);
  return fd;
}
Exemple #4
0
ci_fd_t ci_tcp_ep_ctor(citp_socket* ep, ci_netif* netif, int domain, int type)
{
  ci_tcp_state* ts;
  ci_fd_t fd;

  ci_assert(ep);
  ci_assert(netif);

  ci_netif_lock(netif);
  ts = ci_tcp_get_state_buf(netif);
  if( ts == NULL ) {
    ci_netif_unlock(netif);
    LOG_E(ci_log("%s: [%d] out of socket buffers", __FUNCTION__,NI_ID(netif)));
    return -ENOMEM;
  }

  fd = ci_tcp_helper_sock_attach(ci_netif_get_driver_handle(netif), S_SP(ts),
                                 domain, type);
  if( fd < 0 ) {
    if( fd == -EAFNOSUPPORT )
      LOG_U(ci_log("%s: ci_tcp_helper_sock_attach" \
                   "(domain=%d, type=%d) failed %d",
                   __FUNCTION__, domain, type, fd));
    else
      LOG_E(ci_log("%s: ci_tcp_helper_sock_attach" \
                   "(domain=%d, type=%d) failed %d",
                   __FUNCTION__, domain, type, fd));
  }
  else {
    ci_assert(~ts->s.b.sb_aflags & CI_SB_AFLAG_ORPHAN);
    /* Apply default sockbuf sizes now we've updated them from the kernel
    ** defaults. */
    ts->s.so.sndbuf = NI_OPTS(netif).tcp_sndbuf_def;
    ts->s.so.rcvbuf = NI_OPTS(netif).tcp_rcvbuf_def;
    ep->netif = netif;
    ep->s = &ts->s;
    CHECK_TEP(ep);
  }

  ci_netif_unlock(netif);
  return fd;
}
Exemple #5
0
void ci_tcp_linger(ci_netif* ni, ci_tcp_state* ts)
{
  /* This is called at user-level when a socket is closed if linger is
  ** enabled and has a timeout, and there is TX data outstanding.
  **
  ** Our job is to block until all data is successfully sent and acked, or
  ** until timeout.
  */
  ci_uint64 sleep_seq;
  int rc = 0;
  ci_uint32 timeout = ts->s.so.linger * 1000;

  LOG_TC(log("%s: "NTS_FMT, __FUNCTION__, NTS_PRI_ARGS(ni, ts)));

  ci_assert(ts->s.b.sb_aflags & CI_SB_AFLAG_ORPHAN);
  ci_assert(ts->s.b.sb_aflags & CI_SB_AFLAG_IN_SO_LINGER);
  ci_assert(ts->s.s_flags & CI_SOCK_FLAG_LINGER);
  ci_assert(ts->s.b.state != CI_TCP_LISTEN);

  while( 1 ) {
    sleep_seq = ts->s.b.sleep_seq.all;
    ci_rmb();
    if( SEQ_EQ(tcp_enq_nxt(ts), tcp_snd_una(ts)) )
      return;
    rc = ci_sock_sleep(ni, &ts->s.b, CI_SB_FLAG_WAKE_TX, 0, sleep_seq,
                       &timeout);
    if( rc )
      break;
  }

  if( ! SEQ_EQ(tcp_enq_nxt(ts), tcp_snd_una(ts)) ) {
    ci_netif_lock(ni);
    /* check we are working with the same socket, and it was not closed and
     * dropped under our feet. */
    if( ! SEQ_EQ(tcp_enq_nxt(ts), tcp_snd_una(ts)) &&
        (ts->s.b.sb_aflags & CI_SB_AFLAG_IN_SO_LINGER) )
      ci_tcp_drop(ni, ts, 0);
    ci_netif_unlock(ni);
  }
}
Exemple #6
0
static int oo_pipe_ctor(ci_netif* netif, struct oo_pipe** out_pipe,
                        int fds[2], int flags)
{
  struct oo_pipe* p;
  int rc;

  ci_assert(netif);

  ci_netif_lock(netif);
  p = oo_pipe_buf_get(netif);
  if( !p ) {
    rc = -1;
    errno = ENOMEM;
    goto out;
  }

  if( flags & O_NONBLOCK ) {
    p->aflags = (CI_PFD_AFLAG_NONBLOCK << CI_PFD_AFLAG_READER_SHIFT) |
        (CI_PFD_AFLAG_NONBLOCK << CI_PFD_AFLAG_WRITER_SHIFT);
  }

  /* attach */
  rc = ci_tcp_helper_pipe_attach(ci_netif_get_driver_handle(netif),
                                 W_SP(&p->b), flags, fds);
  if( rc < 0 ) {
    LOG_E(ci_log("%s: ci_tcp_helper_pipe_attach %d", __FUNCTION__, rc));
    errno = -rc;
    rc = -1;
    goto out;
  }

  *out_pipe = p;

out:
  ci_netif_unlock(netif);

  return rc;
}
Exemple #7
0
static int ci_tcp_shutdown_listen(citp_socket* ep, int how, ci_fd_t fd)
{
  ci_tcp_socket_listen* tls = SOCK_TO_TCP_LISTEN(ep->s);

  if( how == SHUT_WR )
    return 0;

  ci_sock_lock(ep->netif, &tls->s.b);
  ci_netif_lock(ep->netif);
  LOG_TC(ci_log(SK_FMT" shutdown(SHUT_RD)", SK_PRI_ARGS(ep)));
  __ci_tcp_listen_shutdown(ep->netif, tls, fd);
  __ci_tcp_listen_to_normal(ep->netif, tls);
  {
    ci_fd_t os_sock = ci_get_os_sock_fd(ep, fd);
    int flags = ci_sys_fcntl(os_sock, F_GETFL);
    flags &= (~O_NONBLOCK);
    CI_TRY(ci_sys_fcntl(os_sock, F_SETFL, flags));
    ci_rel_os_sock_fd(os_sock);
  }
  ci_netif_unlock(ep->netif);
  ci_sock_unlock(ep->netif, &tls->s.b);
  return 0;
}
Exemple #8
0
int onload_zc_alloc_buffers(int fd, struct onload_zc_iovec* iovecs,
                            int iovecs_len, 
                            enum onload_zc_buffer_type_flags flags)
{
  int rc = 0, i;
  citp_lib_context_t lib_context;
  citp_fdinfo* fdi;
  citp_sock_fdi* epi;
  ci_netif* ni;
  ci_ip_pkt_fmt *pkt;
  unsigned max_len;

  Log_CALL(ci_log("%s(%d, %p, %d, %x)", __FUNCTION__, fd, iovecs,
                  iovecs_len, flags));

  citp_enter_lib(&lib_context);

  if( (fdi = citp_fdtable_lookup(fd)) != NULL ) {
    switch( citp_fdinfo_get_type(fdi) ) {
    case CITP_UDP_SOCKET:
    case CITP_TCP_SOCKET:
      epi = fdi_to_sock_fdi(fdi);
      ni = epi->sock.netif;
      ci_netif_lock(ni);
      for( i = 0; i < iovecs_len; ++i ) {
        max_len = CI_CFG_PKT_BUF_SIZE;
        pkt = ci_netif_pkt_tx_tcp_alloc(ni);
        if( pkt == NULL ) {
          while( --i >= 0 )
            ci_netif_pkt_release(ni, (ci_ip_pkt_fmt*)iovecs[i].buf);
          rc = -ENOMEM;
          ci_netif_unlock(ni);
          goto out;
        }
        /* Make sure this is clear as it affects behaviour when freeing */
        pkt->pf.udp.rx_flags = 0;
        iovecs[i].buf = (struct oo_zc_buf *)pkt;
        if( flags & ONLOAD_ZC_BUFFER_HDR_TCP ) {
          if( (citp_fdinfo_get_type(fdi) == CITP_TCP_SOCKET) &&
              (epi->sock.s->b.state & CI_TCP_STATE_TCP_CONN) ) {
            ci_tcp_state* ts = SOCK_TO_TCP(epi->sock.s);
            oo_tx_pkt_layout_init(pkt);
            iovecs[i].iov_base = ((char *)oo_tx_ip_hdr(pkt)) + 
              ts->outgoing_hdrs_len;
            max_len = tcp_eff_mss(ts);
          } 
          else {
            /* Best guess.  We can fix it up later.  Magic 12 leaves
             * space for time stamp option (common case)
             */
            oo_tx_pkt_layout_init(pkt);
            iovecs[i].iov_base =
              (uint8_t*) oo_tx_ip_data(pkt) + sizeof(ci_tcp_hdr) + 12;
          }
        }
        else if( flags & ONLOAD_ZC_BUFFER_HDR_UDP ) {
          oo_tx_pkt_layout_init(pkt);
          iovecs[i].iov_base =
            (uint8_t*) oo_tx_ip_data(pkt) + sizeof(ci_udp_hdr);
        }
        else 
          iovecs[i].iov_base = PKT_START(pkt);
        iovecs[i].iov_len = CI_CFG_PKT_BUF_SIZE - 
          ((char *)iovecs[i].iov_base - (char *)pkt);
        if( iovecs[i].iov_len > max_len )
          iovecs[i].iov_len = max_len;
      }
      ni->state->n_async_pkts += iovecs_len;
      ci_netif_unlock(ni);
      break;
#if CI_CFG_USERSPACE_EPOLL
    case CITP_EPOLL_FD:
      rc = -ENOTSOCK;
      break;
#endif
#if CI_CFG_USERSPACE_PIPE
    case CITP_PIPE_FD:
      rc = -ENOTSOCK;
      break;
#endif
    case CITP_PASSTHROUGH_FD:
      rc = -ESOCKTNOSUPPORT;
      break;
    default:
      LOG_U(log("%s: unknown fdinfo type %d", __FUNCTION__, 
                citp_fdinfo_get_type(fdi)));
      rc = -EINVAL;
    }
    citp_fdinfo_release_ref(fdi, 0);
  } 
  else {
    /* Not onload socket */
    rc = -ESOCKTNOSUPPORT;
  }

 out:
  citp_exit_lib(&lib_context, TRUE);
  Log_CALL_RESULT(rc);
  return rc;
}
Exemple #9
0
int onload_zc_release_buffers(int fd, onload_zc_handle* bufs, int bufs_len)
{
  int rc = 0, i;
  citp_lib_context_t lib_context;
  citp_fdinfo* fdi;
  citp_sock_fdi* epi;
  ci_netif* ni;

  Log_CALL(ci_log("%s(%d, %p, %d)", __FUNCTION__, fd, bufs, bufs_len));

  citp_enter_lib(&lib_context);

  if( (fdi = citp_fdtable_lookup(fd)) != NULL ) {
    switch( citp_fdinfo_get_type(fdi) ) {
    case CITP_UDP_SOCKET:
    case CITP_TCP_SOCKET:
      epi = fdi_to_sock_fdi(fdi);
      ni = epi->sock.netif;
      ci_netif_lock(ni);
      for( i = 0; i < bufs_len; ++i ) {
        ci_ip_pkt_fmt* pkt = (ci_ip_pkt_fmt*)bufs[i];
        if( pkt->stack_id != ni->state->stack_id ) {
          LOG_U(log("%s: attempt to free buffer from stack %d to stack %d",
                    __FUNCTION__, pkt->stack_id, ni->state->stack_id));
          rc = -EINVAL;
          break;
        }
      }
      if( rc == 0 ) {
        for( i = 0; i < bufs_len; ++i )
          ci_netif_pkt_release_check_keep(ni, (ci_ip_pkt_fmt*)bufs[i]);
      }
      ci_netif_unlock(ni);
      break;
#if CI_CFG_USERSPACE_EPOLL
    case CITP_EPOLL_FD:
      rc = -ENOTSOCK;
      break;
#endif
#if CI_CFG_USERSPACE_PIPE
    case CITP_PIPE_FD:
      rc = -ENOTSOCK;
      break;
#endif
    default:
      LOG_U(log("%s: unknown fdinfo type %d", __FUNCTION__, 
                citp_fdinfo_get_type(fdi)));
      rc = -EINVAL;
    }

    citp_fdinfo_release_ref(fdi, 0);
  } 
  else {
    /* Not onload socket */
    rc = -ESOCKTNOSUPPORT;
  }

  citp_exit_lib(&lib_context, TRUE);
  Log_CALL_RESULT(rc);

  return rc;
}
int efab_file_move_to_alien_stack_rsop(ci_private_t *stack_priv, void *arg)
{
  ci_fixed_descriptor_t sock_fd = *(ci_fixed_descriptor_t *)arg;
  struct file *sock_file = fget(sock_fd);
  ci_private_t *sock_priv;
  tcp_helper_resource_t *old_thr;
  tcp_helper_resource_t *new_thr;
  citp_waitable *w;
  int rc;

  if( sock_file == NULL )
    return -EINVAL;
  if( !FILE_IS_ENDPOINT_SOCK(sock_file) ||
      stack_priv->fd_type != CI_PRIV_TYPE_NETIF ) {
    fput(sock_file);
    return -EINVAL;
  }
  sock_priv = sock_file->private_data;
  ci_assert(sock_priv->fd_type == CI_PRIV_TYPE_TCP_EP ||
            sock_priv->fd_type == CI_PRIV_TYPE_UDP_EP);

  old_thr = sock_priv->thr;
  new_thr = stack_priv->thr;
  ci_assert(old_thr);
  ci_assert(new_thr);

  if( old_thr == new_thr ) {
    fput(sock_file);
    return 0;
  }

  if( tcp_helper_cluster_from_cluster(old_thr) != 0 ) {
    LOG_S(ci_log("%s: move_fd() not permitted on clustered stacks", __func__));
    fput(sock_file);
    return -EINVAL;
  }

  w = SP_TO_WAITABLE(&old_thr->netif, sock_priv->sock_id);
  rc = ci_sock_lock(&old_thr->netif, w);
  if( rc != 0 ) {
    fput(sock_file);
    return rc;
  }

  rc = ci_netif_lock(&old_thr->netif);
  if( rc != 0 ) {
    ci_sock_unlock(&old_thr->netif, w);
    fput(sock_file);
    return rc;
  }

  efab_thr_ref(new_thr);
  rc = efab_file_move_to_alien_stack(sock_priv, &stack_priv->thr->netif);
  fput(sock_file);

  if( rc != 0 )
    efab_thr_release(new_thr);
  else
    ci_netif_unlock(&new_thr->netif);

  return rc;
}
/* Move priv file to the alien_ni stack.
 * Should be called with the locked priv stack and socket;
 * the function returns with this stack being unlocked.
 * If rc=0, it returns with alien_ni stack locked;
 * otherwise, both stacks are unlocked.
 * Socket is always unlocked on return. */
int efab_file_move_to_alien_stack(ci_private_t *priv, ci_netif *alien_ni)
{
  tcp_helper_resource_t *old_thr = priv->thr;
  tcp_helper_resource_t *new_thr = netif2tcp_helper_resource(alien_ni);
  ci_sock_cmn *old_s = SP_TO_SOCK(&old_thr->netif, priv->sock_id);
  ci_sock_cmn *new_s;
  ci_sock_cmn *mid_s;
  tcp_helper_endpoint_t *old_ep, *new_ep;
  int rc, i;
  int pollwait_register = 0;
#if CI_CFG_FD_CACHING
  oo_p sp;
#endif

  OO_DEBUG_TCPH(ci_log("%s: move %d:%d to %d", __func__,
                       old_thr->id, priv->sock_id, new_thr->id));
  /* Poll the old stack - deliver all data to our socket */
  ci_netif_poll(&old_thr->netif);

  /* Endpoints in epoll list should not be moved, because waitq is already
   * in the epoll internal structures (bug 41152). */
  if( !list_empty(&priv->_filp->f_ep_links) ) {
    rc = -EBUSY;
    goto fail1;
  }

  if( !efab_file_move_supported(&old_thr->netif, old_s) ) {
    rc = -EINVAL;
    goto fail1;
  }

  /* Lock the second stack */
  i = 0;
  while( ! ci_netif_trylock(alien_ni) ) {
    ci_netif_unlock(&old_thr->netif);
    if( i++ >= 1000 ) {
      rc = -EBUSY;
      goto fail1_ni_unlocked;
    }
    rc = ci_netif_lock(&old_thr->netif);
    if( rc != 0 )
      goto fail1_ni_unlocked;
  }

  /* Allocate a new socket in the alien_ni stack */
  rc = -ENOMEM;
  if( old_s->b.state == CI_TCP_STATE_UDP ) {
    ci_udp_state *new_us = ci_udp_get_state_buf(alien_ni);
    if( new_us == NULL )
      goto fail2;
    new_s = &new_us->s;
  }
  else {
    ci_tcp_state *new_ts = ci_tcp_get_state_buf(alien_ni);
    if( new_ts == NULL )
      goto fail2;
    new_s = &new_ts->s;
  }

  /* Allocate an intermediate "socket" outside of everything */
  mid_s = ci_alloc(CI_MAX(sizeof(ci_tcp_state), sizeof(ci_udp_state)));
  if( mid_s == NULL )
    goto fail3;

  OO_DEBUG_TCPH(ci_log("%s: move %d:%d to %d:%d", __func__,
                       old_thr->id, priv->sock_id,
                       new_thr->id, new_s->b.bufid));

  /* Copy TCP/UDP state */
  memcpy(mid_s, old_s, CI_MAX(sizeof(ci_tcp_state), sizeof(ci_udp_state)));

  /* do not copy old_s->b.bufid
   * and other fields in stack adress space */
  mid_s->b.sb_aflags |= CI_SB_AFLAG_ORPHAN;
  mid_s->b.bufid = new_s->b.bufid;
  mid_s->b.post_poll_link = new_s->b.post_poll_link;
  mid_s->b.ready_link = new_s->b.ready_link;
  mid_s->reap_link = new_s->reap_link;

  if( old_s->b.state & CI_TCP_STATE_TCP ) {
    ci_tcp_state *new_ts = SOCK_TO_TCP(new_s);
    ci_tcp_state *mid_ts = SOCK_TO_TCP(mid_s);

    mid_ts->timeout_q_link = new_ts->timeout_q_link;
    mid_ts->tx_ready_link = new_ts->tx_ready_link;
    mid_ts->rto_tid = new_ts->rto_tid;
    mid_ts->delack_tid = new_ts->delack_tid;
    mid_ts->zwin_tid = new_ts->zwin_tid;
    mid_ts->kalive_tid = new_ts->kalive_tid;
    mid_ts->cork_tid = new_ts->cork_tid;
    ci_ip_queue_init(&mid_ts->recv1);
    ci_ip_queue_init(&mid_ts->recv2);
    ci_ip_queue_init(&mid_ts->send);
    ci_ip_queue_init(&mid_ts->retrans);
    mid_ts->send_prequeue = OO_PP_ID_NULL;
    new_ts->retrans_ptr = OO_PP_NULL;
    mid_ts->tmpl_head = OO_PP_NULL;
    oo_atomic_set(&mid_ts->send_prequeue_in, 0);

    *new_ts = *mid_ts;
    ci_pmtu_state_init(alien_ni, &new_ts->s, &new_ts->pmtus,
                       CI_IP_TIMER_PMTU_DISCOVER);
#if CI_CFG_FD_CACHING
    sp = TS_OFF(alien_ni, new_ts);
    OO_P_ADD(sp, CI_MEMBER_OFFSET(ci_tcp_state, epcache_link));
    ci_ni_dllist_link_init(alien_ni, &new_ts->epcache_link, sp, "epch");
    ci_ni_dllist_self_link(alien_ni, &new_ts->epcache_link);
    sp = TS_OFF(alien_ni, new_ts);
    OO_P_ADD(sp, CI_MEMBER_OFFSET(ci_tcp_state, epcache_fd_link));
    ci_ni_dllist_link_init(alien_ni, &new_ts->epcache_fd_link, sp, "ecfd");
    ci_ni_dllist_self_link(alien_ni, &new_ts->epcache_fd_link);
#endif
   
    /* free temporary mid_ts storage */
    CI_FREE_OBJ(mid_ts);
  }
  else {
    ci_udp_state *mid_us = SOCK_TO_UDP(mid_s);

    *SOCK_TO_UDP(new_s) = *mid_us;
    CI_FREE_OBJ(mid_us);
  }

  /* Move the filter */
  old_ep = ci_trs_ep_get(old_thr, priv->sock_id);
  new_ep = ci_trs_ep_get(new_thr, new_s->b.bufid);
  rc = tcp_helper_endpoint_move_filters_pre(old_ep, new_ep);
  if( rc != 0 ) {
    rc = -EINVAL;
    goto fail3;
  }

  /* Allocate a new file for the new endpoint */
  rc = onload_alloc_file(new_thr, new_s->b.bufid, priv->_filp->f_flags,
                         priv->fd_type, &old_ep->alien_ref);
  if( rc != 0 )
    goto fail4;
  ci_assert(old_ep->alien_ref);

  /* Copy F_SETOWN_EX, F_SETSIG to the new file */
#ifdef F_SETOWN_EX
  rcu_read_lock();
  __f_setown(old_ep->alien_ref->_filp, priv->_filp->f_owner.pid,
             priv->_filp->f_owner.pid_type, 1);
  rcu_read_unlock();
#endif
  old_ep->alien_ref->_filp->f_owner.signum = priv->_filp->f_owner.signum;
  old_ep->alien_ref->_filp->f_flags |= priv->_filp->f_flags & O_NONBLOCK;

  /* Move os_socket from one ep to another */
  if( tcp_helper_endpoint_set_aflags(new_ep, OO_THR_EP_AFLAG_ATTACHED) &
      OO_THR_EP_AFLAG_ATTACHED ) {
    fput(old_ep->alien_ref->_filp);
    rc = -EBUSY;
    goto fail2; /* state & filters are cleared by fput() */
  }

  /********* Point of no return  **********/
  ci_wmb();
  priv->fd_type = CI_PRIV_TYPE_ALIEN_EP;
  priv->_filp->f_op = &linux_tcp_helper_fops_alien;
  ci_wmb();
  oo_file_moved(priv);

  /* Read all already-arrived packets after the filters move but before
   * copying of the receive queue. */
  ci_netif_poll(&old_thr->netif);
  tcp_helper_endpoint_move_filters_post(old_ep, new_ep);
  ci_assert( efab_file_move_supported(&old_thr->netif, old_s));

  /* There's a gap between un-registering the old ep, and registering the
   * the new.  However, the notifications shouldn't be in use for sockets
   * that are in a state that can be moved, so this shouldn't be a problem.
   */
  if( old_ep->os_sock_pt.whead ) {
    pollwait_register = 1;
    efab_tcp_helper_os_pollwait_unregister(old_ep);
  }
  ci_assert_equal(new_ep->os_socket, NULL);
  new_ep->os_socket = oo_file_ref_xchg(&old_ep->os_socket, NULL);
  ci_assert_equal(old_ep->os_socket, NULL);
  if( pollwait_register )
    efab_tcp_helper_os_pollwait_register(new_ep);

  ci_bit_clear(&new_s->b.sb_aflags, CI_SB_AFLAG_ORPHAN_BIT);
  if( new_s->b.state == CI_TCP_ESTABLISHED )
    CI_TCP_STATS_INC_CURR_ESTAB(alien_ni);


  /* Copy recv queue */
  if( new_s->b.state & CI_TCP_STATE_TCP ) {
    ci_tcp_state *new_ts = SOCK_TO_TCP(new_s);
    ci_tcp_state *old_ts = SOCK_TO_TCP(old_s);
    int i;

    /* Stop timers */
    ci_ip_timer_clear(&old_thr->netif, &old_ts->kalive_tid);
    ci_ip_timer_clear(&old_thr->netif, &old_ts->delack_tid);

    efab_ip_queue_copy(alien_ni, &new_ts->recv1,
                       &old_thr->netif, &old_ts->recv1);
    efab_ip_queue_copy(alien_ni, &new_ts->recv2,
                       &old_thr->netif, &old_ts->recv2);
    new_ts->recv1_extract = new_ts->recv1.head;

    /* Drop reorder buffer */
    ci_ip_queue_init(&new_ts->rob);
    new_ts->dsack_block = OO_PP_INVALID;
    new_ts->dsack_start = new_ts->dsack_end = 0;
    for( i = 0; i <= CI_TCP_SACK_MAX_BLOCKS; i++ )
      new_ts->last_sack[i] = OO_PP_NULL;
  }
  else {
    /* There should not be any recv q, but drop it to be sure */
    ci_udp_recv_q_init(&SOCK_TO_UDP(new_s)->recv_q);
  }

  /* Old stack can be unlocked */
  old_s->b.sb_flags |= CI_SB_FLAG_MOVED;
  ci_netif_unlock(&old_thr->netif);

  ci_assert( efab_file_move_supported(alien_ni, new_s) );

  /* Move done: poll for any new data. */
  ci_netif_poll(alien_ni);

  if( new_s->b.state & CI_TCP_STATE_TCP ) {
    ci_tcp_state *new_ts = SOCK_TO_TCP(new_s);
    /* Timers setup: delack, keepalive */
    if( (new_ts->acks_pending & CI_TCP_ACKS_PENDING_MASK) > 0)
      ci_tcp_timeout_delack(alien_ni, new_ts);
    ci_tcp_kalive_reset(alien_ni, new_ts);
  }


  /* Old ep: we are done. */
  ci_bit_set(&old_s->b.sb_aflags, CI_SB_AFLAG_MOVED_AWAY_BIT);
  old_s->b.moved_to_stack_id = alien_ni->state->stack_id;
  old_s->b.moved_to_sock_id = new_s->b.bufid;
  if( ! list_empty(&priv->_filp->f_ep_links) )
    ci_bit_set(&old_s->b.sb_aflags, CI_SB_AFLAG_MOVED_AWAY_IN_EPOLL_BIT);

  ci_sock_unlock(&old_thr->netif, &old_s->b);
  ci_sock_unlock(alien_ni, &new_s->b);
  ci_assert(ci_netif_is_locked(alien_ni));
  OO_DEBUG_TCPH(ci_log("%s: -> [%d:%d] %s", __func__,
                       new_thr->id, new_s->b.bufid,
                       ci_tcp_state_str(new_s->b.state)));
  return 0;

fail4:
  /* We clear the filters from the new ep.
   * For now, we do not need to re-insert old filters because hw filters
   * are alredy here (in case of accepted socket) or not needed.
   * We have not removed old sw filters yet. */
  tcp_helper_endpoint_move_filters_undo(old_ep, new_ep);
fail3:
  if( new_s->b.state & CI_TCP_STATE_TCP )
    ci_tcp_state_free(alien_ni, SOCK_TO_TCP(new_s));
  else
    ci_udp_state_free(alien_ni, SOCK_TO_UDP(new_s));
fail2:
  ci_netif_unlock(alien_ni);
fail1:
  ci_netif_unlock(&old_thr->netif);
fail1_ni_unlocked:
  ci_sock_unlock(&old_thr->netif, &old_s->b);
  OO_DEBUG_TCPH(ci_log("%s: rc=%d", __func__, rc));
  return rc;
}
Exemple #12
0
int onload_zc_release_buffers(int fd, onload_zc_handle* bufs, int bufs_len)
{
  int rc = 0, i, rx_pkt, released;
  citp_lib_context_t lib_context;
  citp_fdinfo* fdi;
  citp_sock_fdi* epi;
  ci_netif* ni;
  ci_ip_pkt_fmt* pkt;

  Log_CALL(ci_log("%s(%d, %p, %d)", __FUNCTION__, fd, bufs, bufs_len));

  citp_enter_lib(&lib_context);

  if( (fdi = citp_fdtable_lookup(fd)) != NULL ) {
    switch( citp_fdinfo_get_type(fdi) ) {
    case CITP_UDP_SOCKET:
    case CITP_TCP_SOCKET:
      epi = fdi_to_sock_fdi(fdi);
      ni = epi->sock.netif;
      ci_netif_lock(ni);
      for( i = 0; i < bufs_len; ++i ) {
        pkt = (ci_ip_pkt_fmt*)bufs[i];
        if( pkt->stack_id != ni->state->stack_id ) {
          LOG_U(log("%s: attempt to free buffer from stack %d to stack %d",
                    __FUNCTION__, pkt->stack_id, ni->state->stack_id));
          rc = -EINVAL;
          break;
        }
      }
      if( rc == 0 ) {
        for( i = 0; i < bufs_len; ++i ) {
          pkt = (ci_ip_pkt_fmt*)bufs[i];
          /* If we are releasing a packet without the RX_FLAG then the user
           * allocated and then freed the packet (without using it).
           * We detect this to decrement n_asyn_pkts.
           * RX packets (kept via ONLOAD_ZC_KEEP) are counted differently
           * so don't decrement here.  (But may release)
           */
          rx_pkt = pkt->flags & CI_PKT_FLAG_RX;
          released = ci_netif_pkt_release_check_keep(ni, pkt);
          if ( ! rx_pkt ) {
            ci_assert(released == 1);
            (void) released;
            --ni->state->n_async_pkts;
          }
        }
      }
      ci_netif_unlock(ni);
      break;
#if CI_CFG_USERSPACE_EPOLL
    case CITP_EPOLL_FD:
      rc = -ENOTSOCK;
      break;
#endif
#if CI_CFG_USERSPACE_PIPE
    case CITP_PIPE_FD:
      rc = -ENOTSOCK;
      break;
#endif
    default:
      LOG_U(log("%s: unknown fdinfo type %d", __FUNCTION__, 
                citp_fdinfo_get_type(fdi)));
      rc = -EINVAL;
    }

    citp_fdinfo_release_ref(fdi, 0);
  } 
  else {
    /* Not onload socket */
    rc = -ESOCKTNOSUPPORT;
  }

  citp_exit_lib(&lib_context, TRUE);
  Log_CALL_RESULT(rc);

  return rc;
}
Exemple #13
0
static int ci_udp_ioctl_locked(ci_netif* ni, ci_udp_state* us,
                               ci_fd_t fd, int request, void* arg)
{
  int rc;

  switch( request ) {
  case FIONREAD: /* synonym of SIOCINQ */
    if( ! CI_IOCTL_ARG_OK(int, arg) )
      return -EFAULT;
    rc = 1;
#if CI_CFG_ZC_RECV_FILTER
# ifndef __KERNEL__
    if( us->recv_q_filter ) {
      ci_netif_unlock(ni);
      ci_sock_lock(ni, &us->s.b);
      rc = ci_udp_recv_q_readable(ni, us);
      ci_sock_unlock(ni, &us->s.b);
      ci_netif_lock(ni);
    }
# endif
#endif
    if( rc ) {
      /* Return the size of the datagram at the head of the receive queue.
       *
       * Careful: extract side of receive queue is owned by sock lock,
       * which we don't have.  However, freeing of bufs is owned by netif
       * lock, which we do have.  So we're safe so long as we only read
       * [extract] once.
       */
      oo_pkt_p extract = us->recv_q.extract;
      if( OO_PP_NOT_NULL(extract) ) {
        ci_ip_pkt_fmt* pkt = PKT_CHK(ni, extract);
        if( (pkt->pf.udp.rx_flags & CI_IP_PKT_FMT_PREFIX_UDP_RX_CONSUMED) &&
            OO_PP_NOT_NULL(pkt->next) )
          pkt = PKT_CHK(ni, pkt->next);
        if( !(pkt->pf.udp.rx_flags & CI_IP_PKT_FMT_PREFIX_UDP_RX_CONSUMED) ) {
          *(int*) arg = pkt->pf.udp.pay_len;
          return 0;
        }
      }
    }
    /* Nothing in userlevel receive queue: So take the value returned by
     * the O/S socket.
     */
    if( !(us->s.os_sock_status & OO_OS_STATUS_RX) ) {
      *(int*)arg = 0;
      return 0;
    }
    goto sys_ioctl;

  case TIOCOUTQ: /* synonym of SIOCOUTQ */
    if( ! CI_IOCTL_ARG_OK(int, arg) )
      return -EFAULT;

    *(int*)arg = us->tx_count + oo_atomic_read(&us->tx_async_q_level);
    return 0;

  case SIOCGSTAMP:
#if defined( __linux__) && defined(__KERNEL__)
/* The following code assumes the width of the timespec and timeval fields */
# error "Need to consider 32-on-64 bit setting of timeval arg" 
#endif
    if( ! (us->udpflags & CI_UDPF_LAST_RECV_ON) )
      return oo_os_sock_ioctl(ni, us->s.b.bufid, request, arg, NULL);
    return ci_udp_ioctl_siocgstamp(ni, us, arg, 1);
  case SIOCGSTAMPNS:
    if( ! (us->udpflags & CI_UDPF_LAST_RECV_ON) )
      return oo_os_sock_ioctl(ni, us->s.b.bufid, request, arg, NULL);
    return ci_udp_ioctl_siocgstamp(ni, us, arg, 0);
  }

  return ci_udp_ioctl_slow(ni, us, fd, request, arg);

 sys_ioctl:
  return oo_os_sock_ioctl(ni, us->s.b.bufid, request, arg, NULL);
}
Exemple #14
0
/* c_ni is assumed to be locked on enterance and is always unlocked on
 * exit. */
int ci_tcp_connect_lo_toconn(ci_netif *c_ni, oo_sp c_id, ci_uint32 dst,
                             ci_netif *l_ni, oo_sp l_id)
{
  ci_tcp_state *ts;
  ci_tcp_socket_listen *tls, *alien_tls;
  citp_waitable_obj *wo;
  citp_waitable *w;
  int rc;

  ci_assert(ci_netif_is_locked(c_ni));
  ci_assert(OO_SP_NOT_NULL(c_id));
  ci_assert(OO_SP_NOT_NULL(l_id));

  LOG_TC(log("%s: connect %d:%d to %d:%d", __FUNCTION__,
             c_ni->state->stack_id, OO_SP_TO_INT(c_id),
             l_ni->state->stack_id, OO_SP_TO_INT(l_id)));

  alien_tls = SP_TO_TCP_LISTEN(l_ni, l_id);
  if( (int)ci_tcp_acceptq_n(alien_tls) >= alien_tls->acceptq_max ) {
    ci_netif_unlock(c_ni);
    return -EBUSY;
  }

  /* In c_ni, create shadow listening socket tls (copy l_id) */
  ts = ci_tcp_get_state_buf(c_ni);
  if( ts == NULL ) {
    ci_netif_unlock(c_ni);
    LOG_E(ci_log("%s: [%d] out of socket buffers", __FUNCTION__, NI_ID(c_ni)));
    return -ENOMEM;
  }

  /* init common tcp fields */
  ts->s.so = alien_tls->s.so;
  ts->s.cp.ip_ttl = alien_tls->s.cp.ip_ttl;
  S_TCP_HDR(&ts->s)->tcp_source_be16 =
      S_TCP_HDR(&alien_tls->s)->tcp_source_be16;
  ts->s.domain = alien_tls->s.domain;
  ts->c = alien_tls->c;
  ts->c.tcp_defer_accept = OO_TCP_DEFER_ACCEPT_OFF;

  /* make sure nobody will ever connect to our "shadow" socket
   * except us */
  ci_bit_set(&ts->s.b.sb_aflags, CI_SB_AFLAG_ORPHAN_BIT);

  ci_tcp_set_slow_state(c_ni, ts, CI_TCP_LISTEN);
  tls = SOCK_TO_TCP_LISTEN(&ts->s);
  /* no timer: */
  tls->s.s_flags = alien_tls->s.s_flags | CI_SOCK_FLAG_BOUND_ALIEN;

  tls->acceptq_max = 1;
  rc = ci_tcp_listen_init(c_ni, tls);
  if( rc != 0 ) {
    citp_waitable_obj_free(c_ni, &tls->s.b);
    return rc;
  }

  /* Connect c_id to tls */
  ts = SP_TO_TCP(c_ni, c_id);
  rc = ci_tcp_connect_lo_samestack(c_ni, ts, tls->s.b.bufid);

  /* Accept as from tls */
  if( !ci_tcp_acceptq_not_empty(tls) ) {
    /* it is possible, for example, if ci_tcp_listenq_try_promote() failed
     * because there are no endpoints */
    ci_tcp_listenq_drop_all(c_ni, tls);
    citp_waitable_obj_free(c_ni, &tls->s.b);
    ci_netif_unlock(c_ni);
    return -EBUSY;
  }
  w = ci_tcp_acceptq_get(c_ni, tls);
  ci_assert(w);
  LOG_TV(ci_log("%s: %d:%d to %d:%d shadow %d:%d accepted %d:%d",
                __FUNCTION__,
                c_ni->state->stack_id, OO_SP_TO_INT(c_id),
                l_ni->state->stack_id, OO_SP_TO_INT(l_id),
                c_ni->state->stack_id, tls->s.b.bufid,
                c_ni->state->stack_id, w->bufid));

  ci_assert(w->state & CI_TCP_STATE_TCP);
  ci_assert(w->state != CI_TCP_LISTEN);

  /* Destroy tls.
   * NB: nobody could possibly connect to it, so no need to do proper
   * shutdown.
   */
  ci_assert_equal(ci_tcp_acceptq_n(tls), 0);
  ci_tcp_listenq_drop_all(c_ni, tls);
  citp_waitable_obj_free(c_ni, &tls->s.b);
  ci_netif_unlock(c_ni);

  /* Keep a port reference */
  {
    tcp_helper_endpoint_t *l_ep, *a_ep;
    struct oo_file_ref* os_sock_ref;
    ci_irqlock_state_t lock_flags;

    l_ep = ci_trs_ep_get(netif2tcp_helper_resource(l_ni), l_id);
    a_ep = ci_trs_ep_get(netif2tcp_helper_resource(c_ni), W_SP(w));
    ci_irqlock_lock(&l_ep->thr->lock, &lock_flags);
    os_sock_ref = l_ep->os_socket;
    ci_assert_equal(a_ep->os_port_keeper, NULL);
    if( os_sock_ref != NULL ) {
      os_sock_ref = oo_file_ref_add(os_sock_ref);
      os_sock_ref = oo_file_ref_xchg(&a_ep->os_port_keeper, os_sock_ref);
      ci_irqlock_unlock(&l_ep->thr->lock, &lock_flags);
      if( os_sock_ref != NULL )
        oo_file_ref_drop(os_sock_ref);
    }
    else {
      ci_irqlock_unlock(&l_ep->thr->lock, &lock_flags);
      goto cleanup;
    }
  }

  /* lock l_ni: Check that l_id is the same socket it used to be */
  /* create ref-sock in l_ni, put it into acc q */
  if( ci_netif_lock(l_ni) != 0 )
    goto cleanup;
  if( alien_tls->s.b.state != CI_TCP_LISTEN ||
      (alien_tls->s.b.sb_aflags & CI_SB_AFLAG_ORPHAN) ||
      S_TCP_HDR(&alien_tls->s)->tcp_source_be16 != TS_TCP(ts)->tcp_dest_be16 ||
      (alien_tls->s.pkt.ip.ip_saddr_be32 != INADDR_ANY &&
       alien_tls->s.pkt.ip.ip_saddr_be32 != ts->s.pkt.ip.ip_daddr_be32) ) {
    ci_netif_unlock(l_ni);
    goto cleanup;
  }

  ci_bit_mask_set(&w->sb_aflags,
                  CI_SB_AFLAG_TCP_IN_ACCEPTQ | CI_SB_AFLAG_ORPHAN);

  wo = citp_waitable_obj_alloc(l_ni);
  if( wo == NULL ) {
    ci_netif_unlock(l_ni);
    goto cleanup;
  }
  wo->waitable.state = CI_TCP_CLOSED;
  wo->waitable.sb_aflags |= CI_SB_AFLAG_MOVED_AWAY;
  wo->waitable.moved_to_stack_id = c_ni->state->stack_id;
  wo->waitable.moved_to_sock_id = W_SP(w);
  LOG_TC(log("%s: put to acceptq %d:%d referencing %d:%d", __func__,
             l_ni->state->stack_id, OO_SP_TO_INT(W_SP(&wo->waitable)),
             c_ni->state->stack_id, OO_SP_TO_INT(W_SP(w))));

  ci_tcp_acceptq_put(l_ni, alien_tls, &wo->waitable);
  citp_waitable_wake_not_in_poll(l_ni, &alien_tls->s.b, CI_SB_FLAG_WAKE_RX);
  ci_netif_unlock(l_ni);

  return rc;

cleanup:
  ci_assert(w->sb_aflags & CI_SB_AFLAG_ORPHAN);
  ci_bit_mask_clear(&w->sb_aflags,
                    CI_SB_AFLAG_TCP_IN_ACCEPTQ | CI_SB_AFLAG_ORPHAN);
  efab_tcp_helper_close_endpoint(netif2tcp_helper_resource(c_ni), w->bufid);
  /* we can not guarantee c_ni lock, so we can' call
   * ci_tcp_drop(c_ni, ts).  So, we return error; UL will handover
   * and close ts endpoint. */
  return -EBUSY;
}
Exemple #15
0
/* Returns:
 *          0                  on success
 *          
 *          CI_SOCKET_ERROR (and errno set)
 *                             this is a normal error that is returned to the
 *                             the application
 *
 *          CI_SOCKET_HANDOVER we tell the upper layers to handover, no need
 *                             to set errno since it isn't a real error
 */
int ci_tcp_connect(citp_socket* ep, const struct sockaddr* serv_addr,
		   socklen_t addrlen, ci_fd_t fd, int *p_moved)
{
  /* Address family is validated earlier. */
  struct sockaddr_in* inaddr = (struct sockaddr_in*) serv_addr;
  ci_sock_cmn* s = ep->s;
  ci_tcp_state* ts = &SOCK_TO_WAITABLE_OBJ(s)->tcp;
  int rc = 0, crc;
  ci_uint32 dst_be32;

  if( NI_OPTS(ep->netif).tcp_connect_handover )
    return CI_SOCKET_HANDOVER;

  /* Make sure we're up-to-date. */
  ci_netif_lock(ep->netif);
  CHECK_TEP(ep);
  ci_netif_poll(ep->netif);

  /*
   * 1. Check if state of the socket is OK for connect operation.
   */

 start_again:

  if( (rc = ci_tcp_connect_handle_so_error(s)) != 0) {
    CI_SET_ERROR(rc, rc);
    goto unlock_out;
  }

  if( s->b.state != CI_TCP_CLOSED ) {
    /* see if progress can be made on this socket before
    ** determining status  (e.g. non-blocking connect and connect poll)*/
    if( s->b.state & CI_TCP_STATE_SYNCHRONISED ) {
      if( ts->tcpflags & CI_TCPT_FLAG_NONBLOCK_CONNECT ) {
        ts->tcpflags &= ~CI_TCPT_FLAG_NONBLOCK_CONNECT;
	rc = 0;
	goto unlock_out;
      }
      if( serv_addr->sa_family == AF_UNSPEC )
        LOG_E(ci_log("Onload does not support TCP disconnect via "

                     "connect(addr->sa_family==AF_UNSPEC)"));
      CI_SET_ERROR(rc, EISCONN);
    }
    else if( s->b.state == CI_TCP_LISTEN ) {
#if CI_CFG_POSIX_CONNECT_AFTER_LISTEN
      CI_SET_ERROR(rc, EOPNOTSUPP);
#else
      if( ci_tcp_validate_sa(s->domain, serv_addr, addrlen) ) {
        /* Request should be forwarded to OS */
        rc = CI_SOCKET_HANDOVER;
	goto unlock_out;
      }
      if( serv_addr->sa_family == AF_UNSPEC ) {
        /* Linux does listen shutdown on disconnect (AF_UNSPEC) */
        ci_netif_unlock(ep->netif);
        rc = ci_tcp_shutdown(ep, SHUT_RD, fd);
	goto out;
      } else {
        /* Linux has curious error reporting in this case */
        CI_SET_ERROR(rc, EISCONN);
      }
#endif
    }
    else {
      /* Socket is in SYN-SENT state. Let's block for receiving SYN-ACK */
      ci_assert_equal(s->b.state, CI_TCP_SYN_SENT);
      if( s->b.sb_aflags & (CI_SB_AFLAG_O_NONBLOCK | CI_SB_AFLAG_O_NDELAY) )
        CI_SET_ERROR(rc, EALREADY);
      else
        goto syn_sent;
    }
    goto unlock_out;
  }

  /* Check if we've ever been connected. */
  if( ts->tcpflags & CI_TCPT_FLAG_WAS_ESTAB ) {
    CI_SET_ERROR(rc, EISCONN);
    goto unlock_out;
  }

  /* 
   * 2. Check address parameter, if it's inappropriate for handover
   *    decision or handover should be done, try to to call OS and
   *    do handover on success.
   */

  if (
    /* Af first, check that address family and length is OK. */
    ci_tcp_validate_sa(s->domain, serv_addr, addrlen)
    /* rfc793 p54 if the foreign socket is unspecified return          */
    /* "error: foreign socket unspecified" (EINVAL), but keep it to OS */
    || (dst_be32 = ci_get_ip4_addr(inaddr->sin_family, serv_addr)) == 0
    /* Zero destination port is tricky as well, keep it to OS */
    || inaddr->sin_port == 0 )
  {
    rc = CI_SOCKET_HANDOVER;
    goto unlock_out;
  }
  
  /* is this a socket that we can handle? */
  rc = ci_tcp_connect_check_dest(ep, dst_be32, inaddr->sin_port);
  if( rc )  goto unlock_out;

  if( (ts->s.pkt.flags & CI_IP_CACHE_IS_LOCALROUTE) &&
      OO_SP_IS_NULL(ts->local_peer) ) {
    /* Try to connect to another stack; handover if can't */
    struct oo_op_loopback_connect op;
    op.dst_port = inaddr->sin_port;
    op.dst_addr = dst_be32;
    /* this operation unlocks netif */
    rc = oo_resource_op(fd, OO_IOC_TCP_LOOPBACK_CONNECT, &op);
    if( rc < 0)
      return CI_SOCKET_HANDOVER;
    if( op.out_moved )
      *p_moved = 1;
    if( op.out_rc == -EINPROGRESS )
      RET_WITH_ERRNO( EINPROGRESS );
    else if( op.out_rc == -EAGAIN )
      return -EAGAIN;
    else if( op.out_rc != 0 )
      return CI_SOCKET_HANDOVER;
    return 0;
  }

  /* filters can't handle alien source address */
  if( (s->s_flags & CI_SOCK_FLAG_BOUND_ALIEN) &&
      ! (ts->s.pkt.flags & CI_IP_CACHE_IS_LOCALROUTE) ) {
    rc = CI_SOCKET_HANDOVER;
    goto unlock_out;
  }

  crc = ci_tcp_connect_ul_start(ep->netif, ts, dst_be32, inaddr->sin_port, &rc);
  if( crc != CI_CONNECT_UL_OK ) {
    switch( crc ) {
    case CI_CONNECT_UL_FAIL:
      goto unlock_out;
    case CI_CONNECT_UL_LOCK_DROPPED:
      goto out;
    case CI_CONNECT_UL_START_AGAIN:
      goto start_again;
    }
  }
  CI_TCP_STATS_INC_ACTIVE_OPENS( ep->netif );

 syn_sent:
  rc = ci_tcp_connect_ul_syn_sent(ep->netif, ts);

 unlock_out:
  ci_netif_unlock(ep->netif);
 out:
  return rc;
}
/* This function must be called with netif lock not held and it always
 * returns with the netif lock not held.
 */
int efab_tcp_helper_reuseport_bind(ci_private_t *priv, void *arg)
{
  oo_tcp_reuseport_bind_t* trb = arg;
  ci_netif* ni = &priv->thr->netif;
  tcp_helper_cluster_t* thc;
  tcp_helper_resource_t* thr = NULL;
  citp_waitable* waitable;
  ci_sock_cmn* sock = SP_TO_SOCK(ni, priv->sock_id);
  struct oof_manager* fm = efab_tcp_driver.filter_manager;
  struct oof_socket* oofilter;
  struct oof_socket dummy_oofilter;
  int protocol = thc_get_sock_protocol(sock);
  char name[CI_CFG_CLUSTER_NAME_LEN + 1];
  int rc, rc1;
  int flags = 0;
  tcp_helper_cluster_t* named_thc,* ported_thc;
  int alloced = 0;

  /* No clustering on sockets bound to alien addresses */
  if( sock->s_flags & CI_SOCK_FLAG_BOUND_ALIEN )
    return 0;

  if( NI_OPTS(ni).cluster_ignore == 1 ) {
    LOG_NV(ci_log("%s: Ignored attempt to use clusters due to "
                  "EF_CLUSTER_IGNORE option.", __FUNCTION__));
    return 0;
  }

  if( trb->port_be16 == 0 ) {
    ci_log("%s: Reuseport on port=0 is not supported", __FUNCTION__);
    return -EINVAL;
  }

  if( trb->cluster_size < 2 ) {
    ci_log("%s: Cluster sizes < 2 are not supported", __FUNCTION__);
    return -EINVAL;
  }

  if( sock->s_flags & (CI_SOCK_FLAG_TPROXY | CI_SOCK_FLAG_MAC_FILTER) ) {
    ci_log("%s: Scalable filter sockets cannot be clustered",
           __FUNCTION__);
    return -EINVAL;
  }

  oofilter = &ci_trs_ep_get(priv->thr, priv->sock_id)->oofilter;

  if( oofilter->sf_local_port != NULL ) {
    ci_log("%s: Socket that already have filter cannot be clustered",
           __FUNCTION__);
    return -EINVAL;
  }

  if( priv->thr->thc ) {
    /* Reserve proto:port[:ip] until bind (or close)*/
    rc = oof_socket_add(fm, oofilter,
                       OOF_SOCKET_ADD_FLAG_CLUSTERED |
                       OOF_SOCKET_ADD_FLAG_DUMMY,
                       protocol, trb->addr_be32, trb->port_be16, 0, 0,
                       &ported_thc);
    if( rc > 0 )
      rc = 0;
    if( rc == 0 )
      sock->s_flags |= CI_SOCK_FLAG_FILTER;
    return rc;
  }

  mutex_lock(&thc_init_mutex);
  /* We are going to be iterating over clusters, make sure they don't
   * change.
   */
  mutex_lock(&thc_mutex);

  /* Lookup a suitable cluster to use */

  /* We try to add dummy filter to oof to reserve proto:port[:ip] tuple,
   * if there is already a cluster at the tuple we will get reference to it,
   */
  oof_socket_ctor(&dummy_oofilter);
  rc = oof_socket_add(fm, &dummy_oofilter,
                      OOF_SOCKET_ADD_FLAG_CLUSTERED |
                      OOF_SOCKET_ADD_FLAG_DUMMY |
                      OOF_SOCKET_ADD_FLAG_NO_STACK,
                      protocol, trb->addr_be32, trb->port_be16, 0, 0,
                      &ported_thc);
  if( rc < 0 ) /* non-clustered socket on the tuple */
    goto alloc_fail0;

  if( ! gen_cluster_name(trb->cluster_name, name) ) {
    /* user requested a cluster by name.  But we need to make sure
     * that the oof_local_port that the user is interested in is not
     * being used by another cluster.  We search for cluster by name
     * and use results of prior protp:port[:ip] search oof_local_port
     * to then do some sanity checking.
     */
    rc1 = thc_search_by_name(name, protocol, trb->port_be16, ci_geteuid(),
                            &named_thc);
    if( rc1 < 0 ) {
      rc = rc1;
      goto alloc_fail;
    }

    if( rc1 == 0 ) {
      if( rc == 1 ) {
        /* search by oof_local_port found a cluster which search by
         * name didn't find. */
        LOG_E(ci_log("Error: Cluster with requested name %s already "
                     "bound to %s", name, ported_thc->thc_name));
        rc = -EEXIST;
        goto alloc_fail;
      }
      else {
        /* Neither searches found a cluster.  So allocate one below.
         */
      }
    }
    else {
      if( rc == 1 ) {
        /* Both searches found clusters.  Fine if they are the same or
         * else error. */
        if( named_thc != ported_thc ) {
          LOG_E(ci_log("Error: Cluster %s does not handle socket %s:%d.  "
                       "Cluster %s does", name, FMT_PROTOCOL(protocol),
                       trb->port_be16, named_thc->thc_name));
          rc = -EEXIST;
          goto alloc_fail;
        }
      }
      /* Search by name found a cluster no conflict with search by tuple
       * (the ported cluster is either none or the same as named)*/
      thc = named_thc;
      goto cont;
    }
  }
  else {
    /* No cluster name requested.  We have already looked for a cluster handling
     * the tuple.  If none found, then try to use an existing
     * cluster this process created.  If none found, then allocate one.
     */
    /* If rc == 0, then no cluster found - try to allocate one.
     * If rc == 1, we found cluster - make sure that euids match and continue. */
    if( rc == 1 ) {
      thc = ported_thc;
      if( thc->thc_euid != ci_geteuid() ) {
        rc = -EADDRINUSE;
        goto alloc_fail;
      }
      goto cont;
    }
    rc = thc_search_by_name(name, protocol, trb->port_be16, ci_geteuid(),
                            &thc);
    if( rc < 0 )
      goto alloc_fail;
    if( rc == 1 )
      goto cont;
  }
  /* When an interface is in tproxy mode, all clustered listening socket
   * are assumed to be part of tproxy passive side.  This requires
   * rss context to use altered rss hashing based solely on src ip:port.
   */
  flags = tcp_helper_cluster_thc_flags(&NI_OPTS(ni));

  if( (rc = thc_alloc(name, protocol, trb->port_be16, ci_geteuid(),
                      trb->cluster_size, flags, &thc)) != 0 )
      goto alloc_fail;

  alloced = 1;

 cont:
  tcp_helper_cluster_ref(thc);

  /* At this point we have our cluster with one additional reference */

  /* Find a suitable stack within the cluster to use */
  rc = thc_get_thr(thc, &dummy_oofilter, &thr);
  if( rc != 0 )
    rc = thc_alloc_thr(thc, trb->cluster_restart_opt,
                       &ni->opts, ni->flags, &thr);

  /* If get or alloc succeeded thr holds reference to the cluster,
   * so the cluster cannot go away.  We'll drop our reference and also
   * will not be accessing state within the cluster anymore so we can
   * drop the lock. */
  mutex_unlock(&thc_mutex);

  if( alloced && rc == 0 && (flags & THC_FLAG_TPROXY) != 0 ) {
    /* Tproxy filter is allocated as late as here,
     * the reason is that this needs to be preceded by stack allocation
     * (firmware needs initialized vi) */
    rc = thc_install_tproxy(thc, NI_OPTS(ni).scalable_filter_ifindex);
    if( rc != 0 )
      efab_thr_release(thr);
  }

  tcp_helper_cluster_release(thc, NULL);

  if( rc != 0 ) {
    oof_socket_del(fm, &dummy_oofilter);
    goto alloc_fail_unlocked;
  }

  /* We have thr and we hold single reference to it. */

  /* Move the socket into the new stack */
  if( (rc = ci_netif_lock(ni)) != 0 )
    goto drop_and_done;
  waitable = SP_TO_WAITABLE(ni, priv->sock_id);
  rc = ci_sock_lock(ni, waitable);
  if( rc != 0 ) {
    ci_netif_unlock(ni);
    goto drop_and_done;
  }
  /* thr referencing scheme comes from efab_file_move_to_alien_stack_rsop */
  efab_thr_ref(thr);
  rc = efab_file_move_to_alien_stack(priv, &thr->netif, 0);
  if( rc != 0 )
    efab_thr_release(thr);
  else {
    /* beside us, socket now holds its own reference to thr */
    oofilter = &ci_trs_ep_get(thr, sock->b.moved_to_sock_id)->oofilter;
    oof_socket_replace(fm, &dummy_oofilter, oofilter);
    SP_TO_SOCK(&thr->netif, sock->b.moved_to_sock_id)->s_flags |= CI_SOCK_FLAG_FILTER;
    ci_netif_unlock(&thr->netif);
  }

 drop_and_done:
  if( rc != 0 )
    oof_socket_del(fm, &dummy_oofilter);
  /* Drop the reference we got from thc_get_thr or thc_alloc_thr().
   * If things went wrong both stack and cluster might disappear. */
  efab_thr_release(thr);
  oof_socket_dtor(&dummy_oofilter);
  mutex_unlock(&thc_init_mutex);
  return rc;

 alloc_fail:
  oof_socket_del(fm, &dummy_oofilter);
 alloc_fail0:
  mutex_unlock(&thc_mutex);
 alloc_fail_unlocked:
  oof_socket_dtor(&dummy_oofilter);
  mutex_unlock(&thc_init_mutex);
  return rc;
}