示例#1
0
int ci_tcp_shutdown(citp_socket* ep, int how, ci_fd_t fd)
{
  ci_sock_cmn* s = ep->s;
  int rc;

  if( s->b.state == CI_TCP_LISTEN )
    return ci_tcp_shutdown_listen(ep, how, fd);

  if( SOCK_TO_TCP(s)->snd_delegated ) {
    /* We do not know which seq number to use.  Call
     * onload_delegated_send_cancel(). */
    CI_SET_ERROR(rc, EBUSY);
    return rc;
  }

  if( ! ci_netif_trylock(ep->netif) ) {
    /* Can't get lock, so try to defer shutdown to the lock holder. */
    unsigned flags = 0;
    switch( s->b.state ) {
    case CI_TCP_CLOSED:
    case CI_TCP_TIME_WAIT:
      CI_SET_ERROR(rc, ENOTCONN);
      return rc;
    }
    if( how == SHUT_RD || how == SHUT_RDWR )
      flags |= CI_SOCK_AFLAG_NEED_SHUT_RD;
    if( how == SHUT_WR || how == SHUT_RDWR )
      flags |= CI_SOCK_AFLAG_NEED_SHUT_WR;
    ci_atomic32_or(&s->s_aflags, flags);
    if( ! ci_netif_lock_or_defer_work(ep->netif, &s->b) )
      return 0;
    ci_atomic32_and(&s->s_aflags, ~flags);
  }

  if( 0 ) {
    /* Poll to get up-to-date.  This is slightly spurious but done to ensure
     * ordered response to all packets preceding this FIN (e.g. ANVL tcp_core
     * 9.18)
     *
     * DJR: I've disabled this because it can hurt performance for
     * high-connection-rate apps.  May consider adding back (as option?) if
     * needed.
     */
    ci_netif_poll(ep->netif);
  }
  rc = __ci_tcp_shutdown(ep->netif, SOCK_TO_TCP(s), how);
  if( rc < 0 )
    CI_SET_ERROR(rc, -rc);
  ci_netif_unlock(ep->netif);
  return rc;
}
示例#2
0
/* check that we can handle this destination */
static int ci_tcp_connect_check_dest(citp_socket* ep, ci_ip_addr_t dst_be32,
                                     int dport_be16)
{
  ci_ip_cached_hdrs* ipcache = &ep->s->pkt;

  ipcache->ip.ip_daddr_be32 = dst_be32;
  ipcache->dport_be16 = dport_be16;
  cicp_user_retrieve(ep->netif, ipcache, &ep->s->cp);

  if(CI_LIKELY( ipcache->status == retrrc_success ||
                ipcache->status == retrrc_nomac   ||
                ipcache->status < 0 )) {
    /* Onloadable. */
    if( ipcache->encap.type & CICP_LLAP_TYPE_XMIT_HASH_LAYER4 )
      /* We don't yet have a local port number, so the result of that
       * lookup may be wrong.
       */
      ci_ip_cache_invalidate(ipcache);
    if( ipcache->ip.ip_saddr_be32 == 0 ) {
      /* Control plane has selected a source address for us -- remember it. */
      ipcache->ip.ip_saddr_be32 = ipcache->ip_saddr_be32;
      ep->s->cp.ip_laddr_be32 = ipcache->ip_saddr_be32;
    }
    return 0;
  }
  else if( ipcache->status == retrrc_localroute ) {
    ci_tcp_state* ts = SOCK_TO_TCP(ep->s);

    if( NI_OPTS(ep->netif).tcp_client_loopback == CITP_TCP_LOOPBACK_OFF)
      return CI_SOCKET_HANDOVER;

    ep->s->s_flags |= CI_SOCK_FLAG_BOUND_ALIEN;
    if( NI_OPTS(ep->netif).tcp_server_loopback != CITP_TCP_LOOPBACK_OFF )
      ts->local_peer = ci_tcp_connect_find_local_peer(ep->netif, dst_be32,
                                                      dport_be16);
    else
      ts->local_peer = OO_SP_NULL;

    if( OO_SP_NOT_NULL(ts->local_peer) ||
        NI_OPTS(ep->netif).tcp_client_loopback !=
        CITP_TCP_LOOPBACK_SAMESTACK ) {
      ipcache->flags |= CI_IP_CACHE_IS_LOCALROUTE;
      if( ipcache->ip.ip_saddr_be32 == 0 ) {
        ipcache->ip.ip_saddr_be32 = dst_be32;
        ep->s->cp.ip_laddr_be32 = dst_be32;
      }
      ipcache->ether_offset = 4; /* lo is non-VLAN */
      ipcache->ip_saddr_be32 = dst_be32;
      ipcache->dport_be16 = dport_be16;
      return 0;
    }
    return CI_SOCKET_HANDOVER;
  }

  return CI_SOCKET_HANDOVER;
}
/* Returns true if move of this endpoint is supported */
static int efab_file_move_supported(ci_netif *ni, ci_sock_cmn *s)
{

  /* We do not copy TX timestamping queue yet. */
  if( s->timestamping_flags != 0 )
    return false;

  /* UDP:  */
  if( s->b.state == CI_TCP_STATE_UDP )
    return efab_file_move_supported_udp(ni, SOCK_TO_UDP(s));

  /* TCP or UDP only */
  if( ! (s->b.state & CI_TCP_STATE_TCP ) )
    return false;

  /* No listening sockets */
  if( s->b.state == CI_TCP_LISTEN )
    return false;

  return efab_file_move_supported_tcp(ni, SOCK_TO_TCP(s));
}
示例#4
0
ssize_t linux_tcp_helper_fop_sendpage(struct file* filp, struct page* page, 
                                      int offset, size_t size,
                                      loff_t* ppos, int flags)
{
  ci_private_t* priv = filp->private_data;
  tcp_helper_resource_t* trs = efab_priv_to_thr(priv);
  ci_sock_cmn* s;

  OO_DEBUG_VERB(ci_log("%s: %d:%d offset=%d size=%d flags=%x", __FUNCTION__,
                       NI_ID(&trs->netif), OO_SP_FMT(priv->sock_id), offset,
                       (int) size, flags));

  ci_assert(page);
  ci_assert_ge(offset, 0);
  ci_assert_gt(size, 0);
  ci_assert_le(offset + size, CI_PAGE_SIZE);

#ifndef MSG_SENDPAGE_NOTLAST
  /* "flags" is really "more".  Convert it. */
  if( flags )
    flags = MSG_MORE;

  /* [more] is sometimes true even for the last page.  We get a little
  ** closer to the truth by spotting that we're not reading to the end of
  ** the page. - seen on 2.6.18, but not on 2.6.26 or later
  */
  if( offset + size < CI_PAGE_SIZE && flags )
    flags = 0;
#endif

  s = SP_TO_SOCK(&trs->netif, priv->sock_id);
  if(CI_LIKELY( s->b.state & CI_TCP_STATE_TCP_CONN ))
    return sendpage_copy(&trs->netif,SOCK_TO_TCP(s),page,offset,size,flags);
  else
    /* Closed or listening.  Return epipe.  Do not send SIGPIPE, because
    ** Linux will do it for us. */
    return -s->tx_errno;
}
示例#5
0
int onload_zc_alloc_buffers(int fd, struct onload_zc_iovec* iovecs,
                            int iovecs_len, 
                            enum onload_zc_buffer_type_flags flags)
{
  int rc = 0, i;
  citp_lib_context_t lib_context;
  citp_fdinfo* fdi;
  citp_sock_fdi* epi;
  ci_netif* ni;
  ci_ip_pkt_fmt *pkt;
  unsigned max_len;

  Log_CALL(ci_log("%s(%d, %p, %d, %x)", __FUNCTION__, fd, iovecs,
                  iovecs_len, flags));

  citp_enter_lib(&lib_context);

  if( (fdi = citp_fdtable_lookup(fd)) != NULL ) {
    switch( citp_fdinfo_get_type(fdi) ) {
    case CITP_UDP_SOCKET:
    case CITP_TCP_SOCKET:
      epi = fdi_to_sock_fdi(fdi);
      ni = epi->sock.netif;
      ci_netif_lock(ni);
      for( i = 0; i < iovecs_len; ++i ) {
        max_len = CI_CFG_PKT_BUF_SIZE;
        pkt = ci_netif_pkt_tx_tcp_alloc(ni);
        if( pkt == NULL ) {
          while( --i >= 0 )
            ci_netif_pkt_release(ni, (ci_ip_pkt_fmt*)iovecs[i].buf);
          rc = -ENOMEM;
          ci_netif_unlock(ni);
          goto out;
        }
        /* Make sure this is clear as it affects behaviour when freeing */
        pkt->pf.udp.rx_flags = 0;
        iovecs[i].buf = (struct oo_zc_buf *)pkt;
        if( flags & ONLOAD_ZC_BUFFER_HDR_TCP ) {
          if( (citp_fdinfo_get_type(fdi) == CITP_TCP_SOCKET) &&
              (epi->sock.s->b.state & CI_TCP_STATE_TCP_CONN) ) {
            ci_tcp_state* ts = SOCK_TO_TCP(epi->sock.s);
            oo_tx_pkt_layout_init(pkt);
            iovecs[i].iov_base = ((char *)oo_tx_ip_hdr(pkt)) + 
              ts->outgoing_hdrs_len;
            max_len = tcp_eff_mss(ts);
          } 
          else {
            /* Best guess.  We can fix it up later.  Magic 12 leaves
             * space for time stamp option (common case)
             */
            oo_tx_pkt_layout_init(pkt);
            iovecs[i].iov_base =
              (uint8_t*) oo_tx_ip_data(pkt) + sizeof(ci_tcp_hdr) + 12;
          }
        }
        else if( flags & ONLOAD_ZC_BUFFER_HDR_UDP ) {
          oo_tx_pkt_layout_init(pkt);
          iovecs[i].iov_base =
            (uint8_t*) oo_tx_ip_data(pkt) + sizeof(ci_udp_hdr);
        }
        else 
          iovecs[i].iov_base = PKT_START(pkt);
        iovecs[i].iov_len = CI_CFG_PKT_BUF_SIZE - 
          ((char *)iovecs[i].iov_base - (char *)pkt);
        if( iovecs[i].iov_len > max_len )
          iovecs[i].iov_len = max_len;
      }
      ni->state->n_async_pkts += iovecs_len;
      ci_netif_unlock(ni);
      break;
#if CI_CFG_USERSPACE_EPOLL
    case CITP_EPOLL_FD:
      rc = -ENOTSOCK;
      break;
#endif
#if CI_CFG_USERSPACE_PIPE
    case CITP_PIPE_FD:
      rc = -ENOTSOCK;
      break;
#endif
    case CITP_PASSTHROUGH_FD:
      rc = -ESOCKTNOSUPPORT;
      break;
    default:
      LOG_U(log("%s: unknown fdinfo type %d", __FUNCTION__, 
                citp_fdinfo_get_type(fdi)));
      rc = -EINVAL;
    }
    citp_fdinfo_release_ref(fdi, 0);
  } 
  else {
    /* Not onload socket */
    rc = -ESOCKTNOSUPPORT;
  }

 out:
  citp_exit_lib(&lib_context, TRUE);
  Log_CALL_RESULT(rc);
  return rc;
}
示例#6
0
/* NOTE: in the kernel version [fd] is unused and, if it's a ptr, [arg] will
 * be in user-space and may need to be fetched into kernel memory. */
static int ci_tcp_ioctl_lk(citp_socket* ep, ci_fd_t fd, int request,
                           void* arg)
{
  ci_netif* netif = ep->netif;
  ci_sock_cmn* s = ep->s;
  ci_tcp_state* ts = NULL;
  int rc = 0;
  int os_socket_exists = s->b.sb_aflags & CI_SB_AFLAG_OS_BACKED;

  if( s->b.state != CI_TCP_LISTEN )
    ts = SOCK_TO_TCP(s);

  /* Keep the os socket in sync.  If this is a "get" request then the
   * return will be based on our support, not the os's (except for EFAULT
   * handling which we get for free).
   * Exceptions:
   *  - FIONBIO is applied just in time on handover if needed (listening
   *    sockets always have a non-blocking OS socket)
   *  - FIONREAD, TIOCOUTQ, SIOCOUTQNSD and SIOCATMARK are useless on OS
   *    socket, let's avoid syscall.
   */
  if( os_socket_exists && request != FIONREAD && request != SIOCATMARK &&
      request != FIOASYNC && request != TIOCOUTQ && request != SIOCOUTQNSD &&
      request != (int) FIONBIO ) {
    rc = oo_os_sock_ioctl(netif, s->b.bufid, request, arg, NULL);
    if( rc < 0 )
      return rc;
  }

  /* ioctl defines are listed in `man ioctl_list` and the CI equivalent
   * CI defines are in include/ci/net/ioctls.h  */
  LOG_TV( ci_log("%s:  request = %d, arg = %ld", __FUNCTION__, request, 
		 (long)arg));

  switch( request ) {
  case FIONBIO:
    if( CI_IOCTL_ARG_OK(int, arg) ) {
      CI_CMN_IOCTL_FIONBIO(ep->s, arg);
      rc = 0;
      break;
    }
    goto fail_fault;
  case FIONREAD: /* synonym of SIOCINQ */
    if( !CI_IOCTL_ARG_OK(int, arg) )
      goto fail_fault;
    if( s->b.state == CI_TCP_LISTEN )
      goto fail_inval;

    if( s->b.state == CI_TCP_SYN_SENT ) {
      CI_IOCTL_SETARG((int*)arg, 0);
    } else {
      /* In inline mode, return the total number of bytes in the receive queue.
         If SO_OOBINLINE isn't set then return the number of bytes up to the
         mark but without counting the mark */
      int bytes_in_rxq = tcp_rcv_usr(ts);
      if (bytes_in_rxq && ! (ts->s.s_flags & CI_SOCK_FLAG_OOBINLINE)) {

        if (tcp_urg_data(ts) & CI_TCP_URG_PTR_VALID) {
          /*! \TODO: what if FIN has been received? */
          unsigned int readnxt = tcp_rcv_nxt(ts) - bytes_in_rxq;
          if (SEQ_LT(readnxt, tcp_rcv_up(ts))) {
            bytes_in_rxq = tcp_rcv_up(ts) - readnxt;
          } else if (SEQ_EQ(readnxt, tcp_rcv_up(ts))) {
            bytes_in_rxq--;
          }
        }

      }
      CI_IOCTL_SETARG((int*)arg, bytes_in_rxq);
    }
    break;

  case TIOCOUTQ: /* synonym of SIOCOUTQ */
  case SIOCOUTQNSD:
    {
    CI_BUILD_ASSERT(TIOCOUTQ == SIOCOUTQ);
    int outq_bytes = 0;

    if( !CI_IOCTL_ARG_OK(int, arg) )
      goto fail_fault;
    if( s->b.state == CI_TCP_LISTEN )
      goto fail_inval;

    if( s->b.state != CI_TCP_SYN_SENT ) {

      /* TIOCOUTQ counts all unacknowledged data, so includes retrans queue. */
      if( request == TIOCOUTQ )
        outq_bytes = SEQ_SUB(tcp_enq_nxt(ts), tcp_snd_una(ts));
      else
        outq_bytes = SEQ_SUB(tcp_enq_nxt(ts), tcp_snd_nxt(ts));
    }
    CI_IOCTL_SETARG((int*)arg, outq_bytes);
    }
    break;

  case SIOCATMARK:
    {
     if( !CI_IOCTL_ARG_OK(int, arg) )
       goto fail_fault;

      /* return true, if we are at the out-of-band byte */
      CI_IOCTL_SETARG((int*)arg, 0);
      if( s->b.state != CI_TCP_LISTEN ) {
	int readnxt;

        readnxt = SEQ_SUB(tcp_rcv_nxt(ts), tcp_rcv_usr(ts));
        if( ~ts->s.b.state & CI_TCP_STATE_ACCEPT_DATA )
          readnxt = SEQ_SUB(readnxt, 1);
        if( tcp_urg_data(ts) & CI_TCP_URG_PTR_VALID )
          CI_IOCTL_SETARG((int*)arg, readnxt == tcp_rcv_up(ts));
        LOG_URG(log(NTS_FMT "SIOCATMARK atmark=%d  readnxt=%u rcv_up=%u%s",
                    NTS_PRI_ARGS(ep->netif, ts), readnxt == tcp_rcv_up(ts), 
		    readnxt,  tcp_rcv_up(SOCK_TO_TCP(ep->s)),
                    (tcp_urg_data(ts)&CI_TCP_URG_PTR_VALID)?"":" (invalid)"));
      }
      break;
    }

#ifndef __KERNEL__
  case FIOASYNC:
    /* Need to apply this to [fd] so that our fasync file-op will be
     * invoked.
     */
    rc = ci_sys_ioctl(fd, request, arg);
    break;

  case SIOCSPGRP:
    if( !CI_IOCTL_ARG_OK(int, arg) )
      goto fail_fault;
    /* Need to apply this to [fd] to get signal delivery to work.  However,
     * SIOCSPGRP is only supported on sockets, so we need to convert to
     * fcntl().
     */
    rc = ci_sys_fcntl(fd, F_SETOWN, CI_IOCTL_GETARG(int, arg));
    if( rc == 0 ) {
      rc = ci_cmn_ioctl(netif, ep->s, request, arg, rc, os_socket_exists);
    }
    else {
      CI_SET_ERROR(rc, -rc);
    }
    break;
#endif

  default:
    return ci_cmn_ioctl(netif, ep->s, request, arg, rc, os_socket_exists);
  }
/* Move priv file to the alien_ni stack.
 * Should be called with the locked priv stack and socket;
 * the function returns with this stack being unlocked.
 * If rc=0, it returns with alien_ni stack locked;
 * otherwise, both stacks are unlocked.
 * Socket is always unlocked on return. */
int efab_file_move_to_alien_stack(ci_private_t *priv, ci_netif *alien_ni)
{
  tcp_helper_resource_t *old_thr = priv->thr;
  tcp_helper_resource_t *new_thr = netif2tcp_helper_resource(alien_ni);
  ci_sock_cmn *old_s = SP_TO_SOCK(&old_thr->netif, priv->sock_id);
  ci_sock_cmn *new_s;
  ci_sock_cmn *mid_s;
  tcp_helper_endpoint_t *old_ep, *new_ep;
  int rc, i;
  int pollwait_register = 0;
#if CI_CFG_FD_CACHING
  oo_p sp;
#endif

  OO_DEBUG_TCPH(ci_log("%s: move %d:%d to %d", __func__,
                       old_thr->id, priv->sock_id, new_thr->id));
  /* Poll the old stack - deliver all data to our socket */
  ci_netif_poll(&old_thr->netif);

  /* Endpoints in epoll list should not be moved, because waitq is already
   * in the epoll internal structures (bug 41152). */
  if( !list_empty(&priv->_filp->f_ep_links) ) {
    rc = -EBUSY;
    goto fail1;
  }

  if( !efab_file_move_supported(&old_thr->netif, old_s) ) {
    rc = -EINVAL;
    goto fail1;
  }

  /* Lock the second stack */
  i = 0;
  while( ! ci_netif_trylock(alien_ni) ) {
    ci_netif_unlock(&old_thr->netif);
    if( i++ >= 1000 ) {
      rc = -EBUSY;
      goto fail1_ni_unlocked;
    }
    rc = ci_netif_lock(&old_thr->netif);
    if( rc != 0 )
      goto fail1_ni_unlocked;
  }

  /* Allocate a new socket in the alien_ni stack */
  rc = -ENOMEM;
  if( old_s->b.state == CI_TCP_STATE_UDP ) {
    ci_udp_state *new_us = ci_udp_get_state_buf(alien_ni);
    if( new_us == NULL )
      goto fail2;
    new_s = &new_us->s;
  }
  else {
    ci_tcp_state *new_ts = ci_tcp_get_state_buf(alien_ni);
    if( new_ts == NULL )
      goto fail2;
    new_s = &new_ts->s;
  }

  /* Allocate an intermediate "socket" outside of everything */
  mid_s = ci_alloc(CI_MAX(sizeof(ci_tcp_state), sizeof(ci_udp_state)));
  if( mid_s == NULL )
    goto fail3;

  OO_DEBUG_TCPH(ci_log("%s: move %d:%d to %d:%d", __func__,
                       old_thr->id, priv->sock_id,
                       new_thr->id, new_s->b.bufid));

  /* Copy TCP/UDP state */
  memcpy(mid_s, old_s, CI_MAX(sizeof(ci_tcp_state), sizeof(ci_udp_state)));

  /* do not copy old_s->b.bufid
   * and other fields in stack adress space */
  mid_s->b.sb_aflags |= CI_SB_AFLAG_ORPHAN;
  mid_s->b.bufid = new_s->b.bufid;
  mid_s->b.post_poll_link = new_s->b.post_poll_link;
  mid_s->b.ready_link = new_s->b.ready_link;
  mid_s->reap_link = new_s->reap_link;

  if( old_s->b.state & CI_TCP_STATE_TCP ) {
    ci_tcp_state *new_ts = SOCK_TO_TCP(new_s);
    ci_tcp_state *mid_ts = SOCK_TO_TCP(mid_s);

    mid_ts->timeout_q_link = new_ts->timeout_q_link;
    mid_ts->tx_ready_link = new_ts->tx_ready_link;
    mid_ts->rto_tid = new_ts->rto_tid;
    mid_ts->delack_tid = new_ts->delack_tid;
    mid_ts->zwin_tid = new_ts->zwin_tid;
    mid_ts->kalive_tid = new_ts->kalive_tid;
    mid_ts->cork_tid = new_ts->cork_tid;
    ci_ip_queue_init(&mid_ts->recv1);
    ci_ip_queue_init(&mid_ts->recv2);
    ci_ip_queue_init(&mid_ts->send);
    ci_ip_queue_init(&mid_ts->retrans);
    mid_ts->send_prequeue = OO_PP_ID_NULL;
    new_ts->retrans_ptr = OO_PP_NULL;
    mid_ts->tmpl_head = OO_PP_NULL;
    oo_atomic_set(&mid_ts->send_prequeue_in, 0);

    *new_ts = *mid_ts;
    ci_pmtu_state_init(alien_ni, &new_ts->s, &new_ts->pmtus,
                       CI_IP_TIMER_PMTU_DISCOVER);
#if CI_CFG_FD_CACHING
    sp = TS_OFF(alien_ni, new_ts);
    OO_P_ADD(sp, CI_MEMBER_OFFSET(ci_tcp_state, epcache_link));
    ci_ni_dllist_link_init(alien_ni, &new_ts->epcache_link, sp, "epch");
    ci_ni_dllist_self_link(alien_ni, &new_ts->epcache_link);
    sp = TS_OFF(alien_ni, new_ts);
    OO_P_ADD(sp, CI_MEMBER_OFFSET(ci_tcp_state, epcache_fd_link));
    ci_ni_dllist_link_init(alien_ni, &new_ts->epcache_fd_link, sp, "ecfd");
    ci_ni_dllist_self_link(alien_ni, &new_ts->epcache_fd_link);
#endif
   
    /* free temporary mid_ts storage */
    CI_FREE_OBJ(mid_ts);
  }
  else {
    ci_udp_state *mid_us = SOCK_TO_UDP(mid_s);

    *SOCK_TO_UDP(new_s) = *mid_us;
    CI_FREE_OBJ(mid_us);
  }

  /* Move the filter */
  old_ep = ci_trs_ep_get(old_thr, priv->sock_id);
  new_ep = ci_trs_ep_get(new_thr, new_s->b.bufid);
  rc = tcp_helper_endpoint_move_filters_pre(old_ep, new_ep);
  if( rc != 0 ) {
    rc = -EINVAL;
    goto fail3;
  }

  /* Allocate a new file for the new endpoint */
  rc = onload_alloc_file(new_thr, new_s->b.bufid, priv->_filp->f_flags,
                         priv->fd_type, &old_ep->alien_ref);
  if( rc != 0 )
    goto fail4;
  ci_assert(old_ep->alien_ref);

  /* Copy F_SETOWN_EX, F_SETSIG to the new file */
#ifdef F_SETOWN_EX
  rcu_read_lock();
  __f_setown(old_ep->alien_ref->_filp, priv->_filp->f_owner.pid,
             priv->_filp->f_owner.pid_type, 1);
  rcu_read_unlock();
#endif
  old_ep->alien_ref->_filp->f_owner.signum = priv->_filp->f_owner.signum;
  old_ep->alien_ref->_filp->f_flags |= priv->_filp->f_flags & O_NONBLOCK;

  /* Move os_socket from one ep to another */
  if( tcp_helper_endpoint_set_aflags(new_ep, OO_THR_EP_AFLAG_ATTACHED) &
      OO_THR_EP_AFLAG_ATTACHED ) {
    fput(old_ep->alien_ref->_filp);
    rc = -EBUSY;
    goto fail2; /* state & filters are cleared by fput() */
  }

  /********* Point of no return  **********/
  ci_wmb();
  priv->fd_type = CI_PRIV_TYPE_ALIEN_EP;
  priv->_filp->f_op = &linux_tcp_helper_fops_alien;
  ci_wmb();
  oo_file_moved(priv);

  /* Read all already-arrived packets after the filters move but before
   * copying of the receive queue. */
  ci_netif_poll(&old_thr->netif);
  tcp_helper_endpoint_move_filters_post(old_ep, new_ep);
  ci_assert( efab_file_move_supported(&old_thr->netif, old_s));

  /* There's a gap between un-registering the old ep, and registering the
   * the new.  However, the notifications shouldn't be in use for sockets
   * that are in a state that can be moved, so this shouldn't be a problem.
   */
  if( old_ep->os_sock_pt.whead ) {
    pollwait_register = 1;
    efab_tcp_helper_os_pollwait_unregister(old_ep);
  }
  ci_assert_equal(new_ep->os_socket, NULL);
  new_ep->os_socket = oo_file_ref_xchg(&old_ep->os_socket, NULL);
  ci_assert_equal(old_ep->os_socket, NULL);
  if( pollwait_register )
    efab_tcp_helper_os_pollwait_register(new_ep);

  ci_bit_clear(&new_s->b.sb_aflags, CI_SB_AFLAG_ORPHAN_BIT);
  if( new_s->b.state == CI_TCP_ESTABLISHED )
    CI_TCP_STATS_INC_CURR_ESTAB(alien_ni);


  /* Copy recv queue */
  if( new_s->b.state & CI_TCP_STATE_TCP ) {
    ci_tcp_state *new_ts = SOCK_TO_TCP(new_s);
    ci_tcp_state *old_ts = SOCK_TO_TCP(old_s);
    int i;

    /* Stop timers */
    ci_ip_timer_clear(&old_thr->netif, &old_ts->kalive_tid);
    ci_ip_timer_clear(&old_thr->netif, &old_ts->delack_tid);

    efab_ip_queue_copy(alien_ni, &new_ts->recv1,
                       &old_thr->netif, &old_ts->recv1);
    efab_ip_queue_copy(alien_ni, &new_ts->recv2,
                       &old_thr->netif, &old_ts->recv2);
    new_ts->recv1_extract = new_ts->recv1.head;

    /* Drop reorder buffer */
    ci_ip_queue_init(&new_ts->rob);
    new_ts->dsack_block = OO_PP_INVALID;
    new_ts->dsack_start = new_ts->dsack_end = 0;
    for( i = 0; i <= CI_TCP_SACK_MAX_BLOCKS; i++ )
      new_ts->last_sack[i] = OO_PP_NULL;
  }
  else {
    /* There should not be any recv q, but drop it to be sure */
    ci_udp_recv_q_init(&SOCK_TO_UDP(new_s)->recv_q);
  }

  /* Old stack can be unlocked */
  old_s->b.sb_flags |= CI_SB_FLAG_MOVED;
  ci_netif_unlock(&old_thr->netif);

  ci_assert( efab_file_move_supported(alien_ni, new_s) );

  /* Move done: poll for any new data. */
  ci_netif_poll(alien_ni);

  if( new_s->b.state & CI_TCP_STATE_TCP ) {
    ci_tcp_state *new_ts = SOCK_TO_TCP(new_s);
    /* Timers setup: delack, keepalive */
    if( (new_ts->acks_pending & CI_TCP_ACKS_PENDING_MASK) > 0)
      ci_tcp_timeout_delack(alien_ni, new_ts);
    ci_tcp_kalive_reset(alien_ni, new_ts);
  }


  /* Old ep: we are done. */
  ci_bit_set(&old_s->b.sb_aflags, CI_SB_AFLAG_MOVED_AWAY_BIT);
  old_s->b.moved_to_stack_id = alien_ni->state->stack_id;
  old_s->b.moved_to_sock_id = new_s->b.bufid;
  if( ! list_empty(&priv->_filp->f_ep_links) )
    ci_bit_set(&old_s->b.sb_aflags, CI_SB_AFLAG_MOVED_AWAY_IN_EPOLL_BIT);

  ci_sock_unlock(&old_thr->netif, &old_s->b);
  ci_sock_unlock(alien_ni, &new_s->b);
  ci_assert(ci_netif_is_locked(alien_ni));
  OO_DEBUG_TCPH(ci_log("%s: -> [%d:%d] %s", __func__,
                       new_thr->id, new_s->b.bufid,
                       ci_tcp_state_str(new_s->b.state)));
  return 0;

fail4:
  /* We clear the filters from the new ep.
   * For now, we do not need to re-insert old filters because hw filters
   * are alredy here (in case of accepted socket) or not needed.
   * We have not removed old sw filters yet. */
  tcp_helper_endpoint_move_filters_undo(old_ep, new_ep);
fail3:
  if( new_s->b.state & CI_TCP_STATE_TCP )
    ci_tcp_state_free(alien_ni, SOCK_TO_TCP(new_s));
  else
    ci_udp_state_free(alien_ni, SOCK_TO_UDP(new_s));
fail2:
  ci_netif_unlock(alien_ni);
fail1:
  ci_netif_unlock(&old_thr->netif);
fail1_ni_unlocked:
  ci_sock_unlock(&old_thr->netif, &old_s->b);
  OO_DEBUG_TCPH(ci_log("%s: rc=%d", __func__, rc));
  return rc;
}
示例#8
0
static int
ci_tcp_info_get(ci_netif* netif, ci_sock_cmn* s, struct ci_tcp_info* info)
{
  ci_iptime_t now = ci_ip_time_now(netif);

  memset(info, 0, sizeof(*info));

  info->tcpi_state = ci_sock_states_linux_map[CI_TCP_STATE_NUM(s->b.state)];
  /* info->tcpi_backoff = 0; */

  info->tcpi_ato = 
    ci_ip_time_ticks2ms(netif, netif->state->conf.tconst_delack) * 1000;
  info->tcpi_rcv_mss    = 536; /* no way to get the actual value */
  /* info->tcpi_sacked     = 0; */ /* there is no way to get any of these */
  /* info->tcpi_lost       = 0; */
  /* info->tcpi_fackets    = 0; */
  /* info->tcpi_reordering = 0; */
  /* info->tcpi_last_ack_sent = 0; */
  /* info->tcpi_last_ack_recv = 0; */

  if( s->b.state != CI_TCP_LISTEN ) {
    ci_tcp_state* ts = SOCK_TO_TCP(s);

    info->tcpi_pmtu       = ts->pmtus.pmtu;
    info->tcpi_ca_state = sock_congstate_linux_map[ts->congstate];
    info->tcpi_retransmits = ts->retransmits;
    info->tcpi_probes = ts->ka_probes;

    /* info->tcpi_options = 0; */
    if( ts->tcpflags & CI_TCPT_FLAG_TSO )
      info->tcpi_options |= CI_TCPI_OPT_TIMESTAMPS;
    if( ts->tcpflags & CI_TCPT_FLAG_ECN )
      info->tcpi_options |= CI_TCPI_OPT_ECN;
    if( ts->tcpflags & CI_TCPT_FLAG_SACK )
      info->tcpi_options |= CI_TCPI_OPT_SACK;

    if( ts->tcpflags & CI_TCPT_FLAG_WSCL ) {
      info->tcpi_options |= CI_TCPI_OPT_WSCALE;
      info->tcpi_snd_wscale = ts->snd_wscl;
      info->tcpi_rcv_wscale = ts->rcv_wscl;
    }

    info->tcpi_rto = ci_ip_time_ticks2ms(netif, ts->rto) * 1000;
    info->tcpi_snd_mss    = ts->eff_mss;
    info->tcpi_unacked    = ts->acks_pending & CI_TCP_ACKS_PENDING_MASK;
#if CI_CFG_TCP_SOCK_STATS
    info->tcpi_retrans    = ts->stats_cumulative.count.tx_retrans_pkt;
#endif
#if CI_CFG_CONGESTION_WINDOW_VALIDATION
    info->tcpi_last_data_sent = ci_ip_time_ticks2ms(netif,
						    now - ts->t_last_sent);
#else
    info->tcpi_last_data_sent = 0;
#endif
    info->tcpi_last_data_recv = ci_ip_time_ticks2ms(netif,
						    now - ts->tspaws);
    
    info->tcpi_rtt = ci_ip_time_ticks2ms(netif, ts->sa) * 1000 / 8;
    info->tcpi_rttvar = ci_ip_time_ticks2ms(netif, ts->sv) * 1000 / 4;
    info->tcpi_rcv_ssthresh = ts->ssthresh;
    if( tcp_eff_mss(ts) != 0 ) {
      info->tcpi_snd_ssthresh = ts->ssthresh / tcp_eff_mss(ts);
      info->tcpi_snd_cwnd     = ts->cwnd / tcp_eff_mss(ts);
    }
    else { /* non-initialised connection */
      info->tcpi_snd_ssthresh = 0;
      info->tcpi_snd_cwnd     = 0;
    }
    info->tcpi_advmss     = ts->amss;
  }

  return 0;
}
示例#9
0
static int ci_tcp_setsockopt_lk(citp_socket* ep, ci_fd_t fd, int level,
				int optname, const void* optval,
				socklen_t optlen )
{
  ci_sock_cmn* s = ep->s;
#if defined(__linux__) || \
    defined(__sun__) && defined(TCP_KEEPALIVE_THRESHOLD) || \
    defined(__sun__) && defined(TCP_KEEPALIVE_ABORT_THRESHOLD)
  ci_tcp_socket_cmn* c = &(SOCK_TO_WAITABLE_OBJ(s)->tcp.c);
#endif
  ci_netif* netif = ep->netif;
  int zeroval = 0;
  int rc;

  /* ?? what to do about optval and optlen checking
  ** Kernel can raise EFAULT, here we are a little in the dark.
  ** Note: If the OS sock is sync'd then we get this checking for free.
  */

  if (optlen == 0) {
    /* Match kernel behaviour: if length is 0, it treats the value as 0; and
     * some applications rely on this.
     */
    optval = &zeroval;
    optlen = sizeof(zeroval);
  }

  /* If you're adding to this please remember to look in common_sockopts.c
   * and decide if the option is common to all protocols. */

  if(level == SOL_SOCKET) {
    switch(optname) {
    case SO_KEEPALIVE:
      /* Over-ride the default common handler.
       * Enable sending of keep-alive messages */
      if( (rc = opt_not_ok(optval, optlen, unsigned)) )
      	goto fail_inval;

      if( *(unsigned*) optval ) {
	unsigned prev_flags = s->s_flags;
	s->s_flags |= CI_SOCK_FLAG_KALIVE;
	/* Set KEEPALIVE timer only if we are not in
	** CLOSE or LISTENING state. */
	if( s->b.state != CI_TCP_CLOSED && s->b.state != CI_TCP_LISTEN &&
	    !(prev_flags & CI_SOCK_FLAG_KALIVE) ) {
	  ci_tcp_state* ts = SOCK_TO_TCP(s);
	  LOG_TV(log("%s: "NSS_FMT" run KEEPALIVE timer from setsockopt()",
		     __FUNCTION__, NSS_PRI_ARGS(netif, s)));
	  ci_assert(ts->ka_probes == 0);
	  ci_tcp_kalive_restart(netif, ts, ci_tcp_kalive_idle_get(ts));
	}
      }
      else {
      	s->s_flags &=~ CI_SOCK_FLAG_KALIVE;
	if( s->b.state != CI_TCP_LISTEN ) {
	  ci_tcp_state* ts = SOCK_TO_TCP(s);
	  ci_tcp_kalive_check_and_clear(netif, ts);
	  ts->ka_probes = 0;
	}
      }
      break;

    default:
      {
        /* Common socket level options */
        return ci_set_sol_socket(netif, s, optname, optval, optlen);
      }
    }
  }
  else if( level == IPPROTO_IP ) {
示例#10
0
/* [fd] is unused in the kernel version */
int ci_tcp_getsockopt(citp_socket* ep, ci_fd_t fd, int level,
		      int optname, void *optval, socklen_t *optlen )
{
  ci_sock_cmn* s = ep->s;
#if defined(__linux__) || \
    defined(__sun__) && defined(TCP_KEEPALIVE_THRESHOLD) || \
    defined(__sun__) && defined(TCP_KEEPALIVE_ABORT_THRESHOLD)
  ci_tcp_socket_cmn *c = &(SOCK_TO_WAITABLE_OBJ(s)->tcp.c);
#endif
  ci_netif* netif = ep->netif;
  unsigned u = 0;

  /* NOTE: The setsockopt() call is reflected into the os socket to
   * keep the two in sync - it's assumed that we know everything
   * to allow us to give good answers here - and therefore we don't
   * bother the os with the get call */

  /* ?? what to do about optval and optlen checking
   * Kernel can raise EFAULT, here we are a little in the dark.
   *  - sockcall_intercept.c checks that optlen is non-NULL and if *optlen
   *    is non-zero that optval is non-NULL, returning EFAULT if false
   */

  if(level == SOL_SOCKET) {
    /* Common SOL_SOCKET handler */
    return ci_get_sol_socket(netif, s, optname, optval, optlen);

  } else if (level ==  IPPROTO_IP) {
    /* IP level options valid for TCP */
    return ci_get_sol_ip(ep, s, fd, optname, optval, optlen);

#if CI_CFG_FAKE_IPV6
  } else if (level ==  IPPROTO_IPV6 && s->domain == AF_INET6) {
    /* IP6 level options valid for TCP */
    return ci_get_sol_ip6(ep, s, fd, optname, optval, optlen);
#endif

  } else if (level == IPPROTO_TCP) {
    /* TCP level options valid for TCP */
    switch(optname){
    case TCP_NODELAY:
      /* gets status of TCP Nagle algorithm  */
      u = ((s->s_aflags & CI_SOCK_AFLAG_NODELAY) != 0);
      goto u_out;
    case TCP_MAXSEG:
      /* gets the MSS size for this connection */
      if ((s->b.state & CI_TCP_STATE_TCP_CONN)) {
        u = tcp_eff_mss(SOCK_TO_TCP(s));
      } else {
        u = 536;
      }
      goto u_out;
# ifdef TCP_CORK
    case TCP_CORK:
      /* don't send partial framses, all partial frames sent
      ** when the option is cleared */
      u = ((s->s_aflags & CI_SOCK_AFLAG_CORK) != 0);
      goto u_out;
# endif


    case TCP_KEEPIDLE:
      {
        /* idle time for keepalives  */
        u = (unsigned) c->t_ka_time_in_secs;
      }
      goto u_out;
    case TCP_KEEPINTVL:
      {
        /* time between keepalives */
        u = (unsigned) c->t_ka_intvl_in_secs;
      }
      goto u_out;
    case TCP_KEEPCNT:
      {
        /* number of keepalives before giving up */
        u = c->ka_probe_th;
      }
      goto u_out;
    case TCP_INFO:
      /* struct tcp_info to be filled */
      return ci_tcp_info_get(netif, s, (struct ci_tcp_info*) optval);
    case TCP_DEFER_ACCEPT:
      {
        u = 0;
        if( c->tcp_defer_accept != OO_TCP_DEFER_ACCEPT_OFF ) {
          u = ci_ip_time_ticks2ms(netif, NI_CONF(netif).tconst_rto_initial);
          u = ((u + 500) / 1000) << c->tcp_defer_accept;
        }
        goto u_out;
      }
    case TCP_QUICKACK:
      {
        u = 0;
        if( s->b.state & CI_TCP_STATE_TCP_CONN ) {
          ci_tcp_state* ts = SOCK_TO_TCP(s);
          u = ci_tcp_is_in_faststart(ts);
        }
        goto u_out;
      }
    default:
      LOG_TC( log(LPF "getsockopt: unimplemented or bad option: %i", 
                  optname));
      RET_WITH_ERRNO(ENOPROTOOPT);
    }
  } else {
    SOCKOPT_RET_INVALID_LEVEL(s);
  }

  return 0;

 u_out:
  return ci_getsockopt_final(optval, optlen, level, &u, sizeof(u));
}
示例#11
0
int ci_tcp_listen(citp_socket* ep, ci_fd_t fd, int backlog)
{
  /* 
  ** ?? error handling on possible fails not handled robustly...
  ** ?? Need to check port number is valid TODO
  */

  /*! \todo If not bound then we have to be listening on all interfaces.
   * It's likely that we won't be coming through here as we have to
   * listen on the OS socket too! */
  ci_tcp_state* ts;
  ci_tcp_socket_listen* tls;
  ci_netif* netif = ep->netif;
  ci_sock_cmn* s = ep->s;
  unsigned ul_backlog = backlog;
  int rc;
  oo_p sp;

  LOG_TC(log("%s "SK_FMT" listen backlog=%d", __FUNCTION__, SK_PRI_ARGS(ep), 
             backlog));
  CHECK_TEP(ep);

  if( NI_OPTS(netif).tcp_listen_handover )
    return CI_SOCKET_HANDOVER;
  if( !NI_OPTS(netif).tcp_server_loopback) {
    /* We should handover if the socket is bound to alien address. */
    if( s->s_flags & CI_SOCK_FLAG_BOUND_ALIEN )
      return CI_SOCKET_HANDOVER;
  }

  if( ul_backlog < 0 )
    ul_backlog = NI_OPTS(netif).max_ep_bufs;
  else if( ul_backlog < NI_OPTS(netif).acceptq_min_backlog )
    ul_backlog = NI_OPTS(netif).acceptq_min_backlog;

  if( s->b.state == CI_TCP_LISTEN ) {
    tls = SOCK_TO_TCP_LISTEN(s);
    tls->acceptq_max = ul_backlog;
    ci_tcp_helper_listen_os_sock(fd, ul_backlog);
    return 0;
  }

  if( s->b.state != CI_TCP_CLOSED ) {
    CI_SET_ERROR(rc, EINVAL);
    return rc;
  }


  ts = SOCK_TO_TCP(s);

  /* Bug 3376: if socket used for a previous, failed, connect then the error
   * numbers will not be as expected.  Only seen when not using listening
   * netifs (as moving the EP to the new netif resets them). 
   */

  ts->s.tx_errno = EPIPE;



  ts->s.rx_errno = ENOTCONN;

  /* fill in address/ports and all TCP state */
  if( !(ts->s.s_flags & CI_SOCK_FLAG_BOUND) ) {
    ci_uint16 source_be16;

    /* They haven't previously done a bind, so we need to choose 
     * a port.  As we haven't been given a hint we let the OS choose. */

    source_be16 = 0;
    rc = __ci_bind(ep->netif, ep->s, ts->s.pkt.ip.ip_saddr_be32, &source_be16);
    if (CI_LIKELY( rc==0 )) {
      TS_TCP(ts)->tcp_source_be16 = source_be16;
      ts->s.cp.lport_be16 = source_be16;
      LOG_TC(log(LNT_FMT "listen: our bind returned %s:%u", 
                 LNT_PRI_ARGS(ep->netif, ts),
                 ip_addr_str(ts->s.pkt.ip.ip_saddr_be32),
                 (unsigned) CI_BSWAP_BE16(TS_TCP(ts)->tcp_source_be16)));

    } else {
      LOG_U(ci_log("__ci_bind returned %d at %s:%d", CI_GET_ERROR(rc),
                   __FILE__, __LINE__));
      return rc;
    }
  } 

  ci_sock_lock(netif, &ts->s.b);
  ci_tcp_set_slow_state(netif, ts, CI_TCP_LISTEN);
  tls = SOCK_TO_TCP_LISTEN(&ts->s);

  tcp_raddr_be32(tls) = 0u;
  tcp_rport_be16(tls) = 0u;

  ci_assert_equal(tls->s.tx_errno, EPIPE);



  ci_assert_equal(tls->s.rx_errno, ENOTCONN);

  /* setup listen timer - do it before the first return statement,
   * because __ci_tcp_listen_to_normal() will be called on error path. */
  if( ~tls->s.s_flags & CI_SOCK_FLAG_BOUND_ALIEN ) {
    sp = TS_OFF(netif, tls);
    OO_P_ADD(sp, CI_MEMBER_OFFSET(ci_tcp_socket_listen, listenq_tid));
    ci_ip_timer_init(netif, &tls->listenq_tid, sp, "lstq");
    tls->listenq_tid.param1 = S_SP(tls);
    tls->listenq_tid.fn = CI_IP_TIMER_TCP_LISTEN;
  }

  rc = ci_tcp_listen_init(netif, tls);
  ci_sock_unlock(netif, &ts->s.b);
  if( rc != 0 ) {
    CI_SET_ERROR(rc, -rc);
    goto listen_fail;
  }
  tls->acceptq_max = ul_backlog;

  CITP_STATS_TCP_LISTEN(CI_ZERO(&tls->stats));

  /* install all the filters needed for this connection 
   *    - tcp_laddr_be32(ts) = 0 for IPADDR_ANY
   *
   *  TODO: handle BINDTODEVICE by setting phys_port paramter to correct 
   *        physical L5 port index
   *  TODO: handle REUSEADDR by setting last paramter to TRUE
   */
  if( ~s->s_flags & CI_SOCK_FLAG_BOUND_ALIEN ) {
#ifdef ONLOAD_OFE
    if( netif->ofe != NULL ) {
      tls->s.ofe_code_start = ofe_socktbl_find(
                        netif->ofe, OFE_SOCKTYPE_TCP_LISTEN,
                        tcp_laddr_be32(tls), INADDR_ANY,
                        tcp_lport_be16(ts), 0);
      tls->ofe_promote = ofe_socktbl_find(
                        netif->ofe, OFE_SOCKTYPE_TCP_PASSIVE,
                        tcp_laddr_be32(tls), INADDR_ANY,
                        tcp_lport_be16(ts), 0);
    }
#endif
    rc = ci_tcp_ep_set_filters(netif, S_SP(tls), tls->s.cp.so_bindtodevice,
                               OO_SP_NULL);
    if( rc == -EFILTERSSOME ) {
      if( CITP_OPTS.no_fail )
        rc = 0;
      else {
        ci_tcp_ep_clear_filters(netif, S_SP(tls), 0);
        rc = -ENOBUFS;
      }
    }
    ci_assert_nequal(rc, -EFILTERSSOME);
    VERB(ci_log("%s: set_filters  returned %d", __FUNCTION__, rc));
    if (rc < 0) {
      CI_SET_ERROR(rc, -rc);
      goto post_listen_fail;
    }
  }


  /* 
   * Call of system listen() is required for listen any, local host
   * communications server and multi-homed server (to accept connections
   * to L5 assigned address(es), but incoming from other interfaces).
   */
#ifdef __ci_driver__
  {
    rc = efab_tcp_helper_listen_os_sock( netif2tcp_helper_resource(netif),
					 S_SP(tls), backlog);
  }
#else
  rc = ci_tcp_helper_listen_os_sock(fd, backlog);
#endif
  if ( rc < 0 ) {
    /* clear the filter we've just set */
    ci_tcp_ep_clear_filters(netif, S_SP(tls), 0);
    goto post_listen_fail;
  }
  return 0;

 post_listen_fail:
  ci_tcp_listenq_drop_all(netif, tls);
 listen_fail:
  /* revert TCP state to a non-listening socket format */
  __ci_tcp_listen_to_normal(netif, tls);
  /* Above function sets orphan flag but we are attached to an FD. */
  ci_bit_clear(&tls->s.b.sb_aflags, CI_SB_AFLAG_ORPHAN_BIT);
#ifdef __ci_driver__
  return rc;
#else
  return CI_SOCKET_ERROR;
#endif
}