示例#1
0
static int ci_tcp_connect_ul_start(ci_netif *ni, ci_tcp_state* ts,
				   ci_uint32 dst_be32, unsigned dport_be16,
                                   int* fail_rc)
{
  ci_ip_pkt_fmt* pkt;
  int rc = 0;

  ci_assert(ts->s.pkt.mtu);

  /* Now that we know the outgoing route, set the MTU related values.
   * Note, even these values are speculative since the real MTU
   * could change between now and passing the packet to the lower layers
   */
  ts->amss = ts->s.pkt.mtu - sizeof(ci_tcp_hdr) - sizeof(ci_ip4_hdr);
#if CI_CFG_LIMIT_AMSS
  ts->amss = ci_tcp_limit_mss(ts->amss, ni, __FUNCTION__);
#endif

  /* Default smss until discovered by MSS option in SYN - RFC1122 4.2.2.6 */
  ts->smss = CI_CFG_TCP_DEFAULT_MSS;

  /* set pmtu, eff_mss, snd_buf and adjust windows */
  ci_pmtu_set(ni, &ts->pmtus, ts->s.pkt.mtu);
  ci_tcp_set_eff_mss(ni, ts);
  ci_tcp_set_initialcwnd(ni, ts);

  /* Send buffer adjusted by ci_tcp_set_eff_mss(), but we want it to stay
   * zero until the connection is established.
   */
  ts->so_sndbuf_pkts = 0;

  /* 
   * 3. State and address are OK. It's address routed through our NIC.
   *    Do connect().
   */
  ci_assert_nequal(ts->s.pkt.ip.ip_saddr_be32, INADDR_ANY);

  if( ts->s.s_flags & CI_SOCK_FLAG_CONNECT_MUST_BIND ) {
    ci_sock_cmn* s = &ts->s;
    ci_uint16 source_be16 = 0;

    if( s->s_flags & CI_SOCK_FLAG_ADDR_BOUND )
      rc = __ci_bind(ni, &ts->s, ts->s.pkt.ip.ip_saddr_be32, &source_be16);
    else 
      rc = __ci_bind(ni, &ts->s, INADDR_ANY, &source_be16);
    if(CI_LIKELY( rc == 0 )) {
      TS_TCP(ts)->tcp_source_be16 = source_be16;
      ts->s.cp.lport_be16 = source_be16;
      LOG_TC(log(LNT_FMT "connect: our bind returned %s:%u", 
                 LNT_PRI_ARGS(ni, ts),
                 ip_addr_str(INADDR_ANY),
                 (unsigned) CI_BSWAP_BE16(TS_TCP(ts)->tcp_source_be16)));
    }
    else {
      LOG_U(ci_log("__ci_bind returned %d at %s:%d", CI_GET_ERROR(rc),
                   __FILE__, __LINE__));
      *fail_rc = rc;
      return CI_CONNECT_UL_FAIL;
    }
    if(CI_UNLIKELY( ts->s.pkt.ip.ip_saddr_be32 == 0 )) {
      CI_SET_ERROR(*fail_rc, EINVAL);
      return CI_CONNECT_UL_FAIL;
    }
  }

  ci_tcp_set_peer(ts, dst_be32, dport_be16);

  /* Make sure we can get a buffer before we change state. */
  pkt = ci_netif_pkt_tx_tcp_alloc(ni);
  if( CI_UNLIKELY(! pkt) ) {
    /* NB. We've already done a poll above. */
    rc = ci_netif_pkt_wait(ni, &ts->s, CI_SLEEP_NETIF_LOCKED|CI_SLEEP_NETIF_RQ);
    if( ci_netif_pkt_wait_was_interrupted(rc) ) {
      CI_SET_ERROR(*fail_rc, -rc);
      return CI_CONNECT_UL_LOCK_DROPPED;
    }
    /* OK, there are (probably) packets available - go try again.  Note we
     * jump back to the top of the function because someone may have
     * connected this socket in the mean-time, so we need to check the
     * state once more.
     */
    return CI_CONNECT_UL_START_AGAIN;
  }

#ifdef ONLOAD_OFE
    if( ni->ofe != NULL )
      ts->s.ofe_code_start = ofe_socktbl_find(
                        ni->ofe, OFE_SOCKTYPE_TCP_ACTIVE,
                        tcp_laddr_be32(ts), tcp_raddr_be32(ts),
                        tcp_lport_be16(ts), tcp_rport_be16(ts));
#endif

  rc = ci_tcp_ep_set_filters(ni, S_SP(ts), ts->s.cp.so_bindtodevice,
                             OO_SP_NULL);
  if( rc < 0 ) {
    /* Perhaps we've run out of filters?  See if we can push a socket out
     * of timewait and steal its filter.
     */
    ci_assert_nequal(rc, -EFILTERSSOME);
    if( rc != -EBUSY || ! ci_netif_timewait_try_to_free_filter(ni) ||
        (rc = ci_tcp_ep_set_filters(ni, S_SP(ts),
                                    ts->s.cp.so_bindtodevice,
                                    OO_SP_NULL)) < 0 ) {
      ci_assert_nequal(rc, -EFILTERSSOME);
      /* Either a different error, or our efforts to free a filter did not
       * work.
       */
      if( ! (ts->s.s_flags & CI_SOCK_FLAG_ADDR_BOUND) ) {
        ts->s.pkt.ip.ip_saddr_be32 = 0;
        ts->s.cp.ip_laddr_be32 = 0;
      }
      ci_netif_pkt_release(ni, pkt);
      CI_SET_ERROR(*fail_rc, -rc);
      return CI_CONNECT_UL_FAIL;
    }
  }

  LOG_TC(log(LNT_FMT "CONNECT %s:%u->%s:%u", LNT_PRI_ARGS(ni, ts),
	     ip_addr_str(ts->s.pkt.ip.ip_saddr_be32),
	     (unsigned) CI_BSWAP_BE16(TS_TCP(ts)->tcp_source_be16),
	     ip_addr_str(ts->s.pkt.ip.ip_daddr_be32),
	     (unsigned) CI_BSWAP_BE16(TS_TCP(ts)->tcp_dest_be16)));

  /* We are going to send the SYN - set states appropriately */
  tcp_snd_una(ts) = tcp_snd_nxt(ts) = tcp_enq_nxt(ts) = tcp_snd_up(ts) =
    ci_tcp_initial_seqno(ni);
  ts->snd_max = tcp_snd_nxt(ts) + 1;

  /* Must be after initialising snd_una. */
  ci_tcp_clear_rtt_timing(ts);
  ci_tcp_set_flags(ts, CI_TCP_FLAG_SYN);
  ts->tcpflags &=~ CI_TCPT_FLAG_OPT_MASK;
  ts->tcpflags |= NI_OPTS(ni).syn_opts;

  if( (ts->tcpflags & CI_TCPT_FLAG_WSCL) ) {
    ts->rcv_wscl = ci_tcp_wscl_by_buff(ni, ci_tcp_rcvbuf_established(ni, &ts->s));
    CI_IP_SOCK_STATS_VAL_RXWSCL(ts, ts->rcv_wscl);
  }
  else {
    ts->rcv_wscl = 0;
    CI_IP_SOCK_STATS_VAL_RXWSCL(ts, 0);
  }
  ci_tcp_set_rcvbuf(ni, ts);
  ci_tcp_init_rcv_wnd(ts, "CONNECT");

  /* outgoing_hdrs_len is initialised to include timestamp option. */
  if( ! (ts->tcpflags & CI_TCPT_FLAG_TSO) )
    ts->outgoing_hdrs_len = sizeof(ci_ip4_hdr)+sizeof(ci_tcp_hdr);
  if( ci_tcp_can_stripe(ni, ts->s.pkt.ip.ip_saddr_be32,
			ts->s.pkt.ip.ip_daddr_be32) )
    ts->tcpflags |= CI_TCPT_FLAG_STRIPE;
  ci_tcp_set_slow_state(ni, ts, CI_TCP_SYN_SENT);

  /* If the app trys to send data on a socket in SYN_SENT state
  ** then the data is queued for send until the SYN gets ACKed.
  ** (rfc793 p56)
  **
  ** Receive calls on the socket should block until data arrives
  ** (rfc793 p58)
  **
  ** Clearing tx_errno and rx_errno acheive this. The transmit window
  ** is set to 1 byte which ensures that only the SYN packet gets
  ** sent until the ACK is received with more window. 
  */
  ci_assert(ts->snd_max == tcp_snd_nxt(ts) + 1);
  ts->s.rx_errno = 0;
  ts->s.tx_errno = 0; 
  ci_tcp_enqueue_no_data(ts, ni, pkt);
  ci_tcp_set_flags(ts, CI_TCP_FLAG_ACK);  

  if( ts->s.b.sb_aflags & (CI_SB_AFLAG_O_NONBLOCK | CI_SB_AFLAG_O_NDELAY) ) {
    ts->tcpflags |= CI_TCPT_FLAG_NONBLOCK_CONNECT;
    LOG_TC(log( LNT_FMT "Non-blocking connect - return EINPROGRESS",
		LNT_PRI_ARGS(ni, ts)));
    CI_SET_ERROR(*fail_rc, EINPROGRESS);
    return CI_CONNECT_UL_FAIL;
  }

  return CI_CONNECT_UL_OK;
}
示例#2
0
void ci_ip_timer_state_assert_valid(ci_netif* ni, const char* file, int line)
{
  ci_ip_timer_state* ipts;
  ci_ip_timer* ts;
  ci_ni_dllist_t* bucket;
  ci_ni_dllist_link* l;
  ci_iptime_t stime, wheel_base, max_time, min_time;
  int a1, a2, a3, w, b, bit_shift;

  /* shifting a 32 bit integer left or right 32 bits has undefined results 
   * (i.e. not 0 which is required). Therefore I now use an array of mask 
   * values 
   */
  unsigned wheel_mask[CI_IPTIME_WHEELS] = 
                { WHEEL0_MASK, WHEEL1_MASK, WHEEL2_MASK, 0 };

  ipts = IPTIMER_STATE(ni);
  stime = ipts->sched_ticks;
  
  /* for each wheel */
  for(w=0; w < CI_IPTIME_WHEELS; w++) {

    /* base time of wheel */
    wheel_base = stime & wheel_mask[w];
    /* for each bucket in wheel */
    for (b=0; b < CI_IPTIME_BUCKETS; b++) {

      /* max and min relative times for this bucket */
      bit_shift = CI_IPTIME_BUCKETBITS*w;
      min_time = wheel_base + (b << bit_shift);
      max_time = min_time   + (1 << bit_shift);

      bucket = &ipts->warray[w*CI_IPTIME_BUCKETS + b];

      /* check list looks valid */
      if ( ci_ni_dllist_start(ni, bucket) == ci_ni_dllist_end(ni, bucket) ) {
        ci_assert( ci_ni_dllist_is_empty(ni, bucket) );
      }

      /* check buckets that should be empty are! */
      a3 = TIME_GT(min_time, stime) || ci_ni_dllist_is_empty(ni, bucket);

      /* run through timers in bucket */
      for (l = ci_ni_dllist_start(ni, bucket);
           l != ci_ni_dllist_end(ni, bucket);
           ci_ni_dllist_iter(ni, l) ) {

        ci_ni_dllist_link_assert_valid(ni, l);

        /* get timer */  
        ts = LINK2TIMER(l);

        /* must be in the future */
        a1 = TIME_GT(ts->time, stime);
        /* must be within time range of bucket */
        a2 = TIME_LT(ts->time, max_time) && TIME_GE(ts->time, min_time);

        /* if any of the checks fail then print out timer details */
        if (!a1 || !a2 || !a3) {
          ci_log("%s: [w=0x%x/b=0x%x] stime=0x%x", __FUNCTION__, w, b, stime);
          ci_log("    --> t=0x%x, min=0x%x, max=0x%x", ts->time, min_time, max_time);
          ci_log("    [%s line=%d]", file, line);
        }
        /* stop if assertion failed */
        ci_assert(a1 && a2 && a3);
      }
    }
  }
}
示例#3
0
int onload_zc_release_buffers(int fd, onload_zc_handle* bufs, int bufs_len)
{
  int rc = 0, i;
  citp_lib_context_t lib_context;
  citp_fdinfo* fdi;
  citp_sock_fdi* epi;
  ci_netif* ni;

  Log_CALL(ci_log("%s(%d, %p, %d)", __FUNCTION__, fd, bufs, bufs_len));

  citp_enter_lib(&lib_context);

  if( (fdi = citp_fdtable_lookup(fd)) != NULL ) {
    switch( citp_fdinfo_get_type(fdi) ) {
    case CITP_UDP_SOCKET:
    case CITP_TCP_SOCKET:
      epi = fdi_to_sock_fdi(fdi);
      ni = epi->sock.netif;
      ci_netif_lock(ni);
      for( i = 0; i < bufs_len; ++i ) {
        ci_ip_pkt_fmt* pkt = (ci_ip_pkt_fmt*)bufs[i];
        if( pkt->stack_id != ni->state->stack_id ) {
          LOG_U(log("%s: attempt to free buffer from stack %d to stack %d",
                    __FUNCTION__, pkt->stack_id, ni->state->stack_id));
          rc = -EINVAL;
          break;
        }
      }
      if( rc == 0 ) {
        for( i = 0; i < bufs_len; ++i )
          ci_netif_pkt_release_check_keep(ni, (ci_ip_pkt_fmt*)bufs[i]);
      }
      ci_netif_unlock(ni);
      break;
#if CI_CFG_USERSPACE_EPOLL
    case CITP_EPOLL_FD:
      rc = -ENOTSOCK;
      break;
#endif
#if CI_CFG_USERSPACE_PIPE
    case CITP_PIPE_FD:
      rc = -ENOTSOCK;
      break;
#endif
    default:
      LOG_U(log("%s: unknown fdinfo type %d", __FUNCTION__, 
                citp_fdinfo_get_type(fdi)));
      rc = -EINVAL;
    }

    citp_fdinfo_release_ref(fdi, 0);
  } 
  else {
    /* Not onload socket */
    rc = -ESOCKTNOSUPPORT;
  }

  citp_exit_lib(&lib_context, TRUE);
  Log_CALL_RESULT(rc);

  return rc;
}
/* Move priv file to the alien_ni stack.
 * Should be called with the locked priv stack and socket;
 * the function returns with this stack being unlocked.
 * If rc=0, it returns with alien_ni stack locked;
 * otherwise, both stacks are unlocked.
 * Socket is always unlocked on return. */
int efab_file_move_to_alien_stack(ci_private_t *priv, ci_netif *alien_ni)
{
  tcp_helper_resource_t *old_thr = priv->thr;
  tcp_helper_resource_t *new_thr = netif2tcp_helper_resource(alien_ni);
  ci_sock_cmn *old_s = SP_TO_SOCK(&old_thr->netif, priv->sock_id);
  ci_sock_cmn *new_s;
  ci_sock_cmn *mid_s;
  tcp_helper_endpoint_t *old_ep, *new_ep;
  int rc, i;
  int pollwait_register = 0;
#if CI_CFG_FD_CACHING
  oo_p sp;
#endif

  OO_DEBUG_TCPH(ci_log("%s: move %d:%d to %d", __func__,
                       old_thr->id, priv->sock_id, new_thr->id));
  /* Poll the old stack - deliver all data to our socket */
  ci_netif_poll(&old_thr->netif);

  /* Endpoints in epoll list should not be moved, because waitq is already
   * in the epoll internal structures (bug 41152). */
  if( !list_empty(&priv->_filp->f_ep_links) ) {
    rc = -EBUSY;
    goto fail1;
  }

  if( !efab_file_move_supported(&old_thr->netif, old_s) ) {
    rc = -EINVAL;
    goto fail1;
  }

  /* Lock the second stack */
  i = 0;
  while( ! ci_netif_trylock(alien_ni) ) {
    ci_netif_unlock(&old_thr->netif);
    if( i++ >= 1000 ) {
      rc = -EBUSY;
      goto fail1_ni_unlocked;
    }
    rc = ci_netif_lock(&old_thr->netif);
    if( rc != 0 )
      goto fail1_ni_unlocked;
  }

  /* Allocate a new socket in the alien_ni stack */
  rc = -ENOMEM;
  if( old_s->b.state == CI_TCP_STATE_UDP ) {
    ci_udp_state *new_us = ci_udp_get_state_buf(alien_ni);
    if( new_us == NULL )
      goto fail2;
    new_s = &new_us->s;
  }
  else {
    ci_tcp_state *new_ts = ci_tcp_get_state_buf(alien_ni);
    if( new_ts == NULL )
      goto fail2;
    new_s = &new_ts->s;
  }

  /* Allocate an intermediate "socket" outside of everything */
  mid_s = ci_alloc(CI_MAX(sizeof(ci_tcp_state), sizeof(ci_udp_state)));
  if( mid_s == NULL )
    goto fail3;

  OO_DEBUG_TCPH(ci_log("%s: move %d:%d to %d:%d", __func__,
                       old_thr->id, priv->sock_id,
                       new_thr->id, new_s->b.bufid));

  /* Copy TCP/UDP state */
  memcpy(mid_s, old_s, CI_MAX(sizeof(ci_tcp_state), sizeof(ci_udp_state)));

  /* do not copy old_s->b.bufid
   * and other fields in stack adress space */
  mid_s->b.sb_aflags |= CI_SB_AFLAG_ORPHAN;
  mid_s->b.bufid = new_s->b.bufid;
  mid_s->b.post_poll_link = new_s->b.post_poll_link;
  mid_s->b.ready_link = new_s->b.ready_link;
  mid_s->reap_link = new_s->reap_link;

  if( old_s->b.state & CI_TCP_STATE_TCP ) {
    ci_tcp_state *new_ts = SOCK_TO_TCP(new_s);
    ci_tcp_state *mid_ts = SOCK_TO_TCP(mid_s);

    mid_ts->timeout_q_link = new_ts->timeout_q_link;
    mid_ts->tx_ready_link = new_ts->tx_ready_link;
    mid_ts->rto_tid = new_ts->rto_tid;
    mid_ts->delack_tid = new_ts->delack_tid;
    mid_ts->zwin_tid = new_ts->zwin_tid;
    mid_ts->kalive_tid = new_ts->kalive_tid;
    mid_ts->cork_tid = new_ts->cork_tid;
    ci_ip_queue_init(&mid_ts->recv1);
    ci_ip_queue_init(&mid_ts->recv2);
    ci_ip_queue_init(&mid_ts->send);
    ci_ip_queue_init(&mid_ts->retrans);
    mid_ts->send_prequeue = OO_PP_ID_NULL;
    new_ts->retrans_ptr = OO_PP_NULL;
    mid_ts->tmpl_head = OO_PP_NULL;
    oo_atomic_set(&mid_ts->send_prequeue_in, 0);

    *new_ts = *mid_ts;
    ci_pmtu_state_init(alien_ni, &new_ts->s, &new_ts->pmtus,
                       CI_IP_TIMER_PMTU_DISCOVER);
#if CI_CFG_FD_CACHING
    sp = TS_OFF(alien_ni, new_ts);
    OO_P_ADD(sp, CI_MEMBER_OFFSET(ci_tcp_state, epcache_link));
    ci_ni_dllist_link_init(alien_ni, &new_ts->epcache_link, sp, "epch");
    ci_ni_dllist_self_link(alien_ni, &new_ts->epcache_link);
    sp = TS_OFF(alien_ni, new_ts);
    OO_P_ADD(sp, CI_MEMBER_OFFSET(ci_tcp_state, epcache_fd_link));
    ci_ni_dllist_link_init(alien_ni, &new_ts->epcache_fd_link, sp, "ecfd");
    ci_ni_dllist_self_link(alien_ni, &new_ts->epcache_fd_link);
#endif
   
    /* free temporary mid_ts storage */
    CI_FREE_OBJ(mid_ts);
  }
  else {
    ci_udp_state *mid_us = SOCK_TO_UDP(mid_s);

    *SOCK_TO_UDP(new_s) = *mid_us;
    CI_FREE_OBJ(mid_us);
  }

  /* Move the filter */
  old_ep = ci_trs_ep_get(old_thr, priv->sock_id);
  new_ep = ci_trs_ep_get(new_thr, new_s->b.bufid);
  rc = tcp_helper_endpoint_move_filters_pre(old_ep, new_ep);
  if( rc != 0 ) {
    rc = -EINVAL;
    goto fail3;
  }

  /* Allocate a new file for the new endpoint */
  rc = onload_alloc_file(new_thr, new_s->b.bufid, priv->_filp->f_flags,
                         priv->fd_type, &old_ep->alien_ref);
  if( rc != 0 )
    goto fail4;
  ci_assert(old_ep->alien_ref);

  /* Copy F_SETOWN_EX, F_SETSIG to the new file */
#ifdef F_SETOWN_EX
  rcu_read_lock();
  __f_setown(old_ep->alien_ref->_filp, priv->_filp->f_owner.pid,
             priv->_filp->f_owner.pid_type, 1);
  rcu_read_unlock();
#endif
  old_ep->alien_ref->_filp->f_owner.signum = priv->_filp->f_owner.signum;
  old_ep->alien_ref->_filp->f_flags |= priv->_filp->f_flags & O_NONBLOCK;

  /* Move os_socket from one ep to another */
  if( tcp_helper_endpoint_set_aflags(new_ep, OO_THR_EP_AFLAG_ATTACHED) &
      OO_THR_EP_AFLAG_ATTACHED ) {
    fput(old_ep->alien_ref->_filp);
    rc = -EBUSY;
    goto fail2; /* state & filters are cleared by fput() */
  }

  /********* Point of no return  **********/
  ci_wmb();
  priv->fd_type = CI_PRIV_TYPE_ALIEN_EP;
  priv->_filp->f_op = &linux_tcp_helper_fops_alien;
  ci_wmb();
  oo_file_moved(priv);

  /* Read all already-arrived packets after the filters move but before
   * copying of the receive queue. */
  ci_netif_poll(&old_thr->netif);
  tcp_helper_endpoint_move_filters_post(old_ep, new_ep);
  ci_assert( efab_file_move_supported(&old_thr->netif, old_s));

  /* There's a gap between un-registering the old ep, and registering the
   * the new.  However, the notifications shouldn't be in use for sockets
   * that are in a state that can be moved, so this shouldn't be a problem.
   */
  if( old_ep->os_sock_pt.whead ) {
    pollwait_register = 1;
    efab_tcp_helper_os_pollwait_unregister(old_ep);
  }
  ci_assert_equal(new_ep->os_socket, NULL);
  new_ep->os_socket = oo_file_ref_xchg(&old_ep->os_socket, NULL);
  ci_assert_equal(old_ep->os_socket, NULL);
  if( pollwait_register )
    efab_tcp_helper_os_pollwait_register(new_ep);

  ci_bit_clear(&new_s->b.sb_aflags, CI_SB_AFLAG_ORPHAN_BIT);
  if( new_s->b.state == CI_TCP_ESTABLISHED )
    CI_TCP_STATS_INC_CURR_ESTAB(alien_ni);


  /* Copy recv queue */
  if( new_s->b.state & CI_TCP_STATE_TCP ) {
    ci_tcp_state *new_ts = SOCK_TO_TCP(new_s);
    ci_tcp_state *old_ts = SOCK_TO_TCP(old_s);
    int i;

    /* Stop timers */
    ci_ip_timer_clear(&old_thr->netif, &old_ts->kalive_tid);
    ci_ip_timer_clear(&old_thr->netif, &old_ts->delack_tid);

    efab_ip_queue_copy(alien_ni, &new_ts->recv1,
                       &old_thr->netif, &old_ts->recv1);
    efab_ip_queue_copy(alien_ni, &new_ts->recv2,
                       &old_thr->netif, &old_ts->recv2);
    new_ts->recv1_extract = new_ts->recv1.head;

    /* Drop reorder buffer */
    ci_ip_queue_init(&new_ts->rob);
    new_ts->dsack_block = OO_PP_INVALID;
    new_ts->dsack_start = new_ts->dsack_end = 0;
    for( i = 0; i <= CI_TCP_SACK_MAX_BLOCKS; i++ )
      new_ts->last_sack[i] = OO_PP_NULL;
  }
  else {
    /* There should not be any recv q, but drop it to be sure */
    ci_udp_recv_q_init(&SOCK_TO_UDP(new_s)->recv_q);
  }

  /* Old stack can be unlocked */
  old_s->b.sb_flags |= CI_SB_FLAG_MOVED;
  ci_netif_unlock(&old_thr->netif);

  ci_assert( efab_file_move_supported(alien_ni, new_s) );

  /* Move done: poll for any new data. */
  ci_netif_poll(alien_ni);

  if( new_s->b.state & CI_TCP_STATE_TCP ) {
    ci_tcp_state *new_ts = SOCK_TO_TCP(new_s);
    /* Timers setup: delack, keepalive */
    if( (new_ts->acks_pending & CI_TCP_ACKS_PENDING_MASK) > 0)
      ci_tcp_timeout_delack(alien_ni, new_ts);
    ci_tcp_kalive_reset(alien_ni, new_ts);
  }


  /* Old ep: we are done. */
  ci_bit_set(&old_s->b.sb_aflags, CI_SB_AFLAG_MOVED_AWAY_BIT);
  old_s->b.moved_to_stack_id = alien_ni->state->stack_id;
  old_s->b.moved_to_sock_id = new_s->b.bufid;
  if( ! list_empty(&priv->_filp->f_ep_links) )
    ci_bit_set(&old_s->b.sb_aflags, CI_SB_AFLAG_MOVED_AWAY_IN_EPOLL_BIT);

  ci_sock_unlock(&old_thr->netif, &old_s->b);
  ci_sock_unlock(alien_ni, &new_s->b);
  ci_assert(ci_netif_is_locked(alien_ni));
  OO_DEBUG_TCPH(ci_log("%s: -> [%d:%d] %s", __func__,
                       new_thr->id, new_s->b.bufid,
                       ci_tcp_state_str(new_s->b.state)));
  return 0;

fail4:
  /* We clear the filters from the new ep.
   * For now, we do not need to re-insert old filters because hw filters
   * are alredy here (in case of accepted socket) or not needed.
   * We have not removed old sw filters yet. */
  tcp_helper_endpoint_move_filters_undo(old_ep, new_ep);
fail3:
  if( new_s->b.state & CI_TCP_STATE_TCP )
    ci_tcp_state_free(alien_ni, SOCK_TO_TCP(new_s));
  else
    ci_udp_state_free(alien_ni, SOCK_TO_UDP(new_s));
fail2:
  ci_netif_unlock(alien_ni);
fail1:
  ci_netif_unlock(&old_thr->netif);
fail1_ni_unlocked:
  ci_sock_unlock(&old_thr->netif, &old_s->b);
  OO_DEBUG_TCPH(ci_log("%s: rc=%d", __func__, rc));
  return rc;
}
示例#5
0
static int oo_epoll2_ctl(struct oo_epoll_private *priv, int op_kepfd,
                         int op_op, int op_fd, struct epoll_event *op_event)
{
  tcp_helper_resource_t *fd_thr;
  struct file *file;
  int rc;
  ci_uint32 fd_sock_id;
  citp_waitable *fd_w;

  /* We are interested in ADD only */
  if( op_op != EPOLL_CTL_ADD )
    return efab_linux_sys_epoll_ctl(op_kepfd, op_op, op_fd, op_event);

  /* system poll() and friends use fget_light(), which is cheap.
   * But they do not export fget_light to us, so we have to use fget(). */
  file = fget(op_fd);
  if(unlikely( file == NULL ))
    return -EBADF;

  /* Check for the dead circle.
   * We should check that we are not adding ourself. */
  if(unlikely( file->private_data == priv )) {
    fput(file);
    return -EINVAL;
  }

  /* Is op->fd ours and if yes, which netif it has? */
  /* Fixme: epoll fd - do we want to accelerate something? */
  if( file->f_op != &linux_tcp_helper_fops_udp &&
      file->f_op != &linux_tcp_helper_fops_tcp ) {
    int rc;
#ifdef OO_EPOLL_NEED_NEST_PROTECTION
    struct oo_epoll_busy_task t;
    t.task = current;
    spin_lock(&priv->lock);
    list_add(&t.link, &priv->p.p2.busy_tasks);
    spin_unlock(&priv->lock);
#endif

#if CI_CFG_USERSPACE_PIPE
    if( ( file->f_op == &linux_tcp_helper_fops_pipe_reader ||
          file->f_op == &linux_tcp_helper_fops_pipe_writer ) )
      priv->p.p2.do_spin = 1;
#endif
    fput(file);
    rc = efab_linux_sys_epoll_ctl(op_kepfd, op_op, op_fd, op_event);
#ifdef OO_EPOLL_NEED_NEST_PROTECTION
      spin_lock(&priv->lock);
      list_del(&t.link);
      spin_unlock(&priv->lock);
#endif
    return rc;
  }

  /* Onload socket here! */
  fd_thr = ((ci_private_t *)file->private_data)->thr;
  fd_sock_id = ((ci_private_t *)file->private_data)->sock_id;
  priv->p.p2.do_spin = 1;

  if(unlikely( ! oo_epoll_add_stack(priv, fd_thr) )) {
    static int printed;
    if( !printed )
      ci_log("Can't add stack %d to epoll set: consider "
             "increasing epoll_max_stacks module option", fd_thr->id);
    /* fall through to sys_epoll_ctl() without interrupt */
  }

  /* Let kernel add fd to the epoll set, but ask endpoint to avoid enabling
   * interrupts.
   * And we keep file ref while using fd_w to avoid nasty things. */
  fd_w = SP_TO_WAITABLE(&fd_thr->netif, fd_sock_id);
  ci_bit_set(&fd_w->sb_aflags, CI_SB_AFLAG_AVOID_INTERRUPTS_BIT);
  rc = efab_linux_sys_epoll_ctl(op_kepfd, op_op, op_fd, op_event);
  ci_bit_clear(&fd_w->sb_aflags, CI_SB_AFLAG_AVOID_INTERRUPTS_BIT);
  fput(file);

  return rc;
}
示例#6
0
int onload_zc_release_buffers(int fd, onload_zc_handle* bufs, int bufs_len)
{
  int rc = 0, i, rx_pkt, released;
  citp_lib_context_t lib_context;
  citp_fdinfo* fdi;
  citp_sock_fdi* epi;
  ci_netif* ni;
  ci_ip_pkt_fmt* pkt;

  Log_CALL(ci_log("%s(%d, %p, %d)", __FUNCTION__, fd, bufs, bufs_len));

  citp_enter_lib(&lib_context);

  if( (fdi = citp_fdtable_lookup(fd)) != NULL ) {
    switch( citp_fdinfo_get_type(fdi) ) {
    case CITP_UDP_SOCKET:
    case CITP_TCP_SOCKET:
      epi = fdi_to_sock_fdi(fdi);
      ni = epi->sock.netif;
      ci_netif_lock(ni);
      for( i = 0; i < bufs_len; ++i ) {
        pkt = (ci_ip_pkt_fmt*)bufs[i];
        if( pkt->stack_id != ni->state->stack_id ) {
          LOG_U(log("%s: attempt to free buffer from stack %d to stack %d",
                    __FUNCTION__, pkt->stack_id, ni->state->stack_id));
          rc = -EINVAL;
          break;
        }
      }
      if( rc == 0 ) {
        for( i = 0; i < bufs_len; ++i ) {
          pkt = (ci_ip_pkt_fmt*)bufs[i];
          /* If we are releasing a packet without the RX_FLAG then the user
           * allocated and then freed the packet (without using it).
           * We detect this to decrement n_asyn_pkts.
           * RX packets (kept via ONLOAD_ZC_KEEP) are counted differently
           * so don't decrement here.  (But may release)
           */
          rx_pkt = pkt->flags & CI_PKT_FLAG_RX;
          released = ci_netif_pkt_release_check_keep(ni, pkt);
          if ( ! rx_pkt ) {
            ci_assert(released == 1);
            (void) released;
            --ni->state->n_async_pkts;
          }
        }
      }
      ci_netif_unlock(ni);
      break;
#if CI_CFG_USERSPACE_EPOLL
    case CITP_EPOLL_FD:
      rc = -ENOTSOCK;
      break;
#endif
#if CI_CFG_USERSPACE_PIPE
    case CITP_PIPE_FD:
      rc = -ENOTSOCK;
      break;
#endif
    default:
      LOG_U(log("%s: unknown fdinfo type %d", __FUNCTION__, 
                citp_fdinfo_get_type(fdi)));
      rc = -EINVAL;
    }

    citp_fdinfo_release_ref(fdi, 0);
  } 
  else {
    /* Not onload socket */
    rc = -ESOCKTNOSUPPORT;
  }

  citp_exit_lib(&lib_context, TRUE);
  Log_CALL_RESULT(rc);

  return rc;
}
示例#7
0
static int
efab_tcp_helper_sock_attach(ci_private_t* priv, void *arg)
{
  oo_sock_attach_t* op = arg;
  tcp_helper_resource_t* trs = priv->thr;
  tcp_helper_endpoint_t* ep = NULL;
  citp_waitable_obj *wo;
  int rc, flags, type = op->type;

/* SOCK_CLOEXEC and SOCK_NONBLOCK exist from 2.6.27 both */
#ifdef SOCK_TYPE_MASK
  BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
  flags = type & (SOCK_CLOEXEC | SOCK_NONBLOCK);
  type &= SOCK_TYPE_MASK;
# ifdef SOCK_NONBLOCK
    if( SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK) )
      flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
# endif
#else
  flags = 0;
#endif

  OO_DEBUG_TCPH(ci_log("%s: ep_id=%d", __FUNCTION__, op->ep_id));
  if( trs == NULL ) {
    LOG_E(ci_log("%s: ERROR: not attached to a stack", __FUNCTION__));
    return -EINVAL;
  }

  /* Validate and find the endpoint. */
  if( ! IS_VALID_SOCK_P(&trs->netif, op->ep_id) )
    return -EINVAL;
  ep = ci_trs_get_valid_ep(trs, op->ep_id);
  if( tcp_helper_endpoint_set_aflags(ep, OO_THR_EP_AFLAG_ATTACHED) &
      OO_THR_EP_AFLAG_ATTACHED )
    return -EBUSY;
  wo = SP_TO_WAITABLE_OBJ(&trs->netif, ep->id);

  /* create OS socket */
  if( op->domain != AF_UNSPEC ) {
    struct socket *sock;
    struct file *os_file;

    rc = sock_create(op->domain, type, 0, &sock);
    if( rc < 0 ) {
      LOG_E(ci_log("%s: ERROR: sock_create(%d, %d, 0) failed (%d)",
                   __FUNCTION__, op->domain, type, rc));
      tcp_helper_endpoint_clear_aflags(ep, OO_THR_EP_AFLAG_ATTACHED);
      return rc;
    }
    os_file = sock_alloc_file(sock, flags, NULL);
    if( IS_ERR(os_file) ) {
      LOG_E(ci_log("%s: ERROR: sock_alloc_file failed (%ld)",
                   __FUNCTION__, PTR_ERR(os_file)));
      sock_release(sock);
      tcp_helper_endpoint_clear_aflags(ep, OO_THR_EP_AFLAG_ATTACHED);
      return PTR_ERR(os_file);
    }
    rc = efab_attach_os_socket(ep, os_file);
    if( rc < 0 ) {
      LOG_E(ci_log("%s: ERROR: efab_attach_os_socket failed (%d)",
                   __FUNCTION__, rc));
      /* NB. efab_attach_os_socket() consumes [os_file] even on error. */
      tcp_helper_endpoint_clear_aflags(ep, OO_THR_EP_AFLAG_ATTACHED);
      return rc;
    }
    wo->sock.domain = op->domain;
    wo->sock.ino = ep->os_socket->file->f_dentry->d_inode->i_ino;
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0)
    wo->sock.uid = ep->os_socket->file->f_dentry->d_inode->i_uid;
#else
    wo->sock.uid = __kuid_val(ep->os_socket->file->f_dentry->d_inode->i_uid);
#endif
  }

  /* Create a new file descriptor to attach the stack to. */
  ci_assert((wo->waitable.state & CI_TCP_STATE_TCP) ||
            wo->waitable.state == CI_TCP_STATE_UDP);
  rc = oo_create_fd(ep, flags,
                    (wo->waitable.state & CI_TCP_STATE_TCP) ?
                    CI_PRIV_TYPE_TCP_EP : CI_PRIV_TYPE_UDP_EP);
  if( rc < 0 ) {
    ci_irqlock_state_t lock_flags;
    struct oo_file_ref* os_socket;
    ci_irqlock_lock(&ep->thr->lock, &lock_flags);
    os_socket = ep->os_socket;
    ep->os_socket = NULL;
    ci_irqlock_unlock(&ep->thr->lock, &lock_flags);
    if( os_socket != NULL )
      oo_file_ref_drop(os_socket);
    tcp_helper_endpoint_clear_aflags(ep, OO_THR_EP_AFLAG_ATTACHED);
    return rc;
  }

  op->fd = rc;
#ifdef SOCK_NONBLOCK
  if( op->type & SOCK_NONBLOCK )
    ci_bit_mask_set(&wo->waitable.sb_aflags, CI_SB_AFLAG_O_NONBLOCK);
#endif

  /* Re-read the OS socket buffer size settings.  This ensures we'll use
   * up-to-date values for this new socket.
   */
  efab_get_os_settings(&NI_OPTS_TRS(trs));
  return 0;
}
示例#8
0
文件: udp_fd.c 项目: ido/openonload
static int citp_udp_socket(int domain, int type, int protocol)
{
  citp_fdinfo* fdi;
  citp_sock_fdi* epi;
  ef_driver_handle fd;
  int rc;
  ci_netif* ni;

  Log_V(log(LPF "socket(%d, %d, %d)", domain, type, protocol));

  epi = CI_ALLOC_OBJ(citp_sock_fdi);
  if( ! epi ) {
    Log_U(ci_log(LPF "socket: failed to allocate epi"));
    errno = ENOMEM;
    goto fail1;
  }
  fdi = &epi->fdinfo;
  citp_fdinfo_init(fdi, &citp_udp_protocol_impl);

  rc = citp_netif_alloc_and_init(&fd, &ni);
  if( rc != 0 ) {
    if( rc == CI_SOCKET_HANDOVER ) {
      /* This implies EF_DONT_ACCELERATE is set, so we handover
       * regardless of CITP_OPTS.no_fail */
      CI_FREE_OBJ(epi);
      return rc;
    }
    goto fail2;
  }

  /* Protect the fdtable entry until we're done initialising. */
  if( fdtable_strict() )  CITP_FDTABLE_LOCK();
  if((fd = ci_udp_ep_ctor(&epi->sock, ni, domain, type)) < 0) {
    /*! ?? \TODO unpick the ci_udp_ep_ctor according to how failed */
    Log_U(ci_log(LPF "socket: udp_ep_ctor failed"));
    errno = -fd;
    goto fail3;
  }

  citp_fdtable_new_fd_set(fd, fdip_busy, fdtable_strict());
  if( fdtable_strict() )  CITP_FDTABLE_UNLOCK();

  CI_DEBUG(epi->sock.s->pid = getpid());

  /* We're ready.  Unleash us onto the world! */
  ci_assert(epi->sock.s->b.sb_aflags & CI_SB_AFLAG_NOT_READY);
  ci_atomic32_and(&epi->sock.s->b.sb_aflags, ~CI_SB_AFLAG_NOT_READY);
  citp_fdtable_insert(fdi, fd, 0);

  Log_VSS(log(LPF "socket(%d, %d, %d) = "EF_FMT, domain, type, protocol,
              EF_PRI_ARGS(epi,fd)));
  return fd;

 fail3:
  if( CITP_OPTS.no_fail && errno != ELIBACC )
    CITP_STATS_NETIF(++ni->state->stats.udp_handover_socket);
  citp_netif_release_ref(ni, 0);
 fail2:
  CI_FREE_OBJ(epi);
 fail1:
  /* BUG1408: Graceful failure. We'll only fail outright if there's a
   * driver/library mismatch */
  if( CITP_OPTS.no_fail && errno != ELIBACC ) {
    Log_U(ci_log("%s: failed (errno:%d) - PASSING TO OS", __FUNCTION__, errno));
    return CI_SOCKET_HANDOVER;
  }
  return -1;
}
示例#9
0
/* fixme kostik: this is partially copy-paste from citp_sock_fcntl */
static int citp_pipe_fcntl(citp_fdinfo* fdinfo, int cmd, long arg)
{
  int rc = 0;
  citp_pipe_fdi* epi = fdi_to_pipe_fdi(fdinfo);
  struct oo_pipe* p = epi->pipe;

  switch ( cmd ) {
  case F_GETFL: {
    ci_uint32 flag_nonb = CI_PFD_AFLAG_NONBLOCK;
    if( ! fdi_is_reader(fdinfo) ) {
      rc = O_WRONLY;
      flag_nonb <<= CI_PFD_AFLAG_WRITER_SHIFT;
    }
    else
      flag_nonb <<= CI_PFD_AFLAG_READER_SHIFT;
    if ( p->aflags & flag_nonb ) rc |= O_NONBLOCK;
    break;
  }
  case F_SETFL: {
    ci_uint32 bit;

    rc = ci_sys_fcntl(fdinfo->fd, cmd, arg);
    if( rc < 0 )
      break;

    bit = CI_PFD_AFLAG_NONBLOCK <<
                (fdi_is_reader(fdinfo) ? CI_PFD_AFLAG_READER_SHIFT :
                 CI_PFD_AFLAG_WRITER_SHIFT);
    if( arg & (O_NONBLOCK | O_NDELAY) )
      ci_bit_mask_set(&p->aflags, bit);
    else
      ci_bit_mask_clear(&p->aflags, bit);
    break;
  }
  case F_DUPFD:
    rc = citp_ep_dup(fdinfo->fd, citp_ep_dup_fcntl_dup, arg);
    break;
#ifdef F_DUPFD_CLOEXEC
  case F_DUPFD_CLOEXEC:
    rc = citp_ep_dup(fdinfo->fd, citp_ep_dup_fcntl_dup_cloexec, arg);
    break;
#endif
  case F_GETFD:
  case F_SETFD:
    rc = ci_sys_fcntl(fdinfo->fd, cmd, arg);
    break;
  case F_GETLK:
  case F_SETLK:
  case F_SETLKW:
    /* File locks not supported on sockets */
    Log_U(ci_log("%s: cmd %d not supported on sockets!",__FUNCTION__,
                 cmd));
    errno = ENOTSUP;
    rc = CI_SOCKET_ERROR;
    break;
  case F_GETOWN:
  case F_SETOWN:
#ifdef F_GETOWN_EX
  case F_GETOWN_EX:
#endif
#ifdef F_SETOWN_EX
  case F_SETOWN_EX:
#endif
    rc = ci_sys_fcntl(fdinfo->fd, cmd, arg);
    if( rc != 0 )
        break;
    p->b.sigown = arg;
    if( p->b.sigown && (p->b.sb_aflags & CI_SB_AFLAG_O_ASYNC) )
      ci_bit_set(&p->b.wake_request, CI_SB_FLAG_WAKE_RX_B);
    break;
#ifdef F_SETPIPE_SZ
  case F_SETPIPE_SZ:
    /* System pipe buf size is rounded up to power of two. We
     * cannot replicate this.
     */
    rc = ci_pipe_set_size(epi->ni, p, arg);
    if( rc < 0 ) {
        errno = EINVAL;
        rc = CI_SOCKET_ERROR;
        break;
    }
    rc = 0;
    break;
#endif
#ifdef F_GETPIPE_SZ
  case F_GETPIPE_SZ:
    rc = (p->bufs_max - 1) * OO_PIPE_BUF_MAX_SIZE;
    break;
#endif
  default:
    /* fixme kostik: logging should include some pipe identification */
    errno = ENOTSUP;
    rc = CI_SOCKET_ERROR;
  }

  Log_VSC(log("%s(%d, %d, %ld) = %d  (errno=%d)",
              __FUNCTION__, fdinfo->fd, cmd, arg, rc, errno));

  return rc;
}
示例#10
0
/*
** promote a synrecv structure to an established socket
**
** Assumes that the caller will handle a fail if we can't allocate a new
** tcp_state structure due to memory pressure or the like
*/
int ci_tcp_listenq_try_promote(ci_netif* netif, ci_tcp_socket_listen* tls,
                               ci_tcp_state_synrecv* tsr,
                               ci_ip_cached_hdrs* ipcache,
                               ci_tcp_state** ts_out)
{
  int rc = 0;
  
  ci_assert(netif);
  ci_assert(tls);
  ci_assert(tls->s.b.state == CI_TCP_LISTEN);
  ci_assert(tsr);

  if( (int) ci_tcp_acceptq_n(tls) < tls->acceptq_max ) {
    ci_tcp_state* ts;

    /* grab a tcp_state structure that will go onto the accept queue.  We take
     * from the cache of EPs if any are available
     */
    ts = get_ts_from_cache (netif, tsr, tls); 
    if( !ts ) {
      /* None on cache; try allocating a new ts */
      ts = ci_tcp_get_state_buf(netif);
#if CI_CFG_FD_CACHING
      if( ts == NULL ) {
        /* We've reaped.  Did this result in any being cached */
        ts = get_ts_from_cache(netif, tsr, tls);
        if (ts == NULL ) {
          /* No -- try again to allocate. */
          ts = ci_tcp_get_state_buf(netif);
        }
        else {
          CITP_STATS_NETIF(++netif->state->stats.sockcache_hit_reap);
        }
      }
#endif
      if( ts == NULL ) {
        LOG_TV(ci_log("%s: [%d] out of socket buffers",
                      __FUNCTION__, NI_ID(netif)));
        CITP_STATS_TCP_LISTEN(++tls->stats.n_acceptq_no_sock);
        CI_SET_SO_ERROR(&tls->s, ENOMEM);
        citp_waitable_wake(netif, &tls->s.b, CI_SB_FLAG_WAKE_RX);
        return -ENOMEM;
      }


      ci_assert(ci_tcp_is_cached(ts) ||
                (ts->s.b.sb_aflags & CI_SB_AFLAG_ORPHAN));
    }

#ifdef ONLOAD_OFE
    ts->s.ofe_code_start = tls->ofe_promote;
#endif

    if( ! ci_tcp_is_cached(ts) ) {
      /* Need to initialise address information for use when setting filters */
      ci_tcp_set_addr_on_promote(netif, ts, tsr, tls);

      /* "borrow" filter from listening socket.  For loopback socket, we
       * do not need filters, but we have to take a reference of the OS
       * socket. */
      rc = ci_tcp_ep_set_filters(netif, S_SP(ts), ts->s.cp.so_bindtodevice,
                                 S_SP(tls));
      if( rc < 0 ) {
        LOG_U(ci_log("%s: Unable to set filters %d", __FUNCTION__, rc));
        /* Either put this back on the list (at the head) or free it */
        ci_tcp_state_free(netif, ts);
        return rc;
      }
    }
#if CI_CFG_FD_CACHING
    else {
      /* Now set the s/w filter.  We leave the hw filter in place for cached
       * EPS. This will probably not have the correct raddr and rport, but as
       * it's sharing the listening socket's filter that's not a problem.  It
       * will be updated if this is still around when the listener is closed.
       */
      rc = ci_netif_filter_insert(netif, S_SP(ts), tsr->l_addr,
                                  sock_lport_be16(&tls->s), tsr->r_addr,
                                  tsr->r_port, tcp_protocol(ts));

      if (rc < 0) {
        /* Bung it back on the cache list */
        LOG_EP(ci_log("Unable to create s/w filter!"));
        ci_ni_dllist_push(netif, &tls->epcache.cache, &ts->epcache_link);
        return rc;
      }

      /* Need to initialise address information.  We do this after trying to
       * insert the sw filter, so we can push the tcp state back onto the
       * cache queue with as few changes as possible if we fail to add the
       * sw filter.
       */
      ci_tcp_set_addr_on_promote(netif, ts, tsr, tls);

      LOG_EP(ci_log("Cached fd %d from cached to connected", ts->cached_on_fd));
      ci_ni_dllist_push(netif, &tls->epcache_connected, &ts->epcache_link);
    }
#endif

    ci_assert(IS_VALID_SOCK_P(netif, S_SP(ts)));
    ci_assert(ts->s.b.state == CI_TCP_CLOSED);
    ts->s.domain = tls->s.domain;

    cicp_ip_cache_update_from(netif, &ts->s.pkt, ipcache);
    ci_pmtu_state_init(netif, &ts->s, &ts->pmtus,
                       CI_IP_TIMER_PMTU_DISCOVER);
    ci_pmtu_set(netif, &ts->pmtus,
                CI_MIN(ts->s.pkt.mtu,
                       tsr->tcpopts.smss + sizeof(ci_tcp_hdr)
                         + sizeof(ci_ip4_hdr)));

    /* If we've got SYN via local route, we can handle it */
    ci_assert_equiv(ts->s.pkt.status == retrrc_localroute,
                    OO_SP_NOT_NULL(tsr->local_peer));
    if( ts->s.pkt.status == retrrc_localroute )
      ts->s.pkt.flags |= CI_IP_CACHE_IS_LOCALROUTE;

    ts->amss = tsr->amss;

    /* options and flags */
    ts->tcpflags = 0;
    ts->tcpflags |= tsr->tcpopts.flags;
    ts->tcpflags |= CI_TCPT_FLAG_PASSIVE_OPENED;
    ts->outgoing_hdrs_len = sizeof(ci_ip4_hdr) + sizeof(ci_tcp_hdr);
    if( ts->tcpflags & CI_TCPT_FLAG_WSCL ) {
      ts->snd_wscl = tsr->tcpopts.wscl_shft;
      ts->rcv_wscl = tsr->rcv_wscl;
    } else {
      ts->snd_wscl = ts->rcv_wscl = 0u;
    }
    CI_IP_SOCK_STATS_VAL_TXWSCL( ts, ts->snd_wscl);
    CI_IP_SOCK_STATS_VAL_RXWSCL( ts, ts->rcv_wscl);

    /* Send and receive sequence numbers */
    tcp_snd_una(ts) = tcp_snd_nxt(ts) = tcp_enq_nxt(ts) = tcp_snd_up(ts) =
      tsr->snd_isn + 1;
    ci_tcp_set_snd_max(ts, tsr->rcv_nxt, tcp_snd_una(ts), 0);
    ci_tcp_rx_set_isn(ts, tsr->rcv_nxt);
    tcp_rcv_up(ts) = SEQ_SUB(tcp_rcv_nxt(ts), 1);

    if( ts->tcpflags & CI_TCPT_FLAG_TSO ) {
      ts->incoming_tcp_hdr_len += 12;
      ts->outgoing_hdrs_len += 12;
      ts->tspaws = ci_tcp_time_now(netif);
      ts->tsrecent = tsr->tspeer;
      ts->tslastack = tsr->rcv_nxt;
    }
    else {
      /* Must be after initialising snd_una. */
      ci_tcp_clear_rtt_timing(ts);
      ts->timed_ts = tsr->timest;
    }
    /* SACK has nothing to be done. */

    /* ?? ECN */
    ci_tcp_set_hdr_len(ts, (ts->outgoing_hdrs_len - sizeof(ci_ip4_hdr)));

    ts->smss = tsr->tcpopts.smss;
    ts->c.user_mss = tls->c.user_mss;
    if (ts->c.user_mss && ts->c.user_mss < ts->smss)
      ts->smss = ts->c.user_mss;
#if CI_CFG_LIMIT_SMSS
    ts->smss = ci_tcp_limit_mss(ts->smss, netif, __FUNCTION__);
#endif
    ci_assert(ts->smss>0);
    ci_tcp_set_eff_mss(netif, ts);
    ci_tcp_set_initialcwnd(netif, ts);

    /* Copy socket options & related fields that should be inherited. 
     * Note: Windows does not inherit rcvbuf until the call to accept 
     * completes. The assumption here is that all options can be
     * inherited at the same time (most won't have an effect until there
     * is a socket available for use by the app.).
     */
    ci_tcp_inherit_accept_options(netif, tls, ts, "SYN RECV (LISTENQ PROMOTE)");

    /* NB. Must have already set peer (which we have). */
    ci_tcp_set_established_state(netif, ts);
    CITP_STATS_NETIF(++netif->state->stats.synrecv2established);
  
    ci_assert(ts->ka_probes == 0);
    ci_tcp_kalive_restart(netif, ts, ci_tcp_kalive_idle_get(ts));
    ci_tcp_set_flags(ts, CI_TCP_FLAG_ACK);

    /* Remove the synrecv structure from the listen queue, and free the
    ** buffer. */
    if( tsr->tcpopts.flags & CI_TCPT_FLAG_SYNCOOKIE )
      ci_free(tsr);
    else {
      ci_tcp_listenq_remove(netif, tls, tsr);
      ci_tcp_synrecv_free(netif, tsr);
    }

    ci_bit_set(&ts->s.b.sb_aflags, CI_SB_AFLAG_TCP_IN_ACCEPTQ_BIT);
    ci_tcp_acceptq_put(netif, tls, &ts->s.b);

    LOG_TC(log(LNT_FMT "new ts=%d SYN-RECV->ESTABLISHED flags=0x%x",
               LNT_PRI_ARGS(netif, tls), S_FMT(ts), ts->tcpflags);
           log(LNTS_FMT RCV_WND_FMT " snd=%08x-%08x-%08x enq=%08x",
               LNTS_PRI_ARGS(netif, ts), RCV_WND_ARGS(ts),
               tcp_snd_una(ts),
               tcp_snd_nxt(ts), ts->snd_max, tcp_enq_nxt(ts)));

    citp_waitable_wake(netif, &tls->s.b, CI_SB_FLAG_WAKE_RX);
    *ts_out = ts;
    return 0;
  }
示例#11
0
static int parse_cfg_opt(int argc, char** argv, const char* context)
{
  const ci_cfg_desc* a;
  const char* val = NULL;
  int result = 1;

  /* is it "-" ? */
  if( argv[0][1] == 0 )  bad_cla(context, argv[0], "- is not allowed");

  /* find the config descriptor */
  a = 0;
  if( cfg_opts )  a = find_cfg_desc(argv[0], cfg_opts, n_cfg_opts, &val);
  if( (!a) && ci_app_standard_opts) {
     a = find_cfg_desc(argv[0], std_opts, N_STD_OPTS, &val);
  }
  if( !a )    bad_cla(context, argv[0], "unknown option");

  /* the option value (if required) may be part of this arg or the next */
  if( !val || *val == 0 ) {
    if( a->type == CI_CFG_FLAG || a->type == CI_CFG_USAGE || argc == 1 ) {
      val = 0;
    } else {
      val = argv[1];
      result = 2;
    }
  }

  switch( a->type ) {
  case CI_CFG_FLAG:
    if( val ) {
      if( sscanf(val, "%d", (int*) a->value) != 1 )
	bad_cla(context, argv[0], "expected integer or nothing");
    }
    else
      ++(*(int*) a->value);
    break;
  case CI_CFG_INT:
    if( !val || sscanf(val, "%i", (int*) a->value) != 1 )
      bad_cla(context, argv[0], "expected integer");
    break;
  case CI_CFG_UINT:
    if( !val || sscanf(val, "%i", (int*) a->value) != 1 )
      bad_cla(context, argv[0], "expected unsigned integer");
    break;
  case CI_CFG_INT64:
    if( !val || sscanf(val, "%lli", (long long int*) a->value) != 1 )
      bad_cla(context, argv[0], "expected 64bit integer");
    break;
  case CI_CFG_UINT64:
    if( !val || sscanf(val, "%lli", (long long int*) a->value) != 1 )
      bad_cla(context, argv[0], "expected unsigned 64bit integer");
    break;
  case CI_CFG_STR:
    *(const char**) a->value = val ? val : "";
    break;
  case CI_CFG_USAGE:
    ci_app_usage(0);
    break;
  case CI_CFG_FN:
    ci_assert(a->fn);
    a->fn(val, a);
    break;
  case CI_CFG_IRANGE:
    {
      int *v;
      v = (int*) a->value;
      if( sscanf(val, " %i - %i", v, v + 1) != 2 ) {
	if( sscanf(val, " %i", v) == 1 )
	  v[1] = v[0];
	else
	  bad_cla(context, argv[0], "expected integer or range");
      }
    }
    break;
  default:
    ci_log("ci_app: unknown config option type %u", a->type);
    break;
  }

  return result;
}
示例#12
0
/* Return 1 if the bucket is empty now */
static int
ci_tcp_listenq_bucket_remove(ci_netif* ni, ci_tcp_socket_listen* tls,
                             ci_tcp_listen_bucket* bucket,
                             ci_tcp_state_synrecv* tsr, int level)
{
  ci_ni_aux_mem* aux;
  int idx = ci_tcp_listenq_hash2idx(tsr->hash, level);
  oo_p tsr_p = ci_tcp_synrecv2p(ni, tsr);

  /* Fixme: we remove empty buckets only.  In theory, it may be useful to
   * remove a bucket with one non-empty list, but it maked code more
   * complicated. */
  int empty = 0;
#ifdef __KERNEL__
  int i = 0;

  if( level > CI_LISTENQ_BUCKET_MAX_DEPTH(ni) ) {
    ci_netif_error_detected(ni, CI_NETIF_ERROR_SYNRECV_TABLE, __FUNCTION__);
    return 0;
  }
#endif

  LOG_TV(ci_log("%s([%d] level=%d "TSR_FMT")", __func__,
                NI_ID(ni), level, TSR_ARGS(tsr)));
  ci_assert( OO_P_NOT_NULL(bucket->bucket[idx]) );
#ifdef __KERNEL__
  if( OO_P_IS_NULL(bucket->bucket[idx]) ) {
    ci_netif_error_detected(ni, CI_NETIF_ERROR_SYNRECV_TABLE,
                            __FUNCTION__);
    return 0;
  }
#endif

  level++;
  aux = ci_ni_aux_p2aux(ni, bucket->bucket[idx]);
  if( aux->type == CI_TCP_AUX_TYPE_BUCKET ) {
    empty = ci_tcp_listenq_bucket_remove(ni, tls, &aux->u.bucket, tsr, level);
    if( empty ) {
      bucket->bucket[idx] = OO_P_NULL;
      ci_ni_aux_free(ni, aux);
      tls->n_buckets--;
    }
  }
  else {
    if( bucket->bucket[idx] == tsr_p ) {
      bucket->bucket[idx] = tsr->bucket_link;
      empty = OO_P_IS_NULL(bucket->bucket[idx]);
    }
    else {
      ci_tcp_state_synrecv* prev = &aux->u.synrecv;
      while( prev->bucket_link != tsr_p ) {
        aux = ci_ni_aux_p2aux(ni, prev->bucket_link);
        prev = &aux->u.synrecv;
#ifdef __KERNEL__
        if( i++ > CI_LISTENQ_BUCKET_LIST_LIMIT(ni) ) {
          ci_netif_error_detected(ni, CI_NETIF_ERROR_SYNRECV_TABLE,
                                  __FUNCTION__);
          return 0;
        }
#endif
      }
      prev->bucket_link = tsr->bucket_link;
    }
  }

  if( empty ) {
    int i;
    for( i = 0; i < CI_TCP_LISTEN_BUCKET_SIZE; i++ )
      if( OO_P_NOT_NULL(bucket->bucket[i]) )
        return 0;
    return 1;
  }
  return 0;
}
示例#13
0
/* c_ni is assumed to be locked on enterance and is always unlocked on
 * exit. */
int ci_tcp_connect_lo_toconn(ci_netif *c_ni, oo_sp c_id, ci_uint32 dst,
                             ci_netif *l_ni, oo_sp l_id)
{
  ci_tcp_state *ts;
  ci_tcp_socket_listen *tls, *alien_tls;
  citp_waitable_obj *wo;
  citp_waitable *w;
  int rc;

  ci_assert(ci_netif_is_locked(c_ni));
  ci_assert(OO_SP_NOT_NULL(c_id));
  ci_assert(OO_SP_NOT_NULL(l_id));

  LOG_TC(log("%s: connect %d:%d to %d:%d", __FUNCTION__,
             c_ni->state->stack_id, OO_SP_TO_INT(c_id),
             l_ni->state->stack_id, OO_SP_TO_INT(l_id)));

  alien_tls = SP_TO_TCP_LISTEN(l_ni, l_id);
  if( (int)ci_tcp_acceptq_n(alien_tls) >= alien_tls->acceptq_max ) {
    ci_netif_unlock(c_ni);
    return -EBUSY;
  }

  /* In c_ni, create shadow listening socket tls (copy l_id) */
  ts = ci_tcp_get_state_buf(c_ni);
  if( ts == NULL ) {
    ci_netif_unlock(c_ni);
    LOG_E(ci_log("%s: [%d] out of socket buffers", __FUNCTION__, NI_ID(c_ni)));
    return -ENOMEM;
  }

  /* init common tcp fields */
  ts->s.so = alien_tls->s.so;
  ts->s.cp.ip_ttl = alien_tls->s.cp.ip_ttl;
  S_TCP_HDR(&ts->s)->tcp_source_be16 =
      S_TCP_HDR(&alien_tls->s)->tcp_source_be16;
  ts->s.domain = alien_tls->s.domain;
  ts->c = alien_tls->c;
  ts->c.tcp_defer_accept = OO_TCP_DEFER_ACCEPT_OFF;

  /* make sure nobody will ever connect to our "shadow" socket
   * except us */
  ci_bit_set(&ts->s.b.sb_aflags, CI_SB_AFLAG_ORPHAN_BIT);

  ci_tcp_set_slow_state(c_ni, ts, CI_TCP_LISTEN);
  tls = SOCK_TO_TCP_LISTEN(&ts->s);
  /* no timer: */
  tls->s.s_flags = alien_tls->s.s_flags | CI_SOCK_FLAG_BOUND_ALIEN;

  tls->acceptq_max = 1;
  rc = ci_tcp_listen_init(c_ni, tls);
  if( rc != 0 ) {
    citp_waitable_obj_free(c_ni, &tls->s.b);
    return rc;
  }

  /* Connect c_id to tls */
  ts = SP_TO_TCP(c_ni, c_id);
  rc = ci_tcp_connect_lo_samestack(c_ni, ts, tls->s.b.bufid);

  /* Accept as from tls */
  if( !ci_tcp_acceptq_not_empty(tls) ) {
    /* it is possible, for example, if ci_tcp_listenq_try_promote() failed
     * because there are no endpoints */
    ci_tcp_listenq_drop_all(c_ni, tls);
    citp_waitable_obj_free(c_ni, &tls->s.b);
    ci_netif_unlock(c_ni);
    return -EBUSY;
  }
  w = ci_tcp_acceptq_get(c_ni, tls);
  ci_assert(w);
  LOG_TV(ci_log("%s: %d:%d to %d:%d shadow %d:%d accepted %d:%d",
                __FUNCTION__,
                c_ni->state->stack_id, OO_SP_TO_INT(c_id),
                l_ni->state->stack_id, OO_SP_TO_INT(l_id),
                c_ni->state->stack_id, tls->s.b.bufid,
                c_ni->state->stack_id, w->bufid));

  ci_assert(w->state & CI_TCP_STATE_TCP);
  ci_assert(w->state != CI_TCP_LISTEN);

  /* Destroy tls.
   * NB: nobody could possibly connect to it, so no need to do proper
   * shutdown.
   */
  ci_assert_equal(ci_tcp_acceptq_n(tls), 0);
  ci_tcp_listenq_drop_all(c_ni, tls);
  citp_waitable_obj_free(c_ni, &tls->s.b);
  ci_netif_unlock(c_ni);

  /* Keep a port reference */
  {
    tcp_helper_endpoint_t *l_ep, *a_ep;
    struct oo_file_ref* os_sock_ref;
    ci_irqlock_state_t lock_flags;

    l_ep = ci_trs_ep_get(netif2tcp_helper_resource(l_ni), l_id);
    a_ep = ci_trs_ep_get(netif2tcp_helper_resource(c_ni), W_SP(w));
    ci_irqlock_lock(&l_ep->thr->lock, &lock_flags);
    os_sock_ref = l_ep->os_socket;
    ci_assert_equal(a_ep->os_port_keeper, NULL);
    if( os_sock_ref != NULL ) {
      os_sock_ref = oo_file_ref_add(os_sock_ref);
      os_sock_ref = oo_file_ref_xchg(&a_ep->os_port_keeper, os_sock_ref);
      ci_irqlock_unlock(&l_ep->thr->lock, &lock_flags);
      if( os_sock_ref != NULL )
        oo_file_ref_drop(os_sock_ref);
    }
    else {
      ci_irqlock_unlock(&l_ep->thr->lock, &lock_flags);
      goto cleanup;
    }
  }

  /* lock l_ni: Check that l_id is the same socket it used to be */
  /* create ref-sock in l_ni, put it into acc q */
  if( ci_netif_lock(l_ni) != 0 )
    goto cleanup;
  if( alien_tls->s.b.state != CI_TCP_LISTEN ||
      (alien_tls->s.b.sb_aflags & CI_SB_AFLAG_ORPHAN) ||
      S_TCP_HDR(&alien_tls->s)->tcp_source_be16 != TS_TCP(ts)->tcp_dest_be16 ||
      (alien_tls->s.pkt.ip.ip_saddr_be32 != INADDR_ANY &&
       alien_tls->s.pkt.ip.ip_saddr_be32 != ts->s.pkt.ip.ip_daddr_be32) ) {
    ci_netif_unlock(l_ni);
    goto cleanup;
  }

  ci_bit_mask_set(&w->sb_aflags,
                  CI_SB_AFLAG_TCP_IN_ACCEPTQ | CI_SB_AFLAG_ORPHAN);

  wo = citp_waitable_obj_alloc(l_ni);
  if( wo == NULL ) {
    ci_netif_unlock(l_ni);
    goto cleanup;
  }
  wo->waitable.state = CI_TCP_CLOSED;
  wo->waitable.sb_aflags |= CI_SB_AFLAG_MOVED_AWAY;
  wo->waitable.moved_to_stack_id = c_ni->state->stack_id;
  wo->waitable.moved_to_sock_id = W_SP(w);
  LOG_TC(log("%s: put to acceptq %d:%d referencing %d:%d", __func__,
             l_ni->state->stack_id, OO_SP_TO_INT(W_SP(&wo->waitable)),
             c_ni->state->stack_id, OO_SP_TO_INT(W_SP(w))));

  ci_tcp_acceptq_put(l_ni, alien_tls, &wo->waitable);
  citp_waitable_wake_not_in_poll(l_ni, &alien_tls->s.b, CI_SB_FLAG_WAKE_RX);
  ci_netif_unlock(l_ni);

  return rc;

cleanup:
  ci_assert(w->sb_aflags & CI_SB_AFLAG_ORPHAN);
  ci_bit_mask_clear(&w->sb_aflags,
                    CI_SB_AFLAG_TCP_IN_ACCEPTQ | CI_SB_AFLAG_ORPHAN);
  efab_tcp_helper_close_endpoint(netif2tcp_helper_resource(c_ni), w->bufid);
  /* we can not guarantee c_ni lock, so we can' call
   * ci_tcp_drop(c_ni, ts).  So, we return error; UL will handover
   * and close ts endpoint. */
  return -EBUSY;
}
示例#14
0
/* Returns:
 *          0                  on success
 *          
 *          CI_SOCKET_ERROR (and errno set)
 *                             this is a normal error that is returned to the
 *                             the application
 *
 *          CI_SOCKET_HANDOVER we tell the upper layers to handover, no need
 *                             to set errno since it isn't a real error
 */
int ci_tcp_connect(citp_socket* ep, const struct sockaddr* serv_addr,
		   socklen_t addrlen, ci_fd_t fd, int *p_moved)
{
  /* Address family is validated earlier. */
  struct sockaddr_in* inaddr = (struct sockaddr_in*) serv_addr;
  ci_sock_cmn* s = ep->s;
  ci_tcp_state* ts = &SOCK_TO_WAITABLE_OBJ(s)->tcp;
  int rc = 0, crc;
  ci_uint32 dst_be32;

  if( NI_OPTS(ep->netif).tcp_connect_handover )
    return CI_SOCKET_HANDOVER;

  /* Make sure we're up-to-date. */
  ci_netif_lock(ep->netif);
  CHECK_TEP(ep);
  ci_netif_poll(ep->netif);

  /*
   * 1. Check if state of the socket is OK for connect operation.
   */

 start_again:

  if( (rc = ci_tcp_connect_handle_so_error(s)) != 0) {
    CI_SET_ERROR(rc, rc);
    goto unlock_out;
  }

  if( s->b.state != CI_TCP_CLOSED ) {
    /* see if progress can be made on this socket before
    ** determining status  (e.g. non-blocking connect and connect poll)*/
    if( s->b.state & CI_TCP_STATE_SYNCHRONISED ) {
      if( ts->tcpflags & CI_TCPT_FLAG_NONBLOCK_CONNECT ) {
        ts->tcpflags &= ~CI_TCPT_FLAG_NONBLOCK_CONNECT;
	rc = 0;
	goto unlock_out;
      }
      if( serv_addr->sa_family == AF_UNSPEC )
        LOG_E(ci_log("Onload does not support TCP disconnect via "

                     "connect(addr->sa_family==AF_UNSPEC)"));
      CI_SET_ERROR(rc, EISCONN);
    }
    else if( s->b.state == CI_TCP_LISTEN ) {
#if CI_CFG_POSIX_CONNECT_AFTER_LISTEN
      CI_SET_ERROR(rc, EOPNOTSUPP);
#else
      if( ci_tcp_validate_sa(s->domain, serv_addr, addrlen) ) {
        /* Request should be forwarded to OS */
        rc = CI_SOCKET_HANDOVER;
	goto unlock_out;
      }
      if( serv_addr->sa_family == AF_UNSPEC ) {
        /* Linux does listen shutdown on disconnect (AF_UNSPEC) */
        ci_netif_unlock(ep->netif);
        rc = ci_tcp_shutdown(ep, SHUT_RD, fd);
	goto out;
      } else {
        /* Linux has curious error reporting in this case */
        CI_SET_ERROR(rc, EISCONN);
      }
#endif
    }
    else {
      /* Socket is in SYN-SENT state. Let's block for receiving SYN-ACK */
      ci_assert_equal(s->b.state, CI_TCP_SYN_SENT);
      if( s->b.sb_aflags & (CI_SB_AFLAG_O_NONBLOCK | CI_SB_AFLAG_O_NDELAY) )
        CI_SET_ERROR(rc, EALREADY);
      else
        goto syn_sent;
    }
    goto unlock_out;
  }

  /* Check if we've ever been connected. */
  if( ts->tcpflags & CI_TCPT_FLAG_WAS_ESTAB ) {
    CI_SET_ERROR(rc, EISCONN);
    goto unlock_out;
  }

  /* 
   * 2. Check address parameter, if it's inappropriate for handover
   *    decision or handover should be done, try to to call OS and
   *    do handover on success.
   */

  if (
    /* Af first, check that address family and length is OK. */
    ci_tcp_validate_sa(s->domain, serv_addr, addrlen)
    /* rfc793 p54 if the foreign socket is unspecified return          */
    /* "error: foreign socket unspecified" (EINVAL), but keep it to OS */
    || (dst_be32 = ci_get_ip4_addr(inaddr->sin_family, serv_addr)) == 0
    /* Zero destination port is tricky as well, keep it to OS */
    || inaddr->sin_port == 0 )
  {
    rc = CI_SOCKET_HANDOVER;
    goto unlock_out;
  }
  
  /* is this a socket that we can handle? */
  rc = ci_tcp_connect_check_dest(ep, dst_be32, inaddr->sin_port);
  if( rc )  goto unlock_out;

  if( (ts->s.pkt.flags & CI_IP_CACHE_IS_LOCALROUTE) &&
      OO_SP_IS_NULL(ts->local_peer) ) {
    /* Try to connect to another stack; handover if can't */
    struct oo_op_loopback_connect op;
    op.dst_port = inaddr->sin_port;
    op.dst_addr = dst_be32;
    /* this operation unlocks netif */
    rc = oo_resource_op(fd, OO_IOC_TCP_LOOPBACK_CONNECT, &op);
    if( rc < 0)
      return CI_SOCKET_HANDOVER;
    if( op.out_moved )
      *p_moved = 1;
    if( op.out_rc == -EINPROGRESS )
      RET_WITH_ERRNO( EINPROGRESS );
    else if( op.out_rc == -EAGAIN )
      return -EAGAIN;
    else if( op.out_rc != 0 )
      return CI_SOCKET_HANDOVER;
    return 0;
  }

  /* filters can't handle alien source address */
  if( (s->s_flags & CI_SOCK_FLAG_BOUND_ALIEN) &&
      ! (ts->s.pkt.flags & CI_IP_CACHE_IS_LOCALROUTE) ) {
    rc = CI_SOCKET_HANDOVER;
    goto unlock_out;
  }

  crc = ci_tcp_connect_ul_start(ep->netif, ts, dst_be32, inaddr->sin_port, &rc);
  if( crc != CI_CONNECT_UL_OK ) {
    switch( crc ) {
    case CI_CONNECT_UL_FAIL:
      goto unlock_out;
    case CI_CONNECT_UL_LOCK_DROPPED:
      goto out;
    case CI_CONNECT_UL_START_AGAIN:
      goto start_again;
    }
  }
  CI_TCP_STATS_INC_ACTIVE_OPENS( ep->netif );

 syn_sent:
  rc = ci_tcp_connect_ul_syn_sent(ep->netif, ts);

 unlock_out:
  ci_netif_unlock(ep->netif);
 out:
  return rc;
}
示例#15
0
int ef_onload_driver_open(ef_driver_handle* pfd,
                          enum oo_device_type dev_type,
                          int do_cloexec)
{
  int rc;
  int flags = 0;
  int saved_errno = errno;

#ifdef O_CLOEXEC
  if( do_cloexec )
    flags = O_CLOEXEC;
#endif

  ci_assert(pfd);
  rc = oo_open(pfd, dev_type, flags);
  if( rc != 0 && errno != EMFILE && fd_is_saved[dev_type] >= 0 ) {
    ci_clone_fd_t op;
    op.do_cloexec = do_cloexec;
    LOG_NV(ci_log("%s: open failed, but cloning from saved fd", __func__));
    rc = ci_sys_ioctl((ci_fd_t) saved_fd[dev_type],
                      clone_ioctl[dev_type], &op);
    if( rc < 0 )
      return rc;
    errno = saved_errno;
    *pfd = op.fd;
  }

  if( rc != 0 )
    return rc;

  /* Our internal driver handles are not visible to the application.  It may
   * make assumptions about the fd space available to it, and try to dup2/3
   * onto one of our driver fds.  To try and minimise this we allow the user
   * to specify a minimum value for us to use, to try and keep out of their
   * way.
   *
   * We have to be able to cope with them coming along and trying to dup onto
   * one of these fds anyway, as they may not have set the option up.  As such
   * we treat failure to shift the fd as acceptable, and just retain the old
   * one.
   */
  if( *pfd < CITP_OPTS.fd_base )
    if( ef_onload_handle_move_and_do_cloexec(pfd, do_cloexec) == 0 )
      return 0;
      
  if( do_cloexec ) {
#if defined(O_CLOEXEC)
    static int o_cloexec_fails = -1;
    if( o_cloexec_fails < 0 ) {
      int arg;
      rc = ci_sys_fcntl(*(int *)pfd, F_GETFD, &arg);
      if( rc == 0 && (arg & FD_CLOEXEC) )
        o_cloexec_fails = 0;
      else
        o_cloexec_fails = 1;
    }
#else
    static const int o_cloexec_fails = 1;
#endif
    if( o_cloexec_fails )
      CI_DEBUG_TRY(ci_sys_fcntl(*(int *)pfd, F_SETFD, FD_CLOEXEC));
  }

  return 0;
}
示例#16
0
/* we don't register protocol impl */
int citp_pipe_create(int fds[2], int flags)
{
  citp_pipe_fdi* epi_read;
  citp_pipe_fdi* epi_write;
  struct oo_pipe* p = NULL;         /* make compiler happy */
  ci_netif* ni;
  int rc = -1;
  ef_driver_handle fd = -1;

  Log_V(log(LPF "pipe()"));

  /* citp_netif_exists() does not need citp_ul_lock here */
  if( CITP_OPTS.ul_pipe == CI_UNIX_PIPE_ACCELERATE_IF_NETIF &&
      ! citp_netif_exists() ) {
    return CITP_NOT_HANDLED;
  }

  rc = citp_netif_alloc_and_init(&fd, &ni);
  if( rc != 0 ) {
    if( rc == CI_SOCKET_HANDOVER ) {
      /* This implies EF_DONT_ACCELERATE is set, so we handover
       * regardless of CITP_OPTS.no_fail */
      return CITP_NOT_HANDLED;
    }
    /* may be lib mismatch - errno will be ELIBACC */
    goto fail1;
  }
  rc = -1;

  CI_MAGIC_CHECK(ni, NETIF_MAGIC);

  /* add another reference as we have 2 fdis */
  citp_netif_add_ref(ni);

  epi_read = citp_pipe_epi_alloc(ni, O_RDONLY);
  if( epi_read == NULL )
    goto fail2;
  epi_write = citp_pipe_epi_alloc(ni, O_WRONLY);
  if( epi_write == NULL )
    goto fail3;

  /* oo_pipe init code */
  if( fdtable_strict() )  CITP_FDTABLE_LOCK();
  rc = oo_pipe_ctor(ni, &p, fds, flags);
  if( rc < 0 )
      goto fail4;
  citp_fdtable_new_fd_set(fds[0], fdip_busy, fdtable_strict());
  citp_fdtable_new_fd_set(fds[1], fdip_busy, fdtable_strict());
  if( fdtable_strict() )  CITP_FDTABLE_UNLOCK();

  LOG_PIPE("%s: pipe=%p id=%d", __FUNCTION__, p, p->b.bufid);

  /* as pipe is created it should be attached to the end-points */
  epi_read->pipe = p;
  epi_write->pipe = p;

  /* We're ready.  Unleash us onto the world! */
  ci_assert(epi_read->pipe->b.sb_aflags & CI_SB_AFLAG_NOT_READY);
  ci_assert(epi_write->pipe->b.sb_aflags & CI_SB_AFLAG_NOT_READY);
  ci_atomic32_and(&epi_read->pipe->b.sb_aflags, ~CI_SB_AFLAG_NOT_READY);
  ci_atomic32_and(&epi_read->pipe->b.sb_aflags, ~CI_SB_AFLAG_NOT_READY);
  citp_fdtable_insert(&epi_read->fdinfo, fds[0], 0);
  citp_fdtable_insert(&epi_write->fdinfo, fds[1], 0);

  CI_MAGIC_CHECK(ni, NETIF_MAGIC);

  return 0;

fail4:
  if( fdtable_strict() )  CITP_FDTABLE_UNLOCK();
fail3:
  CI_FREE_OBJ(epi_write);
fail2:
  CI_FREE_OBJ(epi_read);
  citp_netif_release_ref(ni, 0);
  citp_netif_release_ref(ni, 0);
fail1:
  if( CITP_OPTS.no_fail && errno != ELIBACC ) {
    Log_U(ci_log("%s: failed (errno:%d) - PASSING TO OS", __FUNCTION__, errno));
    return CITP_NOT_HANDLED;
  }

  return rc;
}
示例#17
0
文件: epoll_fd.c 项目: ido/openonload
int citp_epoll_create(int size, int flags)
{
  citp_fdinfo    *fdi;
  citp_epoll_fdi *epi;
  struct citp_epoll_fd* ep;
  int            fd;

  if( (epi = CI_ALLOC_OBJ(citp_epoll_fdi)) == NULL )
    goto fail0;
  if( (ep = CI_ALLOC_OBJ(struct citp_epoll_fd)) == NULL )
    goto fail1;
  fdi = &epi->fdinfo;
  citp_fdinfo_init(fdi, &citp_epoll_protocol_impl);

  /* Create the epoll fd. */
  CITP_FDTABLE_LOCK();
  if( (fd = ci_sys_epoll_create_compat(size, flags, 0)) < 0 )
    goto fail2;
  citp_fdtable_new_fd_set(fd, fdip_busy, TRUE);

  /* Init epfd_os */
#ifdef O_CLOEXEC
  ep->epfd_os = ci_sys_open(OO_EPOLL_DEV, O_RDWR | O_CLOEXEC);
#else
  ep->epfd_os = ci_sys_open(OO_EPOLL_DEV, O_RDWR);
  if( ep->epfd_os >= 0 )
    ci_sys_fcntl(ep->epfd_os, F_SETFD, FD_CLOEXEC);
#endif
  if( ep->epfd_os < 0 ) {
    Log_E(ci_log("%s: ERROR: failed to open(%s) errno=%d",
                 __FUNCTION__, OO_EPOLL_DEV, errno));
    goto fail3;
  }
  __citp_fdtable_reserve(ep->epfd_os, 1);
  ep->shared = mmap(NULL, sizeof(*ep->shared), PROT_READ, MAP_SHARED,
                     ep->epfd_os, 0);
  if( ep->shared == MAP_FAILED ) {
    Log_E(ci_log("%s: ERROR: failed to mmap shared segment errno=%d",
                 __FUNCTION__, errno));
    goto fail4;
  }
  __citp_fdtable_reserve(ep->shared->epfd, 1);
  CITP_FDTABLE_UNLOCK();

  epi->epoll = ep;
  ep->size = size;
  oo_wqlock_init(&ep->lock);
  ep->not_mt_safe = ! CITP_OPTS.ul_epoll_mt_safe;
  ci_dllist_init(&ep->oo_sockets);
  ep->oo_sockets_n = 0;
  ci_dllist_init(&ep->dead_sockets);
  oo_atomic_set(&ep->refcount, 1);
  ep->epfd_syncs_needed = 0;
  ep->blocking = 0;
  citp_fdtable_insert(fdi, fd, 0);
  Log_POLL(ci_log("%s: fd=%d driver_fd=%d epfd=%d", __FUNCTION__,
                  fd, ep->epfd_os, (int) ep->shared->epfd));
  return fd;

 fail4:
  __citp_fdtable_reserve(ep->epfd_os, 0);
  ci_sys_close(ep->epfd_os);
 fail3:
  ci_sys_close(fd);
  citp_fdtable_busy_clear(fd, fdip_unknown, 1);
 fail2:
  CITP_FDTABLE_UNLOCK();
  CI_FREE_OBJ(ep);
 fail1:
  CI_FREE_OBJ(epi);
 fail0:
  return -1;
}
示例#18
0
static void citp_dump_opts(citp_opts_t *o)
{
  /* ?? TODO: should be using opts_cittp_def.h here */

# define DUMP_OPT_INT(envstr, name)		\
  ci_log("%s=%d", (envstr), (int) o->name)
# define DUMP_OPT_HEX(envstr, name)		\
  ci_log("%s=%x", (envstr), (unsigned) o->name)

  DUMP_OPT_HEX("EF_UNIX_LOG",		log_level);
  DUMP_OPT_INT("EF_PROBE",		probe);
  DUMP_OPT_INT("EF_TCP",		ul_tcp);
  DUMP_OPT_INT("EF_UDP",		ul_udp);
  DUMP_OPT_INT("EF_UL_SELECT",		ul_select);
  DUMP_OPT_INT("EF_SELECT_SPIN",	ul_select_spin);
  DUMP_OPT_INT("EF_SELECT_FAST",	ul_select_fast);
  DUMP_OPT_INT("EF_UL_POLL",		ul_poll);
  DUMP_OPT_INT("EF_POLL_SPIN",		ul_poll_spin);
  DUMP_OPT_INT("EF_POLL_FAST",		ul_poll_fast);
  DUMP_OPT_INT("EF_POLL_FAST_USEC",	ul_poll_fast_usec);
  DUMP_OPT_INT("EF_POLL_NONBLOCK_FAST_USEC", ul_poll_nonblock_fast_usec);
  DUMP_OPT_INT("EF_SELECT_FAST_USEC",	ul_select_fast_usec);
  DUMP_OPT_INT("EF_SELECT_NONBLOCK_FAST_USEC", ul_select_nonblock_fast_usec);
#if CI_CFG_UDP
  DUMP_OPT_INT("EF_UDP_RECV_SPIN",      udp_recv_spin);
  DUMP_OPT_INT("EF_UDP_SEND_SPIN",      udp_send_spin);
#endif
  DUMP_OPT_INT("EF_TCP_RECV_SPIN",      tcp_recv_spin);
  DUMP_OPT_INT("EF_TCP_SEND_SPIN",      tcp_send_spin);
  DUMP_OPT_INT("EF_TCP_ACCEPT_SPIN",    tcp_accept_spin);
  DUMP_OPT_INT("EF_TCP_CONNECT_SPIN",   tcp_connect_spin);
  DUMP_OPT_INT("EF_PKT_WAIT_SPIN",      pkt_wait_spin);
#if CI_CFG_USERSPACE_PIPE
  DUMP_OPT_INT("EF_PIPE_RECV_SPIN",     pipe_recv_spin);
  DUMP_OPT_INT("EF_PIPE_SEND_SPIN",     pipe_send_spin);
  DUMP_OPT_INT("EF_PIPE_SIZE",          pipe_size);
#endif
  DUMP_OPT_INT("EF_SOCK_LOCK_BUZZ",     sock_lock_buzz);
  DUMP_OPT_INT("EF_STACK_LOCK_BUZZ",    stack_lock_buzz);
  DUMP_OPT_INT("EF_SO_BUSY_POLL_SPIN",  so_busy_poll_spin);
#if CI_CFG_USERSPACE_EPOLL
  DUMP_OPT_INT("EF_UL_EPOLL",	        ul_epoll);
  DUMP_OPT_INT("EF_EPOLL_SPIN",	        ul_epoll_spin);
  DUMP_OPT_INT("EF_EPOLL_CTL_FAST",     ul_epoll_ctl_fast);
  DUMP_OPT_INT("EF_EPOLL_CTL_HANDOFF",  ul_epoll_ctl_handoff);
  DUMP_OPT_INT("EF_EPOLL_MT_SAFE",      ul_epoll_mt_safe);
#endif
  DUMP_OPT_INT("EF_FDTABLE_SIZE",	fdtable_size);
  DUMP_OPT_INT("EF_SPIN_USEC",		ul_spin_usec);
  DUMP_OPT_INT("EF_STACK_PER_THREAD",	stack_per_thread);
  DUMP_OPT_INT("EF_DONT_ACCELERATE",	dont_accelerate);
  DUMP_OPT_INT("EF_FDTABLE_STRICT",	fdtable_strict);
  DUMP_OPT_INT("EF_FDS_MT_SAFE",	fds_mt_safe);
  DUMP_OPT_INT("EF_FORK_NETIF",		fork_netif);
  DUMP_OPT_INT("EF_NETIF_DTOR",		netif_dtor);
  DUMP_OPT_INT("EF_NO_FAIL",		no_fail);
  DUMP_OPT_INT("EF_SA_ONSTACK_INTERCEPT",	sa_onstack_intercept);
  DUMP_OPT_INT("EF_ACCEPT_INHERIT_NONBLOCK", accept_force_inherit_nonblock);
#if CI_CFG_USERSPACE_PIPE
  DUMP_OPT_INT("EF_PIPE", ul_pipe);
#endif
  DUMP_OPT_HEX("EF_SIGNALS_NOPOSTPONE", signals_no_postpone);
  DUMP_OPT_INT("EF_CLUSTER_SIZE",  cluster_size);
  DUMP_OPT_INT("EF_CLUSTER_RESTART",  cluster_restart_opt);
  ci_log("EF_CLUSTER_NAME=%s", o->cluster_name);
  if( o->tcp_reuseports == 0 ) {
    DUMP_OPT_INT("EF_TCP_FORCE_REUSEPORT", tcp_reuseports);
  } else {
    struct ci_port_list *force_reuseport;
    CI_DLLIST_FOR_EACH2(struct ci_port_list, force_reuseport, link,
                        (ci_dllist*)(ci_uintptr_t)o->tcp_reuseports)
      ci_log("%s=%d", "EF_TCP_FORCE_REUSEPORT", ntohs(force_reuseport->port));
  }
  if( o->udp_reuseports == 0 ) {
    DUMP_OPT_INT("EF_UDP_FORCE_REUSEPORT", udp_reuseports);
  } else {
    struct ci_port_list *force_reuseport;
    CI_DLLIST_FOR_EACH2(struct ci_port_list, force_reuseport, link,
                        (ci_dllist*)(ci_uintptr_t)o->udp_reuseports)
      ci_log("%s=%d", "EF_UDP_FORCE_REUSEPORT", ntohs(force_reuseport->port));
  }
}
示例#19
0
文件: onloadfs.c 项目: ido/openonload
static int
onload_alloc_file(tcp_helper_resource_t *thr, oo_sp ep_id, int flags,
                  int fd_type)
{
  struct qstr name = { .name = "" };
#ifdef EFX_HAVE_STRUCT_PATH
  struct path path;
#define my_dentry path.dentry
#else
  struct dentry *dentry;
#define my_dentry dentry
#endif
  struct file *file;
  int fd;
  struct inode *inode;
  ci_private_t *priv;
  struct file_operations *fops;

  fops = oo_fops_by_type(fd_type);
  if( fops == NULL )
    return -EINVAL;
  ci_assert_equal(fops->owner, THIS_MODULE);

  inode = new_inode(onload_mnt->mnt_sb);
  if( inode == NULL )
    return -ENOMEM;
#ifdef EFX_FSTYPE_HAS_MOUNT
  inode->i_ino = get_next_ino();
#endif
  if( fd_type == CI_PRIV_TYPE_NETIF )
    inode->i_mode = S_IRWXUGO;
  if( fd_type == CI_PRIV_TYPE_TCP_EP || fd_type == CI_PRIV_TYPE_UDP_EP )
    inode->i_mode = 
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,21)
        /* in 2.6.18 this flag makes us "socket" and sendmsg crashes;
         * see sock_from_file() */
                    S_IFSOCK |
#endif
                    S_IRWXUGO;
  else
    inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
  inode->i_uid = current_fsuid();
  inode->i_gid = current_fsgid();
  priv = &container_of(inode, struct onload_inode, vfs_inode)->priv;
  priv->thr = thr;
  priv->sock_id = ep_id;
  priv->fd_type = fd_type;

  fd = get_unused_fd();
  if( fd < 0 ) {
    iput(inode);
    return fd;
  }
  /*ci_log("[%d]%s(%d:%d) return %d priv=%p", current->pid, __func__,
         thr->id, ep_id, fd, priv);*/

#ifdef EFX_FSTYPE_HAS_MOUNT
#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,37)
  path.dentry = d_alloc(onload_mnt->mnt_sb->s_root, &name);
  if( path.dentry != NULL )
    path.dentry->d_op = &onloadfs_dentry_operations;
#else
  path.dentry = d_alloc_pseudo(onload_mnt->mnt_sb, &name);
#endif
#else /* EFX_FSTYPE_HAS_MOUNT */
#ifdef EFX_HAVE_D_DNAME
  my_dentry = d_alloc(onload_mnt->mnt_sb->s_root, &name);
#else
  {
    char str[32];
    name.len = onloadfs_name(&container_of(inode, struct onload_inode,
                                           vfs_inode)->priv,
                             str, sizeof(str));
    name.name = str;
    name.hash = inode->i_ino;
    my_dentry = d_alloc(onload_mnt->mnt_sb->s_root, &name);
  }
#endif
#endif /* EFX_FSTYPE_HAS_MOUNT */

  if( my_dentry == NULL ) {
    put_unused_fd(fd);
    iput(inode);
    return -ENOMEM;
  }

#if !defined(EFX_FSTYPE_HAS_MOUNT) || defined(EFX_OLD_MOUNT_PSEUDO)
  my_dentry->d_op = &onloadfs_dentry_operations;
#if !defined(EFX_HAVE_STRUCT_PATH) && defined(EFX_HAVE_D_DNAME)
  my_dentry->d_flags &= ~DCACHE_UNHASHED;
#endif
#endif
  d_instantiate(my_dentry, inode);
#ifndef EFX_HAVE_D_DNAME
  d_rehash(my_dentry);
#endif
  inode->i_fop = fops;

#ifdef EFX_HAVE_STRUCT_PATH
  path.mnt = mntget(onload_mnt);
  file = alloc_file(&path, FMODE_READ | FMODE_WRITE, fops);
#else
  file = alloc_file(onload_mnt, dentry, FMODE_READ | FMODE_WRITE, fops);
#endif
  if( file == NULL) {
#ifdef EFX_HAVE_STRUCT_PATH
    path_put(&path);
#else
    dput(dentry);
    iput(inode);
#endif
    put_unused_fd(fd);
    return -ENFILE;
  }

  priv->_filp = file;
  file->f_flags = O_RDWR | (flags & O_NONBLOCK);
  file->f_pos = 0;
  file->private_data = priv;

  if( flags & O_CLOEXEC ) {
    struct files_struct *files = current->files;
    struct fdtable *fdt;
    spin_lock(&files->file_lock);
    fdt = files_fdtable(files);
    rcu_assign_pointer(fdt->fd[fd], file);
    efx_set_close_on_exec(fd, fdt);
    spin_unlock(&files->file_lock);
  } else
    fd_install(fd, file);
  try_module_get(THIS_MODULE);

  ci_assert_equal(file->f_op, fops);
  return fd;
}

void onload_priv_free(ci_private_t *priv)
{
  if( priv->_filp->f_vfsmnt != onload_mnt)
    ci_free(priv);
  /* inode will free the priv automatically */
}


int
oo_create_fd(tcp_helper_endpoint_t* ep, int flags, int fd_type)
{
  int fd;
  tcp_helper_resource_t *trs = ep->thr;
  citp_waitable_obj *wo = SP_TO_WAITABLE_OBJ(&trs->netif, ep->id);

  efab_thr_ref(trs);
  fd = onload_alloc_file(trs, ep->id, flags, fd_type);
  if( fd < 0 ) {
    efab_thr_release(trs);
    OO_DEBUG_ERR(ci_log("%s: onload_alloc_file failed (%d)", __FUNCTION__, fd));
    return fd;
  }
  ci_atomic32_and(&wo-> waitable.sb_aflags,
                  ~(CI_SB_AFLAG_ORPHAN | CI_SB_AFLAG_TCP_IN_ACCEPTQ));

  return fd;
}
示例#20
0
static void citp_opts_getenv(citp_opts_t* opts)
{
  /* ?? TODO: would like to use opts_citp_def.h here */

  const char* s;
  unsigned v;

  opts->log_via_ioctl = 3;
  /* TODO: Old name.  Keeping reading 'til 2011, then purge. */
  GET_ENV_OPT_HEX("EF_Log_VIA_IOCTL",	log_via_ioctl);
  GET_ENV_OPT_INT("EF_LOG_VIA_IOCTL",	log_via_ioctl);

  if( (s = getenv("EF_LOG_FILE")) && opts->log_via_ioctl == 3) {
    opts->log_via_ioctl = 0;
    citp_log_to_file(s);
  } else if( opts->log_via_ioctl == 3 ) {
    /* citp_setup_logging_early() have already detected stderr as
     * tty/non-tty, so just trust it. */
    if( ci_log_fn == citp_log_fn_drv )
      opts->log_via_ioctl = 1;
    else
      opts->log_via_ioctl = 0;
  }

  if( opts->log_via_ioctl ) {
    ci_log_options &=~ CI_LOG_PID;
    citp_setup_logging_change(citp_log_fn_drv);
  } else {
    if( getenv("EF_LOG_TIMESTAMPS") )
      ci_log_options |= CI_LOG_TIME;
    citp_setup_logging_change(citp_log_fn_ul);
  }

  if( getenv("EF_POLL_NONBLOCK_FAST_LOOPS") &&
      ! getenv("EF_POLL_NONBLOCK_FAST_USEC") )
    log("ERROR: EF_POLL_NONBLOCK_FAST_LOOPS is deprecated, use"
        " EF_POLL_NONBLOCK_FAST_USEC instead");

  if( getenv("EF_POLL_FAST_LOOPS") && ! getenv("EF_POLL_FAST_USEC") )
    log("ERROR: EF_POLL_FAST_LOOPS is deprecated, use"
        " EF_POLL_FAST_USEC instead");

  if( (s = getenv("EF_POLL_USEC")) && atoi(s) ) {
    GET_ENV_OPT_INT("EF_POLL_USEC", ul_spin_usec);
    opts->ul_select_spin = 1;
    opts->ul_poll_spin = 1;
#if CI_CFG_USERSPACE_EPOLL
    opts->ul_epoll_spin = 1;
#endif
#if CI_CFG_UDP
    opts->udp_recv_spin = 1;
    opts->udp_send_spin = 1;
#endif
    opts->tcp_recv_spin = 1;
    opts->tcp_send_spin = 1;
    opts->pkt_wait_spin = 1;
    opts->sock_lock_buzz = 1;
    opts->stack_lock_buzz = 1;
  }

  if( (s = getenv("EF_BUZZ_USEC")) && atoi(s) ) {
    opts->sock_lock_buzz = 1;
    opts->stack_lock_buzz = 1;
  }

  GET_ENV_OPT_HEX("EF_UNIX_LOG",	log_level);
  GET_ENV_OPT_INT("EF_PROBE",		probe);
  GET_ENV_OPT_INT("EF_TCP",		ul_tcp);
  GET_ENV_OPT_INT("EF_UDP",		ul_udp);
  GET_ENV_OPT_INT("EF_UL_SELECT",	ul_select);
  GET_ENV_OPT_INT("EF_SELECT_SPIN",	ul_select_spin);
  GET_ENV_OPT_INT("EF_SELECT_FAST",	ul_select_fast);
  GET_ENV_OPT_INT("EF_UL_POLL",		ul_poll);
  GET_ENV_OPT_INT("EF_POLL_SPIN",	ul_poll_spin);
  GET_ENV_OPT_INT("EF_POLL_FAST",	ul_poll_fast);
  GET_ENV_OPT_INT("EF_POLL_FAST_USEC",  ul_poll_fast_usec);
  GET_ENV_OPT_INT("EF_POLL_NONBLOCK_FAST_USEC", ul_poll_nonblock_fast_usec);
  GET_ENV_OPT_INT("EF_SELECT_FAST_USEC",  ul_select_fast_usec);
  GET_ENV_OPT_INT("EF_SELECT_NONBLOCK_FAST_USEC", ul_select_nonblock_fast_usec);
#if CI_CFG_UDP
  GET_ENV_OPT_INT("EF_UDP_RECV_SPIN",   udp_recv_spin);
  GET_ENV_OPT_INT("EF_UDP_SEND_SPIN",   udp_send_spin);
#endif
  GET_ENV_OPT_INT("EF_TCP_RECV_SPIN",   tcp_recv_spin);
  GET_ENV_OPT_INT("EF_TCP_SEND_SPIN",   tcp_send_spin);
  GET_ENV_OPT_INT("EF_TCP_ACCEPT_SPIN", tcp_accept_spin);
  GET_ENV_OPT_INT("EF_TCP_CONNECT_SPIN",tcp_connect_spin);
  GET_ENV_OPT_INT("EF_PKT_WAIT_SPIN",   pkt_wait_spin);
#if CI_CFG_USERSPACE_PIPE
  GET_ENV_OPT_INT("EF_PIPE_RECV_SPIN",  pipe_recv_spin);
  GET_ENV_OPT_INT("EF_PIPE_SEND_SPIN",  pipe_send_spin);
  GET_ENV_OPT_INT("EF_PIPE_SIZE",       pipe_size);
#endif
  GET_ENV_OPT_INT("EF_SOCK_LOCK_BUZZ",  sock_lock_buzz);
  GET_ENV_OPT_INT("EF_STACK_LOCK_BUZZ", stack_lock_buzz);
  GET_ENV_OPT_INT("EF_SO_BUSY_POLL_SPIN", so_busy_poll_spin);
#if CI_CFG_USERSPACE_EPOLL
  GET_ENV_OPT_INT("EF_UL_EPOLL",        ul_epoll);
  if( opts->ul_epoll == 0 && ci_cfg_opts.netif_opts.int_driven == 0 ) {
    ci_log("EF_INT_DRIVEN=0 and EF_UL_EPOLL=0 are not compatible.  "
           "EF_INT_DRIVEN can be set to 0 implicitly, because of non-zero "
           "EF_POLL_USEC.  If you need both spinning and EF_UL_EPOLL=0, "
           "please set EF_INT_DRIVEN=1 explicitly.");
  }
  GET_ENV_OPT_INT("EF_EPOLL_SPIN",      ul_epoll_spin);
  GET_ENV_OPT_INT("EF_EPOLL_CTL_FAST",  ul_epoll_ctl_fast);
  GET_ENV_OPT_INT("EF_EPOLL_CTL_HANDOFF",ul_epoll_ctl_handoff);
  GET_ENV_OPT_INT("EF_EPOLL_MT_SAFE",   ul_epoll_mt_safe);
#endif
  GET_ENV_OPT_INT("EF_FDTABLE_SIZE",	fdtable_size);
  GET_ENV_OPT_INT("EF_SPIN_USEC",	ul_spin_usec);
  GET_ENV_OPT_INT("EF_STACK_PER_THREAD",stack_per_thread);
  GET_ENV_OPT_INT("EF_DONT_ACCELERATE",	dont_accelerate);
  GET_ENV_OPT_INT("EF_FDTABLE_STRICT",	fdtable_strict);
  GET_ENV_OPT_INT("EF_FDS_MT_SAFE",	fds_mt_safe);
  GET_ENV_OPT_INT("EF_NO_FAIL",		no_fail);
  GET_ENV_OPT_INT("EF_SA_ONSTACK_INTERCEPT",	sa_onstack_intercept);
  GET_ENV_OPT_INT("EF_ACCEPT_INHERIT_NONBLOCK",	accept_force_inherit_nonblock);
  GET_ENV_OPT_INT("EF_VFORK_MODE",	vfork_mode);
#if CI_CFG_USERSPACE_PIPE
  GET_ENV_OPT_INT("EF_PIPE",        ul_pipe);
#endif

  if( (s = getenv("EF_FORK_NETIF")) && sscanf(s, "%x", &v) == 1 ) {
    opts->fork_netif = CI_MIN(v, CI_UNIX_FORK_NETIF_BOTH);
  }
  if( (s = getenv("EF_NETIF_DTOR")) && sscanf(s, "%x", &v) == 1 ) {
    opts->netif_dtor = CI_MIN(v, CITP_NETIF_DTOR_ALL);
  }

  if( (s = getenv("EF_SIGNALS_NOPOSTPONE")) ) {
    opts->signals_no_postpone = 0;
    while( sscanf(s, "%u", &v) == 1 ) {
      opts->signals_no_postpone |= (1 << (v-1));
      s = strchr(s, ',');
      if( s == NULL )
        break;
      s++;
    }
  }

  if( (s = getenv("EF_CLUSTER_NAME")) ) {
    strncpy(opts->cluster_name, s, CI_CFG_CLUSTER_NAME_LEN);
    opts->cluster_name[CI_CFG_CLUSTER_NAME_LEN] = '\0';
  }
  else {
    opts->cluster_name[0] = '\0';
  }
  GET_ENV_OPT_INT("EF_CLUSTER_SIZE",	cluster_size);
  if( opts->cluster_size < 2 )
    log("ERROR: cluster_size < 2 are not supported");
  GET_ENV_OPT_INT("EF_CLUSTER_RESTART",	cluster_restart_opt);
  get_env_opt_port_list(&opts->tcp_reuseports, "EF_TCP_FORCE_REUSEPORT");
  get_env_opt_port_list(&opts->udp_reuseports, "EF_UDP_FORCE_REUSEPORT");

#if CI_CFG_FD_CACHING
  get_env_opt_port_list(&opts->sock_cache_ports, "EF_SOCKET_CACHE_PORTS");
#endif
}
示例#21
0
static int
efab_tcp_helper_move_state(ci_private_t* priv, void *arg)
{
  oo_tcp_move_state_t *op = arg;
  tcp_helper_endpoint_t *new_ep;
  tcp_helper_resource_t * new_trs = NULL;
  ci_netif* ni, *new_ni;
  ci_tcp_state * ts, *new_ts;
  tcp_helper_endpoint_t* ep;
  int rc = efab_ioctl_get_ep(priv, op->ep_id, &ep);
  if (rc != 0)
    return rc;

  OO_DEBUG_TCPH(ci_log("%s: (trs=%p (%u), priv=%p, ep_id=%u, new_trs_id=%u, "
                       "new_ep_id=%u", __FUNCTION__, priv->thr, priv->thr->id,
                       priv, OO_SP_FMT(op->ep_id), op->new_trs_id,
                       OO_SP_FMT(op->new_ep_id)));

  do {
    /* check that the existing id is valid */
    ni = &priv->thr->netif;
    ts = SP_TO_TCP(ni, ep->id);

    /* TODO: check this endpoint belongs to the tcp helper resource of priv and not
     * somewhere else */
    
    /* this function does not change fd_type or fd ops, so it is not able
     * to cope with changing the socket type. We think this only makes sense
     * for TCP, so assert we are taking a TCP endpoint.
     */
    ci_assert_equal(ts->s.pkt.ip.ip_protocol, IPPROTO_TCP);
    ci_assert_equal(priv->fd_type, CI_PRIV_TYPE_TCP_EP);

    /* get pointer to resource from handle - increments ref count */
    rc = efab_thr_table_lookup(NULL, op->new_trs_id,
                               EFAB_THR_TABLE_LOOKUP_CHECK_USER, &new_trs);
    if (rc < 0) {
      OO_DEBUG_ERR( ci_log("%s: invalid new resource handle", __FUNCTION__) );
      break;
    }
    ci_assert(new_trs != NULL);
    /* check valid endpoint in new netif */
    new_ni = &new_trs->netif;
    new_ep = ci_netif_get_valid_ep(new_ni, op->new_ep_id);
    new_ts = SP_TO_TCP(new_ni, new_ep->id);

    /* check the two endpoint states look valid */
    if( (ts->s.pkt.ip.ip_protocol != new_ts->s.pkt.ip.ip_protocol) ||
        (ts->s.b.state != CI_TCP_CLOSED) ||
        (ep->oofilter.sf_local_port != NULL) ) {
      efab_thr_release(new_trs);
      rc = -EINVAL;
      OO_DEBUG_ERR(ci_log("%s: invalid endpoint states", __FUNCTION__));
      break;
    }

    /* should be fine to complete */
    ci_assert(new_trs);
    {
      tcp_helper_resource_t *old_trs;
    again:
      old_trs = priv->thr;
      if (ci_cas_uintptr_fail((ci_uintptr_t *)&priv->thr,
                              (ci_uintptr_t)old_trs, (ci_uintptr_t)new_trs))
        goto again;
      efab_thr_release(old_trs);
    }

    /* move file to hold details of new resource, new endpoint */
    ci_assert(OO_SP_EQ(priv->sock_id, op->ep_id));
    priv->sock_id = new_ep->id;

    OO_DEBUG_TCPH(ci_log("%s: set epid %u", __FUNCTION__,
                         OO_SP_FMT(priv->sock_id)));
    
    /* copy across any necessary state */

    ci_assert_equal(new_ep->os_socket, NULL);
    new_ep->os_socket = ep->os_socket;
    ep->os_socket = NULL;

    /* set ORPHAN flag in current as not attached to an FD */
    ci_bit_set(&ts->s.b.sb_aflags, CI_SB_AFLAG_ORPHAN_BIT);
    /* remove ORPHAN flag in new TCP state */
    ci_atomic32_and(&new_ts->s.b.sb_aflags,
		    ~(CI_SB_AFLAG_ORPHAN | CI_SB_AFLAG_TCP_IN_ACCEPTQ));

    return 0;

  } while (0);

  return rc;

}
static int setup_trampoline(struct pt_regs *regs, 
                            int opcode, int arg, 
                            int bits)
{
    struct mm_hash *p;
    ci_uintptr_t trampoline_entry = 0, trampoline_exclude = 0,
        trampoline_toc = 0, trampoline_fixup = 0;
    int rc = -EBADF;
    
    read_lock(&oo_mm_tbl_lock);
    p = oo_mm_tbl_lookup(current->mm);
    if (p)
    {
        trampoline_entry = (ci_uintptr_t) CI_USER_PTR_GET(p->trampoline_entry);
        trampoline_exclude = (ci_uintptr_t) CI_USER_PTR_GET(p->trampoline_exclude);
        trampoline_toc = (ci_uintptr_t) CI_USER_PTR_GET(p->trampoline_toc);
        trampoline_fixup = (ci_uintptr_t) CI_USER_PTR_GET(p->trampoline_user_fixup);
    }
    read_unlock(&oo_mm_tbl_lock);
    
    TRAMP_DEBUG("%s: trampoline_entry = %p \n", __func__, (void *)trampoline_entry);

    /* OK. We have the entry - set up a trampoline to user space */
    if (trampoline_entry)
    {
        if (!access_ok(VERIFY_READ, trampoline_entry, 1))
        {
            /* Can't read this address. Fail! */
            ci_log("Pid %d (mm=%p) has bad trampoline entry: %p",
                   current->tgid, current->mm, (void *)trampoline_entry);
            return -EBADF;
        }

        /* Check for the excluded address */
        if (regs->nip == trampoline_exclude)
        {
            TRAMP_DEBUG("Ignoring call from excluded address 0x%08lx",
                        (unsigned long)trampoline_exclude);
            return -EBUSY;
        }

        TRAMP_DEBUG("%s: bits = %d; set up trampoline. \n", __func__, bits);
        if (bits == TRAMPOLINE_BITS_64)
        {
            setup_trampoline64(regs, opcode, arg, 
                               (void *)trampoline_entry, (void *)trampoline_toc,
                               (void *)trampoline_fixup);
        }
#ifdef CONFIG_COMPAT
        else
        {
            setup_trampoline32(regs, opcode, arg,
                               (void *)trampoline_entry, (void *)trampoline_toc,
                               (void *)trampoline_fixup);
        }
#endif
        rc = 0;
    }
    else
    {
        OO_DEBUG_VERB(ci_log("Error -- attempt to trampoline for unknown process"));
        rc = -ENOENT;
    }
    return rc;
}
int efab_file_move_to_alien_stack_rsop(ci_private_t *stack_priv, void *arg)
{
  ci_fixed_descriptor_t sock_fd = *(ci_fixed_descriptor_t *)arg;
  struct file *sock_file = fget(sock_fd);
  ci_private_t *sock_priv;
  tcp_helper_resource_t *old_thr;
  tcp_helper_resource_t *new_thr;
  citp_waitable *w;
  int rc;

  if( sock_file == NULL )
    return -EINVAL;
  if( !FILE_IS_ENDPOINT_SOCK(sock_file) ||
      stack_priv->fd_type != CI_PRIV_TYPE_NETIF ) {
    fput(sock_file);
    return -EINVAL;
  }
  sock_priv = sock_file->private_data;
  ci_assert(sock_priv->fd_type == CI_PRIV_TYPE_TCP_EP ||
            sock_priv->fd_type == CI_PRIV_TYPE_UDP_EP);

  old_thr = sock_priv->thr;
  new_thr = stack_priv->thr;
  ci_assert(old_thr);
  ci_assert(new_thr);

  if( old_thr == new_thr ) {
    fput(sock_file);
    return 0;
  }

  if( tcp_helper_cluster_from_cluster(old_thr) != 0 ) {
    LOG_S(ci_log("%s: move_fd() not permitted on clustered stacks", __func__));
    fput(sock_file);
    return -EINVAL;
  }

  w = SP_TO_WAITABLE(&old_thr->netif, sock_priv->sock_id);
  rc = ci_sock_lock(&old_thr->netif, w);
  if( rc != 0 ) {
    fput(sock_file);
    return rc;
  }

  rc = ci_netif_lock(&old_thr->netif);
  if( rc != 0 ) {
    ci_sock_unlock(&old_thr->netif, w);
    fput(sock_file);
    return rc;
  }

  efab_thr_ref(new_thr);
  rc = efab_file_move_to_alien_stack(sock_priv, &stack_priv->thr->netif);
  fput(sock_file);

  if( rc != 0 )
    efab_thr_release(new_thr);
  else
    ci_netif_unlock(&new_thr->netif);

  return rc;
}
示例#24
0
文件: onload.c 项目: majek/openonload
/* Maps the shared memory regions that are used as the interface between the
 * control plane and its clients.  On failure, this function will clean up any
 * partially-initialised state. */
int
oo_cp_create(int fd, struct oo_cplane_handle* cp, enum cp_sync_mode mode)
{
  struct cp_mibs* mibs = cp->mib;
  int rc;
  void* mem;
  ci_uint32 op = mode;

  /* Check user-kernel interface version. */
  rc = cp_ioctl(fd, OO_IOC_CP_CHECK_VERSION, &oo_cplane_api_version);
  if( rc != 0 )
    return rc;

  /* Wait for the control plane server to start if necessary.  This ioctl does
   * an interruptible sleep while waiting.  If a non-fatal signal is received
   * while we're asleep, the ioctl will fail with EINTR, and we want to try
   * again. */
  do {
    rc = cp_ioctl(fd, OO_IOC_CP_WAIT_FOR_SERVER, &op);
  } while( rc == -EINTR );
  if( rc != 0 )
    return rc;

  /* Find out the MIB size */
  rc = cp_ioctl(fd, OO_IOC_CP_MIB_SIZE, &cp->bytes);
  if( rc != 0 )
    return rc;

  ci_assert(cp->bytes);
  ci_assert_equal(cp->bytes & (CI_PAGE_SIZE - 1), 0);

  /* Mmap MIBs */
  mem = mmap(NULL, cp->bytes, PROT_READ , MAP_SHARED, fd,
             OO_MMAP_TYPE_CPLANE << OO_MMAP_TYPE_SHIFT);
  if( mem == MAP_FAILED ) {
    ci_log("ERROR: failed to mmap cplane MIBs: %s", strerror(errno));
    return -errno;
  }

  /* Build MIBs */
  mibs[1].dim = mibs[0].dim = mem;
  cp_init_mibs(mem, mibs);

  /* Mmap rw memory */
  mibs[1].fwd_rw = mibs[0].fwd_rw = mmap(
          NULL,
          CI_ROUND_UP((mibs[0].dim->fwd_mask + 1) * sizeof(mibs[0].fwd_rw[0]),
                      CI_PAGE_SIZE),
          PROT_READ | PROT_WRITE, MAP_SHARED, fd,
#ifdef CP_SYSUNIT
          /* see server.c init_memory() */
          CI_ROUND_UP(cp->bytes, CI_PAGE_SIZE) +
#endif
          ((OO_MMAP_TYPE_CPLANE << OO_MMAP_TYPE_SHIFT) |
           (OO_MMAP_CPLANE_ID_FWD_RW << OO_MMAP_ID_SHIFT)));
  if( mibs[0].fwd_rw == MAP_FAILED ) {
    ci_log("ERROR: failed to mmap rw part of Control Plane memory: %s",
           strerror(errno));
    rc = -errno;
    munmap(mem, cp->bytes);
    return rc;
  }

  cp->fd = fd;

  return 0;
}
示例#25
0
/* Insert for either TCP or UDP */
int ci_netif_filter_insert(ci_netif* netif, oo_sp tcp_id,
			   unsigned laddr, unsigned lport,
			   unsigned raddr, unsigned rport, unsigned protocol)
{
  ci_netif_filter_table_entry* entry;
  unsigned hash1, hash2;
  ci_netif_filter_table* tbl;
#if !defined(NDEBUG) || CI_CFG_STATS_NETIF
  unsigned hops = 1;
#endif
  unsigned first;

  ci_assert(netif);
  ci_assert(ci_netif_is_locked(netif));
  ci_assert(netif->filter_table);
  tbl = netif->filter_table;

  hash1 = tcp_hash1(tbl, laddr, lport, raddr, rport, protocol);
  hash2 = tcp_hash2(tbl, laddr, lport, raddr, rport, protocol);
  first = hash1;

  /* Find a free slot. */
  while( 1 ) {
    entry = &tbl->table[hash1];
    if( entry->id < 0 )  break;

    ++entry->route_count;
#if !defined(NDEBUG) || CI_CFG_STATS_NETIF
    ++hops;
#endif

    /* A socket can only have multiple entries in the filter table if each
     * entry has a different [laddr].
     */
    ci_assert(
      !((entry->id == OO_SP_TO_INT(tcp_id)) && (laddr == entry->laddr)) );

    hash1 = (hash1 + hash2) & tbl->table_size_mask;

    if( hash1 == first ) {
      ci_sock_cmn *s = SP_TO_SOCK_CMN(netif, tcp_id);
      if( ! (s->s_flags & CI_SOCK_FLAG_SW_FILTER_FULL) ) {
        LOG_E(ci_log(FN_FMT "%d FULL %s %s:%u->%s:%u hops=%u",
                     FN_PRI_ARGS(netif),
                     OO_SP_FMT(tcp_id), CI_IP_PROTOCOL_STR(protocol),
                     ip_addr_str(laddr), (unsigned) CI_BSWAP_BE16(lport),
                     ip_addr_str(raddr), (unsigned) CI_BSWAP_BE16(rport),
                     hops));
        s->s_flags |= CI_SOCK_FLAG_SW_FILTER_FULL;
      }

      CITP_STATS_NETIF_INC(netif, sw_filter_insert_table_full);
      return -ENOBUFS;
    }
  }

  /* Now insert the new entry. */
  LOG_TC(ci_log(FN_FMT "%d INSERT %s %s:%u->%s:%u hash=%u:%u at=%u "
		"over=%d hops=%u", FN_PRI_ARGS(netif), OO_SP_FMT(tcp_id),
                CI_IP_PROTOCOL_STR(protocol),
		ip_addr_str(laddr), (unsigned) CI_BSWAP_BE16(lport),
		ip_addr_str(raddr), (unsigned) CI_BSWAP_BE16(rport),
		first, hash2, hash1, entry->id, hops));

#if CI_CFG_STATS_NETIF
  if( hops > netif->state->stats.table_max_hops )
    netif->state->stats.table_max_hops = hops;
  /* Keep a rolling average of the number of hops per entry. */
  if( netif->state->stats.table_mean_hops == 0 )
    netif->state->stats.table_mean_hops = 1;
  netif->state->stats.table_mean_hops =
    (netif->state->stats.table_mean_hops * 9 + hops) / 10;

  if( entry->id == EMPTY )
    ++netif->state->stats.table_n_slots;
  ++netif->state->stats.table_n_entries;
#endif

  entry->id = OO_SP_TO_INT(tcp_id);
  entry->laddr = laddr;
  return 0;
}
示例#26
0
/* Change substituted sigaction to the structure really meant by user.
 * If sa is provided, copy user sigaction data here to pass to user.
 * If sa==NULL, substitute in-place. */
static int
efab_signal_report_sigaction(int sig, struct sigaction *sa,
                             struct mm_signal_data *tramp_data)
{
  struct oo_sigaction *signal_data = &(tramp_data->signal_data[sig - 1]);
  ci_int32 type;
#define MAX_TRIES_BUSY 1000
  int tried_busy = 0;
  int tried_changed = 0;
  int sa_provided = (sa != NULL);

re_read_data:
  do {
    tried_busy++;
    type = signal_data->type;
  } while( (type & OO_SIGHANGLER_TYPE_MASK) == OO_SIGHANGLER_BUSY &&
           tried_busy <= MAX_TRIES_BUSY );
  if( tried_busy > MAX_TRIES_BUSY ) {
    ci_log("%s(%d): pid %d signal() or sigaction() runs for too long",
           __func__, sig, current->pid);
    return -EBUSY;
  }

report:
  spin_lock_irq(&current->sighand->siglock);
  if( sa_provided )
    *sa = current->sighand->action[sig - 1].sa;
  else
    sa = &current->sighand->action[sig - 1].sa;

  if( sa->sa_handler != tramp_data->handler_postpone ) {
    spin_unlock_irq(&current->sighand->siglock);
    return 0;
  }

  OO_DEBUG_SIGNAL(ci_log("%s: %d process sig=%d type %d handler %p "
                         "flags %lx restorer %p", __func__, current->pid,
                         sig, type & OO_SIGHANGLER_TYPE_MASK, sa->sa_handler,
                         sa->sa_flags, sa->sa_restorer));
  if( (signal_data->type & OO_SIGHANGLER_TYPE_MASK) == OO_SIGHANGLER_USER) {
    sa->sa_handler = CI_USER_PTR_GET(signal_data->handler);
    if( ! (signal_data->flags & SA_SIGINFO) )
      sa->sa_flags &= ~SA_SIGINFO;
  }
  else if( ! (signal_data->type & OO_SIGHANGLER_IGN_BIT) ) {
    sa->sa_handler = SIG_DFL;
    sa->sa_flags &= ~SA_RESTORER;
    if( ! (signal_data->flags & SA_SIGINFO) )
      sa->sa_flags &= ~SA_SIGINFO;
    sa->sa_restorer = NULL;
  }
  OO_DEBUG_SIGNAL(ci_log("%s: %d to user sig=%d handler %p flags %lx "
                         "restorer %p", __func__,
                         current->pid, sig, sa->sa_handler,
                         sa->sa_flags, sa->sa_restorer));

  spin_unlock_irq(&current->sighand->siglock);

  /* Re-check that UL have not changed signal_data. */
  if( type != signal_data->type ) {
    tried_changed++;
    if( tried_changed > MAX_TRIES_BUSY ) {
      ci_log("%s: signal() or sigaction() called too fast", __func__);
      return -EBUSY;
    }
    if( (signal_data->type & OO_SIGHANGLER_TYPE_MASK) == OO_SIGHANGLER_BUSY ) {
      tried_busy = 0;
      goto re_read_data;
    }
    else
      goto report;
  }

  return 0;
}
示例#27
0
/* NOTE: in the kernel version [fd] is unused and, if it's a ptr, [arg] will
 * be in user-space and may need to be fetched into kernel memory. */
static int ci_tcp_ioctl_lk(citp_socket* ep, ci_fd_t fd, int request,
                           void* arg)
{
  ci_netif* netif = ep->netif;
  ci_sock_cmn* s = ep->s;
  ci_tcp_state* ts = NULL;
  int rc = 0;
  int os_socket_exists = s->b.sb_aflags & CI_SB_AFLAG_OS_BACKED;

  if( s->b.state != CI_TCP_LISTEN )
    ts = SOCK_TO_TCP(s);

  /* Keep the os socket in sync.  If this is a "get" request then the
   * return will be based on our support, not the os's (except for EFAULT
   * handling which we get for free).
   * Exceptions:
   *  - FIONBIO is applied just in time on handover if needed (listening
   *    sockets always have a non-blocking OS socket)
   *  - FIONREAD, TIOCOUTQ, SIOCOUTQNSD and SIOCATMARK are useless on OS
   *    socket, let's avoid syscall.
   */
  if( os_socket_exists && request != FIONREAD && request != SIOCATMARK &&
      request != FIOASYNC && request != TIOCOUTQ && request != SIOCOUTQNSD &&
      request != (int) FIONBIO ) {
    rc = oo_os_sock_ioctl(netif, s->b.bufid, request, arg, NULL);
    if( rc < 0 )
      return rc;
  }

  /* ioctl defines are listed in `man ioctl_list` and the CI equivalent
   * CI defines are in include/ci/net/ioctls.h  */
  LOG_TV( ci_log("%s:  request = %d, arg = %ld", __FUNCTION__, request, 
		 (long)arg));

  switch( request ) {
  case FIONBIO:
    if( CI_IOCTL_ARG_OK(int, arg) ) {
      CI_CMN_IOCTL_FIONBIO(ep->s, arg);
      rc = 0;
      break;
    }
    goto fail_fault;
  case FIONREAD: /* synonym of SIOCINQ */
    if( !CI_IOCTL_ARG_OK(int, arg) )
      goto fail_fault;
    if( s->b.state == CI_TCP_LISTEN )
      goto fail_inval;

    if( s->b.state == CI_TCP_SYN_SENT ) {
      CI_IOCTL_SETARG((int*)arg, 0);
    } else {
      /* In inline mode, return the total number of bytes in the receive queue.
         If SO_OOBINLINE isn't set then return the number of bytes up to the
         mark but without counting the mark */
      int bytes_in_rxq = tcp_rcv_usr(ts);
      if (bytes_in_rxq && ! (ts->s.s_flags & CI_SOCK_FLAG_OOBINLINE)) {

        if (tcp_urg_data(ts) & CI_TCP_URG_PTR_VALID) {
          /*! \TODO: what if FIN has been received? */
          unsigned int readnxt = tcp_rcv_nxt(ts) - bytes_in_rxq;
          if (SEQ_LT(readnxt, tcp_rcv_up(ts))) {
            bytes_in_rxq = tcp_rcv_up(ts) - readnxt;
          } else if (SEQ_EQ(readnxt, tcp_rcv_up(ts))) {
            bytes_in_rxq--;
          }
        }

      }
      CI_IOCTL_SETARG((int*)arg, bytes_in_rxq);
    }
    break;

  case TIOCOUTQ: /* synonym of SIOCOUTQ */
  case SIOCOUTQNSD:
    {
    CI_BUILD_ASSERT(TIOCOUTQ == SIOCOUTQ);
    int outq_bytes = 0;

    if( !CI_IOCTL_ARG_OK(int, arg) )
      goto fail_fault;
    if( s->b.state == CI_TCP_LISTEN )
      goto fail_inval;

    if( s->b.state != CI_TCP_SYN_SENT ) {

      /* TIOCOUTQ counts all unacknowledged data, so includes retrans queue. */
      if( request == TIOCOUTQ )
        outq_bytes = SEQ_SUB(tcp_enq_nxt(ts), tcp_snd_una(ts));
      else
        outq_bytes = SEQ_SUB(tcp_enq_nxt(ts), tcp_snd_nxt(ts));
    }
    CI_IOCTL_SETARG((int*)arg, outq_bytes);
    }
    break;

  case SIOCATMARK:
    {
     if( !CI_IOCTL_ARG_OK(int, arg) )
       goto fail_fault;

      /* return true, if we are at the out-of-band byte */
      CI_IOCTL_SETARG((int*)arg, 0);
      if( s->b.state != CI_TCP_LISTEN ) {
	int readnxt;

        readnxt = SEQ_SUB(tcp_rcv_nxt(ts), tcp_rcv_usr(ts));
        if( ~ts->s.b.state & CI_TCP_STATE_ACCEPT_DATA )
          readnxt = SEQ_SUB(readnxt, 1);
        if( tcp_urg_data(ts) & CI_TCP_URG_PTR_VALID )
          CI_IOCTL_SETARG((int*)arg, readnxt == tcp_rcv_up(ts));
        LOG_URG(log(NTS_FMT "SIOCATMARK atmark=%d  readnxt=%u rcv_up=%u%s",
                    NTS_PRI_ARGS(ep->netif, ts), readnxt == tcp_rcv_up(ts), 
		    readnxt,  tcp_rcv_up(SOCK_TO_TCP(ep->s)),
                    (tcp_urg_data(ts)&CI_TCP_URG_PTR_VALID)?"":" (invalid)"));
      }
      break;
    }

#ifndef __KERNEL__
  case FIOASYNC:
    /* Need to apply this to [fd] so that our fasync file-op will be
     * invoked.
     */
    rc = ci_sys_ioctl(fd, request, arg);
    break;

  case SIOCSPGRP:
    if( !CI_IOCTL_ARG_OK(int, arg) )
      goto fail_fault;
    /* Need to apply this to [fd] to get signal delivery to work.  However,
     * SIOCSPGRP is only supported on sockets, so we need to convert to
     * fcntl().
     */
    rc = ci_sys_fcntl(fd, F_SETOWN, CI_IOCTL_GETARG(int, arg));
    if( rc == 0 ) {
      rc = ci_cmn_ioctl(netif, ep->s, request, arg, rc, os_socket_exists);
    }
    else {
      CI_SET_ERROR(rc, -rc);
    }
    break;
#endif

  default:
    return ci_cmn_ioctl(netif, ep->s, request, arg, rc, os_socket_exists);
  }
示例#28
0
/* Substitute signal handler by our variant. */
static int
efab_signal_substitute(int sig, struct sigaction *new_act,
                       struct mm_signal_data *tramp_data)
{
  int rc;
  __sighandler_t handler;
  struct k_sigaction *k;
  int type;
  __user struct oo_sigaction *user_data;
  struct oo_sigaction *signal_data = &(tramp_data->signal_data[sig - 1]);
  ci_int32 old_type;
  ci_int32 seq;

  user_data = &(((struct oo_sigaction *)
                 (CI_USER_PTR_GET(tramp_data->user_data)))[sig - 1]);
  if( !access_ok(VERIFY_WRITE, user_data, sizeof(struct oo_sigaction) ) )
    return -EFAULT;

  do {
    old_type = signal_data->type;
    seq = (old_type & OO_SIGHANGLER_SEQ_MASK) + (1 << OO_SIGHANGLER_SEQ_SHIFT);
  } while( ci_cas32_fail(&signal_data->type, old_type,
                         OO_SIGHANGLER_BUSY | seq) );

  /* We are going to change signal handler: UL should wait until we've
   * finished */
  rc = __put_user(signal_data->type, &user_data->type);
  if( rc != 0 ) {
    signal_data->type = old_type;
    return -EFAULT;
  }

  spin_lock_irq(&current->sighand->siglock);
  k = &current->sighand->action[sig - 1];
  if( new_act )
    k->sa = *new_act;
  type = efab_signal_handler_type(sig, k->sa.sa_handler);
  handler = type <= OO_SIGHANGLER_DFL_MAX ? tramp_data->handlers[type] : NULL;
  BUILD_BUG_ON(SIG_DFL != NULL);

  /* We do not handle this signal: */
  if( type != OO_SIGHANGLER_USER && handler == NULL ) {
    spin_unlock_irq(&current->sighand->siglock);
    signal_data->type = old_type | OO_SIGHANGLER_IGN_BIT | seq;
    ci_verify(__put_user(signal_data->type,
                         &user_data->type) == 0);

    return 0;
  }

  OO_DEBUG_SIGNAL(ci_log("%s: %d change sig=%d handler %p flags %lx "
                         "restorer %p type %d", __func__,
                         current->pid, sig, k->sa.sa_handler,
                         k->sa.sa_flags, k->sa.sa_restorer, type));
  signal_data->flags = k->sa.sa_flags;
  k->sa.sa_flags |= SA_SIGINFO;
  if( type == OO_SIGHANGLER_USER )
    CI_USER_PTR_SET(signal_data->handler, k->sa.sa_handler);
  else {
    CI_USER_PTR_SET(signal_data->handler, handler);
    if( tramp_data->sarestorer ) {
      k->sa.sa_flags |= SA_RESTORER;
      k->sa.sa_restorer = tramp_data->sarestorer;
    }
  }
  k->sa.sa_handler = tramp_data->handler_postpone;
  spin_unlock_irq(&current->sighand->siglock);

  OO_DEBUG_SIGNAL(ci_log("%s: %d set sig=%d handler %p flags %lx restorer %p",
                         __func__, current->pid, sig, k->sa.sa_handler,
                         k->sa.sa_flags, k->sa.sa_restorer));

  /* Copy signal_data to UL; type BUSY */
  rc = __copy_to_user(user_data, signal_data, sizeof(*signal_data));
  signal_data->type = type | seq;
  if( rc != 0 )
    return -EFAULT;
  /* Fill in the real type */
  ci_verify(__put_user(signal_data->type, &user_data->type) == 0);

  return 0;
}
示例#29
0
int onload_zc_alloc_buffers(int fd, struct onload_zc_iovec* iovecs,
                            int iovecs_len, 
                            enum onload_zc_buffer_type_flags flags)
{
  int rc = 0, i;
  citp_lib_context_t lib_context;
  citp_fdinfo* fdi;
  citp_sock_fdi* epi;
  ci_netif* ni;
  ci_ip_pkt_fmt *pkt;
  unsigned max_len;

  Log_CALL(ci_log("%s(%d, %p, %d, %x)", __FUNCTION__, fd, iovecs,
                  iovecs_len, flags));

  citp_enter_lib(&lib_context);

  if( (fdi = citp_fdtable_lookup(fd)) != NULL ) {
    switch( citp_fdinfo_get_type(fdi) ) {
    case CITP_UDP_SOCKET:
    case CITP_TCP_SOCKET:
      epi = fdi_to_sock_fdi(fdi);
      ni = epi->sock.netif;
      ci_netif_lock(ni);
      for( i = 0; i < iovecs_len; ++i ) {
        max_len = CI_CFG_PKT_BUF_SIZE;
        pkt = ci_netif_pkt_tx_tcp_alloc(ni);
        if( pkt == NULL ) {
          while( --i >= 0 )
            ci_netif_pkt_release(ni, (ci_ip_pkt_fmt*)iovecs[i].buf);
          rc = -ENOMEM;
          ci_netif_unlock(ni);
          goto out;
        }
        /* Make sure this is clear as it affects behaviour when freeing */
        pkt->pf.udp.rx_flags = 0;
        iovecs[i].buf = (struct oo_zc_buf *)pkt;
        if( flags & ONLOAD_ZC_BUFFER_HDR_TCP ) {
          if( (citp_fdinfo_get_type(fdi) == CITP_TCP_SOCKET) &&
              (epi->sock.s->b.state & CI_TCP_STATE_TCP_CONN) ) {
            ci_tcp_state* ts = SOCK_TO_TCP(epi->sock.s);
            oo_tx_pkt_layout_init(pkt);
            iovecs[i].iov_base = ((char *)oo_tx_ip_hdr(pkt)) + 
              ts->outgoing_hdrs_len;
            max_len = tcp_eff_mss(ts);
          } 
          else {
            /* Best guess.  We can fix it up later.  Magic 12 leaves
             * space for time stamp option (common case)
             */
            oo_tx_pkt_layout_init(pkt);
            iovecs[i].iov_base =
              (uint8_t*) oo_tx_ip_data(pkt) + sizeof(ci_tcp_hdr) + 12;
          }
        }
        else if( flags & ONLOAD_ZC_BUFFER_HDR_UDP ) {
          oo_tx_pkt_layout_init(pkt);
          iovecs[i].iov_base =
            (uint8_t*) oo_tx_ip_data(pkt) + sizeof(ci_udp_hdr);
        }
        else 
          iovecs[i].iov_base = PKT_START(pkt);
        iovecs[i].iov_len = CI_CFG_PKT_BUF_SIZE - 
          ((char *)iovecs[i].iov_base - (char *)pkt);
        if( iovecs[i].iov_len > max_len )
          iovecs[i].iov_len = max_len;
      }
      ni->state->n_async_pkts += iovecs_len;
      ci_netif_unlock(ni);
      break;
#if CI_CFG_USERSPACE_EPOLL
    case CITP_EPOLL_FD:
      rc = -ENOTSOCK;
      break;
#endif
#if CI_CFG_USERSPACE_PIPE
    case CITP_PIPE_FD:
      rc = -ENOTSOCK;
      break;
#endif
    case CITP_PASSTHROUGH_FD:
      rc = -ESOCKTNOSUPPORT;
      break;
    default:
      LOG_U(log("%s: unknown fdinfo type %d", __FUNCTION__, 
                citp_fdinfo_get_type(fdi)));
      rc = -EINVAL;
    }
    citp_fdinfo_release_ref(fdi, 0);
  } 
  else {
    /* Not onload socket */
    rc = -ESOCKTNOSUPPORT;
  }

 out:
  citp_exit_lib(&lib_context, TRUE);
  Log_CALL_RESULT(rc);
  return rc;
}
示例#30
0
int ci_tcp_listen(citp_socket* ep, ci_fd_t fd, int backlog)
{
  /* 
  ** ?? error handling on possible fails not handled robustly...
  ** ?? Need to check port number is valid TODO
  */

  /*! \todo If not bound then we have to be listening on all interfaces.
   * It's likely that we won't be coming through here as we have to
   * listen on the OS socket too! */
  ci_tcp_state* ts;
  ci_tcp_socket_listen* tls;
  ci_netif* netif = ep->netif;
  ci_sock_cmn* s = ep->s;
  unsigned ul_backlog = backlog;
  int rc;
  oo_p sp;

  LOG_TC(log("%s "SK_FMT" listen backlog=%d", __FUNCTION__, SK_PRI_ARGS(ep), 
             backlog));
  CHECK_TEP(ep);

  if( NI_OPTS(netif).tcp_listen_handover )
    return CI_SOCKET_HANDOVER;
  if( !NI_OPTS(netif).tcp_server_loopback) {
    /* We should handover if the socket is bound to alien address. */
    if( s->s_flags & CI_SOCK_FLAG_BOUND_ALIEN )
      return CI_SOCKET_HANDOVER;
  }

  if( ul_backlog < 0 )
    ul_backlog = NI_OPTS(netif).max_ep_bufs;
  else if( ul_backlog < NI_OPTS(netif).acceptq_min_backlog )
    ul_backlog = NI_OPTS(netif).acceptq_min_backlog;

  if( s->b.state == CI_TCP_LISTEN ) {
    tls = SOCK_TO_TCP_LISTEN(s);
    tls->acceptq_max = ul_backlog;
    ci_tcp_helper_listen_os_sock(fd, ul_backlog);
    return 0;
  }

  if( s->b.state != CI_TCP_CLOSED ) {
    CI_SET_ERROR(rc, EINVAL);
    return rc;
  }


  ts = SOCK_TO_TCP(s);

  /* Bug 3376: if socket used for a previous, failed, connect then the error
   * numbers will not be as expected.  Only seen when not using listening
   * netifs (as moving the EP to the new netif resets them). 
   */

  ts->s.tx_errno = EPIPE;



  ts->s.rx_errno = ENOTCONN;

  /* fill in address/ports and all TCP state */
  if( !(ts->s.s_flags & CI_SOCK_FLAG_BOUND) ) {
    ci_uint16 source_be16;

    /* They haven't previously done a bind, so we need to choose 
     * a port.  As we haven't been given a hint we let the OS choose. */

    source_be16 = 0;
    rc = __ci_bind(ep->netif, ep->s, ts->s.pkt.ip.ip_saddr_be32, &source_be16);
    if (CI_LIKELY( rc==0 )) {
      TS_TCP(ts)->tcp_source_be16 = source_be16;
      ts->s.cp.lport_be16 = source_be16;
      LOG_TC(log(LNT_FMT "listen: our bind returned %s:%u", 
                 LNT_PRI_ARGS(ep->netif, ts),
                 ip_addr_str(ts->s.pkt.ip.ip_saddr_be32),
                 (unsigned) CI_BSWAP_BE16(TS_TCP(ts)->tcp_source_be16)));

    } else {
      LOG_U(ci_log("__ci_bind returned %d at %s:%d", CI_GET_ERROR(rc),
                   __FILE__, __LINE__));
      return rc;
    }
  } 

  ci_sock_lock(netif, &ts->s.b);
  ci_tcp_set_slow_state(netif, ts, CI_TCP_LISTEN);
  tls = SOCK_TO_TCP_LISTEN(&ts->s);

  tcp_raddr_be32(tls) = 0u;
  tcp_rport_be16(tls) = 0u;

  ci_assert_equal(tls->s.tx_errno, EPIPE);



  ci_assert_equal(tls->s.rx_errno, ENOTCONN);

  /* setup listen timer - do it before the first return statement,
   * because __ci_tcp_listen_to_normal() will be called on error path. */
  if( ~tls->s.s_flags & CI_SOCK_FLAG_BOUND_ALIEN ) {
    sp = TS_OFF(netif, tls);
    OO_P_ADD(sp, CI_MEMBER_OFFSET(ci_tcp_socket_listen, listenq_tid));
    ci_ip_timer_init(netif, &tls->listenq_tid, sp, "lstq");
    tls->listenq_tid.param1 = S_SP(tls);
    tls->listenq_tid.fn = CI_IP_TIMER_TCP_LISTEN;
  }

  rc = ci_tcp_listen_init(netif, tls);
  ci_sock_unlock(netif, &ts->s.b);
  if( rc != 0 ) {
    CI_SET_ERROR(rc, -rc);
    goto listen_fail;
  }
  tls->acceptq_max = ul_backlog;

  CITP_STATS_TCP_LISTEN(CI_ZERO(&tls->stats));

  /* install all the filters needed for this connection 
   *    - tcp_laddr_be32(ts) = 0 for IPADDR_ANY
   *
   *  TODO: handle BINDTODEVICE by setting phys_port paramter to correct 
   *        physical L5 port index
   *  TODO: handle REUSEADDR by setting last paramter to TRUE
   */
  if( ~s->s_flags & CI_SOCK_FLAG_BOUND_ALIEN ) {
#ifdef ONLOAD_OFE
    if( netif->ofe != NULL ) {
      tls->s.ofe_code_start = ofe_socktbl_find(
                        netif->ofe, OFE_SOCKTYPE_TCP_LISTEN,
                        tcp_laddr_be32(tls), INADDR_ANY,
                        tcp_lport_be16(ts), 0);
      tls->ofe_promote = ofe_socktbl_find(
                        netif->ofe, OFE_SOCKTYPE_TCP_PASSIVE,
                        tcp_laddr_be32(tls), INADDR_ANY,
                        tcp_lport_be16(ts), 0);
    }
#endif
    rc = ci_tcp_ep_set_filters(netif, S_SP(tls), tls->s.cp.so_bindtodevice,
                               OO_SP_NULL);
    if( rc == -EFILTERSSOME ) {
      if( CITP_OPTS.no_fail )
        rc = 0;
      else {
        ci_tcp_ep_clear_filters(netif, S_SP(tls), 0);
        rc = -ENOBUFS;
      }
    }
    ci_assert_nequal(rc, -EFILTERSSOME);
    VERB(ci_log("%s: set_filters  returned %d", __FUNCTION__, rc));
    if (rc < 0) {
      CI_SET_ERROR(rc, -rc);
      goto post_listen_fail;
    }
  }


  /* 
   * Call of system listen() is required for listen any, local host
   * communications server and multi-homed server (to accept connections
   * to L5 assigned address(es), but incoming from other interfaces).
   */
#ifdef __ci_driver__
  {
    rc = efab_tcp_helper_listen_os_sock( netif2tcp_helper_resource(netif),
					 S_SP(tls), backlog);
  }
#else
  rc = ci_tcp_helper_listen_os_sock(fd, backlog);
#endif
  if ( rc < 0 ) {
    /* clear the filter we've just set */
    ci_tcp_ep_clear_filters(netif, S_SP(tls), 0);
    goto post_listen_fail;
  }
  return 0;

 post_listen_fail:
  ci_tcp_listenq_drop_all(netif, tls);
 listen_fail:
  /* revert TCP state to a non-listening socket format */
  __ci_tcp_listen_to_normal(netif, tls);
  /* Above function sets orphan flag but we are attached to an FD. */
  ci_bit_clear(&tls->s.b.sb_aflags, CI_SB_AFLAG_ORPHAN_BIT);
#ifdef __ci_driver__
  return rc;
#else
  return CI_SOCKET_ERROR;
#endif
}