Esempio n. 1
0
int onload_zc_alloc_buffers(int fd, struct onload_zc_iovec* iovecs,
                            int iovecs_len, 
                            enum onload_zc_buffer_type_flags flags)
{
  int rc = 0, i;
  citp_lib_context_t lib_context;
  citp_fdinfo* fdi;
  citp_sock_fdi* epi;
  ci_netif* ni;
  ci_ip_pkt_fmt *pkt;
  unsigned max_len;

  Log_CALL(ci_log("%s(%d, %p, %d, %x)", __FUNCTION__, fd, iovecs,
                  iovecs_len, flags));

  citp_enter_lib(&lib_context);

  if( (fdi = citp_fdtable_lookup(fd)) != NULL ) {
    switch( citp_fdinfo_get_type(fdi) ) {
    case CITP_UDP_SOCKET:
    case CITP_TCP_SOCKET:
      epi = fdi_to_sock_fdi(fdi);
      ni = epi->sock.netif;
      ci_netif_lock(ni);
      for( i = 0; i < iovecs_len; ++i ) {
        max_len = CI_CFG_PKT_BUF_SIZE;
        pkt = ci_netif_pkt_tx_tcp_alloc(ni);
        if( pkt == NULL ) {
          while( --i >= 0 )
            ci_netif_pkt_release(ni, (ci_ip_pkt_fmt*)iovecs[i].buf);
          rc = -ENOMEM;
          ci_netif_unlock(ni);
          goto out;
        }
        /* Make sure this is clear as it affects behaviour when freeing */
        pkt->pf.udp.rx_flags = 0;
        iovecs[i].buf = (struct oo_zc_buf *)pkt;
        if( flags & ONLOAD_ZC_BUFFER_HDR_TCP ) {
          if( (citp_fdinfo_get_type(fdi) == CITP_TCP_SOCKET) &&
              (epi->sock.s->b.state & CI_TCP_STATE_TCP_CONN) ) {
            ci_tcp_state* ts = SOCK_TO_TCP(epi->sock.s);
            oo_tx_pkt_layout_init(pkt);
            iovecs[i].iov_base = ((char *)oo_tx_ip_hdr(pkt)) + 
              ts->outgoing_hdrs_len;
            max_len = tcp_eff_mss(ts);
          } 
          else {
            /* Best guess.  We can fix it up later.  Magic 12 leaves
             * space for time stamp option (common case)
             */
            oo_tx_pkt_layout_init(pkt);
            iovecs[i].iov_base =
              (uint8_t*) oo_tx_ip_data(pkt) + sizeof(ci_tcp_hdr) + 12;
          }
        }
        else if( flags & ONLOAD_ZC_BUFFER_HDR_UDP ) {
          oo_tx_pkt_layout_init(pkt);
          iovecs[i].iov_base =
            (uint8_t*) oo_tx_ip_data(pkt) + sizeof(ci_udp_hdr);
        }
        else 
          iovecs[i].iov_base = PKT_START(pkt);
        iovecs[i].iov_len = CI_CFG_PKT_BUF_SIZE - 
          ((char *)iovecs[i].iov_base - (char *)pkt);
        if( iovecs[i].iov_len > max_len )
          iovecs[i].iov_len = max_len;
      }
      ni->state->n_async_pkts += iovecs_len;
      ci_netif_unlock(ni);
      break;
#if CI_CFG_USERSPACE_EPOLL
    case CITP_EPOLL_FD:
      rc = -ENOTSOCK;
      break;
#endif
#if CI_CFG_USERSPACE_PIPE
    case CITP_PIPE_FD:
      rc = -ENOTSOCK;
      break;
#endif
    case CITP_PASSTHROUGH_FD:
      rc = -ESOCKTNOSUPPORT;
      break;
    default:
      LOG_U(log("%s: unknown fdinfo type %d", __FUNCTION__, 
                citp_fdinfo_get_type(fdi)));
      rc = -EINVAL;
    }
    citp_fdinfo_release_ref(fdi, 0);
  } 
  else {
    /* Not onload socket */
    rc = -ESOCKTNOSUPPORT;
  }

 out:
  citp_exit_lib(&lib_context, TRUE);
  Log_CALL_RESULT(rc);
  return rc;
}
Esempio n. 2
0
static int ci_tcp_connect_ul_start(ci_netif *ni, ci_tcp_state* ts,
				   ci_uint32 dst_be32, unsigned dport_be16,
                                   int* fail_rc)
{
  ci_ip_pkt_fmt* pkt;
  int rc = 0;

  ci_assert(ts->s.pkt.mtu);

  /* Now that we know the outgoing route, set the MTU related values.
   * Note, even these values are speculative since the real MTU
   * could change between now and passing the packet to the lower layers
   */
  ts->amss = ts->s.pkt.mtu - sizeof(ci_tcp_hdr) - sizeof(ci_ip4_hdr);
#if CI_CFG_LIMIT_AMSS
  ts->amss = ci_tcp_limit_mss(ts->amss, ni, __FUNCTION__);
#endif

  /* Default smss until discovered by MSS option in SYN - RFC1122 4.2.2.6 */
  ts->smss = CI_CFG_TCP_DEFAULT_MSS;

  /* set pmtu, eff_mss, snd_buf and adjust windows */
  ci_pmtu_set(ni, &ts->pmtus, ts->s.pkt.mtu);
  ci_tcp_set_eff_mss(ni, ts);
  ci_tcp_set_initialcwnd(ni, ts);

  /* Send buffer adjusted by ci_tcp_set_eff_mss(), but we want it to stay
   * zero until the connection is established.
   */
  ts->so_sndbuf_pkts = 0;

  /* 
   * 3. State and address are OK. It's address routed through our NIC.
   *    Do connect().
   */
  ci_assert_nequal(ts->s.pkt.ip.ip_saddr_be32, INADDR_ANY);

  if( ts->s.s_flags & CI_SOCK_FLAG_CONNECT_MUST_BIND ) {
    ci_sock_cmn* s = &ts->s;
    ci_uint16 source_be16 = 0;

    if( s->s_flags & CI_SOCK_FLAG_ADDR_BOUND )
      rc = __ci_bind(ni, &ts->s, ts->s.pkt.ip.ip_saddr_be32, &source_be16);
    else 
      rc = __ci_bind(ni, &ts->s, INADDR_ANY, &source_be16);
    if(CI_LIKELY( rc == 0 )) {
      TS_TCP(ts)->tcp_source_be16 = source_be16;
      ts->s.cp.lport_be16 = source_be16;
      LOG_TC(log(LNT_FMT "connect: our bind returned %s:%u", 
                 LNT_PRI_ARGS(ni, ts),
                 ip_addr_str(INADDR_ANY),
                 (unsigned) CI_BSWAP_BE16(TS_TCP(ts)->tcp_source_be16)));
    }
    else {
      LOG_U(ci_log("__ci_bind returned %d at %s:%d", CI_GET_ERROR(rc),
                   __FILE__, __LINE__));
      *fail_rc = rc;
      return CI_CONNECT_UL_FAIL;
    }
    if(CI_UNLIKELY( ts->s.pkt.ip.ip_saddr_be32 == 0 )) {
      CI_SET_ERROR(*fail_rc, EINVAL);
      return CI_CONNECT_UL_FAIL;
    }
  }

  ci_tcp_set_peer(ts, dst_be32, dport_be16);

  /* Make sure we can get a buffer before we change state. */
  pkt = ci_netif_pkt_tx_tcp_alloc(ni);
  if( CI_UNLIKELY(! pkt) ) {
    /* NB. We've already done a poll above. */
    rc = ci_netif_pkt_wait(ni, &ts->s, CI_SLEEP_NETIF_LOCKED|CI_SLEEP_NETIF_RQ);
    if( ci_netif_pkt_wait_was_interrupted(rc) ) {
      CI_SET_ERROR(*fail_rc, -rc);
      return CI_CONNECT_UL_LOCK_DROPPED;
    }
    /* OK, there are (probably) packets available - go try again.  Note we
     * jump back to the top of the function because someone may have
     * connected this socket in the mean-time, so we need to check the
     * state once more.
     */
    return CI_CONNECT_UL_START_AGAIN;
  }

#ifdef ONLOAD_OFE
    if( ni->ofe != NULL )
      ts->s.ofe_code_start = ofe_socktbl_find(
                        ni->ofe, OFE_SOCKTYPE_TCP_ACTIVE,
                        tcp_laddr_be32(ts), tcp_raddr_be32(ts),
                        tcp_lport_be16(ts), tcp_rport_be16(ts));
#endif

  rc = ci_tcp_ep_set_filters(ni, S_SP(ts), ts->s.cp.so_bindtodevice,
                             OO_SP_NULL);
  if( rc < 0 ) {
    /* Perhaps we've run out of filters?  See if we can push a socket out
     * of timewait and steal its filter.
     */
    ci_assert_nequal(rc, -EFILTERSSOME);
    if( rc != -EBUSY || ! ci_netif_timewait_try_to_free_filter(ni) ||
        (rc = ci_tcp_ep_set_filters(ni, S_SP(ts),
                                    ts->s.cp.so_bindtodevice,
                                    OO_SP_NULL)) < 0 ) {
      ci_assert_nequal(rc, -EFILTERSSOME);
      /* Either a different error, or our efforts to free a filter did not
       * work.
       */
      if( ! (ts->s.s_flags & CI_SOCK_FLAG_ADDR_BOUND) ) {
        ts->s.pkt.ip.ip_saddr_be32 = 0;
        ts->s.cp.ip_laddr_be32 = 0;
      }
      ci_netif_pkt_release(ni, pkt);
      CI_SET_ERROR(*fail_rc, -rc);
      return CI_CONNECT_UL_FAIL;
    }
  }

  LOG_TC(log(LNT_FMT "CONNECT %s:%u->%s:%u", LNT_PRI_ARGS(ni, ts),
	     ip_addr_str(ts->s.pkt.ip.ip_saddr_be32),
	     (unsigned) CI_BSWAP_BE16(TS_TCP(ts)->tcp_source_be16),
	     ip_addr_str(ts->s.pkt.ip.ip_daddr_be32),
	     (unsigned) CI_BSWAP_BE16(TS_TCP(ts)->tcp_dest_be16)));

  /* We are going to send the SYN - set states appropriately */
  tcp_snd_una(ts) = tcp_snd_nxt(ts) = tcp_enq_nxt(ts) = tcp_snd_up(ts) =
    ci_tcp_initial_seqno(ni);
  ts->snd_max = tcp_snd_nxt(ts) + 1;

  /* Must be after initialising snd_una. */
  ci_tcp_clear_rtt_timing(ts);
  ci_tcp_set_flags(ts, CI_TCP_FLAG_SYN);
  ts->tcpflags &=~ CI_TCPT_FLAG_OPT_MASK;
  ts->tcpflags |= NI_OPTS(ni).syn_opts;

  if( (ts->tcpflags & CI_TCPT_FLAG_WSCL) ) {
    ts->rcv_wscl = ci_tcp_wscl_by_buff(ni, ci_tcp_rcvbuf_established(ni, &ts->s));
    CI_IP_SOCK_STATS_VAL_RXWSCL(ts, ts->rcv_wscl);
  }
  else {
    ts->rcv_wscl = 0;
    CI_IP_SOCK_STATS_VAL_RXWSCL(ts, 0);
  }
  ci_tcp_set_rcvbuf(ni, ts);
  ci_tcp_init_rcv_wnd(ts, "CONNECT");

  /* outgoing_hdrs_len is initialised to include timestamp option. */
  if( ! (ts->tcpflags & CI_TCPT_FLAG_TSO) )
    ts->outgoing_hdrs_len = sizeof(ci_ip4_hdr)+sizeof(ci_tcp_hdr);
  if( ci_tcp_can_stripe(ni, ts->s.pkt.ip.ip_saddr_be32,
			ts->s.pkt.ip.ip_daddr_be32) )
    ts->tcpflags |= CI_TCPT_FLAG_STRIPE;
  ci_tcp_set_slow_state(ni, ts, CI_TCP_SYN_SENT);

  /* If the app trys to send data on a socket in SYN_SENT state
  ** then the data is queued for send until the SYN gets ACKed.
  ** (rfc793 p56)
  **
  ** Receive calls on the socket should block until data arrives
  ** (rfc793 p58)
  **
  ** Clearing tx_errno and rx_errno acheive this. The transmit window
  ** is set to 1 byte which ensures that only the SYN packet gets
  ** sent until the ACK is received with more window. 
  */
  ci_assert(ts->snd_max == tcp_snd_nxt(ts) + 1);
  ts->s.rx_errno = 0;
  ts->s.tx_errno = 0; 
  ci_tcp_enqueue_no_data(ts, ni, pkt);
  ci_tcp_set_flags(ts, CI_TCP_FLAG_ACK);  

  if( ts->s.b.sb_aflags & (CI_SB_AFLAG_O_NONBLOCK | CI_SB_AFLAG_O_NDELAY) ) {
    ts->tcpflags |= CI_TCPT_FLAG_NONBLOCK_CONNECT;
    LOG_TC(log( LNT_FMT "Non-blocking connect - return EINPROGRESS",
		LNT_PRI_ARGS(ni, ts)));
    CI_SET_ERROR(*fail_rc, EINPROGRESS);
    return CI_CONNECT_UL_FAIL;
  }

  return CI_CONNECT_UL_OK;
}