Exemple #1
0
static int
efab_vi_rm_mmap_io(struct efrm_vi *virs,
                   unsigned long *bytes, void *opaque,
                   int *map_num, unsigned long *offset)
{
  int rc;
  int len;
  int instance;
  int base;
  unsigned vi_stride;
  struct efhw_nic *nic;

  nic = efrm_client_get_nic(virs->rs.rs_client);

  instance = virs->rs.rs_instance;

  len = CI_MIN(*bytes, CI_PAGE_SIZE);
  *bytes -=len;

  /* Make sure we can get away with a single page here. */
  switch (nic->devtype.arch) {
  case EFHW_ARCH_FALCON:
    ci_assert_lt(falcon_tx_dma_page_offset(instance), CI_PAGE_SIZE);
    ci_assert_lt(falcon_rx_dma_page_offset(instance), CI_PAGE_SIZE);
    ci_assert_equal(falcon_tx_dma_page_base(instance),
                    falcon_rx_dma_page_base(instance));
    base = falcon_tx_dma_page_base(instance);
    break;

  case EFHW_ARCH_EF10:
    vi_stride = nic->vi_stride;
    ci_assert_lt(ef10_tx_dma_page_offset(vi_stride, instance), CI_PAGE_SIZE);
    ci_assert_lt(ef10_rx_dma_page_offset(vi_stride, instance), CI_PAGE_SIZE);
    ci_assert_equal(ef10_tx_dma_page_base(vi_stride, instance),
                    ef10_rx_dma_page_base(vi_stride, instance));
    base = ef10_tx_dma_page_base(vi_stride, instance);
    break;

  default:
    EFCH_ERR("%s: ERROR: unknown nic type (%d)", __FUNCTION__,
	     nic->devtype.arch);
    base = 0; /* To quiet the compiler */
    BUG();
  }

  rc = ci_mmap_bar(nic, base, len, opaque, map_num, offset, 0);
  if (rc < 0 ) {
    EFCH_ERR("%s: ERROR: ci_mmap_bar failed rc=%d", __FUNCTION__, rc);
    return rc;
  }

  return 0;
}
Exemple #2
0
void __citp_fdtable_reserve(int fd, int protect)
{
  /* Must be holding the lock. */
  CITP_FDTABLE_ASSERT_LOCKED(1);
  ci_assert_lt((unsigned) fd, citp_fdtable.size);

  if( protect )  citp_fdtable_new_fd_set(fd, fdip_reserved, 1);
  else           fdtable_swap(fd, fdip_reserved, fdip_unknown, 1);
}
Exemple #3
0
void __citp_fdinfo_ref_count_zero(citp_fdinfo* fdi, int fdt_locked)
{
  Log_V(log("%s: fd=%d on_rcz=%d", __FUNCTION__, fdi->fd,
	    fdi->on_ref_count_zero));

  citp_fdinfo_assert_valid(fdi);
  ci_assert(oo_atomic_read(&fdi->ref_count) == 0);
  ci_assert_ge(fdi->fd, 0);
  ci_assert_lt(fdi->fd, citp_fdtable.inited_count);
  ci_assert_nequal(fdi_to_fdip(fdi), citp_fdtable.table[fdi->fd].fdip);

  switch( fdi->on_ref_count_zero ) {
  case FDI_ON_RCZ_CLOSE:
#if CI_CFG_FD_CACHING
    if( citp_fdinfo_get_ops(fdi)->cache(fdi) == 1 ) {
      if( ! fdt_locked && fdtable_strict() )  CITP_FDTABLE_LOCK();
      fdtable_swap(fdi->fd, fdip_closing, fdip_unknown,
                   fdt_locked | fdtable_strict());
      citp_fdinfo_get_ops(fdi)->dtor(fdi, fdt_locked | fdtable_strict());
      if( ! fdt_locked && fdtable_strict() )  CITP_FDTABLE_UNLOCK();
      citp_fdinfo_free(fdi);
      break;
    }
    else 
#endif
    {
      if( ! fdt_locked && fdtable_strict() )  CITP_FDTABLE_LOCK();
      ci_tcp_helper_close_no_trampoline(fdi->fd);
      /* The swap must occur after the close, otherwise another thread could
       * cause a probe of the old endpoint info, which is about be freed.
       */
      fdtable_swap(fdi->fd, fdip_closing, fdip_unknown,
		   fdt_locked | fdtable_strict());
      citp_fdinfo_get_ops(fdi)->dtor(fdi, fdt_locked | fdtable_strict());
      if( ! fdt_locked && fdtable_strict() )  CITP_FDTABLE_UNLOCK();
      citp_fdinfo_free(fdi);
      break;
    }
  case FDI_ON_RCZ_DUP2:
    dup2_complete(fdi, fdi_to_fdip(fdi), fdt_locked);
    break;
  case FDI_ON_RCZ_HANDOVER:
    citp_fdinfo_do_handover(fdi, fdt_locked);
    break;
  case FDI_ON_RCZ_MOVED:
    citp_fdinfo_get_ops(fdi)->dtor(fdi, fdt_locked);
    citp_fdinfo_free(fdi);
    break;
  default:
    CI_DEBUG(ci_log("%s: fd=%d on_ref_count_zero=%d", __FUNCTION__,
		    fdi->fd, fdi->on_ref_count_zero));
    ci_assert(0);
  }
}
Exemple #4
0
citp_fdinfo_p
citp_fdtable_new_fd_set(unsigned fd, citp_fdinfo_p new_fdip, int fdt_locked)
{
  volatile citp_fdinfo_p* p_fdip;
  citp_fdinfo_p prev;

  if( fd >= citp_fdtable.inited_count ) {
    ci_assert_lt(fd, citp_fdtable.size);
    if( ! fdt_locked )  CITP_FDTABLE_LOCK();
    __citp_fdtable_extend(fd);
    if( ! fdt_locked )  CITP_FDTABLE_UNLOCK();
  }

  p_fdip = &citp_fdtable.table[fd].fdip;

  do {
    prev = *p_fdip;

    /* Busy?  Perhaps just closed, but not yet marked unknown.  Or perhaps it
    ** is being probed. */
    if( fdip_is_busy(prev) )
      prev = citp_fdtable_busy_wait(fd, fdt_locked);

    /* There is a close in progress, so we wait until it is resolved. */
    if( fdip_is_closing(prev) )
      prev = citp_fdtable_closing_wait(fd, fdt_locked);

    /* Reserved?  Perhaps it was a netif fd that has just been closed.  So it
    ** should be about to be unreserved. */
  } while (fdip_is_reserved(prev) || fdip_cas_fail(p_fdip, prev, new_fdip) );

  if( fdip_is_normal(prev) ) {
    /* We can get here is close-trampolining fails.  So for release
    ** builds we accept that the user-level state got out-of-sync, and
    ** leak [fdi] since it seems like a suitably cautious thing to do.
    */
    ci_log("%s: ERROR: Orphaned entry in user-level fd-table", __FUNCTION__);
  }
  else
    /* We (at time of writing) only register a trampoline handler when we
    ** create a netif, so we can miss the closing of pass-through
    ** descriptors.
    */
    ci_assert(fdip_is_unknown(prev) || fdip_is_passthru(prev));

  return prev;
}
Exemple #5
0
int citp_fdtable_ctor()
{
  struct rlimit rlim;
  int rc;

  Log_S(log("%s:", __FUNCTION__));

  /* How big should our fdtable be by default?  It's pretty arbitrary, but we have
   * seen a few apps that use setrlimit to set the fdtable to 4096 entries on
   * start-up (see bugs 3253 and 3373), so we choose that.  (Note: we can't grow
   * the table if the app later does setrlimit, and unused entries consume virtual
   * space only, so it's worth allocating a table of reasonable sized.)
   */
  citp_fdtable.size = 4096;

  if( getrlimit(RLIMIT_NOFILE, &rlim) == 0 ) {
    citp_fdtable.size = rlim.rlim_max;
    if( CITP_OPTS.fdtable_size != 0 &&
        CITP_OPTS.fdtable_size != rlim.rlim_max ) {
      Log_S(ci_log("Set the limits for the number of opened files "
                   "to EF_FDTABLE_SIZE=%u value.",
                   CITP_OPTS.fdtable_size));
      rlim.rlim_max = CITP_OPTS.fdtable_size;
      if( rlim.rlim_cur > rlim.rlim_max )
        rlim.rlim_cur = rlim.rlim_max;
      if( ci_sys_setrlimit(RLIMIT_NOFILE, &rlim) == 0 )
          citp_fdtable.size = rlim.rlim_max;
      else {
        /* Most probably, we've got EPERM */
        ci_assert_lt(citp_fdtable.size, CITP_OPTS.fdtable_size);
        ci_log("Can't set EF_FDTABLE_SIZE=%u; using %u",
               CITP_OPTS.fdtable_size, citp_fdtable.size);
        rlim.rlim_max = rlim.rlim_cur = citp_fdtable.size;
        CI_TRY(ci_sys_setrlimit(RLIMIT_NOFILE, &rlim));
      }
    }
  }
  else
    Log_S(ci_log("Assume EF_FDTABLE_SIZE=%u", citp_fdtable.size));

  citp_fdtable.inited_count = 0;

  citp_fdtable.table = ci_libc_malloc(sizeof (citp_fdtable_entry) *
                                      citp_fdtable.size);
  if( ! citp_fdtable.table ) {
    Log_U(log("%s: failed to allocate fdtable (0x%x)", __FUNCTION__,
              citp_fdtable.size));
    return -1;
  }

  /* The whole table is not initialised at start-of-day, but is initialised
  ** on demand.  citp_fdtable.inited_count counts the number of initialised
  ** entries.
  */

  if( (rc = oo_rwlock_ctor(&citp_ul_lock)) != 0 ) {
    Log_E(log("%s: oo_rwlock_ctor %d", __FUNCTION__, rc));
    return -1;
  }

  /* Install SIGONLOAD handler */
  {
    struct sigaction sa;
    memset(&sa, 0, sizeof(sa)); /* sa_flags and sa_mask = 0 */
    sa.sa_handler = sighandler_do_nothing;
    sigaction(SIGONLOAD, &sa, NULL);
  }

  return 0;
}
Exemple #6
0
static int ci_udp_filter_kernel_pkt(ci_netif* ni, ci_udp_state* us, 
                                    struct msghdr* msg, int *bytes)
{
  enum onload_zc_callback_rc rc;
  struct onload_zc_msg zc_msg;
  struct onload_zc_iovec zc_iovec[CI_UDP_ZC_IOVEC_MAX];
  unsigned cb_flags = 0;
  int i = 0, bytes_remaining = *bytes;

  if( msg->msg_iovlen > CI_UDP_ZC_IOVEC_MAX ) {
    LOG_U(log("%s: too many fragments (%d), passing packet unfiltered", 
              __FUNCTION__, (int)msg->msg_iovlen));
    return 1;
  }

  zc_msg.iov = zc_iovec;
  zc_msg.msghdr = *msg;
  zc_msg.msghdr.msg_iov = NULL;

  ci_assert_gt(msg->msg_iovlen, 0);

  do { 
    zc_msg.iov[i].iov_base = msg->msg_iov[i].iov_base;
    zc_msg.iov[i].iov_len = msg->msg_iov[i].iov_len > bytes_remaining ? 
      bytes_remaining : msg->msg_iov[i].iov_len;
    zc_msg.iov[i].buf = ONLOAD_ZC_HANDLE_NONZC;
    zc_msg.iov[i].iov_flags = 0;
    bytes_remaining -= zc_msg.iov[i].iov_len;
  } while(++i < msg->msg_iovlen && bytes_remaining);

  zc_msg.msghdr.msg_iovlen = i;

  rc = (*(onload_zc_recv_filter_callback)((ci_uintptr_t)us->recv_q_filter))
    (&zc_msg, (void *)((ci_uintptr_t)us->recv_q_filter_arg), cb_flags);

  ci_assert(!(rc & ONLOAD_ZC_KEEP));

  if( rc & ONLOAD_ZC_TERMINATE ) 
    return 0;
  else {
    if( rc & ONLOAD_ZC_MODIFIED ) {
      int new_len = 0;
#ifndef NDEBUG
      int found_shortened_iov = 0;
#endif
      for( i = 0; i < zc_msg.msghdr.msg_iovlen; ++i ) {
        new_len += zc_msg.iov[i].iov_len;
#ifndef NDEBUG
        if( found_shortened_iov )
          ci_assert_equal(zc_msg.iov[i].iov_len, 0);
        ci_assert_equal(zc_msg.iov[i].iov_base, msg->msg_iov[i].iov_base);
        if( zc_msg.iov[i].iov_len != msg->msg_iov[i].iov_len ) {
          ci_assert_lt(zc_msg.iov[i].iov_len, msg->msg_iov[i].iov_len);
          found_shortened_iov = 1;
        }
#endif
      }
#ifndef NDEBUG
      if( found_shortened_iov )
        ci_assert_lt(new_len, *bytes);
      else
        ci_assert_equal(new_len, *bytes);
#endif
      *bytes = new_len;
    }
  }
  return 1;
}
Exemple #7
0
int ci_udp_filter_recved_pkts(ci_netif* ni, ci_udp_state* us)
{
  enum onload_zc_callback_rc rc;
  struct onload_zc_msg zc_msg;
  struct onload_zc_iovec zc_iovec[CI_UDP_ZC_IOVEC_MAX];
  ci_ip_pkt_fmt* pkt;
  unsigned cb_flags;
  int dropped_bytes;

  ci_assert(ci_sock_is_locked(ni, &us->s.b));

  zc_msg.iov = zc_iovec;
  zc_msg.msghdr.msg_controllen = 0;
  zc_msg.msghdr.msg_flags = 0;

  while( us->recv_q.pkts_added != 
         us->recv_q.pkts_filter_passed + us->recv_q.pkts_filter_dropped ) {
    ci_rmb();
    pkt = PKT_CHK_NNL(ni, us->recv_q.filter);
    if( pkt->pf.udp.rx_flags & 
        (CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_PASSED |
         CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_DROPPED) ) {
      /* We know this can't go past tail because of the while loop condition */
      us->recv_q.filter = pkt->next;
      pkt = PKT_CHK_NNL(ni, us->recv_q.filter);
      ci_assert( !(pkt->pf.udp.rx_flags & 
                   (CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_PASSED |
                    CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_DROPPED)) );
    }

    ci_udp_pkt_to_zc_msg(ni, pkt, &zc_msg);

    cb_flags = CI_IP_IS_MULTICAST(oo_ip_hdr(pkt)->ip_daddr_be32) ? 
      ONLOAD_ZC_MSG_SHARED : 0;
    rc = (*(onload_zc_recv_filter_callback)((ci_uintptr_t)us->recv_q_filter))
      (&zc_msg, (void *)((ci_uintptr_t)us->recv_q_filter_arg), cb_flags);

    ci_assert(!(rc & ONLOAD_ZC_KEEP));

    if( rc & ONLOAD_ZC_TERMINATE ) {
      us->recv_q.bytes_filter_dropped += pkt->pf.udp.pay_len;
      pkt->pf.udp.rx_flags |= CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_DROPPED;
      ++us->recv_q.pkts_filter_dropped;
    }
    else {
      pkt->pf.udp.rx_flags |= CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_PASSED;
      ++us->recv_q.pkts_filter_passed;
      if( rc & ONLOAD_ZC_MODIFIED ) {
        ci_assert(!(cb_flags & ONLOAD_ZC_MSG_SHARED));
        dropped_bytes = ci_zc_msg_to_udp_pkt(ni, &zc_msg, pkt);
        ci_assert_gt(dropped_bytes, 0);
        ci_assert_lt(dropped_bytes, pkt->pf.udp.pay_len);
        pkt->pf.udp.pay_len -= dropped_bytes;
        us->recv_q.bytes_filter_dropped += dropped_bytes;
      }
      us->recv_q.bytes_filter_passed += pkt->pf.udp.pay_len;
      return 1;
    }
  }

  return us->recv_q.pkts_filter_passed != us->recv_q.pkts_delivered;
}
Exemple #8
0
static int ci_zc_msg_to_udp_pkt(ci_netif* ni, 
                                struct onload_zc_msg* zc_msg,
                                ci_ip_pkt_fmt* pkt)
{
  int i, n_buffers = pkt->n_buffers, dropped_bytes = 0;
  ci_ip_pkt_fmt* frag;
  ci_ip_pkt_fmt* prev_frag = NULL;
  frag = pkt;
  i = 0;
  ci_assert_nequal(zc_msg->iov, NULL);

  /* Ignore first frag if zero length and there is another frag */
  if( oo_offbuf_left(&frag->buf) == 0 && OO_PP_NOT_NULL(frag->frag_next) ) {
    frag = PKT_CHK_NNL(ni, frag->frag_next);
    --n_buffers;
  }

  CI_TEST(zc_msg->msghdr.msg_iovlen <= n_buffers);
  CI_TEST(zc_msg->msghdr.msg_iovlen > 0);

  do {
    CI_TEST(zc_msg->iov[i].buf == (onload_zc_handle)frag);
    CI_TEST(zc_msg->iov[i].iov_len != 0);
    if( i < zc_msg->msghdr.msg_iovlen ) {
      if( zc_msg->iov[i].iov_base != oo_offbuf_ptr(&frag->buf) ) {
        ci_assert_gt((char*)zc_msg->iov[i].iov_base, 
                     oo_offbuf_ptr(&frag->buf));
        dropped_bytes += ((char*)zc_msg->iov[i].iov_base - 
                          oo_offbuf_ptr(&frag->buf) );
        oo_offbuf_set_start(&frag->buf, (char*)zc_msg->iov[i].iov_base);
      }
      if( zc_msg->iov[i].iov_len != oo_offbuf_left(&frag->buf) ) {
        ci_assert_lt(zc_msg->iov[i].iov_len, oo_offbuf_left(&frag->buf));
        dropped_bytes += (oo_offbuf_left(&frag->buf) - zc_msg->iov[i].iov_len);
        oo_offbuf_set_len(&frag->buf, zc_msg->iov[i].iov_len);
      }
    }
    else {
      /* All remaining fragments should be discarded.  Should not get
       * here on first frag as msg_iovlen > 0
       */
      ci_assert(prev_frag != NULL);
      prev_frag->frag_next = OO_PP_NULL;
      /* remember frag so we can release it after counting dropped bytes */
      prev_frag = frag;
      do {
        dropped_bytes += oo_offbuf_left(&frag->buf);
        if( ++i == n_buffers )
          break;
        frag = PKT_CHK_NNL(ni, frag->frag_next);
      } while( 1 );
      ci_netif_pkt_release(ni, prev_frag);
      pkt->n_buffers -= (n_buffers - zc_msg->msghdr.msg_iovlen);
      return dropped_bytes;
    }

    ci_assert_lt(oo_offbuf_offset(&frag->buf) + oo_offbuf_left(&frag->buf),
                 CI_CFG_PKT_BUF_SIZE);

    if( ++i == n_buffers )
      break;
    prev_frag = frag;
    frag = PKT_CHK_NNL(ni, frag->frag_next);
  } while( 1 );

  return dropped_bytes;
}
Exemple #9
0
void ci_ip_send_tcp_slow(ci_netif* ni, ci_tcp_state* ts, ci_ip_pkt_fmt* pkt)
{
  /* We're here because the ipcache is not valid. */
  int rc, prev_mtu = ts->s.pkt.mtu;

  cicp_user_retrieve(ni, &ts->s.pkt, &ts->s.cp);

  if( ts->s.pkt.status == retrrc_success ) {
    if( ts->s.pkt.mtu != prev_mtu )
      CI_PMTU_TIMER_NOW(ni, &ts->pmtus);
    ci_ip_set_mac_and_port(ni, &ts->s.pkt, pkt);
    ci_netif_send(ni, pkt);
    return;
  }
  else if( ts->s.pkt.status == retrrc_localroute &&
           (ts->s.pkt.flags & CI_IP_CACHE_IS_LOCALROUTE) )
    ci_ip_local_send(ni, pkt, &ts->s, OO_SP_NULL);

  /* For TCP, we want the ipcache to only be valid when onloadable. */
  ci_ip_cache_invalidate(&ts->s.pkt);

  switch( ts->s.pkt.status ) {
  case retrrc_nomac:
    rc = 0;
    /* If we resend SYN, and there is no MAC - it means ARP failed.
     * Connect() should return with EHOSTUNREACH.
     * We verify twice - on the first and the second retransmit.
     * Very hackish.
     */
    if( ts->s.b.state == CI_TCP_SYN_SENT ) {
      if( ts->retransmits == 1 )
        ts->tcpflags |= CI_TCPT_FLAG_NO_ARP;
      else if( (ts->tcpflags & CI_TCPT_FLAG_NO_ARP) &&
               ts->retransmits == 2 ) {
        ci_tcp_drop(ni, ts, EHOSTUNREACH);
        return;
      }
    }
    cicp_user_defer_send(ni, retrrc_nomac, &rc, OO_PKT_P(pkt), 
                         ts->s.pkt.ifindex);
    ++ts->stats.tx_nomac_defer;
    return;
  case retrrc_noroute:
    rc = -EHOSTUNREACH;
    break;
  case retrrc_alienroute:
  case retrrc_localroute:
    /* ?? TODO: inc some stat */
    return;
  default:
    ci_assert_lt(ts->s.pkt.status, 0);
    if( ts->s.pkt.status < 0 )
      rc = ts->s.pkt.status;
    else
      /* belt and braces... */
      rc = 0;
  }

  ci_assert_le(rc, 0);

  /* In most cases, we should ignore return code; the packet will be resend
   * later, because of RTO.  However, in SYN-SENT we should pass errors to
   * user.  At the same time, we should not pass ENOBUFS to user - it is
   * pretty internal problem of cplane, so we should try again.  Possibly,
   * there may be other internal problems, such as ENOMEM.
   *
   * Also, do not break connection when the first SYN fails:
   * - Linux does not do it;
   * - cplane has some latency, so we have false positives here;
   * - ci_tcp_connect() does not expect it.
   */
  if( ts->s.b.state == CI_TCP_SYN_SENT && rc < 0 && ts->retransmits > 0 &&
      (rc == -EHOSTUNREACH || rc == -ENETUNREACH || rc == -ENETDOWN) )
    ci_tcp_drop(ni, ts, -rc);
}