示例#1
0
static int 
efab_vi_rm_mmap_mem(struct efrm_vi *virs,
                    unsigned long *bytes, void *opaque,
                    int *map_num, unsigned long *offset)
{
  int queue_type;
  uint32_t len;

  if( virs->q[EFHW_EVQ].capacity != 0 ) {
    len = efhw_iopages_size(&virs->q[EFHW_EVQ].pages);
    len = CI_MIN(len, *bytes);
    ci_assert_gt(len, 0);
    ci_mmap_iopages(&virs->q[EFHW_EVQ].pages, 0,
                    len, bytes, opaque, map_num, offset);
    if(*bytes == 0)
      return 0;
  }

  for( queue_type=EFRM_VI_RM_DMA_QUEUE_COUNT-1;
       queue_type>=0;
       queue_type-- ) {
    if( virs->q[queue_type].capacity != 0 ) {
      len = efhw_iopages_size(&virs->q[queue_type].pages);
      len = CI_MIN(len, *bytes);
      ci_assert_gt(len, 0);
      ci_mmap_iopages(&virs->q[queue_type].pages, 0,
                      len, bytes, opaque, map_num, offset);
      if(*bytes == 0)
        return 0;
    }
  }

  return 0;
}
示例#2
0
/* initialise the iptimer scheduler */
void ci_ip_timer_state_init(ci_netif* netif, unsigned cpu_khz)
{
  ci_ip_timer_state* ipts = IPTIMER_STATE(netif);
  int i;
  int us2isn;

  /* initialise the cycle to tick constants */
  ipts->khz = cpu_khz;
  ipts->ci_ip_time_frc2tick = shift_for_gran(CI_IP_TIME_APP_GRANULARITY, ipts->khz);
  ipts->ci_ip_time_frc2us = shift_for_gran(1, ipts->khz);

  /* The Linux kernel ticks the initial sequence number that it would use for
   * a given tuple every 64 ns.  Onload does the same, when using
   * EF_TCP_ISN_MODE=clocked. However in EF_TCP_ISN_MODE=clocked+cache our use
   * of the clock-driven ISN is slightly different, though, as we remember
   * old sequence numbers in the case where the clock-driven ISN is not known
   * to be safe.  As such, we don't need it to tick so fast, and so we let it
   * tick at most every 256 ns.  This means that it takes more than eight
   * minutes to wrap by half, while four minutes is our assumed maximum
   * peer-MSL.  This in practice reduces the cases in which we have to
   * remember old sequence numbers. */
  us2isn = NI_OPTS(netif).tcp_isn_mode != 0 ? 2 : 4;
  ipts->ci_ip_time_frc2isn = ipts->ci_ip_time_frc2us > us2isn ?
                             ipts->ci_ip_time_frc2us - us2isn : 0;

  ci_ip_time_initial_sync(ipts);
  ipts->sched_ticks = ci_ip_time_now(netif);
  ipts->closest_timer = ipts->sched_ticks + IPTIME_INFINITY;

  /* To convert ms to ticks we will use fixed point arithmetic
   * Calculate conversion factor, which is expected to be in range <0.5,1]
   * */
  ipts->ci_ip_time_ms2tick_fxp =
    (((ci_uint64)ipts->khz) << 32) /
    (1u << ipts->ci_ip_time_frc2tick);
  ci_assert_gt(ipts->ci_ip_time_ms2tick_fxp, 1ull<<31);
  ci_assert_le(ipts->ci_ip_time_ms2tick_fxp, 1ull<<32);

  /* set module specific time constants dependent on frc2tick */
  ci_tcp_timer_init(netif);

  ci_ni_dllist_init(netif, &ipts->fire_list,
		    oo_ptr_to_statep(netif, &ipts->fire_list),
                    "fire");
  
  /* Initialise the wheel lists. */
  for( i=0; i < CI_IPTIME_WHEELSIZE; i++)
    ci_ni_dllist_init(netif, &ipts->warray[i],
		      oo_ptr_to_statep(netif, &ipts->warray[i]),
                      "timw");
}
示例#3
0
void ci_netif_filter_init(ci_netif_filter_table* tbl, int size_lg2)
{
  unsigned i;
  unsigned size = ci_pow2(size_lg2);

  ci_assert(tbl);
  ci_assert_gt(size_lg2, 0);
  ci_assert_le(size_lg2, 32);

  tbl->table_size_mask = size - 1;

  for( i = 0; i < size; ++i ) {
    tbl->table[i].id = EMPTY;
    tbl->table[i].route_count = 0;
    tbl->table[i].laddr = 0;
  }
}
示例#4
0
int onload_zc_send(struct onload_zc_mmsg* msgs, int mlen, int flags)
{
  int done = 0, last_fd = -1, i;
  citp_lib_context_t lib_context;
  citp_fdinfo* fdi = NULL;

  Log_CALL(ci_log("%s(%p, %d, %x)", __FUNCTION__, msgs, mlen, flags));

  citp_enter_lib(&lib_context);

  for( i = 0; i < mlen; ++i ) {
    if( msgs[i].fd != last_fd ) {
      if( fdi != NULL )
        citp_fdinfo_release_ref(fdi, 0);
      fdi = citp_fdtable_lookup(msgs[i].fd);
      if( fdi == NULL ) {
        msgs[i].rc = -ESOCKTNOSUPPORT;
        ++done;
        goto out;
      }
      last_fd = msgs[i].fd;
    }

    CI_TRY_EQ( citp_fdinfo_get_ops(fdi)->zc_send(fdi, &msgs[i], flags), 1);
    /* If we got an error, return the number of msgs that have had
     * rc set and exit.  fd_op should have updated msgs.rc appropriately
     */
    ++done;
    if( msgs[i].rc < 0 )
      goto out;
  }

 out:

  if( fdi != NULL )
    citp_fdinfo_release_ref(fdi, 0);

  citp_exit_lib(&lib_context, TRUE);

  ci_assert_gt(done, 0);
  ci_assert_le(done, mlen);

  Log_CALL_RESULT(done);
  return done;
}
示例#5
0
ssize_t linux_tcp_helper_fop_sendpage(struct file* filp, struct page* page, 
                                      int offset, size_t size,
                                      loff_t* ppos, int flags)
{
  ci_private_t* priv = filp->private_data;
  tcp_helper_resource_t* trs = efab_priv_to_thr(priv);
  ci_sock_cmn* s;

  OO_DEBUG_VERB(ci_log("%s: %d:%d offset=%d size=%d flags=%x", __FUNCTION__,
                       NI_ID(&trs->netif), OO_SP_FMT(priv->sock_id), offset,
                       (int) size, flags));

  ci_assert(page);
  ci_assert_ge(offset, 0);
  ci_assert_gt(size, 0);
  ci_assert_le(offset + size, CI_PAGE_SIZE);

#ifndef MSG_SENDPAGE_NOTLAST
  /* "flags" is really "more".  Convert it. */
  if( flags )
    flags = MSG_MORE;

  /* [more] is sometimes true even for the last page.  We get a little
  ** closer to the truth by spotting that we're not reading to the end of
  ** the page. - seen on 2.6.18, but not on 2.6.26 or later
  */
  if( offset + size < CI_PAGE_SIZE && flags )
    flags = 0;
#endif

  s = SP_TO_SOCK(&trs->netif, priv->sock_id);
  if(CI_LIKELY( s->b.state & CI_TCP_STATE_TCP_CONN ))
    return sendpage_copy(&trs->netif,SOCK_TO_TCP(s),page,offset,size,flags);
  else
    /* Closed or listening.  Return epipe.  Do not send SIGPIPE, because
    ** Linux will do it for us. */
    return -s->tx_errno;
}
示例#6
0
citp_fdinfo*
citp_fdtable_lookup_fast(citp_lib_context_t* ctx, unsigned fd)
{
  /* Note that if we haven't yet initialised this module, then
  ** [inited_count] will be zero, and the following test will fail.  So the
  ** test for initialisation is done further down...
  **
  ** This is highly performance critial.  DO NOT add any code between here
  ** and the first [return] statement.
  */
  citp_fdinfo* fdi;

  /* Try to avoid entering lib. */
  ctx->thread = NULL;

  if( fd < citp_fdtable.inited_count ) {
    volatile citp_fdinfo_p* p_fdip = &citp_fdtable.table[fd].fdip;
    citp_fdinfo_p fdip;

  again:
    fdip = *p_fdip;
    if( fdip_is_normal(fdip) ) {

      citp_enter_lib_if(ctx);
      if( citp_fdtable_is_mt_safe() ) {
	/* No need to use atomic ops or add a ref to the fdi when MT-safe.
         * The definition of "fds_mt_safe" is that the app does not change
         * the meaning of a file descriptor in one thread when it is being
         * used in another thread.
         */
        fdi = fdip_to_fdi(fdip);
        if( ! citp_fdinfo_is_consistent(fdi) )
          fdi = citp_reprobe_moved(fdi, CI_TRUE, CI_FALSE);

	return fdi;
      }
      else {
        /* Swap in the busy marker. */
	if( fdip_cas_succeed(p_fdip, fdip, fdip_busy) ) {
	  fdi = fdip_to_fdi(fdip);

	  ci_assert(fdi);
	  ci_assert_gt(oo_atomic_read(&fdi->ref_count), 0);
	  ci_assert(fdip_is_closing(fdip) || fdip_is_reserved(fdip) ||
		    fdi->fd == fd);
	  /* Bump the reference count. */
	  citp_fdinfo_ref(fdi);

          if( ! citp_fdinfo_is_consistent(fdi) )
            fdi = citp_reprobe_moved(fdi, CI_FALSE, CI_TRUE);
          else {
            /* Swap the busy marker out again. */
            citp_fdtable_busy_clear(fd, fdip, 0);
          }
	  return fdi;
	}
	goto again;
      }
    }

    /* Not normal! */
    if( fdip_is_passthru(fdip) )
      return NULL;

    citp_enter_lib_if(ctx);
    if( fdip_is_busy(fdip) ) {
      citp_fdtable_busy_wait(fd, 0);
      goto again;
    }

    ci_assert(fdip_is_unknown(fdip));
    goto probe;
  }

  if( citp.init_level < CITP_INIT_FDTABLE ) {
    if( _citp_do_init_inprogress == 0 )
      CI_TRY(citp_do_init(CITP_INIT_ALL));
    else
      CI_TRY(citp_do_init(CITP_INIT_FDTABLE)); /* get what we need */
  }

  if( fd >= citp_fdtable.size )
    return NULL;

 probe:
  citp_enter_lib_if(ctx);
  fdi = citp_fdtable_probe(fd);
  if( fdi && citp_fdtable_is_mt_safe() )
    citp_fdinfo_release_ref(fdi, 0);
  return fdi;
}
示例#7
0
citp_fdinfo *
citp_fdtable_lookup(unsigned fd)
{
  /* Note that if we haven't yet initialised this module, then
  ** [inited_count] will be zero, and the following test will fail.  So the
  ** test for initialisation is done further down...
  **
  ** This is highly performance critial.  DO NOT add any code between here
  ** and the first [return] statement.
  */
  citp_fdinfo* fdi;

  /* In some cases, we'll lock fdtable.  Assert that it is possible: */
  ci_assert(oo_per_thread_get()->sig.inside_lib);

  if( fd < citp_fdtable.inited_count ) {

    volatile citp_fdinfo_p* p_fdip = &citp_fdtable.table[fd].fdip;
    citp_fdinfo_p fdip;

  again:
    /* Swap in the busy marker. */
    fdip = *p_fdip;

    if( fdip_is_normal(fdip) ) {
      if( citp_fdtable_not_mt_safe() ) {
	if( fdip_cas_succeed(p_fdip, fdip, fdip_busy) ) {
	  fdi = fdip_to_fdi(fdip);
	  ci_assert(fdi);
	  ci_assert_gt(oo_atomic_read(&fdi->ref_count), 0);
	  ci_assert(fdip_is_closing(fdip) || fdip_is_reserved(fdip) ||
		    fdi->fd == fd);
	  /* Bump the reference count. */
	  citp_fdinfo_ref(fdi);

          if( ! citp_fdinfo_is_consistent(fdi) ) {
            /* Something is wrong.  Re-probe. */
            fdi = citp_reprobe_moved(fdi, CI_FALSE, CI_TRUE);
          }
          else {
            /* Swap the busy marker out again. */
            citp_fdtable_busy_clear(fd, fdip, 0);
          }
	  return fdi;
	}
	goto again;
      }
      else {
	/* No need to use atomic ops when single-threaded.  The definition
         * of "fds_mt_safe" is that the app does not change the meaning of
         * a file descriptor in one thread when it is being used in another
         * thread.  In that case I'm hoping this should be safe, but at
         * time of writing I'm really not confident.  (FIXME).
         */
	fdi = fdip_to_fdi(fdip);
        if( ci_is_multithreaded() )
	  citp_fdinfo_ref(fdi);
        else
          ++fdi->ref_count.n;

        if( ! citp_fdinfo_is_consistent(fdi) )
          fdi = citp_reprobe_moved(fdi, CI_FALSE, CI_FALSE);

	return fdi;
      }
    }

    /* Not normal! */
    if( fdip_is_passthru(fdip) )  return NULL;

    if( fdip_is_busy(fdip) ) {
      citp_fdtable_busy_wait(fd, 0);
      goto again;
    }

    ci_assert(fdip_is_unknown(fdip));
    goto probe;
  }

  if (citp.init_level < CITP_INIT_FDTABLE) {
    if (_citp_do_init_inprogress == 0)
      CI_TRY(citp_do_init(CITP_INIT_ALL));
    else
      CI_TRY(citp_do_init(CITP_INIT_FDTABLE)); /* get what we need */
  }

  if( fd >= citp_fdtable.size )  return NULL;

 probe:
  fdi = citp_fdtable_probe(fd);

  return fdi;
}
示例#8
0
static int 
ci_udp_recvmsg_block(ci_udp_iomsg_args* a, ci_netif* ni, ci_udp_state* us,
                     int timeout)
{
  int rc;

#ifndef __KERNEL__
  {
    citp_signal_info* si;
    struct pollfd pfd;
#if !CI_CFG_CITP_INSIDE_LIB_IS_FLAG
    int inside_lib;
#endif
    pfd.fd = a->fd;
    pfd.events = POLLIN;

    if( timeout == 0 )
      timeout = -1;

    /* Ideally, we should do the same as in citp_tcp_accept(), but since
     * we do not have lib_context and citp_exit_lib() out of unix/
     * subdirectory, we copy it contents. */
    si = citp_signal_get_specific_inited();
  continue_to_block:
#if !CI_CFG_CITP_INSIDE_LIB_IS_FLAG
    inside_lib = si->inside_lib;
    ci_assert_gt(inside_lib, 0);
#endif
    si->inside_lib = 0;
    ci_compiler_barrier();
    if(CI_UNLIKELY( si->aflags & OO_SIGNAL_FLAG_HAVE_PENDING ))
      citp_signal_run_pending(si);

    rc = ci_sys_poll(&pfd, 1, timeout);

#if CI_CFG_CITP_INSIDE_LIB_IS_FLAG
    si->inside_lib = 1;
#else
    si->inside_lib = inside_lib;
#endif

    if( rc > 0 )
      return 0;
    else if( rc == 0 )
      rc = -EAGAIN;
    else if( errno == EINTR && (si->aflags & OO_SIGNAL_FLAG_NEED_RESTART) &&
             timeout == -1 ) {
      /* Blocking recv() should only be restarted if there is no timeout. */
      goto continue_to_block;
    } else 
      rc = -errno;

    return rc;
  }
#else  /* __KERNEL__ */
  {
    int mask;
    s64 t;

    if( timeout == 0 )
      t = -1;
    else
      t = msecs_to_jiffies(timeout);

    mask = POLLIN;
    rc = efab_tcp_helper_poll_udp(a->filp, &mask, &t);
    if( rc == 0 ) {
      if( mask ) {
        return 0;
      }
      else
        rc = -EAGAIN;
    }
    else if( rc == -ERESTARTSYS &&  us->s.so.rcvtimeo_msec )
      rc = -EINTR;
  }
  return rc;
#endif /* __KERNEL__ */
}
示例#9
0
static int ci_udp_filter_kernel_pkt(ci_netif* ni, ci_udp_state* us, 
                                    struct msghdr* msg, int *bytes)
{
  enum onload_zc_callback_rc rc;
  struct onload_zc_msg zc_msg;
  struct onload_zc_iovec zc_iovec[CI_UDP_ZC_IOVEC_MAX];
  unsigned cb_flags = 0;
  int i = 0, bytes_remaining = *bytes;

  if( msg->msg_iovlen > CI_UDP_ZC_IOVEC_MAX ) {
    LOG_U(log("%s: too many fragments (%d), passing packet unfiltered", 
              __FUNCTION__, (int)msg->msg_iovlen));
    return 1;
  }

  zc_msg.iov = zc_iovec;
  zc_msg.msghdr = *msg;
  zc_msg.msghdr.msg_iov = NULL;

  ci_assert_gt(msg->msg_iovlen, 0);

  do { 
    zc_msg.iov[i].iov_base = msg->msg_iov[i].iov_base;
    zc_msg.iov[i].iov_len = msg->msg_iov[i].iov_len > bytes_remaining ? 
      bytes_remaining : msg->msg_iov[i].iov_len;
    zc_msg.iov[i].buf = ONLOAD_ZC_HANDLE_NONZC;
    zc_msg.iov[i].iov_flags = 0;
    bytes_remaining -= zc_msg.iov[i].iov_len;
  } while(++i < msg->msg_iovlen && bytes_remaining);

  zc_msg.msghdr.msg_iovlen = i;

  rc = (*(onload_zc_recv_filter_callback)((ci_uintptr_t)us->recv_q_filter))
    (&zc_msg, (void *)((ci_uintptr_t)us->recv_q_filter_arg), cb_flags);

  ci_assert(!(rc & ONLOAD_ZC_KEEP));

  if( rc & ONLOAD_ZC_TERMINATE ) 
    return 0;
  else {
    if( rc & ONLOAD_ZC_MODIFIED ) {
      int new_len = 0;
#ifndef NDEBUG
      int found_shortened_iov = 0;
#endif
      for( i = 0; i < zc_msg.msghdr.msg_iovlen; ++i ) {
        new_len += zc_msg.iov[i].iov_len;
#ifndef NDEBUG
        if( found_shortened_iov )
          ci_assert_equal(zc_msg.iov[i].iov_len, 0);
        ci_assert_equal(zc_msg.iov[i].iov_base, msg->msg_iov[i].iov_base);
        if( zc_msg.iov[i].iov_len != msg->msg_iov[i].iov_len ) {
          ci_assert_lt(zc_msg.iov[i].iov_len, msg->msg_iov[i].iov_len);
          found_shortened_iov = 1;
        }
#endif
      }
#ifndef NDEBUG
      if( found_shortened_iov )
        ci_assert_lt(new_len, *bytes);
      else
        ci_assert_equal(new_len, *bytes);
#endif
      *bytes = new_len;
    }
  }
  return 1;
}
示例#10
0
int ci_udp_filter_recved_pkts(ci_netif* ni, ci_udp_state* us)
{
  enum onload_zc_callback_rc rc;
  struct onload_zc_msg zc_msg;
  struct onload_zc_iovec zc_iovec[CI_UDP_ZC_IOVEC_MAX];
  ci_ip_pkt_fmt* pkt;
  unsigned cb_flags;
  int dropped_bytes;

  ci_assert(ci_sock_is_locked(ni, &us->s.b));

  zc_msg.iov = zc_iovec;
  zc_msg.msghdr.msg_controllen = 0;
  zc_msg.msghdr.msg_flags = 0;

  while( us->recv_q.pkts_added != 
         us->recv_q.pkts_filter_passed + us->recv_q.pkts_filter_dropped ) {
    ci_rmb();
    pkt = PKT_CHK_NNL(ni, us->recv_q.filter);
    if( pkt->pf.udp.rx_flags & 
        (CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_PASSED |
         CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_DROPPED) ) {
      /* We know this can't go past tail because of the while loop condition */
      us->recv_q.filter = pkt->next;
      pkt = PKT_CHK_NNL(ni, us->recv_q.filter);
      ci_assert( !(pkt->pf.udp.rx_flags & 
                   (CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_PASSED |
                    CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_DROPPED)) );
    }

    ci_udp_pkt_to_zc_msg(ni, pkt, &zc_msg);

    cb_flags = CI_IP_IS_MULTICAST(oo_ip_hdr(pkt)->ip_daddr_be32) ? 
      ONLOAD_ZC_MSG_SHARED : 0;
    rc = (*(onload_zc_recv_filter_callback)((ci_uintptr_t)us->recv_q_filter))
      (&zc_msg, (void *)((ci_uintptr_t)us->recv_q_filter_arg), cb_flags);

    ci_assert(!(rc & ONLOAD_ZC_KEEP));

    if( rc & ONLOAD_ZC_TERMINATE ) {
      us->recv_q.bytes_filter_dropped += pkt->pf.udp.pay_len;
      pkt->pf.udp.rx_flags |= CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_DROPPED;
      ++us->recv_q.pkts_filter_dropped;
    }
    else {
      pkt->pf.udp.rx_flags |= CI_IP_PKT_FMT_PREFIX_UDP_RX_FILTER_PASSED;
      ++us->recv_q.pkts_filter_passed;
      if( rc & ONLOAD_ZC_MODIFIED ) {
        ci_assert(!(cb_flags & ONLOAD_ZC_MSG_SHARED));
        dropped_bytes = ci_zc_msg_to_udp_pkt(ni, &zc_msg, pkt);
        ci_assert_gt(dropped_bytes, 0);
        ci_assert_lt(dropped_bytes, pkt->pf.udp.pay_len);
        pkt->pf.udp.pay_len -= dropped_bytes;
        us->recv_q.bytes_filter_dropped += dropped_bytes;
      }
      us->recv_q.bytes_filter_passed += pkt->pf.udp.pay_len;
      return 1;
    }
  }

  return us->recv_q.pkts_filter_passed != us->recv_q.pkts_delivered;
}
示例#11
0
static int ci_zc_msg_to_udp_pkt(ci_netif* ni, 
                                struct onload_zc_msg* zc_msg,
                                ci_ip_pkt_fmt* pkt)
{
  int i, n_buffers = pkt->n_buffers, dropped_bytes = 0;
  ci_ip_pkt_fmt* frag;
  ci_ip_pkt_fmt* prev_frag = NULL;
  frag = pkt;
  i = 0;
  ci_assert_nequal(zc_msg->iov, NULL);

  /* Ignore first frag if zero length and there is another frag */
  if( oo_offbuf_left(&frag->buf) == 0 && OO_PP_NOT_NULL(frag->frag_next) ) {
    frag = PKT_CHK_NNL(ni, frag->frag_next);
    --n_buffers;
  }

  CI_TEST(zc_msg->msghdr.msg_iovlen <= n_buffers);
  CI_TEST(zc_msg->msghdr.msg_iovlen > 0);

  do {
    CI_TEST(zc_msg->iov[i].buf == (onload_zc_handle)frag);
    CI_TEST(zc_msg->iov[i].iov_len != 0);
    if( i < zc_msg->msghdr.msg_iovlen ) {
      if( zc_msg->iov[i].iov_base != oo_offbuf_ptr(&frag->buf) ) {
        ci_assert_gt((char*)zc_msg->iov[i].iov_base, 
                     oo_offbuf_ptr(&frag->buf));
        dropped_bytes += ((char*)zc_msg->iov[i].iov_base - 
                          oo_offbuf_ptr(&frag->buf) );
        oo_offbuf_set_start(&frag->buf, (char*)zc_msg->iov[i].iov_base);
      }
      if( zc_msg->iov[i].iov_len != oo_offbuf_left(&frag->buf) ) {
        ci_assert_lt(zc_msg->iov[i].iov_len, oo_offbuf_left(&frag->buf));
        dropped_bytes += (oo_offbuf_left(&frag->buf) - zc_msg->iov[i].iov_len);
        oo_offbuf_set_len(&frag->buf, zc_msg->iov[i].iov_len);
      }
    }
    else {
      /* All remaining fragments should be discarded.  Should not get
       * here on first frag as msg_iovlen > 0
       */
      ci_assert(prev_frag != NULL);
      prev_frag->frag_next = OO_PP_NULL;
      /* remember frag so we can release it after counting dropped bytes */
      prev_frag = frag;
      do {
        dropped_bytes += oo_offbuf_left(&frag->buf);
        if( ++i == n_buffers )
          break;
        frag = PKT_CHK_NNL(ni, frag->frag_next);
      } while( 1 );
      ci_netif_pkt_release(ni, prev_frag);
      pkt->n_buffers -= (n_buffers - zc_msg->msghdr.msg_iovlen);
      return dropped_bytes;
    }

    ci_assert_lt(oo_offbuf_offset(&frag->buf) + oo_offbuf_left(&frag->buf),
                 CI_CFG_PKT_BUF_SIZE);

    if( ++i == n_buffers )
      break;
    prev_frag = frag;
    frag = PKT_CHK_NNL(ni, frag->frag_next);
  } while( 1 );

  return dropped_bytes;
}
示例#12
0
int citp_pipe_splice_write(citp_fdinfo* fdi, int alien_fd, loff_t* alien_off,
                           size_t olen, int flags,
                           citp_lib_context_t* lib_context)
{
  citp_pipe_fdi* epi = fdi_to_pipe_fdi(fdi);
  int len_in_bufs = OO_PIPE_SIZE_TO_BUFS(olen);
  struct iovec iov_on_stack[CITP_PIPE_SPLICE_WRITE_STACK_IOV_LEN];
  struct iovec* iov = iov_on_stack;
  int want_buf_count;
  int rc;
  int bytes_to_read;
  int len = olen;
  int no_more = 1; /* for now we only run single loop */
  int written_total = 0;
  int non_block = (flags & SPLICE_F_NONBLOCK) || (epi->pipe->aflags &
      (CI_PFD_AFLAG_NONBLOCK << CI_PFD_AFLAG_WRITER_SHIFT));
  if( fdi_is_reader(fdi) ) {
    errno = EINVAL;
    return -1;
  }
  if( alien_off ) {
    /* TODO support this */
    errno = ENOTSUP;
    return -1;
  }
  do {
    int count;
    int iov_num;
    int bytes_to_write;
    struct ci_pipe_pkt_list pkts = {};
    struct ci_pipe_pkt_list pkts2;
    want_buf_count = len_in_bufs;
    /* We might need to wait for buffers here on the first iteration */
    rc = ci_pipe_zc_alloc_buffers(epi->ni, epi->pipe, want_buf_count,
                                  MSG_NOSIGNAL | (non_block || written_total ?
                                  MSG_DONTWAIT : 0),
                                  &pkts);
    if( rc < 0 && written_total ) {
      /* whatever the error we need to report already written_bytes */
      rc = written_total;
      break;
    }
    else if( rc < 0 )
      break;
    else if( pkts.count == 0 && non_block ) {
      errno = EAGAIN;
      rc = -1;
      break;
    }
    else
      ci_assert_gt(pkts.count, 0);
    count = pkts.count;

    if( count > CITP_PIPE_SPLICE_WRITE_STACK_IOV_LEN ) {
      void* niov = realloc(iov == iov_on_stack ? NULL : iov,
                           sizeof(*iov) * len_in_bufs);
      if( niov == NULL )
        /* we can still move quite a few pkts */
        count = CITP_PIPE_SPLICE_WRITE_STACK_IOV_LEN;
      else
        niov = iov;
    }

    ci_assert_ge(count, 1);

    iov_num = count;
    pkts2 = pkts;
    bytes_to_read = ci_pipe_list_to_iovec(epi->ni, epi->pipe, iov, &iov_num,
                                          &pkts2, len);

    citp_exit_lib_if(lib_context, TRUE);
    /* Note: the following call might be non-blocking as well as blocking */
    rc = readv(alien_fd, iov, count);
    citp_reenter_lib(lib_context);

    if( rc > 0 ) {
      bytes_to_write = rc;
      written_total += bytes_to_write;
      len -= bytes_to_write;
      no_more |= bytes_to_write < bytes_to_read;
    }
    else {
      bytes_to_write = 0;
      no_more = 1;
    }

    {
      /* pipe zc_write will write non_empty buffers and release the empty
       * ones */
      int rc2 = ci_pipe_zc_write(epi->ni, epi->pipe, &pkts, bytes_to_write,
                  CI_PIPE_ZC_WRITE_FLAG_FORCE | MSG_DONTWAIT | MSG_NOSIGNAL);
      (void) rc2;
      ci_assert_equal(rc2, bytes_to_write);
    }
    /* for now we will not be doing second iteration, to allow for that
     * we'd need to have guarantee that read will not block
     * e.g. insight into type of fd and a nonblokcing operation
     * (to name a valid case: socket, recvmsg) */
  } while( ! no_more );

  if( iov != iov_on_stack )
    free(iov);
  if( rc > 0 )
    return written_total;
  if( rc < 0 && errno == EPIPE && ! (flags & MSG_NOSIGNAL) ) {
    ci_sys_ioctl(ci_netif_get_driver_handle(epi->ni),
                 OO_IOC_KILL_SELF_SIGPIPE, NULL);
  }
  return rc;
}