Esempio n. 1
0
void citp_epinfo_init(citp_epinfo* epinfo, citp_protocol_impl* protocol)
{
  ci_assert(epinfo);
  ci_assert(protocol);

  epinfo->protocol = protocol;

  /* Start at zero.  It will be increased whenever an endpoint is inserted
  ** into the fdtable.
  */
  oo_atomic_set(&epinfo->ref_count, 0);
}
Esempio n. 2
0
/* initialise all the fields that we can in the UDP state structure.  
** There are no IP options, no destination addresses, no ports */
static void ci_udp_state_init(ci_netif* netif, ci_udp_state* us)
{
  ci_sock_cmn_init(netif, &us->s, 1);

  /* IP_MULTICAST_LOOP is 1 by default, so we should not send multicast
   * unless specially permitted */
  if( ! NI_OPTS(netif).force_send_multicast )
    us->s.cp.sock_cp_flags |= OO_SCP_NO_MULTICAST;

  /* Poison. */
  CI_DEBUG(memset(&us->s + 1, 0xf0, (char*) (us + 1) - (char*) (&us->s + 1)));

  /*! \todo This should be part of sock_cmn reinit, but the comment to that
   * function suggests that it's possibly not a good plan to move it there */

#if CI_CFG_TIMESTAMPING
  ci_udp_recv_q_init(&us->timestamp_q);
#endif

  /*! \todo These two should really be handled in ci_sock_cmn_init() */

  /* Make sure we don't hit any state assertions. Can use
   *  UDP_STATE_FROM_SOCKET_EPINFO() after this. */
  us->s.b.state = CI_TCP_STATE_UDP;

  us->s.so.sndbuf = NI_OPTS(netif).udp_sndbuf_def;
  us->s.so.rcvbuf = NI_OPTS(netif).udp_rcvbuf_def;

  /* Init the ip-caches (packet header templates). */
  ci_udp_hdrs_init(&us->s.pkt);
  ci_ip_cache_init(&us->ephemeral_pkt);
  ci_udp_hdrs_init(&us->ephemeral_pkt);
  udp_lport_be16(us) = 0;
  udp_rport_be16(us) = 0;

#if CI_CFG_ZC_RECV_FILTER
  us->recv_q_filter = 0;
  us->recv_q_filter_arg = 0;
#endif
  ci_udp_recv_q_init(&us->recv_q);
  us->zc_kernel_datagram = OO_PP_NULL;
  us->zc_kernel_datagram_count = 0;
  us->tx_async_q = CI_ILL_END;
  oo_atomic_set(&us->tx_async_q_level, 0);
  us->tx_count = 0;
  us->udpflags = CI_UDPF_MCAST_LOOP;
  us->ip_pktinfo_cache.intf_i = -1;
  us->stamp = 0;
  memset(&us->stats, 0, sizeof(us->stats));
}
Esempio n. 3
0
int citp_ep_dup3(unsigned fromfd, unsigned tofd, int flags)
{
  volatile citp_fdinfo_p* p_tofdip;
  citp_fdinfo_p tofdip;
  unsigned max;

  Log_V(log("%s(%d, %d)", __FUNCTION__, fromfd, tofd));

  /* Must be checked by callers. */
  ci_assert(fromfd != tofd);

  /* Hack: if [tofd] is the fd we're using for logging, we'd better choose
  ** a different one!
  */
  if( tofd == citp.log_fd )  citp_log_change_fd();

  ci_assert(citp.init_level >= CITP_INIT_FDTABLE);

  max = CI_MAX(fromfd, tofd);
  if( max >= citp_fdtable.inited_count ) {
    ci_assert(max < citp_fdtable.size);
    CITP_FDTABLE_LOCK();
    __citp_fdtable_extend(max);
    CITP_FDTABLE_UNLOCK();
  }

  /* Bug1151: Concurrent threads doing dup2(x,y) and dup2(y,x) can deadlock
  ** against one another.  So we take out a fat lock to prevent concurrent
  ** dup2()s.
  */
  /* Lock tofd.  We need to interlock against select and poll etc, so we
  ** also grab the exclusive lock.  Also grab the bug1151 lock.
  */
  pthread_mutex_lock(&citp_dup_lock);
  CITP_FDTABLE_LOCK();
  p_tofdip = &citp_fdtable.table[tofd].fdip;
 lock_tofdip_again:
  tofdip = *p_tofdip;
  if( fdip_is_busy(tofdip) )
    tofdip = citp_fdtable_busy_wait(tofd, 1);
  if( fdip_is_closing(tofdip) )
    tofdip = citp_fdtable_closing_wait(tofd, 1);
  if( fdip_is_reserved(tofdip) ) {
    /* ?? FIXME: we can't cope with this at the moment */
    CITP_FDTABLE_UNLOCK();
    Log_U(log("%s(%d, %d): target is reserved", __FUNCTION__, fromfd, tofd));
    errno = EBUSY;
    tofd = -1;
    goto out;
  }
  if( fdip_cas_fail(p_tofdip, tofdip, fdip_busy) )
    goto lock_tofdip_again;
  CITP_FDTABLE_UNLOCK();
  ci_assert(fdip_is_normal(tofdip) | fdip_is_passthru(tofdip) |
 	    fdip_is_unknown(tofdip));

  if( fdip_is_normal(tofdip) ) {
    /* We're duping onto a user-level socket. */
    citp_fdinfo* tofdi = fdip_to_fdi(tofdip);
    if( tofdi->epoll_fd >= 0 ) {
      citp_fdinfo* epoll_fdi = citp_epoll_fdi_from_member(tofdi, 0);
      if( epoll_fdi ) {
        if( epoll_fdi->protocol->type == CITP_EPOLL_FD )
          citp_epoll_on_close(epoll_fdi, tofdi, 0);
        citp_fdinfo_release_ref(epoll_fdi, 0);
      }
    }
    ci_assert_equal(tofdi->on_ref_count_zero, FDI_ON_RCZ_NONE);
    tofdi->on_ref_count_zero = FDI_ON_RCZ_DUP2;
    tofdi->on_rcz.dup3_args.fd = fromfd;
    tofdi->on_rcz.dup3_args.flags = flags;
    citp_fdinfo_release_ref(tofdi, 0);
    {
      int i = 0;
      /* We need to free this fdi.  If someone is using it right now,
       * we are in trouble.  So, we spin for a while and interrupt the
       * user.  See bug 28123. */
      while( tofdi->on_ref_count_zero != FDI_ON_RCZ_DONE ) {
        if( ci_is_multithreaded() && i % 10000 == 9999 ) {
          pthread_t pth = tofdi->thread_id;
          if( pth !=  pthread_self() && pth != PTHREAD_NULL ) {
            pthread_kill(pth, SIGONLOAD);
            sleep(1);
          }
        }
        ci_spinloop_pause();
        i++;
      }
      ci_rmb();
    }
    if( tofdi->on_rcz.dup2_result < 0 ) {
      errno = -tofdi->on_rcz.dup2_result;
      /* Need to re-insert [tofdi] into the table. */
      ci_assert_equal(oo_atomic_read(&tofdi->ref_count), 0);
      oo_atomic_set(&tofdi->ref_count, 1);
      CI_DEBUG(tofdi->on_ref_count_zero = FDI_ON_RCZ_NONE);
      citp_fdtable_busy_clear(tofd, tofdip, 0);
      tofd = -1;
    }
    else {
      ci_assert(tofdi->on_rcz.dup2_result == tofd);
      citp_fdinfo_get_ops(tofdi)->dtor(tofdi, 0);
      citp_fdinfo_free(tofdi);
    }
    goto out;
  }

  ci_assert(fdip_is_passthru(tofdip) | fdip_is_unknown(tofdip));

  { /* We're dupping onto an O/S descriptor, or it may be closed.  Create a
    ** dummy [citp_fdinfo], just so we can share code with the case above.
    */
    citp_fdinfo fdi;
    fdi.fd = tofd;
    fdi.on_rcz.dup3_args.fd = fromfd;
    fdi.on_rcz.dup3_args.flags = flags;
    dup2_complete(&fdi, tofdip, 0);
    if( fdi.on_rcz.dup2_result < 0 ) {
      errno = -fdi.on_rcz.dup2_result;
      citp_fdtable_busy_clear(tofd, tofdip, 0);
      tofd = -1;
    }
    else
      ci_assert(fdi.on_rcz.dup2_result == tofd);
  }

 out:
  pthread_mutex_unlock(&citp_dup_lock);
  return tofd;
}
/* Move priv file to the alien_ni stack.
 * Should be called with the locked priv stack and socket;
 * the function returns with this stack being unlocked.
 * If rc=0, it returns with alien_ni stack locked;
 * otherwise, both stacks are unlocked.
 * Socket is always unlocked on return. */
int efab_file_move_to_alien_stack(ci_private_t *priv, ci_netif *alien_ni)
{
  tcp_helper_resource_t *old_thr = priv->thr;
  tcp_helper_resource_t *new_thr = netif2tcp_helper_resource(alien_ni);
  ci_sock_cmn *old_s = SP_TO_SOCK(&old_thr->netif, priv->sock_id);
  ci_sock_cmn *new_s;
  ci_sock_cmn *mid_s;
  tcp_helper_endpoint_t *old_ep, *new_ep;
  int rc, i;
  int pollwait_register = 0;
#if CI_CFG_FD_CACHING
  oo_p sp;
#endif

  OO_DEBUG_TCPH(ci_log("%s: move %d:%d to %d", __func__,
                       old_thr->id, priv->sock_id, new_thr->id));
  /* Poll the old stack - deliver all data to our socket */
  ci_netif_poll(&old_thr->netif);

  /* Endpoints in epoll list should not be moved, because waitq is already
   * in the epoll internal structures (bug 41152). */
  if( !list_empty(&priv->_filp->f_ep_links) ) {
    rc = -EBUSY;
    goto fail1;
  }

  if( !efab_file_move_supported(&old_thr->netif, old_s) ) {
    rc = -EINVAL;
    goto fail1;
  }

  /* Lock the second stack */
  i = 0;
  while( ! ci_netif_trylock(alien_ni) ) {
    ci_netif_unlock(&old_thr->netif);
    if( i++ >= 1000 ) {
      rc = -EBUSY;
      goto fail1_ni_unlocked;
    }
    rc = ci_netif_lock(&old_thr->netif);
    if( rc != 0 )
      goto fail1_ni_unlocked;
  }

  /* Allocate a new socket in the alien_ni stack */
  rc = -ENOMEM;
  if( old_s->b.state == CI_TCP_STATE_UDP ) {
    ci_udp_state *new_us = ci_udp_get_state_buf(alien_ni);
    if( new_us == NULL )
      goto fail2;
    new_s = &new_us->s;
  }
  else {
    ci_tcp_state *new_ts = ci_tcp_get_state_buf(alien_ni);
    if( new_ts == NULL )
      goto fail2;
    new_s = &new_ts->s;
  }

  /* Allocate an intermediate "socket" outside of everything */
  mid_s = ci_alloc(CI_MAX(sizeof(ci_tcp_state), sizeof(ci_udp_state)));
  if( mid_s == NULL )
    goto fail3;

  OO_DEBUG_TCPH(ci_log("%s: move %d:%d to %d:%d", __func__,
                       old_thr->id, priv->sock_id,
                       new_thr->id, new_s->b.bufid));

  /* Copy TCP/UDP state */
  memcpy(mid_s, old_s, CI_MAX(sizeof(ci_tcp_state), sizeof(ci_udp_state)));

  /* do not copy old_s->b.bufid
   * and other fields in stack adress space */
  mid_s->b.sb_aflags |= CI_SB_AFLAG_ORPHAN;
  mid_s->b.bufid = new_s->b.bufid;
  mid_s->b.post_poll_link = new_s->b.post_poll_link;
  mid_s->b.ready_link = new_s->b.ready_link;
  mid_s->reap_link = new_s->reap_link;

  if( old_s->b.state & CI_TCP_STATE_TCP ) {
    ci_tcp_state *new_ts = SOCK_TO_TCP(new_s);
    ci_tcp_state *mid_ts = SOCK_TO_TCP(mid_s);

    mid_ts->timeout_q_link = new_ts->timeout_q_link;
    mid_ts->tx_ready_link = new_ts->tx_ready_link;
    mid_ts->rto_tid = new_ts->rto_tid;
    mid_ts->delack_tid = new_ts->delack_tid;
    mid_ts->zwin_tid = new_ts->zwin_tid;
    mid_ts->kalive_tid = new_ts->kalive_tid;
    mid_ts->cork_tid = new_ts->cork_tid;
    ci_ip_queue_init(&mid_ts->recv1);
    ci_ip_queue_init(&mid_ts->recv2);
    ci_ip_queue_init(&mid_ts->send);
    ci_ip_queue_init(&mid_ts->retrans);
    mid_ts->send_prequeue = OO_PP_ID_NULL;
    new_ts->retrans_ptr = OO_PP_NULL;
    mid_ts->tmpl_head = OO_PP_NULL;
    oo_atomic_set(&mid_ts->send_prequeue_in, 0);

    *new_ts = *mid_ts;
    ci_pmtu_state_init(alien_ni, &new_ts->s, &new_ts->pmtus,
                       CI_IP_TIMER_PMTU_DISCOVER);
#if CI_CFG_FD_CACHING
    sp = TS_OFF(alien_ni, new_ts);
    OO_P_ADD(sp, CI_MEMBER_OFFSET(ci_tcp_state, epcache_link));
    ci_ni_dllist_link_init(alien_ni, &new_ts->epcache_link, sp, "epch");
    ci_ni_dllist_self_link(alien_ni, &new_ts->epcache_link);
    sp = TS_OFF(alien_ni, new_ts);
    OO_P_ADD(sp, CI_MEMBER_OFFSET(ci_tcp_state, epcache_fd_link));
    ci_ni_dllist_link_init(alien_ni, &new_ts->epcache_fd_link, sp, "ecfd");
    ci_ni_dllist_self_link(alien_ni, &new_ts->epcache_fd_link);
#endif
   
    /* free temporary mid_ts storage */
    CI_FREE_OBJ(mid_ts);
  }
  else {
    ci_udp_state *mid_us = SOCK_TO_UDP(mid_s);

    *SOCK_TO_UDP(new_s) = *mid_us;
    CI_FREE_OBJ(mid_us);
  }

  /* Move the filter */
  old_ep = ci_trs_ep_get(old_thr, priv->sock_id);
  new_ep = ci_trs_ep_get(new_thr, new_s->b.bufid);
  rc = tcp_helper_endpoint_move_filters_pre(old_ep, new_ep);
  if( rc != 0 ) {
    rc = -EINVAL;
    goto fail3;
  }

  /* Allocate a new file for the new endpoint */
  rc = onload_alloc_file(new_thr, new_s->b.bufid, priv->_filp->f_flags,
                         priv->fd_type, &old_ep->alien_ref);
  if( rc != 0 )
    goto fail4;
  ci_assert(old_ep->alien_ref);

  /* Copy F_SETOWN_EX, F_SETSIG to the new file */
#ifdef F_SETOWN_EX
  rcu_read_lock();
  __f_setown(old_ep->alien_ref->_filp, priv->_filp->f_owner.pid,
             priv->_filp->f_owner.pid_type, 1);
  rcu_read_unlock();
#endif
  old_ep->alien_ref->_filp->f_owner.signum = priv->_filp->f_owner.signum;
  old_ep->alien_ref->_filp->f_flags |= priv->_filp->f_flags & O_NONBLOCK;

  /* Move os_socket from one ep to another */
  if( tcp_helper_endpoint_set_aflags(new_ep, OO_THR_EP_AFLAG_ATTACHED) &
      OO_THR_EP_AFLAG_ATTACHED ) {
    fput(old_ep->alien_ref->_filp);
    rc = -EBUSY;
    goto fail2; /* state & filters are cleared by fput() */
  }

  /********* Point of no return  **********/
  ci_wmb();
  priv->fd_type = CI_PRIV_TYPE_ALIEN_EP;
  priv->_filp->f_op = &linux_tcp_helper_fops_alien;
  ci_wmb();
  oo_file_moved(priv);

  /* Read all already-arrived packets after the filters move but before
   * copying of the receive queue. */
  ci_netif_poll(&old_thr->netif);
  tcp_helper_endpoint_move_filters_post(old_ep, new_ep);
  ci_assert( efab_file_move_supported(&old_thr->netif, old_s));

  /* There's a gap between un-registering the old ep, and registering the
   * the new.  However, the notifications shouldn't be in use for sockets
   * that are in a state that can be moved, so this shouldn't be a problem.
   */
  if( old_ep->os_sock_pt.whead ) {
    pollwait_register = 1;
    efab_tcp_helper_os_pollwait_unregister(old_ep);
  }
  ci_assert_equal(new_ep->os_socket, NULL);
  new_ep->os_socket = oo_file_ref_xchg(&old_ep->os_socket, NULL);
  ci_assert_equal(old_ep->os_socket, NULL);
  if( pollwait_register )
    efab_tcp_helper_os_pollwait_register(new_ep);

  ci_bit_clear(&new_s->b.sb_aflags, CI_SB_AFLAG_ORPHAN_BIT);
  if( new_s->b.state == CI_TCP_ESTABLISHED )
    CI_TCP_STATS_INC_CURR_ESTAB(alien_ni);


  /* Copy recv queue */
  if( new_s->b.state & CI_TCP_STATE_TCP ) {
    ci_tcp_state *new_ts = SOCK_TO_TCP(new_s);
    ci_tcp_state *old_ts = SOCK_TO_TCP(old_s);
    int i;

    /* Stop timers */
    ci_ip_timer_clear(&old_thr->netif, &old_ts->kalive_tid);
    ci_ip_timer_clear(&old_thr->netif, &old_ts->delack_tid);

    efab_ip_queue_copy(alien_ni, &new_ts->recv1,
                       &old_thr->netif, &old_ts->recv1);
    efab_ip_queue_copy(alien_ni, &new_ts->recv2,
                       &old_thr->netif, &old_ts->recv2);
    new_ts->recv1_extract = new_ts->recv1.head;

    /* Drop reorder buffer */
    ci_ip_queue_init(&new_ts->rob);
    new_ts->dsack_block = OO_PP_INVALID;
    new_ts->dsack_start = new_ts->dsack_end = 0;
    for( i = 0; i <= CI_TCP_SACK_MAX_BLOCKS; i++ )
      new_ts->last_sack[i] = OO_PP_NULL;
  }
  else {
    /* There should not be any recv q, but drop it to be sure */
    ci_udp_recv_q_init(&SOCK_TO_UDP(new_s)->recv_q);
  }

  /* Old stack can be unlocked */
  old_s->b.sb_flags |= CI_SB_FLAG_MOVED;
  ci_netif_unlock(&old_thr->netif);

  ci_assert( efab_file_move_supported(alien_ni, new_s) );

  /* Move done: poll for any new data. */
  ci_netif_poll(alien_ni);

  if( new_s->b.state & CI_TCP_STATE_TCP ) {
    ci_tcp_state *new_ts = SOCK_TO_TCP(new_s);
    /* Timers setup: delack, keepalive */
    if( (new_ts->acks_pending & CI_TCP_ACKS_PENDING_MASK) > 0)
      ci_tcp_timeout_delack(alien_ni, new_ts);
    ci_tcp_kalive_reset(alien_ni, new_ts);
  }


  /* Old ep: we are done. */
  ci_bit_set(&old_s->b.sb_aflags, CI_SB_AFLAG_MOVED_AWAY_BIT);
  old_s->b.moved_to_stack_id = alien_ni->state->stack_id;
  old_s->b.moved_to_sock_id = new_s->b.bufid;
  if( ! list_empty(&priv->_filp->f_ep_links) )
    ci_bit_set(&old_s->b.sb_aflags, CI_SB_AFLAG_MOVED_AWAY_IN_EPOLL_BIT);

  ci_sock_unlock(&old_thr->netif, &old_s->b);
  ci_sock_unlock(alien_ni, &new_s->b);
  ci_assert(ci_netif_is_locked(alien_ni));
  OO_DEBUG_TCPH(ci_log("%s: -> [%d:%d] %s", __func__,
                       new_thr->id, new_s->b.bufid,
                       ci_tcp_state_str(new_s->b.state)));
  return 0;

fail4:
  /* We clear the filters from the new ep.
   * For now, we do not need to re-insert old filters because hw filters
   * are alredy here (in case of accepted socket) or not needed.
   * We have not removed old sw filters yet. */
  tcp_helper_endpoint_move_filters_undo(old_ep, new_ep);
fail3:
  if( new_s->b.state & CI_TCP_STATE_TCP )
    ci_tcp_state_free(alien_ni, SOCK_TO_TCP(new_s));
  else
    ci_udp_state_free(alien_ni, SOCK_TO_UDP(new_s));
fail2:
  ci_netif_unlock(alien_ni);
fail1:
  ci_netif_unlock(&old_thr->netif);
fail1_ni_unlocked:
  ci_sock_unlock(&old_thr->netif, &old_s->b);
  OO_DEBUG_TCPH(ci_log("%s: rc=%d", __func__, rc));
  return rc;
}
Esempio n. 5
0
int citp_epoll_create(int size, int flags)
{
  citp_fdinfo    *fdi;
  citp_epoll_fdi *epi;
  struct citp_epoll_fd* ep;
  int            fd;

  if( (epi = CI_ALLOC_OBJ(citp_epoll_fdi)) == NULL )
    goto fail0;
  if( (ep = CI_ALLOC_OBJ(struct citp_epoll_fd)) == NULL )
    goto fail1;
  fdi = &epi->fdinfo;
  citp_fdinfo_init(fdi, &citp_epoll_protocol_impl);

  /* Create the epoll fd. */
  CITP_FDTABLE_LOCK();
  if( (fd = ci_sys_epoll_create_compat(size, flags, 0)) < 0 )
    goto fail2;
  citp_fdtable_new_fd_set(fd, fdip_busy, TRUE);

  /* Init epfd_os */
#ifdef O_CLOEXEC
  ep->epfd_os = ci_sys_open(OO_EPOLL_DEV, O_RDWR | O_CLOEXEC);
#else
  ep->epfd_os = ci_sys_open(OO_EPOLL_DEV, O_RDWR);
  if( ep->epfd_os >= 0 )
    ci_sys_fcntl(ep->epfd_os, F_SETFD, FD_CLOEXEC);
#endif
  if( ep->epfd_os < 0 ) {
    Log_E(ci_log("%s: ERROR: failed to open(%s) errno=%d",
                 __FUNCTION__, OO_EPOLL_DEV, errno));
    goto fail3;
  }
  __citp_fdtable_reserve(ep->epfd_os, 1);
  ep->shared = mmap(NULL, sizeof(*ep->shared), PROT_READ, MAP_SHARED,
                     ep->epfd_os, 0);
  if( ep->shared == MAP_FAILED ) {
    Log_E(ci_log("%s: ERROR: failed to mmap shared segment errno=%d",
                 __FUNCTION__, errno));
    goto fail4;
  }
  __citp_fdtable_reserve(ep->shared->epfd, 1);
  CITP_FDTABLE_UNLOCK();

  epi->epoll = ep;
  ep->size = size;
  oo_wqlock_init(&ep->lock);
  ep->not_mt_safe = ! CITP_OPTS.ul_epoll_mt_safe;
  ci_dllist_init(&ep->oo_sockets);
  ep->oo_sockets_n = 0;
  ci_dllist_init(&ep->dead_sockets);
  oo_atomic_set(&ep->refcount, 1);
  ep->epfd_syncs_needed = 0;
  ep->blocking = 0;
  citp_fdtable_insert(fdi, fd, 0);
  Log_POLL(ci_log("%s: fd=%d driver_fd=%d epfd=%d", __FUNCTION__,
                  fd, ep->epfd_os, (int) ep->shared->epfd));
  return fd;

 fail4:
  __citp_fdtable_reserve(ep->epfd_os, 0);
  ci_sys_close(ep->epfd_os);
 fail3:
  ci_sys_close(fd);
  citp_fdtable_busy_clear(fd, fdip_unknown, 1);
 fail2:
  CITP_FDTABLE_UNLOCK();
  CI_FREE_OBJ(ep);
 fail1:
  CI_FREE_OBJ(epi);
 fail0:
  return -1;
}
Esempio n. 6
0
int
oo_iobufset_resource_alloc(struct oo_buffer_pages * pages, struct efrm_pd *pd,
                           struct oo_iobufset **iobrs_out, uint64_t *hw_addrs,
                           int reset_pending)
{
  struct oo_iobufset *iobrs;
  int rc;
  int gfp_flag = (in_atomic() || in_interrupt()) ? GFP_ATOMIC : GFP_KERNEL;
  int size = sizeof(struct oo_iobufset) + pages->n_bufs * sizeof(dma_addr_t);
  int nic_order;
  void **addrs;
  unsigned int i;

  ci_assert(iobrs_out);
  ci_assert(pd);

  if( size <= PAGE_SIZE ) {
    iobrs = kmalloc(size, gfp_flag);
    if( iobrs == NULL )
      return -ENOMEM;
    iobrs->dma_addrs = (void *)(iobrs + 1);
  }
  else {
    /* Avoid multi-page allocations */
    iobrs = kmalloc(sizeof(struct oo_iobufset), gfp_flag);
    if( iobrs == NULL )
      return -ENOMEM;
    ci_assert_le(pages->n_bufs * sizeof(dma_addr_t), PAGE_SIZE);
    iobrs->dma_addrs = kmalloc(pages->n_bufs * sizeof(dma_addr_t), gfp_flag);
    if( iobrs->dma_addrs == NULL ) {
      kfree(iobrs);
      return -ENOMEM;
    }

  }

  oo_atomic_set(&iobrs->ref_count, 1);
  iobrs->pd = pd;
  iobrs->pages = pages;

  nic_order = EFHW_GFP_ORDER_TO_NIC_ORDER(compound_order(pages->pages[0]));

  ci_assert_le(sizeof(void *) * pages->n_bufs, PAGE_SIZE);
  addrs = kmalloc(sizeof(void *) * pages->n_bufs, gfp_flag);
  if (addrs == NULL)
  {
    rc = -ENOMEM;
    goto fail;
  }

  for (i = 0; i < pages->n_bufs; i++) {
    addrs[i] = page_address(pages->pages[i]);
  }

  rc = efrm_pd_dma_map(iobrs->pd, pages->n_bufs,
		       nic_order,
		       addrs, sizeof(addrs[0]),
		       &iobrs->dma_addrs[0], sizeof(iobrs->dma_addrs[0]),
		       hw_addrs, sizeof(hw_addrs[0]),
		       put_user_fake, &iobrs->buf_tbl_alloc, reset_pending);
  kfree(addrs);

  if( rc < 0 )
    goto fail;

  OO_DEBUG_VERB(ci_log("%s: [%p] %d pages", __FUNCTION__,
                       iobrs, iobrs->pages->n_bufs));

  efrm_resource_ref(efrm_pd_to_resource(pd));
  oo_atomic_inc(&pages->ref_count);
  *iobrs_out = iobrs;
  return 0;

fail:
  oo_iobufset_free_memory(iobrs);
  return rc;
}
Esempio n. 7
0
static int oo_bufpage_alloc(struct oo_buffer_pages **pages_out,
                            int user_order, int low_order,
                            int *flags, int gfp_flag)
{
  int i;
  struct oo_buffer_pages *pages;
  int n_bufs = 1 << (user_order - low_order);
  int size = sizeof(struct oo_buffer_pages) + n_bufs * sizeof(struct page *);

  if( size < PAGE_SIZE ) {
    pages = kmalloc(size, gfp_flag);
    if( pages == NULL )
      return -ENOMEM;
    pages->pages = (void *)(pages + 1);
  }
  else {
    /* Avoid multi-page allocations */
    pages = kmalloc(sizeof(struct oo_buffer_pages), gfp_flag);
    if( pages == NULL )
      return -ENOMEM;
    ci_assert_le(n_bufs * sizeof(struct page *), PAGE_SIZE);
    pages->pages = kmalloc(n_bufs * sizeof(struct page *), gfp_flag);
    if( pages->pages == NULL ) {
      kfree(pages);
      return -ENOMEM;
    }
  }

  pages->n_bufs = n_bufs;
  oo_atomic_set(&pages->ref_count, 1);

#ifdef OO_DO_HUGE_PAGES
  if( (*flags & (OO_IOBUFSET_FLAG_HUGE_PAGE_TRY |
                 OO_IOBUFSET_FLAG_HUGE_PAGE_FORCE)) &&
      gfp_flag == GFP_KERNEL &&
      low_order == HPAGE_SHIFT - PAGE_SHIFT ) {
    if (oo_bufpage_huge_alloc(pages, flags) == 0) {
      *pages_out = pages;
      return 0;
    }
  }
  pages->shmid = -1;
  if( *flags & OO_IOBUFSET_FLAG_HUGE_PAGE_FORCE ) {
    ci_assert_equal(low_order, HPAGE_SHIFT - PAGE_SHIFT);
    return -ENOMEM;
  }
#endif

  if( low_order > 0 ) {
#ifdef OO_HAVE_COMPOUND_PAGES
    /* __GFP_COMP hint stolen from http://samirdas.blog.com/
     * __GFP_NOWARN is necessary because we properly handle high-order page
     * allocation failure by allocating pages one-by-one. */
    gfp_flag |= __GFP_COMP | __GFP_NOWARN;
#else
    return -EINVAL;
#endif
  }

  for( i = 0; i < n_bufs; ++i ) {
    pages->pages[i] = alloc_pages_node(numa_node_id(), gfp_flag, low_order);
    if( pages->pages[i] == NULL ) {
      OO_DEBUG_VERB(ci_log("%s: failed to allocate page (i=%u) "
                           "user_order=%d page_order=%d",
                           __FUNCTION__, i, user_order, low_order));
      pages->n_bufs = i;
      oo_iobufset_free_pages(pages);
      return -ENOMEM;
    }
    memset(page_address(pages->pages[i]), 0, PAGE_SIZE << low_order);
  }
  
  *pages_out = pages;
  return 0;
}