Пример #1
0
static UCS_F_ALWAYS_INLINE ucs_status_t 
uct_rc_verbs_iface_poll_rx(uct_rc_verbs_iface_t *iface)
{
    uct_ib_iface_recv_desc_t *desc;
    uct_rc_hdr_t *hdr;
    struct ibv_wc wc[UCT_IB_MAX_WC];
    int i, ret;

    ret = ibv_poll_cq(iface->super.super.recv_cq, UCT_IB_MAX_WC, wc);
    if (ret > 0) {
        for (i = 0; i < ret; ++i) {
            if (ucs_unlikely(wc[i].status != IBV_WC_SUCCESS)) {
                ucs_fatal("Receive completion with error: %s", ibv_wc_status_str(wc[i].status));
            }

            UCS_STATS_UPDATE_COUNTER(iface->super.stats, UCT_RC_IFACE_STAT_RX_COMPLETION, 1);

            desc = (void*)wc[i].wr_id;
            uct_ib_iface_desc_received(&iface->super.super, desc, wc[i].byte_len, 1);

            hdr = uct_ib_iface_recv_desc_hdr(&iface->super.super, desc);
            uct_ib_log_recv_completion(IBV_QPT_RC, &wc[i], hdr, uct_rc_ep_am_packet_dump);

            uct_rc_iface_invoke_am(&iface->super, hdr, wc[i].byte_len, desc);
        }

        iface->super.rx.available += ret;
        return UCS_OK;
    } else if (ret == 0) {
        uct_rc_verbs_iface_post_recv(iface, 0);
        return UCS_ERR_NO_PROGRESS;
    } else {
        ucs_fatal("Failed to poll receive CQ");
    }
}
Пример #2
0
    /// The InfiniBand completion notification handler.
    int poll_completion()
    {
        const int ne_max = 10;

        struct ibv_wc wc[ne_max];
        int ne;
        int ne_total = 0;

        while ((ne = ibv_poll_cq(cq_, ne_max, wc))) {
            if (ne < 0)
                throw InfinibandException("ibv_poll_cq failed");

            ne_total += ne;
            for (int i = 0; i < ne; ++i) {
                if (wc[i].status != IBV_WC_SUCCESS) {
                    std::ostringstream s;
                    s << ibv_wc_status_str(wc[i].status) << " for wr_id "
                      << static_cast<int>(wc[i].wr_id);
                    L_(error) << s.str();

                    continue;
                }

                on_completion(wc[i]);
            }
        }

        return ne_total;
    }
Пример #3
0
static inline ucs_status_t uct_ud_verbs_iface_poll_rx(uct_ud_verbs_iface_t *iface)
{
    uct_ib_iface_recv_desc_t *desc;
    struct ibv_wc wc[UCT_IB_MAX_WC];
    int i, ret;
    char *packet;


    ret = ibv_poll_cq(iface->super.super.recv_cq, UCT_IB_MAX_WC, wc);
    if (ret == 0) {
        return UCS_ERR_NO_PROGRESS;
    } 
    if (ucs_unlikely(ret < 0)) {
        ucs_fatal("Failed to poll receive CQ");
    }

    for (i = 0; i < ret; ++i) {
        if (ucs_unlikely(wc[i].status != IBV_WC_SUCCESS)) {
            ucs_fatal("Receive completion with error: %s", ibv_wc_status_str(wc[i].status));
        }

        desc = (void*)wc[i].wr_id;
        ucs_trace_data("pkt rcvd: buf=%p len=%d", desc, wc[i].byte_len);
        packet = uct_ib_iface_recv_desc_hdr(&iface->super.super, desc);
        VALGRIND_MAKE_MEM_DEFINED(packet, wc[i].byte_len);

        uct_ud_ep_process_rx(&iface->super, 
                             (uct_ud_neth_t *)(packet + UCT_IB_GRH_LEN),
                             wc[i].byte_len - UCT_IB_GRH_LEN,
                             (uct_ud_recv_skb_t *)desc); 
    }
    iface->super.rx.available += ret;
    uct_ud_verbs_iface_post_recv(iface);
    return UCS_OK;
}
Пример #4
0
gaspi_return_t
pgaspi_dev_wait (const gaspi_queue_id_t queue,
		 int * counter,
		 const gaspi_timeout_t timeout_ms)
{

  int ne = 0, i;
  struct ibv_wc wc;

  const int nr = *counter;
  const gaspi_cycles_t s0 = gaspi_get_cycles ();

  for (i = 0; i < nr; i++)
    {
      do
	{
	  ne = ibv_poll_cq (glb_gaspi_ctx_ib.scqC[queue], 1, &wc);
	  *counter -= ne;
	  
	  if (ne == 0)
	    {
	      const gaspi_cycles_t s1 = gaspi_get_cycles ();
	      const gaspi_cycles_t tdelta = s1 - s0;

	      const float ms = (float) tdelta * glb_gaspi_ctx.cycles_to_msecs;
	      if (ms > timeout_ms)
		{
		  return GASPI_TIMEOUT;
		}
	    }
	}
      while (ne == 0);


      if ((ne < 0) || (wc.status != IBV_WC_SUCCESS))
	{
	  gaspi_print_error("Failed request to %lu. Queue %d might be broken %s",
			    wc.wr_id, queue, ibv_wc_status_str(wc.status) );

	  glb_gaspi_ctx.qp_state_vec[queue][wc.wr_id] = GASPI_STATE_CORRUPT;

	  return GASPI_ERROR;
	}
    }
#ifdef GPI2_CUDA 
  int j,k;
  for(k = 0;k < glb_gaspi_ctx.gpu_count; k++)
    {
      for(j = 0; j < GASPI_CUDA_EVENTS; j++)
	gpus[k].events[queue][j].ib_use = 0;
    }
  
#endif

  return GASPI_SUCCESS;
}
Пример #5
0
const char *ibv_wc_status_string(int status)
{
    return ibv_wc_status_str(status);

    switch (status) {
    case IBV_WC_SUCCESS:
        return "IBV_WC_SUCCESS";
    case IBV_WC_LOC_LEN_ERR:
        return "IBV_WC_LOC_LEN_ERR";
    case IBV_WC_LOC_QP_OP_ERR:
        return "IBV_WC_LOC_QP_OP_ERR";
    case IBV_WC_LOC_EEC_OP_ERR:
        return "IBV_WC_LOC_EEC_OP_ERR";
    case IBV_WC_LOC_PROT_ERR:
        return "IBV_WC_LOC_PROT_ERR";
    case IBV_WC_WR_FLUSH_ERR:
        return "IBV_WC_WR_FLUSH_ERR";
    case IBV_WC_MW_BIND_ERR:
        return "IBV_WC_MW_BIND_ERR";
    case IBV_WC_BAD_RESP_ERR:
        return "IBV_WC_BAD_RESP_ERR";
    case IBV_WC_LOC_ACCESS_ERR:
        return "IBV_WC_LOC_ACCESS_ERR";
    case IBV_WC_REM_INV_REQ_ERR:
        return "IBV_WC_REM_INV_REQ_ERR";
    case IBV_WC_REM_ACCESS_ERR:
        return "IBV_WC_REM_ACCESS_ERR";
    case IBV_WC_REM_OP_ERR:
        return "IBV_WC_REM_OP_ERR";
    case IBV_WC_RETRY_EXC_ERR:
        return "IBV_WC_RETRY_EXC_ERR";
    case IBV_WC_RNR_RETRY_EXC_ERR:
        return "IBV_WC_RNR_RETRY_EXC_ERR";
    case IBV_WC_LOC_RDD_VIOL_ERR:
        return "IBV_WC_LOC_RDD_VIOL_ERR";
    case IBV_WC_REM_INV_RD_REQ_ERR:
        return "IBV_WC_REM_INV_RD_REQ_ERR";
    case IBV_WC_REM_ABORT_ERR:
        return "IBV_WC_REM_ABORT_ERR";
    case IBV_WC_INV_EECN_ERR:
        return "IBV_WC_INV_EECN_ERR";
    case IBV_WC_INV_EEC_STATE_ERR:
        return "IBV_WC_INV_EEC_STATE_ERR";
    case IBV_WC_FATAL_ERR:
        return "IBV_WC_FATAL_ERR";
    case IBV_WC_RESP_TIMEOUT_ERR:
        return "IBV_WC_RESP_TIMEOUT_ERR";
    case IBV_WC_GENERAL_ERR:
        return "IBV_WC_GENERAL_ERR";
    default:
        return "unknown-status";
    }
}
Пример #6
0
/**
 * Polling for events on a inner thread allows processing of management messages
 * like buffer connection immediately, even if the user is not polling.
 * Otherwise buffer constructors would block indefinitely.
 *
 * Deep learning workloads are about sending small numbers of large messages,
 * in which case this model works great. If the library was to be used to
 * exchange large numbers of short messages, it would be useful to split
 * management and data messages over two different queue pairs. User threads
 * could then wait or poll on the data queue pair directly.
 */
void RDMAAdapter::InternalThreadEntry() {
  while (!must_stop()) {
    ibv_cq* cq;
    void* cq_context;
    CHECK(!ibv_get_cq_event(channel_, &cq, &cq_context));
    CHECK(cq == cq_);
    ibv_ack_cq_events(cq, 1);
    CHECK(!ibv_req_notify_cq(cq_, 0));

    int ne = ibv_poll_cq(cq_, MAX_CONCURRENT_WRITES * 2,
      static_cast<ibv_wc*>(wc_));
    CHECK_GE(ne, 0);

    for (int i = 0; i < ne; ++i) {
      CHECK(wc_[i].status == IBV_WC_SUCCESS) << "Failed status \n"
                                             << ibv_wc_status_str(wc_[i].status)
                                             << " " << wc_[i].status << " "
                                             << static_cast<int>(wc_[i].wr_id)
                                             << " "<< wc_[i].vendor_err;

      if (wc_[i].opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
        // Data message, add it to user received queue
        RDMAChannel* channel = reinterpret_cast<RDMAChannel*>(wc_[i].wr_id);
        channel->recv();
        int id = wc_[i].imm_data;
        if (id >= CTRL_ID_OFFSET) {
        // ctrl signal
          ctrl_received_.push(channel->buffers_[id - CTRL_ID_OFFSET]);
        } else {
        // data
          received_.push(channel->buffers_[id]);
        }
      } else {
        if (wc_[i].opcode & IBV_WC_RECV) {
          // Buffer connection message
          RDMAChannel* channel = reinterpret_cast<RDMAChannel*>(wc_[i].wr_id);
          int id = wc_[i].imm_data;
          channel->memory_regions_queue_.push(channel->memory_regions_[id]);
          CHECK(id == channel->memory_regions_received_++);
          CHECK(!ibv_dereg_mr(channel->region_regions_[id]));
        }
      }
    }
  }
}
Пример #7
0
static UCS_F_ALWAYS_INLINE void 
uct_rc_verbs_iface_poll_tx(uct_rc_verbs_iface_t *iface)
{
    struct ibv_wc wc[UCT_IB_MAX_WC];
    uct_rc_verbs_ep_t *ep;
    uct_rc_iface_send_op_t *op;
    unsigned count;
    uint16_t sn;
    int i, ret;

    ret = ibv_poll_cq(iface->super.super.send_cq, UCT_IB_MAX_WC, wc);
    if (ucs_unlikely(ret <= 0)) {
        if (ucs_unlikely(ret < 0)) {
            ucs_fatal("Failed to poll send CQ");
        }
        return;
    }

    for (i = 0; i < ret; ++i) {
        if (ucs_unlikely(wc[i].status != IBV_WC_SUCCESS)) {
            ucs_fatal("Send completion with error: %s", ibv_wc_status_str(wc[i].status));
        }

        UCS_STATS_UPDATE_COUNTER(iface->super.stats, UCT_RC_IFACE_STAT_TX_COMPLETION, 1);

        ep = ucs_derived_of(uct_rc_iface_lookup_ep(&iface->super, wc[i].qp_num), uct_rc_verbs_ep_t);
        ucs_assert(ep != NULL);

        count = wc[i].wr_id + 1; /* Number of sends with WC completes in batch */
        ep->super.available         += count;
        ep->tx.completion_count     += count;
        ++iface->super.tx.cq_available;

        sn = ep->tx.completion_count;
        ucs_queue_for_each_extract(op, &ep->super.outstanding, queue,
                                   UCS_CIRCULAR_COMPARE16(op->sn, <=, sn)) {
            op->handler(op);
        }
    }
}
Пример #8
0
static void UNUSED dump_wc(struct ibv_wc *wc)
{
	if (!Debug) {
		return;
	}

	DEBUG("\nibv_wc:\n");
	DEBUG("\twc->wr_id                   :%lx\n", wc->wr_id);
	DEBUG("\twc->status                  :%s\n", ibv_wc_status_str(wc->status));
	DEBUG("\twc->opcode                  :%x\n", wc->opcode);
	DEBUG("\twc->vendor_err              :%x\n", wc->vendor_err);
	DEBUG("\twc->byte_len                :%x\n", wc->byte_len);
	DEBUG("\twc->imm_data                :%x\n", wc->imm_data);
	DEBUG("\twc->qp_num                  :%x\n", wc->qp_num);
	DEBUG("\twc->src_qp                  :%x\n", wc->src_qp);
	DEBUG("\twc->wc_flags                :%x\n", wc->wc_flags);
	DEBUG("\twc->pkey_index              :%x\n", wc->pkey_index);
	DEBUG("\twc->slid                    :%x\n", wc->slid);
	DEBUG("\twc->sl                      :%x\n", wc->sl);
	DEBUG("\twc->dlid_path_bits          :%x\n", wc->dlid_path_bits);

	return;
}
Пример #9
0
static UCS_F_ALWAYS_INLINE void 
uct_ud_verbs_iface_poll_tx(uct_ud_verbs_iface_t *iface)
{
    struct ibv_wc wc;
    int ret;

    ret = ibv_poll_cq(iface->super.super.send_cq, 1, &wc);
    if (ucs_unlikely(ret < 0)) {
        ucs_fatal("Failed to poll send CQ");
        return;
    }

    if (ret == 0) {
        return;
    }

    if (ucs_unlikely(wc.status != IBV_WC_SUCCESS)) {
        ucs_fatal("Send completion (wr_id=0x%0X with error: %s ",
                  (unsigned)wc.wr_id, ibv_wc_status_str(wc.status));
        return;
    }

    iface->super.tx.available += UCT_UD_TX_MODERATION + 1;
}
Пример #10
0
/**
 * DPDK callback for RX.
 *
 * The following function is the same as mlx5_rx_burst_sp(), except it doesn't
 * manage scattered packets. Improves performance when MRU is lower than the
 * size of the first segment.
 *
 * @param dpdk_rxq
 *   Generic pointer to RX queue structure.
 * @param[out] pkts
 *   Array to store received packets.
 * @param pkts_n
 *   Maximum number of packets in array.
 *
 * @return
 *   Number of packets successfully received (<= pkts_n).
 */
uint16_t
mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
	struct rxq *rxq = (struct rxq *)dpdk_rxq;
	struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp;
	const unsigned int elts_n = rxq->elts_n;
	unsigned int elts_head = rxq->elts_head;
	struct ibv_sge sges[pkts_n];
	unsigned int i;
	unsigned int pkts_ret = 0;
	int ret;

	if (unlikely(rxq->sp))
		return mlx5_rx_burst_sp(dpdk_rxq, pkts, pkts_n);
	for (i = 0; (i != pkts_n); ++i) {
		struct rxq_elt *elt = &(*elts)[elts_head];
		unsigned int len;
		struct rte_mbuf *seg = elt->buf;
		struct rte_mbuf *rep;
		uint32_t flags;
		uint16_t vlan_tci;

		/* Sanity checks. */
		assert(seg != NULL);
		assert(elts_head < rxq->elts_n);
		assert(rxq->elts_head < rxq->elts_n);
		/*
		 * Fetch initial bytes of packet descriptor into a
		 * cacheline while allocating rep.
		 */
		rte_prefetch0(seg);
		rte_prefetch0(&seg->cacheline1);
		ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci);
		if (unlikely(ret < 0)) {
			struct ibv_wc wc;
			int wcs_n;

			DEBUG("rxq=%p, poll_length() failed (ret=%d)",
			      (void *)rxq, ret);
			/* ibv_poll_cq() must be used in case of failure. */
			wcs_n = ibv_poll_cq(rxq->cq, 1, &wc);
			if (unlikely(wcs_n == 0))
				break;
			if (unlikely(wcs_n < 0)) {
				DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)",
				      (void *)rxq, wcs_n);
				break;
			}
			assert(wcs_n == 1);
			if (unlikely(wc.status != IBV_WC_SUCCESS)) {
				/* Whatever, just repost the offending WR. */
				DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work"
				      " completion status (%d): %s",
				      (void *)rxq, wc.wr_id, wc.status,
				      ibv_wc_status_str(wc.status));
#ifdef MLX5_PMD_SOFT_COUNTERS
				/* Increment dropped packets counter. */
				++rxq->stats.idropped;
#endif
				/* Add SGE to array for repost. */
				sges[i] = elt->sge;
				goto repost;
			}
			ret = wc.byte_len;
		}
		if (ret == 0)
			break;
		assert(ret >= (rxq->crc_present << 2));
		len = ret - (rxq->crc_present << 2);
		rep = __rte_mbuf_raw_alloc(rxq->mp);
		if (unlikely(rep == NULL)) {
			/*
			 * Unable to allocate a replacement mbuf,
			 * repost WR.
			 */
			DEBUG("rxq=%p: can't allocate a new mbuf",
			      (void *)rxq);
			/* Increment out of memory counters. */
			++rxq->stats.rx_nombuf;
			++rxq->priv->dev->data->rx_mbuf_alloc_failed;
			goto repost;
		}

		/* Reconfigure sge to use rep instead of seg. */
		elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM;
		assert(elt->sge.lkey == rxq->mr->lkey);
		elt->buf = rep;

		/* Add SGE to array for repost. */
		sges[i] = elt->sge;

		/* Update seg information. */
		SET_DATA_OFF(seg, RTE_PKTMBUF_HEADROOM);
		NB_SEGS(seg) = 1;
		PORT(seg) = rxq->port_id;
		NEXT(seg) = NULL;
		PKT_LEN(seg) = len;
		DATA_LEN(seg) = len;
		if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) {
			seg->packet_type = rxq_cq_to_pkt_type(flags);
			seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
			if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
				seg->ol_flags |= PKT_RX_VLAN_PKT;
				seg->vlan_tci = vlan_tci;
			}
#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
		}
		/* Return packet. */
		*(pkts++) = seg;
		++pkts_ret;
#ifdef MLX5_PMD_SOFT_COUNTERS
		/* Increment bytes counter. */
		rxq->stats.ibytes += len;
#endif
repost:
		if (++elts_head >= elts_n)
			elts_head = 0;
		continue;
	}
	if (unlikely(i == 0))
		return 0;
	/* Repost WRs. */
#ifdef DEBUG_RECV
	DEBUG("%p: reposting %u WRs", (void *)rxq, i);
#endif
	ret = rxq->recv(rxq->wq, sges, i);
	if (unlikely(ret)) {
		/* Inability to repost WRs is fatal. */
		DEBUG("%p: recv_burst(): failed (ret=%d)",
		      (void *)rxq->priv,
		      ret);
		abort();
	}
	rxq->elts_head = elts_head;
#ifdef MLX5_PMD_SOFT_COUNTERS
	/* Increment packets counter. */
	rxq->stats.ipackets += pkts_ret;
#endif
	return pkts_ret;
}
Пример #11
0
static int cq_event_handler(struct thread_data *td, enum ibv_wc_opcode opcode)
{
	struct rdmaio_data *rd = td->io_ops->data;
	struct ibv_wc wc;
	struct rdma_io_u_data *r_io_u_d;
	int ret;
	int compevnum = 0;
	int i;

	while ((ret = ibv_poll_cq(rd->cq, 1, &wc)) == 1) {
		ret = 0;
		compevnum++;

		if (wc.status) {
			log_err("fio: cq completion status %d(%s)\n",
				wc.status, ibv_wc_status_str(wc.status));
			return -1;
		}

		switch (wc.opcode) {

		case IBV_WC_RECV:
			if (rd->is_client == 1)
				ret = client_recv(td, &wc);
			else
				ret = server_recv(td, &wc);

			if (ret)
				return -1;

			if (wc.wr_id == FIO_RDMA_MAX_IO_DEPTH)
				break;

			for (i = 0; i < rd->io_u_flight_nr; i++) {
				r_io_u_d = rd->io_us_flight[i]->engine_data;

				if (wc.wr_id == r_io_u_d->rq_wr.wr_id) {
					rd->io_us_flight[i]->resid =
					    rd->io_us_flight[i]->buflen
					    - wc.byte_len;

					rd->io_us_flight[i]->error = 0;

					rd->io_us_completed[rd->
							    io_u_completed_nr]
					    = rd->io_us_flight[i];
					rd->io_u_completed_nr++;
					break;
				}
			}
			if (i == rd->io_u_flight_nr)
				log_err("fio: recv wr %" PRId64 " not found\n",
					wc.wr_id);
			else {
				/* put the last one into middle of the list */
				rd->io_us_flight[i] =
				    rd->io_us_flight[rd->io_u_flight_nr - 1];
				rd->io_u_flight_nr--;
			}

			break;

		case IBV_WC_SEND:
		case IBV_WC_RDMA_WRITE:
		case IBV_WC_RDMA_READ:
			if (wc.wr_id == FIO_RDMA_MAX_IO_DEPTH)
				break;

			for (i = 0; i < rd->io_u_flight_nr; i++) {
				r_io_u_d = rd->io_us_flight[i]->engine_data;

				if (wc.wr_id == r_io_u_d->sq_wr.wr_id) {
					rd->io_us_completed[rd->
							    io_u_completed_nr]
					    = rd->io_us_flight[i];
					rd->io_u_completed_nr++;
					break;
				}
			}
			if (i == rd->io_u_flight_nr)
				log_err("fio: send wr %" PRId64 " not found\n",
					wc.wr_id);
			else {
				/* put the last one into middle of the list */
				rd->io_us_flight[i] =
				    rd->io_us_flight[rd->io_u_flight_nr - 1];
				rd->io_u_flight_nr--;
			}

			break;

		default:
			log_info("fio: unknown completion event %d\n",
				 wc.opcode);
			return -1;
		}
		rd->cq_event_num++;
	}

	if (ret) {
		log_err("fio: poll error %d\n", ret);
		return 1;
	}

	return compevnum;
}
Пример #12
0
int main(int argc, char *argv[])
{
    struct ibv_pd		       *pd1, *pd2;
    struct ibv_comp_channel	       *comp_chan1, *comp_chan2;
    struct ibv_cq		       *cq1, *cq2;
    struct ibv_cq		       *evt_cq = NULL;
    struct ibv_mr		       *mr1, *mr2;
    struct ibv_qp_init_attr		qp_attr1 = { }, qp_attr2 = {};
    struct ibv_sge			sge;
    struct ibv_send_wr		send_wr = { };
    struct ibv_send_wr	       *bad_send_wr = NULL;
    struct ibv_wc			wc;
    struct ibv_qp			*qp1, *qp2;
    void			       *cq_context = NULL;
    union ibv_gid			gid1, gid2;

    int				n;

    uint8_t			       *buf1, *buf2;

    int				err;
    int 				num_devices;
    struct ibv_context	*	verbs1, *verbs2;
    struct ibv_device ** dev_list = ibv_get_device_list(&num_devices);
    struct ibv_device_attr		dev_attr;
    int use = 0;
    int port = 1;
    int x = 0;
    unsigned long mb = 0;
    unsigned long bytes = 0;
    unsigned long save_diff = 0;
    struct timeval start, stop, diff;
    int iterations = 0;

    struct rusage usage;
    struct timeval ustart, uend;
    struct timeval sstart, send;
    struct timeval tstart, tend;

    DPRINTF("There are %d devices\n", num_devices);

    for(x = 0; x < num_devices; x++) {
        printf("Device: %d, %s\n", x, ibv_get_device_name(dev_list[use]));
    }

    if(num_devices == 0 || dev_list == NULL) {
        printf("No devices found\n");
        return 1;
    }

    if(argc < 2) {
        printf("Which RDMA device to use? 0, 1, 2, 3...\n");
        return 1;
    }

    use = atoi(argv[1]);

    DPRINTF("Using device %d\n", use);

    verbs1 = ibv_open_device(dev_list[use]);

    if(verbs1 == NULL) {
        printf("Failed to open device!\n");
        return 1;
    }

    DPRINTF("Device open %s\n", ibv_get_device_name(dev_list[use]));

    verbs2 = ibv_open_device(dev_list[use]);

    if(verbs2 == NULL) {
        printf("Failed to open device again!\n");
        return 1;
    }

    if(ibv_query_device(verbs1, &dev_attr)) {
        printf("Failed to query device attributes.\n");
        return 1;
    }

    printf("Device open: %d, %s which has %d ports\n", x, ibv_get_device_name(dev_list[use]), dev_attr.phys_port_cnt);

    if(argc < 3) {
        printf("Which port on the device to use? 1, 2, 3...\n");
        return 1;
    }

    port = atoi(argv[2]);

    if(port <= 0) {
        printf("Port #%d invalid, must start with 1, 2, 3, ...\n", port);
        return 1;
    }

    printf("Using port %d\n", port);

    if(argc < 4) {
        printf("How many iterations to perform?\n");
        return 1;
    }

    iterations = atoi(argv[3]);
    printf("Will perform %d iterations\n", iterations);

    pd1 = ibv_alloc_pd(verbs1);
    if (!pd1)
        return 1;

    if(argc < 5) {
        printf("How many megabytes to allocate? (This will be allocated twice. Once for source, once for destination.)\n");
        return 1;
    }

    mb = atoi(argv[4]);

    if(mb <= 0) {
        printf("Megabytes %lu invalid\n", mb);
        return 1;
    }

    DPRINTF("protection domain1 allocated\n");

    pd2 = ibv_alloc_pd(verbs2);
    if (!pd2)
        return 1;

    DPRINTF("protection domain2 allocated\n");

    comp_chan1 = ibv_create_comp_channel(verbs1);
    if (!comp_chan1)
        return 1;

    DPRINTF("completion chan1 created\n");

    comp_chan2 = ibv_create_comp_channel(verbs2);
    if (!comp_chan2)
        return 1;

    DPRINTF("completion chan2 created\n");

    cq1 = ibv_create_cq(verbs1, 2, NULL, comp_chan1, 0);
    if (!cq1)
        return 1;

    DPRINTF("CQ1 created\n");

    cq2 = ibv_create_cq(verbs2, 2, NULL, comp_chan2, 0);
    if (!cq2)
        return 1;

    DPRINTF("CQ2 created\n");

    bytes = mb * 1024UL * 1024UL;

    buf1 = malloc(bytes);
    if (!buf1)
        return 1;

    buf2 = malloc(bytes);
    if (!buf2)
        return 1;

    printf("Populating %lu MB memory.\n", mb * 2);

    for(x = 0; x < bytes; x++) {
        buf1[x] = 123;
    }

    buf1[bytes - 1] = 123;

    mr1 = ibv_reg_mr(pd1, buf1, bytes, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ);
    if (!mr1) {
        printf("Failed to register memory.\n");
        return 1;
    }

    mr2 = ibv_reg_mr(pd2, buf2, bytes, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ);
    if (!mr2) {
        printf("Failed to register memory.\n");
        return 1;
    }

    DPRINTF("memory registered.\n");

    qp_attr1.cap.max_send_wr	 = 10;
    qp_attr1.cap.max_send_sge = 10;
    qp_attr1.cap.max_recv_wr	 = 10;
    qp_attr1.cap.max_recv_sge = 10;
    qp_attr1.sq_sig_all = 1;

    qp_attr1.send_cq		 = cq1;
    qp_attr1.recv_cq		 = cq1;

    qp_attr1.qp_type		 = IBV_QPT_RC;

    qp1 = ibv_create_qp(pd1, &qp_attr1);
    if (!qp1) {
        printf("failed to create queue pair #1\n");
        return 1;
    }

    DPRINTF("queue pair1 created\n");

    qp_attr2.cap.max_send_wr	 = 10;
    qp_attr2.cap.max_send_sge = 10;
    qp_attr2.cap.max_recv_wr	 = 10;
    qp_attr2.cap.max_recv_sge = 10;
    qp_attr2.sq_sig_all = 1;

    qp_attr2.send_cq		 = cq2;
    qp_attr2.recv_cq		 = cq2;

    qp_attr2.qp_type		 = IBV_QPT_RC;


    qp2 = ibv_create_qp(pd2, &qp_attr2);
    if (!qp2) {
        printf("failed to create queue pair #2\n");
        return 1;
    }

    DPRINTF("queue pair2 created\n");

    struct ibv_qp_attr attr1 = {
        .qp_state = IBV_QPS_INIT,
        .pkey_index = 0,
        .port_num = port,
        .qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE,
    };

    if(ibv_modify_qp(qp1, &attr1,
                     IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) {
        printf("verbs 1 Failed to go to init\n");
        return 1;
    }

    DPRINTF("verbs1 to init\n");

    struct ibv_qp_attr attr2 = {
        .qp_state = IBV_QPS_INIT,
        .pkey_index = 0,
        .port_num = port,
        .qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE,
    };

    if(ibv_modify_qp(qp2, &attr2,
                     IBV_QP_STATE |
                     IBV_QP_PKEY_INDEX |
                     IBV_QP_PORT |
                     IBV_QP_ACCESS_FLAGS)) {
        printf("verbs 2 Failed to go to init\n");
        return 1;
    }

    DPRINTF("verbs2 to init\n");

    //struct ibv_gid gid1, gid2;
    struct ibv_port_attr port1, port2;
    uint64_t psn1 = lrand48() & 0xffffff;
    uint64_t psn2 = lrand48() & 0xffffff;

    if(ibv_query_port(verbs1, port, &port1))
        return 1;

    DPRINTF("got port1 information\n");

    if(ibv_query_port(verbs2, port, &port2))
        return 1;

    DPRINTF("got port2 information\n");

    if(ibv_query_gid(verbs1, 1, 0, &gid1))
        return 1;
    DPRINTF("got gid1 information\n");

    if(ibv_query_gid(verbs2, 1, 0, &gid2))
        return 1;

    DPRINTF("got gid2 information\n");

    struct ibv_qp_attr next2 = {
        .qp_state = IBV_QPS_RTR,
        .path_mtu = IBV_MTU_1024,
        .dest_qp_num = qp2->qp_num,
        .rq_psn = psn2,
        .max_dest_rd_atomic = 5,
        .min_rnr_timer = 12,
        .ah_attr = {
            .is_global = 0,
            .dlid = port2.lid,
            .sl = 0,
            .src_path_bits = 0,
            .port_num = port,
        }
    };

    if(gid2.global.interface_id) {
        next2.ah_attr.is_global = 1;
        next2.ah_attr.grh.hop_limit = 1;
        next2.ah_attr.grh.dgid = gid2;
        next2.ah_attr.grh.sgid_index = 0;
    }

    struct ibv_qp_attr next1 = {
        .qp_state = IBV_QPS_RTR,
        .path_mtu = IBV_MTU_1024,
        .dest_qp_num = qp1->qp_num,
        .rq_psn = psn1,
        .max_dest_rd_atomic = 1,
        .min_rnr_timer = 12,
        .ah_attr = {
            .is_global = 0,
            .dlid = port1.lid,
            .sl = 0,
            .src_path_bits = 0,
            .port_num = port,
        }
    };

    if(gid1.global.interface_id) {
        next1.ah_attr.is_global = 1;
        next1.ah_attr.grh.hop_limit = 1;
        next1.ah_attr.grh.dgid = gid1;
        next1.ah_attr.grh.sgid_index = 0;
    }

    if(ibv_modify_qp(qp2, &next1,
                     IBV_QP_STATE |
                     IBV_QP_AV |
                     IBV_QP_PATH_MTU |
                     IBV_QP_DEST_QPN |
                     IBV_QP_RQ_PSN |
                     IBV_QP_MAX_DEST_RD_ATOMIC |
                     IBV_QP_MIN_RNR_TIMER)) {
        printf("Failed to modify verbs2 to ready\n");
        return 1;
    }

    DPRINTF("verbs2 RTR\n");

    if(ibv_modify_qp(qp1, &next2,
                     IBV_QP_STATE |
                     IBV_QP_AV |
                     IBV_QP_PATH_MTU |
                     IBV_QP_DEST_QPN |
                     IBV_QP_RQ_PSN |
                     IBV_QP_MAX_DEST_RD_ATOMIC |
                     IBV_QP_MIN_RNR_TIMER)) {
        printf("Failed to modify verbs1 to ready\n");
        return 1;
    }

    DPRINTF("verbs1 RTR\n");

    next2.qp_state = IBV_QPS_RTS;
    next2.timeout = 14;
    next2.retry_cnt = 7;
    next2.rnr_retry = 7;
    next2.sq_psn = psn1;
    next2.max_rd_atomic = 1;

    if(ibv_modify_qp(qp1, &next2,
                     IBV_QP_STATE |
                     IBV_QP_TIMEOUT |
                     IBV_QP_RETRY_CNT |
                     IBV_QP_RNR_RETRY |
                     IBV_QP_SQ_PSN |
                     IBV_QP_MAX_QP_RD_ATOMIC)) {
        printf("Failed again to modify verbs1 to ready\n");
        return 1;
    }

    DPRINTF("verbs1 RTS\n");

    next1.qp_state = IBV_QPS_RTS;
    next1.timeout = 14;
    next1.retry_cnt = 7;
    next1.rnr_retry = 7;
    next1.sq_psn = psn2;
    next1.max_rd_atomic = 1;

    if(ibv_modify_qp(qp2, &next1,
                     IBV_QP_STATE |
                     IBV_QP_TIMEOUT |
                     IBV_QP_RETRY_CNT |
                     IBV_QP_RNR_RETRY |
                     IBV_QP_SQ_PSN |
                     IBV_QP_MAX_QP_RD_ATOMIC)) {
        printf("Failed again to modify verbs2 to ready\n");
        return 1;
    }

    DPRINTF("verbs2 RTS\n");

    printf("Performing RDMA first.\n");
    iterations = atoi(argv[3]);

    getrusage(RUSAGE_SELF, &usage);
    ustart = usage.ru_utime;
    sstart = usage.ru_stime;

    gettimeofday(&tstart, NULL);

    while(iterations-- > 0) {
        sge.addr   = (uintptr_t) buf1;
        sge.length = bytes;
        sge.lkey   = mr1->lkey;

        send_wr.wr_id		    = 1;
        send_wr.opcode		    = IBV_WR_RDMA_WRITE;
        send_wr.sg_list		    = &sge;
        send_wr.num_sge		    = 1;
        send_wr.send_flags          = IBV_SEND_SIGNALED;
        send_wr.wr.rdma.rkey 	    = mr2->rkey;
        send_wr.wr.rdma.remote_addr = (uint64_t) buf2;

        DPRINTF("Iterations left: %d\n", iterations);
        if (ibv_req_notify_cq(cq1, 0))
            return 1;

        DPRINTF("Submitting local RDMA\n");
        gettimeofday(&start, NULL);
        if (ibv_post_send(qp1, &send_wr, &bad_send_wr))
            return 1;

        DPRINTF("RDMA posted %p %p\n", &send_wr, bad_send_wr);

        DPRINTF("blocking...\n");
        if(ibv_get_cq_event(comp_chan1, &evt_cq, &cq_context)) {
            printf("failed to get CQ event\n");
            return 1;
        }
        gettimeofday(&stop, NULL);
        timersub(&stop, &start, &diff);

        DPRINTF("RDMA took: %lu us\n", diff.tv_usec);

        ibv_ack_cq_events(evt_cq, 1);

        DPRINTF("got event\n");

        n = ibv_poll_cq(cq1, 1, &wc);
        if (n > 0) {
            DPRINTF("return from poll: %lu\n", wc.wr_id);
            if (wc.status != IBV_WC_SUCCESS) {
                printf("poll failed %s\n", ibv_wc_status_str(wc.status));
                return 1;
            }

            if (wc.wr_id == 1) {
                DPRINTF("Finished %d bytes %d %d\n", n, buf1[bytes - 1], buf2[bytes - 1]);
            } else {
                printf("didn't find completion\n");
            }
        }

        if (n < 0) {
            printf("poll returned error\n");
            return 1;
        }

        DPRINTF("Poll returned %d bytes %d %d\n", n, buf1[0], buf2[0]);

    }

    gettimeofday(&tend, NULL);

    getrusage(RUSAGE_SELF, &usage);
    uend = usage.ru_utime;
    send = usage.ru_stime;

    save_diff = 0;
    timersub(&uend, &ustart, &diff);
    save_diff += diff.tv_usec;
    printf("User CPU time: %lu us\n", diff.tv_usec);
    timersub(&send, &sstart, &diff);
    save_diff += diff.tv_usec;
    printf("System CPU time: %lu us\n", diff.tv_usec);
    timersub(&tend, &tstart, &diff);
    printf("Sleeping time: %lu us\n", diff.tv_usec - save_diff);
    printf("Wall clock CPU time: %lu us\n", diff.tv_usec);

    iterations = atoi(argv[3]);

    printf("Now using the CPU instead....\n");

    getrusage(RUSAGE_SELF, &usage);
    ustart = usage.ru_utime;
    sstart = usage.ru_stime;

    gettimeofday(&tstart, NULL);

    while(iterations-- > 0) {
        DPRINTF("Repeating without RDMA...\n");

        gettimeofday(&start, NULL);

        memcpy(buf2, buf1, bytes);

        gettimeofday(&stop, NULL);
        timersub(&stop, &start, &diff);
        DPRINTF("Regular copy too took: %lu us\n", diff.tv_usec);
    }

    gettimeofday(&tend, NULL);

    getrusage(RUSAGE_SELF, &usage);
    uend = usage.ru_utime;
    send = usage.ru_stime;

    save_diff = 0;
    timersub(&uend, &ustart, &diff);
    save_diff += diff.tv_usec;
    printf("User CPU time: %lu us\n", diff.tv_usec);
    timersub(&send, &sstart, &diff);
    save_diff += diff.tv_usec;
    printf("System CPU time: %lu us\n", diff.tv_usec);
    timersub(&tend, &tstart, &diff);
    printf("Sleeping time: %lu us\n", diff.tv_usec - save_diff);
    printf("Wall clock CPU time: %lu us\n", diff.tv_usec);
    return 0;
}
Пример #13
0
static void uct_cm_iface_event_handler(void *arg)
{
    uct_cm_iface_t *iface = arg;
    struct ib_cm_event *event;
    struct ib_cm_id *id;
    int destroy_id;
    int ret;

    ucs_trace_func("");

    for (;;) {
        /* Fetch all events */
        ret = ib_cm_get_event(iface->cmdev, &event);
        if (ret) {
            if (errno != EAGAIN) {
                ucs_warn("ib_cm_get_event() failed: %m");
            }
            return;
        }

        id  = event->cm_id;

        /* Handle the event */
        switch (event->event) {
        case IB_CM_SIDR_REQ_ERROR:
            ucs_error("SIDR request error, status: %s",
                      ibv_wc_status_str(event->param.send_status));
            destroy_id = 1;
            break;
        case IB_CM_SIDR_REQ_RECEIVED:
            uct_cm_iface_handle_sidr_req(iface, event);
            destroy_id = 1; /* Destroy the ID created by the driver */
            break;
        case IB_CM_SIDR_REP_RECEIVED:
            ucs_trace_data("RX: SIDR_REP [id %p{%u}]", id, id->handle);
            uct_cm_iface_outstanding_remove(iface, id);
            destroy_id = 1; /* Destroy the ID which was used for sending */
            break;
        default:
            ucs_warn("Unexpected CM event: %d", event->event);
            destroy_id = 0;
            break;
        }

        /* Acknowledge CM event, remember the id, in case we would destroy it */
        ret = ib_cm_ack_event(event);
        if (ret) {
            ucs_warn("ib_cm_ack_event() failed: %m");
        }

        /* If there is an id which should be destroyed, do it now, after
         * acknowledging all events.
         */
        if (destroy_id) {
            ret = ib_cm_destroy_id(id);
            if (ret) {
                ucs_error("ib_cm_destroy_id() failed: %m");
            }
        }

        uct_cm_iface_notify(iface);
    }
}
Пример #14
0
struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size,
                                     int rx_depth, int port, int use_event,
                                     enum pp_wr_calc_op   calc_op,
                                     enum pp_wr_data_type calc_data_type,
                                     char *calc_operands_str)
{
    struct pingpong_context *ctx;
    int rc;

    ctx = malloc(sizeof *ctx);
    if (!ctx)
        return NULL;
    memset(ctx, 0, sizeof *ctx);

    ctx->size	= size;
    ctx->rx_depth	= rx_depth;

    ctx->calc_op.opcode	= IBV_EXP_CALC_OP_NUMBER;
    ctx->calc_op.data_type	= IBV_EXP_CALC_DATA_TYPE_NUMBER;
    ctx->calc_op.data_size	= IBV_EXP_CALC_DATA_SIZE_NUMBER;

    ctx->buf = memalign(page_size, size);
    if (!ctx->buf) {
        fprintf(stderr, "Couldn't allocate work buf.\n");
        goto clean_ctx;
    }

    memset(ctx->buf, 0, size);

    ctx->net_buf = memalign(page_size, size);
    if (!ctx->net_buf) {
        fprintf(stderr, "Couldn't allocate work buf.\n");
        goto clean_buffer;
    }
    memset(ctx->net_buf, 0, size);

    ctx->context = ibv_open_device(ib_dev);
    if (!ctx->context) {
        fprintf(stderr, "Couldn't get context for %s\n",
                ibv_get_device_name(ib_dev));
        goto clean_net_buf;
    }

    if (use_event) {
        ctx->channel = ibv_create_comp_channel(ctx->context);
        if (!ctx->channel) {
            fprintf(stderr, "Couldn't create completion channel\n");
            goto clean_device;
        }
    } else
        ctx->channel = NULL;

    ctx->pd = ibv_alloc_pd(ctx->context);
    if (!ctx->pd) {
        fprintf(stderr, "Couldn't allocate PD\n");
        goto clean_comp_channel;
    }

    ctx->mr = ibv_reg_mr(ctx->pd, ctx->net_buf, size, IBV_ACCESS_LOCAL_WRITE);
    if (!ctx->mr) {
        fprintf(stderr, "Couldn't register MR\n");
        goto clean_pd;
    }

    if (calc_op != PP_CALC_INVALID) {
        int op_per_gather, num_op, max_num_op;

        ctx->calc_op.opcode	= IBV_EXP_CALC_OP_NUMBER;
        ctx->calc_op.data_type	= IBV_EXP_CALC_DATA_TYPE_NUMBER;
        ctx->calc_op.data_size	= IBV_EXP_CALC_DATA_SIZE_NUMBER;

        num_op = pp_parse_calc_to_gather(calc_operands_str, calc_op, calc_data_type,
                                         &ctx->calc_op, ctx->context, ctx->buf, ctx->net_buf);
        if (num_op < 0) {
            fprintf(stderr, "-E- failed parsing calc operators\n");
            goto clean_mr;
        }

        rc = pp_query_calc_cap(ctx->context,
                               ctx->calc_op.opcode,
                               ctx->calc_op.data_type,
                               ctx->calc_op.data_size,
                               &op_per_gather, &max_num_op);
        if (rc) {
            fprintf(stderr, "-E- operation not supported on %s. valid ops are:\n",
                    ibv_get_device_name(ib_dev));

            pp_print_dev_calc_ops(ctx->context);
            goto clean_mr;
        }

        if (pp_prepare_sg_list(op_per_gather, num_op, ctx->mr->lkey, &ctx->calc_op, ctx->net_buf)) {
            fprintf(stderr, "-failed to prepare the sg list\n");
            goto clean_mr;
        }
    }

    ctx->cq = ibv_create_cq(ctx->context, rx_depth + 1, NULL,
                            ctx->channel, 0);
    if (!ctx->cq) {
        fprintf(stderr, "Couldn't create CQ\n");
        goto clean_mr;
    }

    {
        struct ibv_exp_qp_init_attr attr = {
            .send_cq = ctx->cq,
            .recv_cq = ctx->cq,
            .cap	 = {
                .max_send_wr  = 16,
                .max_recv_wr  = rx_depth,
                .max_send_sge = 16,
                .max_recv_sge = 16
            },
            .qp_type = IBV_QPT_RC,
            .pd = ctx->pd
        };

        attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS | IBV_EXP_QP_INIT_ATTR_PD;
        attr.exp_create_flags = IBV_EXP_QP_CREATE_CROSS_CHANNEL;

        ctx->qp = ibv_exp_create_qp(ctx->context, &attr);
        if (!ctx->qp)  {
            fprintf(stderr, "Couldn't create QP\n");
            goto clean_cq;
        }
    }

    {
        struct ibv_qp_attr attr = {
            .qp_state		= IBV_QPS_INIT,
            .pkey_index		= 0,
            .port_num		= port,
            .qp_access_flags	= 0
        };

        if (ibv_modify_qp(ctx->qp, &attr,
                          IBV_QP_STATE		|
                          IBV_QP_PKEY_INDEX	|
                          IBV_QP_PORT		|
                          IBV_QP_ACCESS_FLAGS)) {
            fprintf(stderr, "Failed to modify QP to INIT\n");
            goto clean_qp;
        }

    }

    ctx->mcq = ibv_create_cq(ctx->context, rx_depth + 1, NULL,
                             ctx->channel, 0);
    if (!ctx->mcq) {
        fprintf(stderr, "Couldn't create CQ for MQP\n");
        goto clean_qp;
    }

    {
        struct ibv_exp_qp_init_attr mattr = {
            .send_cq = ctx->mcq,
            .recv_cq = ctx->mcq,
            .cap	 = {
                .max_send_wr  = 1,
                .max_recv_wr  = rx_depth,
                .max_send_sge = 16,
                .max_recv_sge = 16
            },
            .qp_type = IBV_QPT_RC,
            .pd = ctx->pd
        };

        mattr.comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS | IBV_EXP_QP_INIT_ATTR_PD;
        mattr.exp_create_flags = IBV_EXP_QP_CREATE_CROSS_CHANNEL;

        ctx->mqp = ibv_exp_create_qp(ctx->context, &mattr);
        if (!ctx->qp)  {
            fprintf(stderr, "Couldn't create MQP\n");
            goto clean_mcq;
        }
    }

    {
        struct ibv_qp_attr mattr = {
            .qp_state		= IBV_QPS_INIT,
            .pkey_index		= 0,
            .port_num		= port,
            .qp_access_flags	= 0
        };

        if (ibv_modify_qp(ctx->mqp, &mattr,
                          IBV_QP_STATE		|
                          IBV_QP_PKEY_INDEX	|
                          IBV_QP_PORT		|
                          IBV_QP_ACCESS_FLAGS)) {
            fprintf(stderr, "Failed to modify MQP to INIT\n");
            goto clean_mqp;
        }
    }

    return ctx;

clean_mqp:
    ibv_destroy_qp(ctx->mqp);

clean_mcq:
    ibv_destroy_cq(ctx->mcq);

clean_qp:
    ibv_destroy_qp(ctx->qp);

clean_cq:
    ibv_destroy_cq(ctx->cq);

clean_mr:
    ibv_dereg_mr(ctx->mr);

clean_pd:
    ibv_dealloc_pd(ctx->pd);

clean_comp_channel:
    if (ctx->channel)
        ibv_destroy_comp_channel(ctx->channel);

clean_device:
    ibv_close_device(ctx->context);

clean_net_buf:
    free(ctx->net_buf);

clean_buffer:
    free(ctx->buf);

clean_ctx:
    free(ctx);

    return NULL;
}

int pp_close_ctx(struct pingpong_context *ctx)
{
    if (ibv_destroy_qp(ctx->qp)) {
        fprintf(stderr, "Couldn't destroy QP\n");
        return 1;
    }


    if (ibv_destroy_qp(ctx->mqp)) {
        fprintf(stderr, "Couldn't destroy MQP\n");
        return 1;
    }


    if (ibv_destroy_cq(ctx->cq)) {
        fprintf(stderr, "Couldn't destroy CQ\n");
        return 1;
    }

    if (ibv_destroy_cq(ctx->mcq)) {
        fprintf(stderr, "Couldn't destroy MCQ\n");
        return 1;
    }

    if (ibv_dereg_mr(ctx->mr)) {
        fprintf(stderr, "Couldn't deregister MR\n");
        return 1;
    }

    if (ibv_dealloc_pd(ctx->pd)) {
        fprintf(stderr, "Couldn't deallocate PD\n");
        return 1;
    }

    if (ctx->channel) {
        if (ibv_destroy_comp_channel(ctx->channel)) {
            fprintf(stderr, "Couldn't destroy completion channel\n");
            return 1;
        }
    }

    if (ibv_close_device(ctx->context)) {
        fprintf(stderr, "Couldn't release context\n");
        return 1;
    }

    free(ctx->buf);
    free(ctx->net_buf);
    free(ctx);

    return 0;
}

static int pp_post_recv(struct pingpong_context *ctx, int n)
{
    int rc;

    struct ibv_sge list = {
        .addr	= (uintptr_t) ctx->net_buf,
        .length = ctx->size,
        .lkey	= ctx->mr->lkey
    };
    struct ibv_recv_wr wr = {
        .wr_id		= PP_RECV_WRID,
        .sg_list	= &list,
        .num_sge	= 1,
    };
    struct ibv_recv_wr *bad_wr;
    int i;

    for (i = 0; i < n; ++i) {
        rc = ibv_post_recv(ctx->qp, &wr, &bad_wr);
        if (rc)
            return rc;
    }

    return i;
}

static int pp_post_send(struct pingpong_context *ctx)
{
    int ret;

    struct ibv_sge list = {
        .addr	= (uintptr_t) ctx->net_buf,
        .length = ctx->size,
        .lkey	= ctx->mr->lkey
    };
    struct ibv_exp_send_wr wr = {
        .wr_id		= PP_SEND_WRID,
        .sg_list	= &list,
        .num_sge	= 1,
        .exp_opcode	= IBV_EXP_WR_SEND,
        .exp_send_flags	= IBV_EXP_SEND_SIGNALED,
    };
    struct ibv_exp_send_wr *bad_wr;
    /* If this is a calc operation - set the required params in the wr */
    if (ctx->calc_op.opcode != IBV_EXP_CALC_OP_NUMBER) {
        wr.exp_opcode  = IBV_EXP_WR_SEND;
        wr.exp_send_flags |= IBV_EXP_SEND_WITH_CALC;
        wr.sg_list = ctx->calc_op.gather_list;
        wr.num_sge = ctx->calc_op.gather_list_size;

        wr.op.calc.calc_op   = ctx->calc_op.opcode;
        wr.op.calc.data_type = ctx->calc_op.data_type;
        wr.op.calc.data_size = ctx->calc_op.data_size;

    }

    ret = ibv_exp_post_send(ctx->qp, &wr, &bad_wr);

    return ret;
}

int pp_post_ext_wqe(struct pingpong_context *ctx, enum ibv_exp_wr_opcode op)
{
    int ret;
    struct ibv_exp_send_wr wr = {
        .wr_id		= PP_CQE_WAIT,
        .sg_list	= NULL,
        .num_sge	= 0,
        .exp_opcode	= op,
        .exp_send_flags	= IBV_EXP_SEND_SIGNALED,
    };
    struct ibv_exp_send_wr *bad_wr;

    switch (op) {
    case IBV_EXP_WR_RECV_ENABLE:
    case IBV_EXP_WR_SEND_ENABLE:

        wr.task.wqe_enable.qp = ctx->qp;
        wr.task.wqe_enable.wqe_count = 0;

        wr.exp_send_flags |= IBV_EXP_SEND_WAIT_EN_LAST;

        break;

    case IBV_EXP_WR_CQE_WAIT:
        wr.task.cqe_wait.cq = ctx->cq;
        wr.task.cqe_wait.cq_count = 1;

        wr.exp_send_flags |=  IBV_EXP_SEND_WAIT_EN_LAST;

        break;

    default:
        fprintf(stderr, "-E- unsupported m_wqe opcode %d\n", op);
        return -1;
    }

    ret = ibv_exp_post_send(ctx->mqp, &wr, &bad_wr);

    return ret;
}

int pp_poll_mcq(struct ibv_cq *cq, int num_cqe)
{
    int ne;
    int i;
    struct ibv_wc wc[2];

    if (num_cqe > 2) {
        fprintf(stderr, "-E- max num cqe exceeded\n");
        return -1;
    }

    do {
        ne = ibv_poll_cq(cq, num_cqe, wc);
        if (ne < 0) {
            fprintf(stderr, "poll CQ failed %d\n", ne);
            return 1;
        }
    } while (ne < 1);

    for (i = 0; i < ne; ++i) {
        if (wc[i].status != IBV_WC_SUCCESS) {
            fprintf(stderr, "Failed %s status %s (%d)\n",
                    wr_id_str[(int)wc[i].wr_id],
                    ibv_wc_status_str(wc[i].status),
                    wc[i].status);
            return 1;
        }

        if ((int) wc[i].wr_id != PP_CQE_WAIT) {
            fprintf(stderr, "invalid wr_id %" PRIx64 "\n", wc[i].wr_id);
            return -1;
        }
    }

    return 0;
}

static int pp_calc_verify(struct pingpong_context *ctx,
                          enum pp_wr_data_type calc_data_type,
                          enum pp_wr_calc_op calc_opcode)
{
    uint64_t *op1 = &(ctx->last_result);
    uint64_t *op2 = (uint64_t *)ctx->buf + 2;
    uint64_t *res = (uint64_t *)ctx->buf;

    return !EXEC_VERIFY(calc_data_type, calc_opcode, 1, op1, op2, res);
}

static int pp_update_last_result(struct pingpong_context *ctx,
                                 enum pp_wr_data_type calc_data_type,
                                 enum pp_wr_calc_op calc_opcode)
{
    /* EXEC_VERIFY derefence result parameter */
    uint64_t *dummy;

    uint64_t *op1 = (uint64_t *)ctx->buf;
    uint64_t *op2 = (uint64_t *)ctx->buf + 2;
    uint64_t res = (uint64_t)EXEC_VERIFY(calc_data_type, calc_opcode, 0, op1, op2, dummy);

    ctx->last_result = res;
    return 0;
}


static void usage(const char *argv0)
{
    printf("Usage:\n");
    printf("  %s				start a server and wait for connection\n", argv0);
    printf("  %s <host>			connect to server at <host>\n", argv0);
    printf("\n");
    printf("Options:\n");
    printf("  -p, --port=<port>		listen on/connect to port <port> (default 18515)\n");
    printf("  -d, --ib-dev=<dev>		use IB device <dev> (default first device found)\n");
    printf("  -i, --ib-port=<port>		use port <port> of IB device (default 1)\n");
    printf("  -s, --size=<size>		size of message to exchange (default 4096 minimum 16)\n");
    printf("  -m, --mtu=<size>		path MTU (default 1024)\n");
    printf("  -r, --rx-depth=<dep>		number of receives to post at a time (default 500)\n");
    printf("  -n, --iters=<iters>		number of exchanges (default 1000)\n");
    printf("  -l, --sl=<sl>			service level value\n");
    printf("  -e, --events			sleep on CQ events (default poll)\n");
    printf("  -c, --calc=<operation>	calc operation\n");
    printf("  -t, --op_type=<type>		calc operands type\n");
    printf("  -o, --operands=<o1,o2,...>	comma separated list of operands\n");
    printf("  -w, --wait_cq=cqn		wait for entries on cq\n");
    printf("  -v, --verbose			print verbose information\n");
    printf("  -V, --verify			verify calc operations\n");
}
Пример #15
0
gaspi_return_t
pgaspi_dev_atomic_compare_swap (const gaspi_segment_id_t segment_id,
				const gaspi_offset_t offset,
				const gaspi_rank_t rank,
				const gaspi_atomic_value_t comparator,
				const gaspi_atomic_value_t val_new)
{ 
  struct ibv_send_wr *bad_wr;
  struct ibv_sge slist;
  struct ibv_send_wr swr;
  int i;

  slist.addr = (uintptr_t) (glb_gaspi_ctx.nsrc.buf + NOTIFY_OFFSET);
  slist.length = sizeof(gaspi_atomic_value_t);
  slist.lkey = ((struct ibv_mr *) glb_gaspi_ctx.nsrc.mr)->lkey;
  
  swr.wr.atomic.remote_addr =
    glb_gaspi_ctx.rrmd[segment_id][rank].addr + NOTIFY_OFFSET + offset;

  swr.wr.atomic.rkey = glb_gaspi_ctx.rrmd[segment_id][rank].rkey;
  swr.wr.atomic.compare_add = comparator;
  swr.wr.atomic.swap = val_new;

  swr.wr_id = rank;
  swr.sg_list = &slist;
  swr.num_sge = 1;
  swr.opcode = IBV_WR_ATOMIC_CMP_AND_SWP;
  swr.send_flags = IBV_SEND_SIGNALED;
  swr.next = NULL;

  if (ibv_post_send (glb_gaspi_ctx_ib.qpGroups[rank], &swr, &bad_wr))
    {
      glb_gaspi_ctx.qp_state_vec[GASPI_COLL_QP][rank] = GASPI_STATE_CORRUPT;

      return GASPI_ERROR;
    }

  glb_gaspi_ctx.ne_count_grp++;

  int ne = 0;
  for (i = 0; i < glb_gaspi_ctx.ne_count_grp; i++)
    {
      do
	{
	  ne = ibv_poll_cq (glb_gaspi_ctx_ib.scqGroups, 1,
			    glb_gaspi_ctx_ib.wc_grp_send);
	}
      while (ne == 0);

      if ((ne < 0) || (glb_gaspi_ctx_ib.wc_grp_send[i].status != IBV_WC_SUCCESS))
	{
	  glb_gaspi_ctx.qp_state_vec[GASPI_COLL_QP][glb_gaspi_ctx_ib.wc_grp_send[i].wr_id] = GASPI_STATE_CORRUPT;

 	  gaspi_print_error("Failed request to %lu : %s",
			    glb_gaspi_ctx_ib.wc_grp_send[i].wr_id, 
			    ibv_wc_status_str(glb_gaspi_ctx_ib.wc_grp_send[i].status));

	  return GASPI_ERROR;
	}
    }

  glb_gaspi_ctx.ne_count_grp = 0;
  
  return GASPI_SUCCESS;
}
Пример #16
0
static UCS_F_MAYBE_UNUSED
struct ibv_mr *uct_ib_md_create_umr(uct_ib_md_t *md, struct ibv_mr *mr)
{
#if HAVE_EXP_UMR
    struct ibv_exp_mem_region mem_reg;
    struct ibv_exp_send_wr wr, *bad_wr;
    struct ibv_exp_create_mr_in mrin;
    struct ibv_mr *umr;
    struct ibv_wc wc;
    int ret;
    size_t offset;

    if ((md->umr_qp == NULL) || (md->umr_cq == NULL)) {
        return NULL;
    }

    offset = uct_ib_md_umr_offset(uct_ib_md_umr_id(md));
    /* Create memory key */
    memset(&mrin, 0, sizeof(mrin));
    mrin.pd                       = md->pd;

#ifdef HAVE_EXP_UMR_NEW_API
    mrin.attr.create_flags        = IBV_EXP_MR_INDIRECT_KLMS;
    mrin.attr.exp_access_flags    = UCT_IB_MEM_ACCESS_FLAGS;
    mrin.attr.max_klm_list_size   = 1;
#else
    mrin.attr.create_flags        = IBV_MR_NONCONTIG_MEM;
    mrin.attr.access_flags        = UCT_IB_MEM_ACCESS_FLAGS;
    mrin.attr.max_reg_descriptors = 1;
#endif

    umr = ibv_exp_create_mr(&mrin);
    if (!umr) {
        ucs_error("Failed to create modified_mr: %m");
        goto err;
    }

    /* Fill memory list and UMR */
    memset(&wr, 0, sizeof(wr));
    memset(&mem_reg, 0, sizeof(mem_reg));

    mem_reg.base_addr                              = (uintptr_t) mr->addr;
    mem_reg.length                                 = mr->length;

#ifdef HAVE_EXP_UMR_NEW_API
    mem_reg.mr                                     = mr;

    wr.ext_op.umr.umr_type                         = IBV_EXP_UMR_MR_LIST;
    wr.ext_op.umr.mem_list.mem_reg_list            = &mem_reg;
    wr.ext_op.umr.exp_access                       = UCT_IB_MEM_ACCESS_FLAGS;
    wr.ext_op.umr.modified_mr                      = umr;
    wr.ext_op.umr.base_addr                        = (uint64_t) (uintptr_t) mr->addr + offset;

    wr.ext_op.umr.num_mrs                          = 1;
#else
    mem_reg.m_key                                  = mr;

    wr.ext_op.umr.memory_key.mkey_type             = IBV_EXP_UMR_MEM_LAYOUT_NONCONTIG;
    wr.ext_op.umr.memory_key.mem_list.mem_reg_list = &mem_reg;
    wr.ext_op.umr.memory_key.access                = UCT_IB_MEM_ACCESS_FLAGS;
    wr.ext_op.umr.memory_key.modified_mr           = umr;
    wr.ext_op.umr.memory_key.region_base_addr      = mr->addr + offset;

    wr.num_sge                                     = 1;
#endif

    wr.exp_opcode                                  = IBV_EXP_WR_UMR_FILL;
    wr.exp_send_flags                              = IBV_EXP_SEND_INLINE | IBV_EXP_SEND_SIGNALED;

    /* Post UMR */
    ret = ibv_exp_post_send(md->umr_qp, &wr, &bad_wr);
    if (ret) {
        ucs_error("ibv_exp_post_send(UMR_FILL) failed: %m");
        goto err_free_umr;
    }

    /* Wait for send UMR completion */
    for (;;) {
        ret = ibv_poll_cq(md->umr_cq, 1, &wc);
        if (ret < 0) {
            ucs_error("ibv_exp_poll_cq(umr_cq) failed: %m");
            goto err_free_umr;
        }
        if (ret == 1) {
            if (wc.status != IBV_WC_SUCCESS) {
                ucs_error("UMR_FILL completed with error: %s vendor_err %d",
                          ibv_wc_status_str(wc.status), wc.vendor_err);
                goto err_free_umr;
            }
            break;
        }
    }

    ucs_trace("UMR registered memory %p..%p offset 0x%x on %s lkey 0x%x rkey 0x%x",
              mr->addr, mr->addr + mr->length, (unsigned)offset, uct_ib_device_name(&md->dev), umr->lkey,
              umr->rkey);
    return umr;

err_free_umr:
    ibv_dereg_mr(umr);
err:
#endif
    return NULL;
}
Пример #17
0
static int
rdmasniff_read(pcap_t *handle, int max_packets, pcap_handler callback, u_char *user)
{
	struct pcap_rdmasniff *priv = handle->priv;
	struct ibv_cq *ev_cq;
	void *ev_ctx;
	struct ibv_wc wc;
	struct pcap_pkthdr pkth;
	u_char *pktd;
	int count = 0;

	if (!priv->cq_event) {
		while (ibv_get_cq_event(priv->channel, &ev_cq, &ev_ctx) < 0) {
			if (errno != EINTR) {
				return PCAP_ERROR;
			}
			if (handle->break_loop) {
				handle->break_loop = 0;
				return PCAP_ERROR_BREAK;
			}
		}
		ibv_ack_cq_events(priv->cq, 1);
		ibv_req_notify_cq(priv->cq, 0);
		priv->cq_event = 1;
	}

	while (count < max_packets || PACKET_COUNT_IS_UNLIMITED(max_packets)) {
		if (ibv_poll_cq(priv->cq, 1, &wc) != 1) {
			priv->cq_event = 0;
			break;
		}

		if (wc.status != IBV_WC_SUCCESS) {
			fprintf(stderr, "failed WC wr_id %lld status %d/%s\n",
				(unsigned long long) wc.wr_id,
				wc.status, ibv_wc_status_str(wc.status));
			continue;
		}

		pkth.len = wc.byte_len;
		pkth.caplen = min(pkth.len, (u_int)handle->snapshot);
		gettimeofday(&pkth.ts, NULL);

		pktd = (u_char *) handle->buffer + wc.wr_id * RDMASNIFF_RECEIVE_SIZE;

		if (handle->fcode.bf_insns == NULL ||
		    pcap_filter(handle->fcode.bf_insns, pktd, pkth.len, pkth.caplen)) {
			callback(user, &pkth, pktd);
			++priv->packets_recv;
			++count;
		}

		rdmasniff_post_recv(handle, wc.wr_id);

		if (handle->break_loop) {
			handle->break_loop = 0;
			return PCAP_ERROR_BREAK;
		}
	}

	return count;
}
Пример #18
0
void CompletionContext::processWorkComplete(struct ibv_wc* wc) {
    LOG_TRACE("Processing WC with ID %1% on queue %2% with status %3% %4%", wc->wr_id, wc->qp_num, wc->status,
            ibv_wc_status_str(wc->status));

    WorkRequestId workId(wc->wr_id);
    std::error_code ec;

    auto i = mSocketMap.find(wc->qp_num);
    if (i == mSocketMap.end()) {
        LOG_ERROR("No matching socket for qp_num %1%", wc->qp_num);

        // In the case that we have no socket associated with the qp_num we just repost the buffer to the shared receive
        // queue or release the buffer in the case of send
        switch (workId.workType()) {

        // In the case the work request was a receive, we try to repost the shared receive buffer
        case WorkType::RECEIVE: {
            mDevice->postReceiveBuffer(workId.bufferId());
        } break;

        // In the case the work request was a send we just release the send buffer
        case WorkType::SEND: {
            releaseSendBuffer(workId.bufferId());
        } break;

        default:
            break;
        }

        return;
    }
    InfinibandSocketImpl* socket = i->second.get();

    if (wc->status != IBV_WC_SUCCESS) {
        ec = std::error_code(wc->status, error::get_work_completion_category());
    } else {
        assert(workId.workType() != WorkType::RECEIVE || wc->opcode & IBV_WC_RECV);
        assert(workId.workType() != WorkType::SEND || wc->opcode == IBV_WC_SEND);
        assert(workId.workType() != WorkType::READ || wc->opcode == IBV_WC_RDMA_READ);
        assert(workId.workType() != WorkType::WRITE || wc->opcode == IBV_WC_RDMA_WRITE);
    }

    switch (workId.workType()) {
    case WorkType::RECEIVE: {
        LOG_TRACE("Executing receive event of buffer %1%", workId.bufferId());
        auto buffer = mDevice->acquireReceiveBuffer(workId.bufferId());
        if (!buffer.valid()) {
            socket->onReceive(nullptr, 0x0u, error::invalid_buffer);
            break;
        }

        if (wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
            socket->onImmediate(ntohl(wc->imm_data));
        } else {
            socket->onReceive(buffer.data(), wc->byte_len, ec);
        }
        mDevice->postReceiveBuffer(buffer);
    } break;

    case WorkType::SEND: {
        LOG_TRACE("Executing send event of buffer %1%", workId.bufferId());
        socket->onSend(workId.userId(), ec);
        releaseSendBuffer(workId.bufferId());
    } break;

    case WorkType::READ: {
        LOG_TRACE("Executing read event of buffer %1%", workId.bufferId());
        socket->onRead(workId.userId(), workId.bufferId(), ec);
    } break;

    case WorkType::WRITE: {
        LOG_TRACE("Executing write event of buffer %1%", workId.bufferId());
        socket->onWrite(workId.userId(), workId.bufferId(), ec);
    } break;

    default: {
        LOG_TRACE("Unknown work type");
    } break;
    }
}
Пример #19
0
/**
 * DPDK callback for RX with scattered packets support.
 *
 * @param dpdk_rxq
 *   Generic pointer to RX queue structure.
 * @param[out] pkts
 *   Array to store received packets.
 * @param pkts_n
 *   Maximum number of packets in array.
 *
 * @return
 *   Number of packets successfully received (<= pkts_n).
 */
uint16_t
mlx5_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
{
	struct rxq *rxq = (struct rxq *)dpdk_rxq;
	struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp;
	const unsigned int elts_n = rxq->elts_n;
	unsigned int elts_head = rxq->elts_head;
	unsigned int i;
	unsigned int pkts_ret = 0;
	int ret;

	if (unlikely(!rxq->sp))
		return mlx5_rx_burst(dpdk_rxq, pkts, pkts_n);
	if (unlikely(elts == NULL)) /* See RTE_DEV_CMD_SET_MTU. */
		return 0;
	for (i = 0; (i != pkts_n); ++i) {
		struct rxq_elt_sp *elt = &(*elts)[elts_head];
		unsigned int len;
		unsigned int pkt_buf_len;
		struct rte_mbuf *pkt_buf = NULL; /* Buffer returned in pkts. */
		struct rte_mbuf **pkt_buf_next = &pkt_buf;
		unsigned int seg_headroom = RTE_PKTMBUF_HEADROOM;
		unsigned int j = 0;
		uint32_t flags;
		uint16_t vlan_tci;

		/* Sanity checks. */
		assert(elts_head < rxq->elts_n);
		assert(rxq->elts_head < rxq->elts_n);
		ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci);
		if (unlikely(ret < 0)) {
			struct ibv_wc wc;
			int wcs_n;

			DEBUG("rxq=%p, poll_length() failed (ret=%d)",
			      (void *)rxq, ret);
			/* ibv_poll_cq() must be used in case of failure. */
			wcs_n = ibv_poll_cq(rxq->cq, 1, &wc);
			if (unlikely(wcs_n == 0))
				break;
			if (unlikely(wcs_n < 0)) {
				DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)",
				      (void *)rxq, wcs_n);
				break;
			}
			assert(wcs_n == 1);
			if (unlikely(wc.status != IBV_WC_SUCCESS)) {
				/* Whatever, just repost the offending WR. */
				DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work"
				      " completion status (%d): %s",
				      (void *)rxq, wc.wr_id, wc.status,
				      ibv_wc_status_str(wc.status));
#ifdef MLX5_PMD_SOFT_COUNTERS
				/* Increment dropped packets counter. */
				++rxq->stats.idropped;
#endif
				goto repost;
			}
			ret = wc.byte_len;
		}
		if (ret == 0)
			break;
		assert(ret >= (rxq->crc_present << 2));
		len = ret - (rxq->crc_present << 2);
		pkt_buf_len = len;
		/*
		 * Replace spent segments with new ones, concatenate and
		 * return them as pkt_buf.
		 */
		while (1) {
			struct ibv_sge *sge = &elt->sges[j];
			struct rte_mbuf *seg = elt->bufs[j];
			struct rte_mbuf *rep;
			unsigned int seg_tailroom;

			assert(seg != NULL);
			/*
			 * Fetch initial bytes of packet descriptor into a
			 * cacheline while allocating rep.
			 */
			rte_prefetch0(seg);
			rep = __rte_mbuf_raw_alloc(rxq->mp);
			if (unlikely(rep == NULL)) {
				/*
				 * Unable to allocate a replacement mbuf,
				 * repost WR.
				 */
				DEBUG("rxq=%p: can't allocate a new mbuf",
				      (void *)rxq);
				if (pkt_buf != NULL) {
					*pkt_buf_next = NULL;
					rte_pktmbuf_free(pkt_buf);
				}
				/* Increment out of memory counters. */
				++rxq->stats.rx_nombuf;
				++rxq->priv->dev->data->rx_mbuf_alloc_failed;
				goto repost;
			}
#ifndef NDEBUG
			/* Poison user-modifiable fields in rep. */
			NEXT(rep) = (void *)((uintptr_t)-1);
			SET_DATA_OFF(rep, 0xdead);
			DATA_LEN(rep) = 0xd00d;
			PKT_LEN(rep) = 0xdeadd00d;
			NB_SEGS(rep) = 0x2a;
			PORT(rep) = 0x2a;
			rep->ol_flags = -1;
#endif
			assert(rep->buf_len == seg->buf_len);
			assert(rep->buf_len == rxq->mb_len);
			/* Reconfigure sge to use rep instead of seg. */
			assert(sge->lkey == rxq->mr->lkey);
			sge->addr = ((uintptr_t)rep->buf_addr + seg_headroom);
			elt->bufs[j] = rep;
			++j;
			/* Update pkt_buf if it's the first segment, or link
			 * seg to the previous one and update pkt_buf_next. */
			*pkt_buf_next = seg;
			pkt_buf_next = &NEXT(seg);
			/* Update seg information. */
			seg_tailroom = (seg->buf_len - seg_headroom);
			assert(sge->length == seg_tailroom);
			SET_DATA_OFF(seg, seg_headroom);
			if (likely(len <= seg_tailroom)) {
				/* Last segment. */
				DATA_LEN(seg) = len;
				PKT_LEN(seg) = len;
				/* Sanity check. */
				assert(rte_pktmbuf_headroom(seg) ==
				       seg_headroom);
				assert(rte_pktmbuf_tailroom(seg) ==
				       (seg_tailroom - len));
				break;
			}
			DATA_LEN(seg) = seg_tailroom;
			PKT_LEN(seg) = seg_tailroom;
			/* Sanity check. */
			assert(rte_pktmbuf_headroom(seg) == seg_headroom);
			assert(rte_pktmbuf_tailroom(seg) == 0);
			/* Fix len and clear headroom for next segments. */
			len -= seg_tailroom;
			seg_headroom = 0;
		}
		/* Update head and tail segments. */
		*pkt_buf_next = NULL;
		assert(pkt_buf != NULL);
		assert(j != 0);
		NB_SEGS(pkt_buf) = j;
		PORT(pkt_buf) = rxq->port_id;
		PKT_LEN(pkt_buf) = pkt_buf_len;
		if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) {
			pkt_buf->packet_type = rxq_cq_to_pkt_type(flags);
			pkt_buf->ol_flags = rxq_cq_to_ol_flags(rxq, flags);
#ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS
			if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) {
				pkt_buf->ol_flags |= PKT_RX_VLAN_PKT;
				pkt_buf->vlan_tci = vlan_tci;
			}
#endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */
		}

		/* Return packet. */
		*(pkts++) = pkt_buf;
		++pkts_ret;
#ifdef MLX5_PMD_SOFT_COUNTERS
		/* Increment bytes counter. */
		rxq->stats.ibytes += pkt_buf_len;
#endif
repost:
		ret = rxq->recv(rxq->wq, elt->sges, RTE_DIM(elt->sges));
		if (unlikely(ret)) {
			/* Inability to repost WRs is fatal. */
			DEBUG("%p: recv_sg_list(): failed (ret=%d)",
			      (void *)rxq->priv,
			      ret);
			abort();
		}
		if (++elts_head >= elts_n)
			elts_head = 0;
		continue;
	}
	if (unlikely(i == 0))
		return 0;
	rxq->elts_head = elts_head;
#ifdef MLX5_PMD_SOFT_COUNTERS
	/* Increment packets counter. */
	rxq->stats.ipackets += pkts_ret;
#endif
	return pkts_ret;
}