static UCS_F_ALWAYS_INLINE ucs_status_t uct_rc_verbs_iface_poll_rx(uct_rc_verbs_iface_t *iface) { uct_ib_iface_recv_desc_t *desc; uct_rc_hdr_t *hdr; struct ibv_wc wc[UCT_IB_MAX_WC]; int i, ret; ret = ibv_poll_cq(iface->super.super.recv_cq, UCT_IB_MAX_WC, wc); if (ret > 0) { for (i = 0; i < ret; ++i) { if (ucs_unlikely(wc[i].status != IBV_WC_SUCCESS)) { ucs_fatal("Receive completion with error: %s", ibv_wc_status_str(wc[i].status)); } UCS_STATS_UPDATE_COUNTER(iface->super.stats, UCT_RC_IFACE_STAT_RX_COMPLETION, 1); desc = (void*)wc[i].wr_id; uct_ib_iface_desc_received(&iface->super.super, desc, wc[i].byte_len, 1); hdr = uct_ib_iface_recv_desc_hdr(&iface->super.super, desc); uct_ib_log_recv_completion(IBV_QPT_RC, &wc[i], hdr, uct_rc_ep_am_packet_dump); uct_rc_iface_invoke_am(&iface->super, hdr, wc[i].byte_len, desc); } iface->super.rx.available += ret; return UCS_OK; } else if (ret == 0) { uct_rc_verbs_iface_post_recv(iface, 0); return UCS_ERR_NO_PROGRESS; } else { ucs_fatal("Failed to poll receive CQ"); } }
/// The InfiniBand completion notification handler. int poll_completion() { const int ne_max = 10; struct ibv_wc wc[ne_max]; int ne; int ne_total = 0; while ((ne = ibv_poll_cq(cq_, ne_max, wc))) { if (ne < 0) throw InfinibandException("ibv_poll_cq failed"); ne_total += ne; for (int i = 0; i < ne; ++i) { if (wc[i].status != IBV_WC_SUCCESS) { std::ostringstream s; s << ibv_wc_status_str(wc[i].status) << " for wr_id " << static_cast<int>(wc[i].wr_id); L_(error) << s.str(); continue; } on_completion(wc[i]); } } return ne_total; }
static inline ucs_status_t uct_ud_verbs_iface_poll_rx(uct_ud_verbs_iface_t *iface) { uct_ib_iface_recv_desc_t *desc; struct ibv_wc wc[UCT_IB_MAX_WC]; int i, ret; char *packet; ret = ibv_poll_cq(iface->super.super.recv_cq, UCT_IB_MAX_WC, wc); if (ret == 0) { return UCS_ERR_NO_PROGRESS; } if (ucs_unlikely(ret < 0)) { ucs_fatal("Failed to poll receive CQ"); } for (i = 0; i < ret; ++i) { if (ucs_unlikely(wc[i].status != IBV_WC_SUCCESS)) { ucs_fatal("Receive completion with error: %s", ibv_wc_status_str(wc[i].status)); } desc = (void*)wc[i].wr_id; ucs_trace_data("pkt rcvd: buf=%p len=%d", desc, wc[i].byte_len); packet = uct_ib_iface_recv_desc_hdr(&iface->super.super, desc); VALGRIND_MAKE_MEM_DEFINED(packet, wc[i].byte_len); uct_ud_ep_process_rx(&iface->super, (uct_ud_neth_t *)(packet + UCT_IB_GRH_LEN), wc[i].byte_len - UCT_IB_GRH_LEN, (uct_ud_recv_skb_t *)desc); } iface->super.rx.available += ret; uct_ud_verbs_iface_post_recv(iface); return UCS_OK; }
gaspi_return_t pgaspi_dev_wait (const gaspi_queue_id_t queue, int * counter, const gaspi_timeout_t timeout_ms) { int ne = 0, i; struct ibv_wc wc; const int nr = *counter; const gaspi_cycles_t s0 = gaspi_get_cycles (); for (i = 0; i < nr; i++) { do { ne = ibv_poll_cq (glb_gaspi_ctx_ib.scqC[queue], 1, &wc); *counter -= ne; if (ne == 0) { const gaspi_cycles_t s1 = gaspi_get_cycles (); const gaspi_cycles_t tdelta = s1 - s0; const float ms = (float) tdelta * glb_gaspi_ctx.cycles_to_msecs; if (ms > timeout_ms) { return GASPI_TIMEOUT; } } } while (ne == 0); if ((ne < 0) || (wc.status != IBV_WC_SUCCESS)) { gaspi_print_error("Failed request to %lu. Queue %d might be broken %s", wc.wr_id, queue, ibv_wc_status_str(wc.status) ); glb_gaspi_ctx.qp_state_vec[queue][wc.wr_id] = GASPI_STATE_CORRUPT; return GASPI_ERROR; } } #ifdef GPI2_CUDA int j,k; for(k = 0;k < glb_gaspi_ctx.gpu_count; k++) { for(j = 0; j < GASPI_CUDA_EVENTS; j++) gpus[k].events[queue][j].ib_use = 0; } #endif return GASPI_SUCCESS; }
const char *ibv_wc_status_string(int status) { return ibv_wc_status_str(status); switch (status) { case IBV_WC_SUCCESS: return "IBV_WC_SUCCESS"; case IBV_WC_LOC_LEN_ERR: return "IBV_WC_LOC_LEN_ERR"; case IBV_WC_LOC_QP_OP_ERR: return "IBV_WC_LOC_QP_OP_ERR"; case IBV_WC_LOC_EEC_OP_ERR: return "IBV_WC_LOC_EEC_OP_ERR"; case IBV_WC_LOC_PROT_ERR: return "IBV_WC_LOC_PROT_ERR"; case IBV_WC_WR_FLUSH_ERR: return "IBV_WC_WR_FLUSH_ERR"; case IBV_WC_MW_BIND_ERR: return "IBV_WC_MW_BIND_ERR"; case IBV_WC_BAD_RESP_ERR: return "IBV_WC_BAD_RESP_ERR"; case IBV_WC_LOC_ACCESS_ERR: return "IBV_WC_LOC_ACCESS_ERR"; case IBV_WC_REM_INV_REQ_ERR: return "IBV_WC_REM_INV_REQ_ERR"; case IBV_WC_REM_ACCESS_ERR: return "IBV_WC_REM_ACCESS_ERR"; case IBV_WC_REM_OP_ERR: return "IBV_WC_REM_OP_ERR"; case IBV_WC_RETRY_EXC_ERR: return "IBV_WC_RETRY_EXC_ERR"; case IBV_WC_RNR_RETRY_EXC_ERR: return "IBV_WC_RNR_RETRY_EXC_ERR"; case IBV_WC_LOC_RDD_VIOL_ERR: return "IBV_WC_LOC_RDD_VIOL_ERR"; case IBV_WC_REM_INV_RD_REQ_ERR: return "IBV_WC_REM_INV_RD_REQ_ERR"; case IBV_WC_REM_ABORT_ERR: return "IBV_WC_REM_ABORT_ERR"; case IBV_WC_INV_EECN_ERR: return "IBV_WC_INV_EECN_ERR"; case IBV_WC_INV_EEC_STATE_ERR: return "IBV_WC_INV_EEC_STATE_ERR"; case IBV_WC_FATAL_ERR: return "IBV_WC_FATAL_ERR"; case IBV_WC_RESP_TIMEOUT_ERR: return "IBV_WC_RESP_TIMEOUT_ERR"; case IBV_WC_GENERAL_ERR: return "IBV_WC_GENERAL_ERR"; default: return "unknown-status"; } }
/** * Polling for events on a inner thread allows processing of management messages * like buffer connection immediately, even if the user is not polling. * Otherwise buffer constructors would block indefinitely. * * Deep learning workloads are about sending small numbers of large messages, * in which case this model works great. If the library was to be used to * exchange large numbers of short messages, it would be useful to split * management and data messages over two different queue pairs. User threads * could then wait or poll on the data queue pair directly. */ void RDMAAdapter::InternalThreadEntry() { while (!must_stop()) { ibv_cq* cq; void* cq_context; CHECK(!ibv_get_cq_event(channel_, &cq, &cq_context)); CHECK(cq == cq_); ibv_ack_cq_events(cq, 1); CHECK(!ibv_req_notify_cq(cq_, 0)); int ne = ibv_poll_cq(cq_, MAX_CONCURRENT_WRITES * 2, static_cast<ibv_wc*>(wc_)); CHECK_GE(ne, 0); for (int i = 0; i < ne; ++i) { CHECK(wc_[i].status == IBV_WC_SUCCESS) << "Failed status \n" << ibv_wc_status_str(wc_[i].status) << " " << wc_[i].status << " " << static_cast<int>(wc_[i].wr_id) << " "<< wc_[i].vendor_err; if (wc_[i].opcode == IBV_WC_RECV_RDMA_WITH_IMM) { // Data message, add it to user received queue RDMAChannel* channel = reinterpret_cast<RDMAChannel*>(wc_[i].wr_id); channel->recv(); int id = wc_[i].imm_data; if (id >= CTRL_ID_OFFSET) { // ctrl signal ctrl_received_.push(channel->buffers_[id - CTRL_ID_OFFSET]); } else { // data received_.push(channel->buffers_[id]); } } else { if (wc_[i].opcode & IBV_WC_RECV) { // Buffer connection message RDMAChannel* channel = reinterpret_cast<RDMAChannel*>(wc_[i].wr_id); int id = wc_[i].imm_data; channel->memory_regions_queue_.push(channel->memory_regions_[id]); CHECK(id == channel->memory_regions_received_++); CHECK(!ibv_dereg_mr(channel->region_regions_[id])); } } } } }
static UCS_F_ALWAYS_INLINE void uct_rc_verbs_iface_poll_tx(uct_rc_verbs_iface_t *iface) { struct ibv_wc wc[UCT_IB_MAX_WC]; uct_rc_verbs_ep_t *ep; uct_rc_iface_send_op_t *op; unsigned count; uint16_t sn; int i, ret; ret = ibv_poll_cq(iface->super.super.send_cq, UCT_IB_MAX_WC, wc); if (ucs_unlikely(ret <= 0)) { if (ucs_unlikely(ret < 0)) { ucs_fatal("Failed to poll send CQ"); } return; } for (i = 0; i < ret; ++i) { if (ucs_unlikely(wc[i].status != IBV_WC_SUCCESS)) { ucs_fatal("Send completion with error: %s", ibv_wc_status_str(wc[i].status)); } UCS_STATS_UPDATE_COUNTER(iface->super.stats, UCT_RC_IFACE_STAT_TX_COMPLETION, 1); ep = ucs_derived_of(uct_rc_iface_lookup_ep(&iface->super, wc[i].qp_num), uct_rc_verbs_ep_t); ucs_assert(ep != NULL); count = wc[i].wr_id + 1; /* Number of sends with WC completes in batch */ ep->super.available += count; ep->tx.completion_count += count; ++iface->super.tx.cq_available; sn = ep->tx.completion_count; ucs_queue_for_each_extract(op, &ep->super.outstanding, queue, UCS_CIRCULAR_COMPARE16(op->sn, <=, sn)) { op->handler(op); } } }
static void UNUSED dump_wc(struct ibv_wc *wc) { if (!Debug) { return; } DEBUG("\nibv_wc:\n"); DEBUG("\twc->wr_id :%lx\n", wc->wr_id); DEBUG("\twc->status :%s\n", ibv_wc_status_str(wc->status)); DEBUG("\twc->opcode :%x\n", wc->opcode); DEBUG("\twc->vendor_err :%x\n", wc->vendor_err); DEBUG("\twc->byte_len :%x\n", wc->byte_len); DEBUG("\twc->imm_data :%x\n", wc->imm_data); DEBUG("\twc->qp_num :%x\n", wc->qp_num); DEBUG("\twc->src_qp :%x\n", wc->src_qp); DEBUG("\twc->wc_flags :%x\n", wc->wc_flags); DEBUG("\twc->pkey_index :%x\n", wc->pkey_index); DEBUG("\twc->slid :%x\n", wc->slid); DEBUG("\twc->sl :%x\n", wc->sl); DEBUG("\twc->dlid_path_bits :%x\n", wc->dlid_path_bits); return; }
static UCS_F_ALWAYS_INLINE void uct_ud_verbs_iface_poll_tx(uct_ud_verbs_iface_t *iface) { struct ibv_wc wc; int ret; ret = ibv_poll_cq(iface->super.super.send_cq, 1, &wc); if (ucs_unlikely(ret < 0)) { ucs_fatal("Failed to poll send CQ"); return; } if (ret == 0) { return; } if (ucs_unlikely(wc.status != IBV_WC_SUCCESS)) { ucs_fatal("Send completion (wr_id=0x%0X with error: %s ", (unsigned)wc.wr_id, ibv_wc_status_str(wc.status)); return; } iface->super.tx.available += UCT_UD_TX_MODERATION + 1; }
/** * DPDK callback for RX. * * The following function is the same as mlx5_rx_burst_sp(), except it doesn't * manage scattered packets. Improves performance when MRU is lower than the * size of the first segment. * * @param dpdk_rxq * Generic pointer to RX queue structure. * @param[out] pkts * Array to store received packets. * @param pkts_n * Maximum number of packets in array. * * @return * Number of packets successfully received (<= pkts_n). */ uint16_t mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) { struct rxq *rxq = (struct rxq *)dpdk_rxq; struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp; const unsigned int elts_n = rxq->elts_n; unsigned int elts_head = rxq->elts_head; struct ibv_sge sges[pkts_n]; unsigned int i; unsigned int pkts_ret = 0; int ret; if (unlikely(rxq->sp)) return mlx5_rx_burst_sp(dpdk_rxq, pkts, pkts_n); for (i = 0; (i != pkts_n); ++i) { struct rxq_elt *elt = &(*elts)[elts_head]; unsigned int len; struct rte_mbuf *seg = elt->buf; struct rte_mbuf *rep; uint32_t flags; uint16_t vlan_tci; /* Sanity checks. */ assert(seg != NULL); assert(elts_head < rxq->elts_n); assert(rxq->elts_head < rxq->elts_n); /* * Fetch initial bytes of packet descriptor into a * cacheline while allocating rep. */ rte_prefetch0(seg); rte_prefetch0(&seg->cacheline1); ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci); if (unlikely(ret < 0)) { struct ibv_wc wc; int wcs_n; DEBUG("rxq=%p, poll_length() failed (ret=%d)", (void *)rxq, ret); /* ibv_poll_cq() must be used in case of failure. */ wcs_n = ibv_poll_cq(rxq->cq, 1, &wc); if (unlikely(wcs_n == 0)) break; if (unlikely(wcs_n < 0)) { DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)", (void *)rxq, wcs_n); break; } assert(wcs_n == 1); if (unlikely(wc.status != IBV_WC_SUCCESS)) { /* Whatever, just repost the offending WR. */ DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work" " completion status (%d): %s", (void *)rxq, wc.wr_id, wc.status, ibv_wc_status_str(wc.status)); #ifdef MLX5_PMD_SOFT_COUNTERS /* Increment dropped packets counter. */ ++rxq->stats.idropped; #endif /* Add SGE to array for repost. */ sges[i] = elt->sge; goto repost; } ret = wc.byte_len; } if (ret == 0) break; assert(ret >= (rxq->crc_present << 2)); len = ret - (rxq->crc_present << 2); rep = __rte_mbuf_raw_alloc(rxq->mp); if (unlikely(rep == NULL)) { /* * Unable to allocate a replacement mbuf, * repost WR. */ DEBUG("rxq=%p: can't allocate a new mbuf", (void *)rxq); /* Increment out of memory counters. */ ++rxq->stats.rx_nombuf; ++rxq->priv->dev->data->rx_mbuf_alloc_failed; goto repost; } /* Reconfigure sge to use rep instead of seg. */ elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM; assert(elt->sge.lkey == rxq->mr->lkey); elt->buf = rep; /* Add SGE to array for repost. */ sges[i] = elt->sge; /* Update seg information. */ SET_DATA_OFF(seg, RTE_PKTMBUF_HEADROOM); NB_SEGS(seg) = 1; PORT(seg) = rxq->port_id; NEXT(seg) = NULL; PKT_LEN(seg) = len; DATA_LEN(seg) = len; if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) { seg->packet_type = rxq_cq_to_pkt_type(flags); seg->ol_flags = rxq_cq_to_ol_flags(rxq, flags); #ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) { seg->ol_flags |= PKT_RX_VLAN_PKT; seg->vlan_tci = vlan_tci; } #endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */ } /* Return packet. */ *(pkts++) = seg; ++pkts_ret; #ifdef MLX5_PMD_SOFT_COUNTERS /* Increment bytes counter. */ rxq->stats.ibytes += len; #endif repost: if (++elts_head >= elts_n) elts_head = 0; continue; } if (unlikely(i == 0)) return 0; /* Repost WRs. */ #ifdef DEBUG_RECV DEBUG("%p: reposting %u WRs", (void *)rxq, i); #endif ret = rxq->recv(rxq->wq, sges, i); if (unlikely(ret)) { /* Inability to repost WRs is fatal. */ DEBUG("%p: recv_burst(): failed (ret=%d)", (void *)rxq->priv, ret); abort(); } rxq->elts_head = elts_head; #ifdef MLX5_PMD_SOFT_COUNTERS /* Increment packets counter. */ rxq->stats.ipackets += pkts_ret; #endif return pkts_ret; }
static int cq_event_handler(struct thread_data *td, enum ibv_wc_opcode opcode) { struct rdmaio_data *rd = td->io_ops->data; struct ibv_wc wc; struct rdma_io_u_data *r_io_u_d; int ret; int compevnum = 0; int i; while ((ret = ibv_poll_cq(rd->cq, 1, &wc)) == 1) { ret = 0; compevnum++; if (wc.status) { log_err("fio: cq completion status %d(%s)\n", wc.status, ibv_wc_status_str(wc.status)); return -1; } switch (wc.opcode) { case IBV_WC_RECV: if (rd->is_client == 1) ret = client_recv(td, &wc); else ret = server_recv(td, &wc); if (ret) return -1; if (wc.wr_id == FIO_RDMA_MAX_IO_DEPTH) break; for (i = 0; i < rd->io_u_flight_nr; i++) { r_io_u_d = rd->io_us_flight[i]->engine_data; if (wc.wr_id == r_io_u_d->rq_wr.wr_id) { rd->io_us_flight[i]->resid = rd->io_us_flight[i]->buflen - wc.byte_len; rd->io_us_flight[i]->error = 0; rd->io_us_completed[rd-> io_u_completed_nr] = rd->io_us_flight[i]; rd->io_u_completed_nr++; break; } } if (i == rd->io_u_flight_nr) log_err("fio: recv wr %" PRId64 " not found\n", wc.wr_id); else { /* put the last one into middle of the list */ rd->io_us_flight[i] = rd->io_us_flight[rd->io_u_flight_nr - 1]; rd->io_u_flight_nr--; } break; case IBV_WC_SEND: case IBV_WC_RDMA_WRITE: case IBV_WC_RDMA_READ: if (wc.wr_id == FIO_RDMA_MAX_IO_DEPTH) break; for (i = 0; i < rd->io_u_flight_nr; i++) { r_io_u_d = rd->io_us_flight[i]->engine_data; if (wc.wr_id == r_io_u_d->sq_wr.wr_id) { rd->io_us_completed[rd-> io_u_completed_nr] = rd->io_us_flight[i]; rd->io_u_completed_nr++; break; } } if (i == rd->io_u_flight_nr) log_err("fio: send wr %" PRId64 " not found\n", wc.wr_id); else { /* put the last one into middle of the list */ rd->io_us_flight[i] = rd->io_us_flight[rd->io_u_flight_nr - 1]; rd->io_u_flight_nr--; } break; default: log_info("fio: unknown completion event %d\n", wc.opcode); return -1; } rd->cq_event_num++; } if (ret) { log_err("fio: poll error %d\n", ret); return 1; } return compevnum; }
int main(int argc, char *argv[]) { struct ibv_pd *pd1, *pd2; struct ibv_comp_channel *comp_chan1, *comp_chan2; struct ibv_cq *cq1, *cq2; struct ibv_cq *evt_cq = NULL; struct ibv_mr *mr1, *mr2; struct ibv_qp_init_attr qp_attr1 = { }, qp_attr2 = {}; struct ibv_sge sge; struct ibv_send_wr send_wr = { }; struct ibv_send_wr *bad_send_wr = NULL; struct ibv_wc wc; struct ibv_qp *qp1, *qp2; void *cq_context = NULL; union ibv_gid gid1, gid2; int n; uint8_t *buf1, *buf2; int err; int num_devices; struct ibv_context * verbs1, *verbs2; struct ibv_device ** dev_list = ibv_get_device_list(&num_devices); struct ibv_device_attr dev_attr; int use = 0; int port = 1; int x = 0; unsigned long mb = 0; unsigned long bytes = 0; unsigned long save_diff = 0; struct timeval start, stop, diff; int iterations = 0; struct rusage usage; struct timeval ustart, uend; struct timeval sstart, send; struct timeval tstart, tend; DPRINTF("There are %d devices\n", num_devices); for(x = 0; x < num_devices; x++) { printf("Device: %d, %s\n", x, ibv_get_device_name(dev_list[use])); } if(num_devices == 0 || dev_list == NULL) { printf("No devices found\n"); return 1; } if(argc < 2) { printf("Which RDMA device to use? 0, 1, 2, 3...\n"); return 1; } use = atoi(argv[1]); DPRINTF("Using device %d\n", use); verbs1 = ibv_open_device(dev_list[use]); if(verbs1 == NULL) { printf("Failed to open device!\n"); return 1; } DPRINTF("Device open %s\n", ibv_get_device_name(dev_list[use])); verbs2 = ibv_open_device(dev_list[use]); if(verbs2 == NULL) { printf("Failed to open device again!\n"); return 1; } if(ibv_query_device(verbs1, &dev_attr)) { printf("Failed to query device attributes.\n"); return 1; } printf("Device open: %d, %s which has %d ports\n", x, ibv_get_device_name(dev_list[use]), dev_attr.phys_port_cnt); if(argc < 3) { printf("Which port on the device to use? 1, 2, 3...\n"); return 1; } port = atoi(argv[2]); if(port <= 0) { printf("Port #%d invalid, must start with 1, 2, 3, ...\n", port); return 1; } printf("Using port %d\n", port); if(argc < 4) { printf("How many iterations to perform?\n"); return 1; } iterations = atoi(argv[3]); printf("Will perform %d iterations\n", iterations); pd1 = ibv_alloc_pd(verbs1); if (!pd1) return 1; if(argc < 5) { printf("How many megabytes to allocate? (This will be allocated twice. Once for source, once for destination.)\n"); return 1; } mb = atoi(argv[4]); if(mb <= 0) { printf("Megabytes %lu invalid\n", mb); return 1; } DPRINTF("protection domain1 allocated\n"); pd2 = ibv_alloc_pd(verbs2); if (!pd2) return 1; DPRINTF("protection domain2 allocated\n"); comp_chan1 = ibv_create_comp_channel(verbs1); if (!comp_chan1) return 1; DPRINTF("completion chan1 created\n"); comp_chan2 = ibv_create_comp_channel(verbs2); if (!comp_chan2) return 1; DPRINTF("completion chan2 created\n"); cq1 = ibv_create_cq(verbs1, 2, NULL, comp_chan1, 0); if (!cq1) return 1; DPRINTF("CQ1 created\n"); cq2 = ibv_create_cq(verbs2, 2, NULL, comp_chan2, 0); if (!cq2) return 1; DPRINTF("CQ2 created\n"); bytes = mb * 1024UL * 1024UL; buf1 = malloc(bytes); if (!buf1) return 1; buf2 = malloc(bytes); if (!buf2) return 1; printf("Populating %lu MB memory.\n", mb * 2); for(x = 0; x < bytes; x++) { buf1[x] = 123; } buf1[bytes - 1] = 123; mr1 = ibv_reg_mr(pd1, buf1, bytes, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ); if (!mr1) { printf("Failed to register memory.\n"); return 1; } mr2 = ibv_reg_mr(pd2, buf2, bytes, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ); if (!mr2) { printf("Failed to register memory.\n"); return 1; } DPRINTF("memory registered.\n"); qp_attr1.cap.max_send_wr = 10; qp_attr1.cap.max_send_sge = 10; qp_attr1.cap.max_recv_wr = 10; qp_attr1.cap.max_recv_sge = 10; qp_attr1.sq_sig_all = 1; qp_attr1.send_cq = cq1; qp_attr1.recv_cq = cq1; qp_attr1.qp_type = IBV_QPT_RC; qp1 = ibv_create_qp(pd1, &qp_attr1); if (!qp1) { printf("failed to create queue pair #1\n"); return 1; } DPRINTF("queue pair1 created\n"); qp_attr2.cap.max_send_wr = 10; qp_attr2.cap.max_send_sge = 10; qp_attr2.cap.max_recv_wr = 10; qp_attr2.cap.max_recv_sge = 10; qp_attr2.sq_sig_all = 1; qp_attr2.send_cq = cq2; qp_attr2.recv_cq = cq2; qp_attr2.qp_type = IBV_QPT_RC; qp2 = ibv_create_qp(pd2, &qp_attr2); if (!qp2) { printf("failed to create queue pair #2\n"); return 1; } DPRINTF("queue pair2 created\n"); struct ibv_qp_attr attr1 = { .qp_state = IBV_QPS_INIT, .pkey_index = 0, .port_num = port, .qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE, }; if(ibv_modify_qp(qp1, &attr1, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { printf("verbs 1 Failed to go to init\n"); return 1; } DPRINTF("verbs1 to init\n"); struct ibv_qp_attr attr2 = { .qp_state = IBV_QPS_INIT, .pkey_index = 0, .port_num = port, .qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE, }; if(ibv_modify_qp(qp2, &attr2, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { printf("verbs 2 Failed to go to init\n"); return 1; } DPRINTF("verbs2 to init\n"); //struct ibv_gid gid1, gid2; struct ibv_port_attr port1, port2; uint64_t psn1 = lrand48() & 0xffffff; uint64_t psn2 = lrand48() & 0xffffff; if(ibv_query_port(verbs1, port, &port1)) return 1; DPRINTF("got port1 information\n"); if(ibv_query_port(verbs2, port, &port2)) return 1; DPRINTF("got port2 information\n"); if(ibv_query_gid(verbs1, 1, 0, &gid1)) return 1; DPRINTF("got gid1 information\n"); if(ibv_query_gid(verbs2, 1, 0, &gid2)) return 1; DPRINTF("got gid2 information\n"); struct ibv_qp_attr next2 = { .qp_state = IBV_QPS_RTR, .path_mtu = IBV_MTU_1024, .dest_qp_num = qp2->qp_num, .rq_psn = psn2, .max_dest_rd_atomic = 5, .min_rnr_timer = 12, .ah_attr = { .is_global = 0, .dlid = port2.lid, .sl = 0, .src_path_bits = 0, .port_num = port, } }; if(gid2.global.interface_id) { next2.ah_attr.is_global = 1; next2.ah_attr.grh.hop_limit = 1; next2.ah_attr.grh.dgid = gid2; next2.ah_attr.grh.sgid_index = 0; } struct ibv_qp_attr next1 = { .qp_state = IBV_QPS_RTR, .path_mtu = IBV_MTU_1024, .dest_qp_num = qp1->qp_num, .rq_psn = psn1, .max_dest_rd_atomic = 1, .min_rnr_timer = 12, .ah_attr = { .is_global = 0, .dlid = port1.lid, .sl = 0, .src_path_bits = 0, .port_num = port, } }; if(gid1.global.interface_id) { next1.ah_attr.is_global = 1; next1.ah_attr.grh.hop_limit = 1; next1.ah_attr.grh.dgid = gid1; next1.ah_attr.grh.sgid_index = 0; } if(ibv_modify_qp(qp2, &next1, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER)) { printf("Failed to modify verbs2 to ready\n"); return 1; } DPRINTF("verbs2 RTR\n"); if(ibv_modify_qp(qp1, &next2, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER)) { printf("Failed to modify verbs1 to ready\n"); return 1; } DPRINTF("verbs1 RTR\n"); next2.qp_state = IBV_QPS_RTS; next2.timeout = 14; next2.retry_cnt = 7; next2.rnr_retry = 7; next2.sq_psn = psn1; next2.max_rd_atomic = 1; if(ibv_modify_qp(qp1, &next2, IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC)) { printf("Failed again to modify verbs1 to ready\n"); return 1; } DPRINTF("verbs1 RTS\n"); next1.qp_state = IBV_QPS_RTS; next1.timeout = 14; next1.retry_cnt = 7; next1.rnr_retry = 7; next1.sq_psn = psn2; next1.max_rd_atomic = 1; if(ibv_modify_qp(qp2, &next1, IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC)) { printf("Failed again to modify verbs2 to ready\n"); return 1; } DPRINTF("verbs2 RTS\n"); printf("Performing RDMA first.\n"); iterations = atoi(argv[3]); getrusage(RUSAGE_SELF, &usage); ustart = usage.ru_utime; sstart = usage.ru_stime; gettimeofday(&tstart, NULL); while(iterations-- > 0) { sge.addr = (uintptr_t) buf1; sge.length = bytes; sge.lkey = mr1->lkey; send_wr.wr_id = 1; send_wr.opcode = IBV_WR_RDMA_WRITE; send_wr.sg_list = &sge; send_wr.num_sge = 1; send_wr.send_flags = IBV_SEND_SIGNALED; send_wr.wr.rdma.rkey = mr2->rkey; send_wr.wr.rdma.remote_addr = (uint64_t) buf2; DPRINTF("Iterations left: %d\n", iterations); if (ibv_req_notify_cq(cq1, 0)) return 1; DPRINTF("Submitting local RDMA\n"); gettimeofday(&start, NULL); if (ibv_post_send(qp1, &send_wr, &bad_send_wr)) return 1; DPRINTF("RDMA posted %p %p\n", &send_wr, bad_send_wr); DPRINTF("blocking...\n"); if(ibv_get_cq_event(comp_chan1, &evt_cq, &cq_context)) { printf("failed to get CQ event\n"); return 1; } gettimeofday(&stop, NULL); timersub(&stop, &start, &diff); DPRINTF("RDMA took: %lu us\n", diff.tv_usec); ibv_ack_cq_events(evt_cq, 1); DPRINTF("got event\n"); n = ibv_poll_cq(cq1, 1, &wc); if (n > 0) { DPRINTF("return from poll: %lu\n", wc.wr_id); if (wc.status != IBV_WC_SUCCESS) { printf("poll failed %s\n", ibv_wc_status_str(wc.status)); return 1; } if (wc.wr_id == 1) { DPRINTF("Finished %d bytes %d %d\n", n, buf1[bytes - 1], buf2[bytes - 1]); } else { printf("didn't find completion\n"); } } if (n < 0) { printf("poll returned error\n"); return 1; } DPRINTF("Poll returned %d bytes %d %d\n", n, buf1[0], buf2[0]); } gettimeofday(&tend, NULL); getrusage(RUSAGE_SELF, &usage); uend = usage.ru_utime; send = usage.ru_stime; save_diff = 0; timersub(&uend, &ustart, &diff); save_diff += diff.tv_usec; printf("User CPU time: %lu us\n", diff.tv_usec); timersub(&send, &sstart, &diff); save_diff += diff.tv_usec; printf("System CPU time: %lu us\n", diff.tv_usec); timersub(&tend, &tstart, &diff); printf("Sleeping time: %lu us\n", diff.tv_usec - save_diff); printf("Wall clock CPU time: %lu us\n", diff.tv_usec); iterations = atoi(argv[3]); printf("Now using the CPU instead....\n"); getrusage(RUSAGE_SELF, &usage); ustart = usage.ru_utime; sstart = usage.ru_stime; gettimeofday(&tstart, NULL); while(iterations-- > 0) { DPRINTF("Repeating without RDMA...\n"); gettimeofday(&start, NULL); memcpy(buf2, buf1, bytes); gettimeofday(&stop, NULL); timersub(&stop, &start, &diff); DPRINTF("Regular copy too took: %lu us\n", diff.tv_usec); } gettimeofday(&tend, NULL); getrusage(RUSAGE_SELF, &usage); uend = usage.ru_utime; send = usage.ru_stime; save_diff = 0; timersub(&uend, &ustart, &diff); save_diff += diff.tv_usec; printf("User CPU time: %lu us\n", diff.tv_usec); timersub(&send, &sstart, &diff); save_diff += diff.tv_usec; printf("System CPU time: %lu us\n", diff.tv_usec); timersub(&tend, &tstart, &diff); printf("Sleeping time: %lu us\n", diff.tv_usec - save_diff); printf("Wall clock CPU time: %lu us\n", diff.tv_usec); return 0; }
static void uct_cm_iface_event_handler(void *arg) { uct_cm_iface_t *iface = arg; struct ib_cm_event *event; struct ib_cm_id *id; int destroy_id; int ret; ucs_trace_func(""); for (;;) { /* Fetch all events */ ret = ib_cm_get_event(iface->cmdev, &event); if (ret) { if (errno != EAGAIN) { ucs_warn("ib_cm_get_event() failed: %m"); } return; } id = event->cm_id; /* Handle the event */ switch (event->event) { case IB_CM_SIDR_REQ_ERROR: ucs_error("SIDR request error, status: %s", ibv_wc_status_str(event->param.send_status)); destroy_id = 1; break; case IB_CM_SIDR_REQ_RECEIVED: uct_cm_iface_handle_sidr_req(iface, event); destroy_id = 1; /* Destroy the ID created by the driver */ break; case IB_CM_SIDR_REP_RECEIVED: ucs_trace_data("RX: SIDR_REP [id %p{%u}]", id, id->handle); uct_cm_iface_outstanding_remove(iface, id); destroy_id = 1; /* Destroy the ID which was used for sending */ break; default: ucs_warn("Unexpected CM event: %d", event->event); destroy_id = 0; break; } /* Acknowledge CM event, remember the id, in case we would destroy it */ ret = ib_cm_ack_event(event); if (ret) { ucs_warn("ib_cm_ack_event() failed: %m"); } /* If there is an id which should be destroyed, do it now, after * acknowledging all events. */ if (destroy_id) { ret = ib_cm_destroy_id(id); if (ret) { ucs_error("ib_cm_destroy_id() failed: %m"); } } uct_cm_iface_notify(iface); } }
struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, int rx_depth, int port, int use_event, enum pp_wr_calc_op calc_op, enum pp_wr_data_type calc_data_type, char *calc_operands_str) { struct pingpong_context *ctx; int rc; ctx = malloc(sizeof *ctx); if (!ctx) return NULL; memset(ctx, 0, sizeof *ctx); ctx->size = size; ctx->rx_depth = rx_depth; ctx->calc_op.opcode = IBV_EXP_CALC_OP_NUMBER; ctx->calc_op.data_type = IBV_EXP_CALC_DATA_TYPE_NUMBER; ctx->calc_op.data_size = IBV_EXP_CALC_DATA_SIZE_NUMBER; ctx->buf = memalign(page_size, size); if (!ctx->buf) { fprintf(stderr, "Couldn't allocate work buf.\n"); goto clean_ctx; } memset(ctx->buf, 0, size); ctx->net_buf = memalign(page_size, size); if (!ctx->net_buf) { fprintf(stderr, "Couldn't allocate work buf.\n"); goto clean_buffer; } memset(ctx->net_buf, 0, size); ctx->context = ibv_open_device(ib_dev); if (!ctx->context) { fprintf(stderr, "Couldn't get context for %s\n", ibv_get_device_name(ib_dev)); goto clean_net_buf; } if (use_event) { ctx->channel = ibv_create_comp_channel(ctx->context); if (!ctx->channel) { fprintf(stderr, "Couldn't create completion channel\n"); goto clean_device; } } else ctx->channel = NULL; ctx->pd = ibv_alloc_pd(ctx->context); if (!ctx->pd) { fprintf(stderr, "Couldn't allocate PD\n"); goto clean_comp_channel; } ctx->mr = ibv_reg_mr(ctx->pd, ctx->net_buf, size, IBV_ACCESS_LOCAL_WRITE); if (!ctx->mr) { fprintf(stderr, "Couldn't register MR\n"); goto clean_pd; } if (calc_op != PP_CALC_INVALID) { int op_per_gather, num_op, max_num_op; ctx->calc_op.opcode = IBV_EXP_CALC_OP_NUMBER; ctx->calc_op.data_type = IBV_EXP_CALC_DATA_TYPE_NUMBER; ctx->calc_op.data_size = IBV_EXP_CALC_DATA_SIZE_NUMBER; num_op = pp_parse_calc_to_gather(calc_operands_str, calc_op, calc_data_type, &ctx->calc_op, ctx->context, ctx->buf, ctx->net_buf); if (num_op < 0) { fprintf(stderr, "-E- failed parsing calc operators\n"); goto clean_mr; } rc = pp_query_calc_cap(ctx->context, ctx->calc_op.opcode, ctx->calc_op.data_type, ctx->calc_op.data_size, &op_per_gather, &max_num_op); if (rc) { fprintf(stderr, "-E- operation not supported on %s. valid ops are:\n", ibv_get_device_name(ib_dev)); pp_print_dev_calc_ops(ctx->context); goto clean_mr; } if (pp_prepare_sg_list(op_per_gather, num_op, ctx->mr->lkey, &ctx->calc_op, ctx->net_buf)) { fprintf(stderr, "-failed to prepare the sg list\n"); goto clean_mr; } } ctx->cq = ibv_create_cq(ctx->context, rx_depth + 1, NULL, ctx->channel, 0); if (!ctx->cq) { fprintf(stderr, "Couldn't create CQ\n"); goto clean_mr; } { struct ibv_exp_qp_init_attr attr = { .send_cq = ctx->cq, .recv_cq = ctx->cq, .cap = { .max_send_wr = 16, .max_recv_wr = rx_depth, .max_send_sge = 16, .max_recv_sge = 16 }, .qp_type = IBV_QPT_RC, .pd = ctx->pd }; attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS | IBV_EXP_QP_INIT_ATTR_PD; attr.exp_create_flags = IBV_EXP_QP_CREATE_CROSS_CHANNEL; ctx->qp = ibv_exp_create_qp(ctx->context, &attr); if (!ctx->qp) { fprintf(stderr, "Couldn't create QP\n"); goto clean_cq; } } { struct ibv_qp_attr attr = { .qp_state = IBV_QPS_INIT, .pkey_index = 0, .port_num = port, .qp_access_flags = 0 }; if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { fprintf(stderr, "Failed to modify QP to INIT\n"); goto clean_qp; } } ctx->mcq = ibv_create_cq(ctx->context, rx_depth + 1, NULL, ctx->channel, 0); if (!ctx->mcq) { fprintf(stderr, "Couldn't create CQ for MQP\n"); goto clean_qp; } { struct ibv_exp_qp_init_attr mattr = { .send_cq = ctx->mcq, .recv_cq = ctx->mcq, .cap = { .max_send_wr = 1, .max_recv_wr = rx_depth, .max_send_sge = 16, .max_recv_sge = 16 }, .qp_type = IBV_QPT_RC, .pd = ctx->pd }; mattr.comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS | IBV_EXP_QP_INIT_ATTR_PD; mattr.exp_create_flags = IBV_EXP_QP_CREATE_CROSS_CHANNEL; ctx->mqp = ibv_exp_create_qp(ctx->context, &mattr); if (!ctx->qp) { fprintf(stderr, "Couldn't create MQP\n"); goto clean_mcq; } } { struct ibv_qp_attr mattr = { .qp_state = IBV_QPS_INIT, .pkey_index = 0, .port_num = port, .qp_access_flags = 0 }; if (ibv_modify_qp(ctx->mqp, &mattr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { fprintf(stderr, "Failed to modify MQP to INIT\n"); goto clean_mqp; } } return ctx; clean_mqp: ibv_destroy_qp(ctx->mqp); clean_mcq: ibv_destroy_cq(ctx->mcq); clean_qp: ibv_destroy_qp(ctx->qp); clean_cq: ibv_destroy_cq(ctx->cq); clean_mr: ibv_dereg_mr(ctx->mr); clean_pd: ibv_dealloc_pd(ctx->pd); clean_comp_channel: if (ctx->channel) ibv_destroy_comp_channel(ctx->channel); clean_device: ibv_close_device(ctx->context); clean_net_buf: free(ctx->net_buf); clean_buffer: free(ctx->buf); clean_ctx: free(ctx); return NULL; } int pp_close_ctx(struct pingpong_context *ctx) { if (ibv_destroy_qp(ctx->qp)) { fprintf(stderr, "Couldn't destroy QP\n"); return 1; } if (ibv_destroy_qp(ctx->mqp)) { fprintf(stderr, "Couldn't destroy MQP\n"); return 1; } if (ibv_destroy_cq(ctx->cq)) { fprintf(stderr, "Couldn't destroy CQ\n"); return 1; } if (ibv_destroy_cq(ctx->mcq)) { fprintf(stderr, "Couldn't destroy MCQ\n"); return 1; } if (ibv_dereg_mr(ctx->mr)) { fprintf(stderr, "Couldn't deregister MR\n"); return 1; } if (ibv_dealloc_pd(ctx->pd)) { fprintf(stderr, "Couldn't deallocate PD\n"); return 1; } if (ctx->channel) { if (ibv_destroy_comp_channel(ctx->channel)) { fprintf(stderr, "Couldn't destroy completion channel\n"); return 1; } } if (ibv_close_device(ctx->context)) { fprintf(stderr, "Couldn't release context\n"); return 1; } free(ctx->buf); free(ctx->net_buf); free(ctx); return 0; } static int pp_post_recv(struct pingpong_context *ctx, int n) { int rc; struct ibv_sge list = { .addr = (uintptr_t) ctx->net_buf, .length = ctx->size, .lkey = ctx->mr->lkey }; struct ibv_recv_wr wr = { .wr_id = PP_RECV_WRID, .sg_list = &list, .num_sge = 1, }; struct ibv_recv_wr *bad_wr; int i; for (i = 0; i < n; ++i) { rc = ibv_post_recv(ctx->qp, &wr, &bad_wr); if (rc) return rc; } return i; } static int pp_post_send(struct pingpong_context *ctx) { int ret; struct ibv_sge list = { .addr = (uintptr_t) ctx->net_buf, .length = ctx->size, .lkey = ctx->mr->lkey }; struct ibv_exp_send_wr wr = { .wr_id = PP_SEND_WRID, .sg_list = &list, .num_sge = 1, .exp_opcode = IBV_EXP_WR_SEND, .exp_send_flags = IBV_EXP_SEND_SIGNALED, }; struct ibv_exp_send_wr *bad_wr; /* If this is a calc operation - set the required params in the wr */ if (ctx->calc_op.opcode != IBV_EXP_CALC_OP_NUMBER) { wr.exp_opcode = IBV_EXP_WR_SEND; wr.exp_send_flags |= IBV_EXP_SEND_WITH_CALC; wr.sg_list = ctx->calc_op.gather_list; wr.num_sge = ctx->calc_op.gather_list_size; wr.op.calc.calc_op = ctx->calc_op.opcode; wr.op.calc.data_type = ctx->calc_op.data_type; wr.op.calc.data_size = ctx->calc_op.data_size; } ret = ibv_exp_post_send(ctx->qp, &wr, &bad_wr); return ret; } int pp_post_ext_wqe(struct pingpong_context *ctx, enum ibv_exp_wr_opcode op) { int ret; struct ibv_exp_send_wr wr = { .wr_id = PP_CQE_WAIT, .sg_list = NULL, .num_sge = 0, .exp_opcode = op, .exp_send_flags = IBV_EXP_SEND_SIGNALED, }; struct ibv_exp_send_wr *bad_wr; switch (op) { case IBV_EXP_WR_RECV_ENABLE: case IBV_EXP_WR_SEND_ENABLE: wr.task.wqe_enable.qp = ctx->qp; wr.task.wqe_enable.wqe_count = 0; wr.exp_send_flags |= IBV_EXP_SEND_WAIT_EN_LAST; break; case IBV_EXP_WR_CQE_WAIT: wr.task.cqe_wait.cq = ctx->cq; wr.task.cqe_wait.cq_count = 1; wr.exp_send_flags |= IBV_EXP_SEND_WAIT_EN_LAST; break; default: fprintf(stderr, "-E- unsupported m_wqe opcode %d\n", op); return -1; } ret = ibv_exp_post_send(ctx->mqp, &wr, &bad_wr); return ret; } int pp_poll_mcq(struct ibv_cq *cq, int num_cqe) { int ne; int i; struct ibv_wc wc[2]; if (num_cqe > 2) { fprintf(stderr, "-E- max num cqe exceeded\n"); return -1; } do { ne = ibv_poll_cq(cq, num_cqe, wc); if (ne < 0) { fprintf(stderr, "poll CQ failed %d\n", ne); return 1; } } while (ne < 1); for (i = 0; i < ne; ++i) { if (wc[i].status != IBV_WC_SUCCESS) { fprintf(stderr, "Failed %s status %s (%d)\n", wr_id_str[(int)wc[i].wr_id], ibv_wc_status_str(wc[i].status), wc[i].status); return 1; } if ((int) wc[i].wr_id != PP_CQE_WAIT) { fprintf(stderr, "invalid wr_id %" PRIx64 "\n", wc[i].wr_id); return -1; } } return 0; } static int pp_calc_verify(struct pingpong_context *ctx, enum pp_wr_data_type calc_data_type, enum pp_wr_calc_op calc_opcode) { uint64_t *op1 = &(ctx->last_result); uint64_t *op2 = (uint64_t *)ctx->buf + 2; uint64_t *res = (uint64_t *)ctx->buf; return !EXEC_VERIFY(calc_data_type, calc_opcode, 1, op1, op2, res); } static int pp_update_last_result(struct pingpong_context *ctx, enum pp_wr_data_type calc_data_type, enum pp_wr_calc_op calc_opcode) { /* EXEC_VERIFY derefence result parameter */ uint64_t *dummy; uint64_t *op1 = (uint64_t *)ctx->buf; uint64_t *op2 = (uint64_t *)ctx->buf + 2; uint64_t res = (uint64_t)EXEC_VERIFY(calc_data_type, calc_opcode, 0, op1, op2, dummy); ctx->last_result = res; return 0; } static void usage(const char *argv0) { printf("Usage:\n"); printf(" %s start a server and wait for connection\n", argv0); printf(" %s <host> connect to server at <host>\n", argv0); printf("\n"); printf("Options:\n"); printf(" -p, --port=<port> listen on/connect to port <port> (default 18515)\n"); printf(" -d, --ib-dev=<dev> use IB device <dev> (default first device found)\n"); printf(" -i, --ib-port=<port> use port <port> of IB device (default 1)\n"); printf(" -s, --size=<size> size of message to exchange (default 4096 minimum 16)\n"); printf(" -m, --mtu=<size> path MTU (default 1024)\n"); printf(" -r, --rx-depth=<dep> number of receives to post at a time (default 500)\n"); printf(" -n, --iters=<iters> number of exchanges (default 1000)\n"); printf(" -l, --sl=<sl> service level value\n"); printf(" -e, --events sleep on CQ events (default poll)\n"); printf(" -c, --calc=<operation> calc operation\n"); printf(" -t, --op_type=<type> calc operands type\n"); printf(" -o, --operands=<o1,o2,...> comma separated list of operands\n"); printf(" -w, --wait_cq=cqn wait for entries on cq\n"); printf(" -v, --verbose print verbose information\n"); printf(" -V, --verify verify calc operations\n"); }
gaspi_return_t pgaspi_dev_atomic_compare_swap (const gaspi_segment_id_t segment_id, const gaspi_offset_t offset, const gaspi_rank_t rank, const gaspi_atomic_value_t comparator, const gaspi_atomic_value_t val_new) { struct ibv_send_wr *bad_wr; struct ibv_sge slist; struct ibv_send_wr swr; int i; slist.addr = (uintptr_t) (glb_gaspi_ctx.nsrc.buf + NOTIFY_OFFSET); slist.length = sizeof(gaspi_atomic_value_t); slist.lkey = ((struct ibv_mr *) glb_gaspi_ctx.nsrc.mr)->lkey; swr.wr.atomic.remote_addr = glb_gaspi_ctx.rrmd[segment_id][rank].addr + NOTIFY_OFFSET + offset; swr.wr.atomic.rkey = glb_gaspi_ctx.rrmd[segment_id][rank].rkey; swr.wr.atomic.compare_add = comparator; swr.wr.atomic.swap = val_new; swr.wr_id = rank; swr.sg_list = &slist; swr.num_sge = 1; swr.opcode = IBV_WR_ATOMIC_CMP_AND_SWP; swr.send_flags = IBV_SEND_SIGNALED; swr.next = NULL; if (ibv_post_send (glb_gaspi_ctx_ib.qpGroups[rank], &swr, &bad_wr)) { glb_gaspi_ctx.qp_state_vec[GASPI_COLL_QP][rank] = GASPI_STATE_CORRUPT; return GASPI_ERROR; } glb_gaspi_ctx.ne_count_grp++; int ne = 0; for (i = 0; i < glb_gaspi_ctx.ne_count_grp; i++) { do { ne = ibv_poll_cq (glb_gaspi_ctx_ib.scqGroups, 1, glb_gaspi_ctx_ib.wc_grp_send); } while (ne == 0); if ((ne < 0) || (glb_gaspi_ctx_ib.wc_grp_send[i].status != IBV_WC_SUCCESS)) { glb_gaspi_ctx.qp_state_vec[GASPI_COLL_QP][glb_gaspi_ctx_ib.wc_grp_send[i].wr_id] = GASPI_STATE_CORRUPT; gaspi_print_error("Failed request to %lu : %s", glb_gaspi_ctx_ib.wc_grp_send[i].wr_id, ibv_wc_status_str(glb_gaspi_ctx_ib.wc_grp_send[i].status)); return GASPI_ERROR; } } glb_gaspi_ctx.ne_count_grp = 0; return GASPI_SUCCESS; }
static UCS_F_MAYBE_UNUSED struct ibv_mr *uct_ib_md_create_umr(uct_ib_md_t *md, struct ibv_mr *mr) { #if HAVE_EXP_UMR struct ibv_exp_mem_region mem_reg; struct ibv_exp_send_wr wr, *bad_wr; struct ibv_exp_create_mr_in mrin; struct ibv_mr *umr; struct ibv_wc wc; int ret; size_t offset; if ((md->umr_qp == NULL) || (md->umr_cq == NULL)) { return NULL; } offset = uct_ib_md_umr_offset(uct_ib_md_umr_id(md)); /* Create memory key */ memset(&mrin, 0, sizeof(mrin)); mrin.pd = md->pd; #ifdef HAVE_EXP_UMR_NEW_API mrin.attr.create_flags = IBV_EXP_MR_INDIRECT_KLMS; mrin.attr.exp_access_flags = UCT_IB_MEM_ACCESS_FLAGS; mrin.attr.max_klm_list_size = 1; #else mrin.attr.create_flags = IBV_MR_NONCONTIG_MEM; mrin.attr.access_flags = UCT_IB_MEM_ACCESS_FLAGS; mrin.attr.max_reg_descriptors = 1; #endif umr = ibv_exp_create_mr(&mrin); if (!umr) { ucs_error("Failed to create modified_mr: %m"); goto err; } /* Fill memory list and UMR */ memset(&wr, 0, sizeof(wr)); memset(&mem_reg, 0, sizeof(mem_reg)); mem_reg.base_addr = (uintptr_t) mr->addr; mem_reg.length = mr->length; #ifdef HAVE_EXP_UMR_NEW_API mem_reg.mr = mr; wr.ext_op.umr.umr_type = IBV_EXP_UMR_MR_LIST; wr.ext_op.umr.mem_list.mem_reg_list = &mem_reg; wr.ext_op.umr.exp_access = UCT_IB_MEM_ACCESS_FLAGS; wr.ext_op.umr.modified_mr = umr; wr.ext_op.umr.base_addr = (uint64_t) (uintptr_t) mr->addr + offset; wr.ext_op.umr.num_mrs = 1; #else mem_reg.m_key = mr; wr.ext_op.umr.memory_key.mkey_type = IBV_EXP_UMR_MEM_LAYOUT_NONCONTIG; wr.ext_op.umr.memory_key.mem_list.mem_reg_list = &mem_reg; wr.ext_op.umr.memory_key.access = UCT_IB_MEM_ACCESS_FLAGS; wr.ext_op.umr.memory_key.modified_mr = umr; wr.ext_op.umr.memory_key.region_base_addr = mr->addr + offset; wr.num_sge = 1; #endif wr.exp_opcode = IBV_EXP_WR_UMR_FILL; wr.exp_send_flags = IBV_EXP_SEND_INLINE | IBV_EXP_SEND_SIGNALED; /* Post UMR */ ret = ibv_exp_post_send(md->umr_qp, &wr, &bad_wr); if (ret) { ucs_error("ibv_exp_post_send(UMR_FILL) failed: %m"); goto err_free_umr; } /* Wait for send UMR completion */ for (;;) { ret = ibv_poll_cq(md->umr_cq, 1, &wc); if (ret < 0) { ucs_error("ibv_exp_poll_cq(umr_cq) failed: %m"); goto err_free_umr; } if (ret == 1) { if (wc.status != IBV_WC_SUCCESS) { ucs_error("UMR_FILL completed with error: %s vendor_err %d", ibv_wc_status_str(wc.status), wc.vendor_err); goto err_free_umr; } break; } } ucs_trace("UMR registered memory %p..%p offset 0x%x on %s lkey 0x%x rkey 0x%x", mr->addr, mr->addr + mr->length, (unsigned)offset, uct_ib_device_name(&md->dev), umr->lkey, umr->rkey); return umr; err_free_umr: ibv_dereg_mr(umr); err: #endif return NULL; }
static int rdmasniff_read(pcap_t *handle, int max_packets, pcap_handler callback, u_char *user) { struct pcap_rdmasniff *priv = handle->priv; struct ibv_cq *ev_cq; void *ev_ctx; struct ibv_wc wc; struct pcap_pkthdr pkth; u_char *pktd; int count = 0; if (!priv->cq_event) { while (ibv_get_cq_event(priv->channel, &ev_cq, &ev_ctx) < 0) { if (errno != EINTR) { return PCAP_ERROR; } if (handle->break_loop) { handle->break_loop = 0; return PCAP_ERROR_BREAK; } } ibv_ack_cq_events(priv->cq, 1); ibv_req_notify_cq(priv->cq, 0); priv->cq_event = 1; } while (count < max_packets || PACKET_COUNT_IS_UNLIMITED(max_packets)) { if (ibv_poll_cq(priv->cq, 1, &wc) != 1) { priv->cq_event = 0; break; } if (wc.status != IBV_WC_SUCCESS) { fprintf(stderr, "failed WC wr_id %lld status %d/%s\n", (unsigned long long) wc.wr_id, wc.status, ibv_wc_status_str(wc.status)); continue; } pkth.len = wc.byte_len; pkth.caplen = min(pkth.len, (u_int)handle->snapshot); gettimeofday(&pkth.ts, NULL); pktd = (u_char *) handle->buffer + wc.wr_id * RDMASNIFF_RECEIVE_SIZE; if (handle->fcode.bf_insns == NULL || pcap_filter(handle->fcode.bf_insns, pktd, pkth.len, pkth.caplen)) { callback(user, &pkth, pktd); ++priv->packets_recv; ++count; } rdmasniff_post_recv(handle, wc.wr_id); if (handle->break_loop) { handle->break_loop = 0; return PCAP_ERROR_BREAK; } } return count; }
void CompletionContext::processWorkComplete(struct ibv_wc* wc) { LOG_TRACE("Processing WC with ID %1% on queue %2% with status %3% %4%", wc->wr_id, wc->qp_num, wc->status, ibv_wc_status_str(wc->status)); WorkRequestId workId(wc->wr_id); std::error_code ec; auto i = mSocketMap.find(wc->qp_num); if (i == mSocketMap.end()) { LOG_ERROR("No matching socket for qp_num %1%", wc->qp_num); // In the case that we have no socket associated with the qp_num we just repost the buffer to the shared receive // queue or release the buffer in the case of send switch (workId.workType()) { // In the case the work request was a receive, we try to repost the shared receive buffer case WorkType::RECEIVE: { mDevice->postReceiveBuffer(workId.bufferId()); } break; // In the case the work request was a send we just release the send buffer case WorkType::SEND: { releaseSendBuffer(workId.bufferId()); } break; default: break; } return; } InfinibandSocketImpl* socket = i->second.get(); if (wc->status != IBV_WC_SUCCESS) { ec = std::error_code(wc->status, error::get_work_completion_category()); } else { assert(workId.workType() != WorkType::RECEIVE || wc->opcode & IBV_WC_RECV); assert(workId.workType() != WorkType::SEND || wc->opcode == IBV_WC_SEND); assert(workId.workType() != WorkType::READ || wc->opcode == IBV_WC_RDMA_READ); assert(workId.workType() != WorkType::WRITE || wc->opcode == IBV_WC_RDMA_WRITE); } switch (workId.workType()) { case WorkType::RECEIVE: { LOG_TRACE("Executing receive event of buffer %1%", workId.bufferId()); auto buffer = mDevice->acquireReceiveBuffer(workId.bufferId()); if (!buffer.valid()) { socket->onReceive(nullptr, 0x0u, error::invalid_buffer); break; } if (wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) { socket->onImmediate(ntohl(wc->imm_data)); } else { socket->onReceive(buffer.data(), wc->byte_len, ec); } mDevice->postReceiveBuffer(buffer); } break; case WorkType::SEND: { LOG_TRACE("Executing send event of buffer %1%", workId.bufferId()); socket->onSend(workId.userId(), ec); releaseSendBuffer(workId.bufferId()); } break; case WorkType::READ: { LOG_TRACE("Executing read event of buffer %1%", workId.bufferId()); socket->onRead(workId.userId(), workId.bufferId(), ec); } break; case WorkType::WRITE: { LOG_TRACE("Executing write event of buffer %1%", workId.bufferId()); socket->onWrite(workId.userId(), workId.bufferId(), ec); } break; default: { LOG_TRACE("Unknown work type"); } break; } }
/** * DPDK callback for RX with scattered packets support. * * @param dpdk_rxq * Generic pointer to RX queue structure. * @param[out] pkts * Array to store received packets. * @param pkts_n * Maximum number of packets in array. * * @return * Number of packets successfully received (<= pkts_n). */ uint16_t mlx5_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) { struct rxq *rxq = (struct rxq *)dpdk_rxq; struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp; const unsigned int elts_n = rxq->elts_n; unsigned int elts_head = rxq->elts_head; unsigned int i; unsigned int pkts_ret = 0; int ret; if (unlikely(!rxq->sp)) return mlx5_rx_burst(dpdk_rxq, pkts, pkts_n); if (unlikely(elts == NULL)) /* See RTE_DEV_CMD_SET_MTU. */ return 0; for (i = 0; (i != pkts_n); ++i) { struct rxq_elt_sp *elt = &(*elts)[elts_head]; unsigned int len; unsigned int pkt_buf_len; struct rte_mbuf *pkt_buf = NULL; /* Buffer returned in pkts. */ struct rte_mbuf **pkt_buf_next = &pkt_buf; unsigned int seg_headroom = RTE_PKTMBUF_HEADROOM; unsigned int j = 0; uint32_t flags; uint16_t vlan_tci; /* Sanity checks. */ assert(elts_head < rxq->elts_n); assert(rxq->elts_head < rxq->elts_n); ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci); if (unlikely(ret < 0)) { struct ibv_wc wc; int wcs_n; DEBUG("rxq=%p, poll_length() failed (ret=%d)", (void *)rxq, ret); /* ibv_poll_cq() must be used in case of failure. */ wcs_n = ibv_poll_cq(rxq->cq, 1, &wc); if (unlikely(wcs_n == 0)) break; if (unlikely(wcs_n < 0)) { DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)", (void *)rxq, wcs_n); break; } assert(wcs_n == 1); if (unlikely(wc.status != IBV_WC_SUCCESS)) { /* Whatever, just repost the offending WR. */ DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work" " completion status (%d): %s", (void *)rxq, wc.wr_id, wc.status, ibv_wc_status_str(wc.status)); #ifdef MLX5_PMD_SOFT_COUNTERS /* Increment dropped packets counter. */ ++rxq->stats.idropped; #endif goto repost; } ret = wc.byte_len; } if (ret == 0) break; assert(ret >= (rxq->crc_present << 2)); len = ret - (rxq->crc_present << 2); pkt_buf_len = len; /* * Replace spent segments with new ones, concatenate and * return them as pkt_buf. */ while (1) { struct ibv_sge *sge = &elt->sges[j]; struct rte_mbuf *seg = elt->bufs[j]; struct rte_mbuf *rep; unsigned int seg_tailroom; assert(seg != NULL); /* * Fetch initial bytes of packet descriptor into a * cacheline while allocating rep. */ rte_prefetch0(seg); rep = __rte_mbuf_raw_alloc(rxq->mp); if (unlikely(rep == NULL)) { /* * Unable to allocate a replacement mbuf, * repost WR. */ DEBUG("rxq=%p: can't allocate a new mbuf", (void *)rxq); if (pkt_buf != NULL) { *pkt_buf_next = NULL; rte_pktmbuf_free(pkt_buf); } /* Increment out of memory counters. */ ++rxq->stats.rx_nombuf; ++rxq->priv->dev->data->rx_mbuf_alloc_failed; goto repost; } #ifndef NDEBUG /* Poison user-modifiable fields in rep. */ NEXT(rep) = (void *)((uintptr_t)-1); SET_DATA_OFF(rep, 0xdead); DATA_LEN(rep) = 0xd00d; PKT_LEN(rep) = 0xdeadd00d; NB_SEGS(rep) = 0x2a; PORT(rep) = 0x2a; rep->ol_flags = -1; #endif assert(rep->buf_len == seg->buf_len); assert(rep->buf_len == rxq->mb_len); /* Reconfigure sge to use rep instead of seg. */ assert(sge->lkey == rxq->mr->lkey); sge->addr = ((uintptr_t)rep->buf_addr + seg_headroom); elt->bufs[j] = rep; ++j; /* Update pkt_buf if it's the first segment, or link * seg to the previous one and update pkt_buf_next. */ *pkt_buf_next = seg; pkt_buf_next = &NEXT(seg); /* Update seg information. */ seg_tailroom = (seg->buf_len - seg_headroom); assert(sge->length == seg_tailroom); SET_DATA_OFF(seg, seg_headroom); if (likely(len <= seg_tailroom)) { /* Last segment. */ DATA_LEN(seg) = len; PKT_LEN(seg) = len; /* Sanity check. */ assert(rte_pktmbuf_headroom(seg) == seg_headroom); assert(rte_pktmbuf_tailroom(seg) == (seg_tailroom - len)); break; } DATA_LEN(seg) = seg_tailroom; PKT_LEN(seg) = seg_tailroom; /* Sanity check. */ assert(rte_pktmbuf_headroom(seg) == seg_headroom); assert(rte_pktmbuf_tailroom(seg) == 0); /* Fix len and clear headroom for next segments. */ len -= seg_tailroom; seg_headroom = 0; } /* Update head and tail segments. */ *pkt_buf_next = NULL; assert(pkt_buf != NULL); assert(j != 0); NB_SEGS(pkt_buf) = j; PORT(pkt_buf) = rxq->port_id; PKT_LEN(pkt_buf) = pkt_buf_len; if (rxq->csum | rxq->csum_l2tun | rxq->vlan_strip) { pkt_buf->packet_type = rxq_cq_to_pkt_type(flags); pkt_buf->ol_flags = rxq_cq_to_ol_flags(rxq, flags); #ifdef HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS if (flags & IBV_EXP_CQ_RX_CVLAN_STRIPPED_V1) { pkt_buf->ol_flags |= PKT_RX_VLAN_PKT; pkt_buf->vlan_tci = vlan_tci; } #endif /* HAVE_EXP_DEVICE_ATTR_VLAN_OFFLOADS */ } /* Return packet. */ *(pkts++) = pkt_buf; ++pkts_ret; #ifdef MLX5_PMD_SOFT_COUNTERS /* Increment bytes counter. */ rxq->stats.ibytes += pkt_buf_len; #endif repost: ret = rxq->recv(rxq->wq, elt->sges, RTE_DIM(elt->sges)); if (unlikely(ret)) { /* Inability to repost WRs is fatal. */ DEBUG("%p: recv_sg_list(): failed (ret=%d)", (void *)rxq->priv, ret); abort(); } if (++elts_head >= elts_n) elts_head = 0; continue; } if (unlikely(i == 0)) return 0; rxq->elts_head = elts_head; #ifdef MLX5_PMD_SOFT_COUNTERS /* Increment packets counter. */ rxq->stats.ipackets += pkts_ret; #endif return pkts_ret; }