static UCS_F_ALWAYS_INLINE void uct_rc_verbs_exp_post_send(uct_rc_verbs_ep_t *ep, struct ibv_exp_send_wr *wr, int signal) { uct_rc_verbs_iface_t *iface = ucs_derived_of(ep->super.super.super.iface, uct_rc_verbs_iface_t); struct ibv_exp_send_wr *bad_wr; int ret; if (!signal) { signal = uct_rc_iface_tx_moderation(&iface->super, &ep->super, IBV_EXP_SEND_SIGNALED); } wr->exp_send_flags |= signal; wr->wr_id = ep->super.unsignaled; uct_ib_log_exp_post_send(ep->super.qp, wr, (wr->exp_opcode == IBV_EXP_WR_SEND) ? uct_rc_ep_am_packet_dump : NULL); ret = ibv_exp_post_send(ep->super.qp, wr, &bad_wr); if (ret != 0) { ucs_fatal("ibv_exp_post_send() returned %d (%m)", ret); } uct_rc_verbs_ep_posted(ep, signal); }
static UCS_F_ALWAYS_INLINE void uct_rc_verbs_exp_post_send(uct_rc_verbs_ep_t *ep, struct ibv_exp_send_wr *wr, uint64_t signal) { uct_rc_verbs_iface_t *iface = ucs_derived_of(ep->super.super.super.iface, uct_rc_verbs_iface_t); uct_rc_txqp_check(&ep->super.txqp); struct ibv_exp_send_wr *bad_wr; int ret; signal |= uct_rc_iface_tx_moderation(&iface->super, &ep->super.txqp, IBV_EXP_SEND_SIGNALED); wr->exp_send_flags = signal; wr->wr_id = uct_rc_txqp_unsignaled(&ep->super.txqp); uct_ib_log_exp_post_send(&iface->super.super, ep->super.txqp.qp, wr, (wr->exp_opcode == IBV_EXP_WR_SEND) ? uct_rc_ep_am_packet_dump : NULL); UCT_IB_INSTRUMENT_RECORD_SEND_EXP_WR_LEN("uct_rc_verbs_exp_post_send", wr); ret = ibv_exp_post_send(ep->super.txqp.qp, wr, &bad_wr); if (ret != 0) { ucs_fatal("ibv_exp_post_send() returned %d (%m)", ret); } uct_rc_verbs_txqp_posted(&ep->super.txqp, &ep->txcnt, &iface->super, signal); }
int send_nop(struct dc_ctx *ctx) { struct ibv_exp_send_wr *bad_wr; struct ibv_exp_send_wr wr; struct ibv_exp_wc wc; int err; int n; memset(&wr, 0, sizeof(wr)); wr.num_sge = 0; wr.exp_opcode = IBV_EXP_WR_NOP; wr.exp_send_flags = IBV_EXP_SEND_SIGNALED; err = ibv_exp_post_send(ctx->qp, &wr, &bad_wr); if (err) { fprintf(stderr, "post nop failed\n"); return err; } do { n = ibv_exp_poll_cq(ctx->cq, 1, &wc, sizeof(wc)); if (n < 0) { fprintf(stderr, "poll CQ failed %d\n", n); return -1; } } while (!n); if (wc.status != IBV_WC_SUCCESS) { fprintf(stderr, "completion with error %d\n", wc.status); return -1; } return 0; }
static int infini_post_send(struct ibvif *m, void *payload, int length) { struct ibv_sge list; struct ibv_exp_send_wr wr; struct ibv_exp_send_wr *bad_wr = NULL; memset(&list, 0, sizeof(struct ibv_sge)); list.addr = (uintptr_t) payload; list.length = length; list.lkey = m->send_mr->lkey; memset(&wr, 0, sizeof(struct ibv_send_wr)); wr.wr_id = list.addr; wr.sg_list = &list; wr.num_sge = 1; wr.exp_opcode = IBV_EXP_WR_SEND; wr.exp_send_flags = IBV_EXP_SEND_SIGNALED | IBV_EXP_SEND_IP_CSUM; wr.next = NULL; return ibv_exp_post_send(m->qp, &wr, &bad_wr); }
struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, int rx_depth, int port, int use_event, enum pp_wr_calc_op calc_op, enum pp_wr_data_type calc_data_type, char *calc_operands_str) { struct pingpong_context *ctx; int rc; ctx = malloc(sizeof *ctx); if (!ctx) return NULL; memset(ctx, 0, sizeof *ctx); ctx->size = size; ctx->rx_depth = rx_depth; ctx->calc_op.opcode = IBV_EXP_CALC_OP_NUMBER; ctx->calc_op.data_type = IBV_EXP_CALC_DATA_TYPE_NUMBER; ctx->calc_op.data_size = IBV_EXP_CALC_DATA_SIZE_NUMBER; ctx->buf = memalign(page_size, size); if (!ctx->buf) { fprintf(stderr, "Couldn't allocate work buf.\n"); goto clean_ctx; } memset(ctx->buf, 0, size); ctx->net_buf = memalign(page_size, size); if (!ctx->net_buf) { fprintf(stderr, "Couldn't allocate work buf.\n"); goto clean_buffer; } memset(ctx->net_buf, 0, size); ctx->context = ibv_open_device(ib_dev); if (!ctx->context) { fprintf(stderr, "Couldn't get context for %s\n", ibv_get_device_name(ib_dev)); goto clean_net_buf; } if (use_event) { ctx->channel = ibv_create_comp_channel(ctx->context); if (!ctx->channel) { fprintf(stderr, "Couldn't create completion channel\n"); goto clean_device; } } else ctx->channel = NULL; ctx->pd = ibv_alloc_pd(ctx->context); if (!ctx->pd) { fprintf(stderr, "Couldn't allocate PD\n"); goto clean_comp_channel; } ctx->mr = ibv_reg_mr(ctx->pd, ctx->net_buf, size, IBV_ACCESS_LOCAL_WRITE); if (!ctx->mr) { fprintf(stderr, "Couldn't register MR\n"); goto clean_pd; } if (calc_op != PP_CALC_INVALID) { int op_per_gather, num_op, max_num_op; ctx->calc_op.opcode = IBV_EXP_CALC_OP_NUMBER; ctx->calc_op.data_type = IBV_EXP_CALC_DATA_TYPE_NUMBER; ctx->calc_op.data_size = IBV_EXP_CALC_DATA_SIZE_NUMBER; num_op = pp_parse_calc_to_gather(calc_operands_str, calc_op, calc_data_type, &ctx->calc_op, ctx->context, ctx->buf, ctx->net_buf); if (num_op < 0) { fprintf(stderr, "-E- failed parsing calc operators\n"); goto clean_mr; } rc = pp_query_calc_cap(ctx->context, ctx->calc_op.opcode, ctx->calc_op.data_type, ctx->calc_op.data_size, &op_per_gather, &max_num_op); if (rc) { fprintf(stderr, "-E- operation not supported on %s. valid ops are:\n", ibv_get_device_name(ib_dev)); pp_print_dev_calc_ops(ctx->context); goto clean_mr; } if (pp_prepare_sg_list(op_per_gather, num_op, ctx->mr->lkey, &ctx->calc_op, ctx->net_buf)) { fprintf(stderr, "-failed to prepare the sg list\n"); goto clean_mr; } } ctx->cq = ibv_create_cq(ctx->context, rx_depth + 1, NULL, ctx->channel, 0); if (!ctx->cq) { fprintf(stderr, "Couldn't create CQ\n"); goto clean_mr; } { struct ibv_exp_qp_init_attr attr = { .send_cq = ctx->cq, .recv_cq = ctx->cq, .cap = { .max_send_wr = 16, .max_recv_wr = rx_depth, .max_send_sge = 16, .max_recv_sge = 16 }, .qp_type = IBV_QPT_RC, .pd = ctx->pd }; attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS | IBV_EXP_QP_INIT_ATTR_PD; attr.exp_create_flags = IBV_EXP_QP_CREATE_CROSS_CHANNEL; ctx->qp = ibv_exp_create_qp(ctx->context, &attr); if (!ctx->qp) { fprintf(stderr, "Couldn't create QP\n"); goto clean_cq; } } { struct ibv_qp_attr attr = { .qp_state = IBV_QPS_INIT, .pkey_index = 0, .port_num = port, .qp_access_flags = 0 }; if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { fprintf(stderr, "Failed to modify QP to INIT\n"); goto clean_qp; } } ctx->mcq = ibv_create_cq(ctx->context, rx_depth + 1, NULL, ctx->channel, 0); if (!ctx->mcq) { fprintf(stderr, "Couldn't create CQ for MQP\n"); goto clean_qp; } { struct ibv_exp_qp_init_attr mattr = { .send_cq = ctx->mcq, .recv_cq = ctx->mcq, .cap = { .max_send_wr = 1, .max_recv_wr = rx_depth, .max_send_sge = 16, .max_recv_sge = 16 }, .qp_type = IBV_QPT_RC, .pd = ctx->pd }; mattr.comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS | IBV_EXP_QP_INIT_ATTR_PD; mattr.exp_create_flags = IBV_EXP_QP_CREATE_CROSS_CHANNEL; ctx->mqp = ibv_exp_create_qp(ctx->context, &mattr); if (!ctx->qp) { fprintf(stderr, "Couldn't create MQP\n"); goto clean_mcq; } } { struct ibv_qp_attr mattr = { .qp_state = IBV_QPS_INIT, .pkey_index = 0, .port_num = port, .qp_access_flags = 0 }; if (ibv_modify_qp(ctx->mqp, &mattr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { fprintf(stderr, "Failed to modify MQP to INIT\n"); goto clean_mqp; } } return ctx; clean_mqp: ibv_destroy_qp(ctx->mqp); clean_mcq: ibv_destroy_cq(ctx->mcq); clean_qp: ibv_destroy_qp(ctx->qp); clean_cq: ibv_destroy_cq(ctx->cq); clean_mr: ibv_dereg_mr(ctx->mr); clean_pd: ibv_dealloc_pd(ctx->pd); clean_comp_channel: if (ctx->channel) ibv_destroy_comp_channel(ctx->channel); clean_device: ibv_close_device(ctx->context); clean_net_buf: free(ctx->net_buf); clean_buffer: free(ctx->buf); clean_ctx: free(ctx); return NULL; } int pp_close_ctx(struct pingpong_context *ctx) { if (ibv_destroy_qp(ctx->qp)) { fprintf(stderr, "Couldn't destroy QP\n"); return 1; } if (ibv_destroy_qp(ctx->mqp)) { fprintf(stderr, "Couldn't destroy MQP\n"); return 1; } if (ibv_destroy_cq(ctx->cq)) { fprintf(stderr, "Couldn't destroy CQ\n"); return 1; } if (ibv_destroy_cq(ctx->mcq)) { fprintf(stderr, "Couldn't destroy MCQ\n"); return 1; } if (ibv_dereg_mr(ctx->mr)) { fprintf(stderr, "Couldn't deregister MR\n"); return 1; } if (ibv_dealloc_pd(ctx->pd)) { fprintf(stderr, "Couldn't deallocate PD\n"); return 1; } if (ctx->channel) { if (ibv_destroy_comp_channel(ctx->channel)) { fprintf(stderr, "Couldn't destroy completion channel\n"); return 1; } } if (ibv_close_device(ctx->context)) { fprintf(stderr, "Couldn't release context\n"); return 1; } free(ctx->buf); free(ctx->net_buf); free(ctx); return 0; } static int pp_post_recv(struct pingpong_context *ctx, int n) { int rc; struct ibv_sge list = { .addr = (uintptr_t) ctx->net_buf, .length = ctx->size, .lkey = ctx->mr->lkey }; struct ibv_recv_wr wr = { .wr_id = PP_RECV_WRID, .sg_list = &list, .num_sge = 1, }; struct ibv_recv_wr *bad_wr; int i; for (i = 0; i < n; ++i) { rc = ibv_post_recv(ctx->qp, &wr, &bad_wr); if (rc) return rc; } return i; } static int pp_post_send(struct pingpong_context *ctx) { int ret; struct ibv_sge list = { .addr = (uintptr_t) ctx->net_buf, .length = ctx->size, .lkey = ctx->mr->lkey }; struct ibv_exp_send_wr wr = { .wr_id = PP_SEND_WRID, .sg_list = &list, .num_sge = 1, .exp_opcode = IBV_EXP_WR_SEND, .exp_send_flags = IBV_EXP_SEND_SIGNALED, }; struct ibv_exp_send_wr *bad_wr; /* If this is a calc operation - set the required params in the wr */ if (ctx->calc_op.opcode != IBV_EXP_CALC_OP_NUMBER) { wr.exp_opcode = IBV_EXP_WR_SEND; wr.exp_send_flags |= IBV_EXP_SEND_WITH_CALC; wr.sg_list = ctx->calc_op.gather_list; wr.num_sge = ctx->calc_op.gather_list_size; wr.op.calc.calc_op = ctx->calc_op.opcode; wr.op.calc.data_type = ctx->calc_op.data_type; wr.op.calc.data_size = ctx->calc_op.data_size; } ret = ibv_exp_post_send(ctx->qp, &wr, &bad_wr); return ret; } int pp_post_ext_wqe(struct pingpong_context *ctx, enum ibv_exp_wr_opcode op) { int ret; struct ibv_exp_send_wr wr = { .wr_id = PP_CQE_WAIT, .sg_list = NULL, .num_sge = 0, .exp_opcode = op, .exp_send_flags = IBV_EXP_SEND_SIGNALED, }; struct ibv_exp_send_wr *bad_wr; switch (op) { case IBV_EXP_WR_RECV_ENABLE: case IBV_EXP_WR_SEND_ENABLE: wr.task.wqe_enable.qp = ctx->qp; wr.task.wqe_enable.wqe_count = 0; wr.exp_send_flags |= IBV_EXP_SEND_WAIT_EN_LAST; break; case IBV_EXP_WR_CQE_WAIT: wr.task.cqe_wait.cq = ctx->cq; wr.task.cqe_wait.cq_count = 1; wr.exp_send_flags |= IBV_EXP_SEND_WAIT_EN_LAST; break; default: fprintf(stderr, "-E- unsupported m_wqe opcode %d\n", op); return -1; } ret = ibv_exp_post_send(ctx->mqp, &wr, &bad_wr); return ret; } int pp_poll_mcq(struct ibv_cq *cq, int num_cqe) { int ne; int i; struct ibv_wc wc[2]; if (num_cqe > 2) { fprintf(stderr, "-E- max num cqe exceeded\n"); return -1; } do { ne = ibv_poll_cq(cq, num_cqe, wc); if (ne < 0) { fprintf(stderr, "poll CQ failed %d\n", ne); return 1; } } while (ne < 1); for (i = 0; i < ne; ++i) { if (wc[i].status != IBV_WC_SUCCESS) { fprintf(stderr, "Failed %s status %s (%d)\n", wr_id_str[(int)wc[i].wr_id], ibv_wc_status_str(wc[i].status), wc[i].status); return 1; } if ((int) wc[i].wr_id != PP_CQE_WAIT) { fprintf(stderr, "invalid wr_id %" PRIx64 "\n", wc[i].wr_id); return -1; } } return 0; } static int pp_calc_verify(struct pingpong_context *ctx, enum pp_wr_data_type calc_data_type, enum pp_wr_calc_op calc_opcode) { uint64_t *op1 = &(ctx->last_result); uint64_t *op2 = (uint64_t *)ctx->buf + 2; uint64_t *res = (uint64_t *)ctx->buf; return !EXEC_VERIFY(calc_data_type, calc_opcode, 1, op1, op2, res); } static int pp_update_last_result(struct pingpong_context *ctx, enum pp_wr_data_type calc_data_type, enum pp_wr_calc_op calc_opcode) { /* EXEC_VERIFY derefence result parameter */ uint64_t *dummy; uint64_t *op1 = (uint64_t *)ctx->buf; uint64_t *op2 = (uint64_t *)ctx->buf + 2; uint64_t res = (uint64_t)EXEC_VERIFY(calc_data_type, calc_opcode, 0, op1, op2, dummy); ctx->last_result = res; return 0; } static void usage(const char *argv0) { printf("Usage:\n"); printf(" %s start a server and wait for connection\n", argv0); printf(" %s <host> connect to server at <host>\n", argv0); printf("\n"); printf("Options:\n"); printf(" -p, --port=<port> listen on/connect to port <port> (default 18515)\n"); printf(" -d, --ib-dev=<dev> use IB device <dev> (default first device found)\n"); printf(" -i, --ib-port=<port> use port <port> of IB device (default 1)\n"); printf(" -s, --size=<size> size of message to exchange (default 4096 minimum 16)\n"); printf(" -m, --mtu=<size> path MTU (default 1024)\n"); printf(" -r, --rx-depth=<dep> number of receives to post at a time (default 500)\n"); printf(" -n, --iters=<iters> number of exchanges (default 1000)\n"); printf(" -l, --sl=<sl> service level value\n"); printf(" -e, --events sleep on CQ events (default poll)\n"); printf(" -c, --calc=<operation> calc operation\n"); printf(" -t, --op_type=<type> calc operands type\n"); printf(" -o, --operands=<o1,o2,...> comma separated list of operands\n"); printf(" -w, --wait_cq=cqn wait for entries on cq\n"); printf(" -v, --verbose print verbose information\n"); printf(" -V, --verify verify calc operations\n"); }
gaspi_return_t pgaspi_dev_atomic_compare_swap (gaspi_context_t * const gctx, const gaspi_segment_id_t segment_id, const gaspi_offset_t offset, const gaspi_rank_t rank, const gaspi_atomic_value_t comparator, const gaspi_atomic_value_t val_new) { int i; struct ibv_sge slist; slist.addr = (uintptr_t) (gctx->nsrc.data.buf); slist.length = sizeof(gaspi_atomic_value_t); slist.lkey = ((struct ibv_mr *) gctx->nsrc.mr[0])->lkey; #ifdef GPI2_EXP_VERBS struct ibv_exp_send_wr *bad_wr; struct ibv_exp_send_wr swr; swr.exp_opcode = IBV_WR_ATOMIC_CMP_AND_SWP; swr.exp_send_flags = IBV_SEND_SIGNALED; #else struct ibv_send_wr *bad_wr; struct ibv_send_wr swr; swr.opcode = IBV_WR_ATOMIC_CMP_AND_SWP; swr.send_flags = IBV_SEND_SIGNALED; #endif gaspi_ib_ctx * const ib_dev_ctx = (gaspi_ib_ctx*) gctx->device->ctx; swr.wr.atomic.remote_addr = gctx->rrmd[segment_id][rank].data.addr + offset; swr.wr.atomic.rkey = gctx->rrmd[segment_id][rank].rkey[0]; swr.wr.atomic.compare_add = comparator; swr.wr.atomic.swap = val_new; swr.wr_id = rank; swr.sg_list = &slist; swr.num_sge = 1; swr.next = NULL; #ifdef GPI2_EXP_VERBS if (ibv_exp_post_send (ib_dev_ctx->qpGroups[rank], &swr, &bad_wr)) { return GASPI_ERROR; } #else if (ibv_post_send (ib_dev_ctx->qpGroups[rank], &swr, &bad_wr)) { return GASPI_ERROR; } #endif gctx->ne_count_grp++; int ne = 0; for (i = 0; i < gctx->ne_count_grp; i++) { do { ne = ibv_poll_cq (ib_dev_ctx->scqGroups, 1, ib_dev_ctx->wc_grp_send); } while (ne == 0); if ((ne < 0) || (ib_dev_ctx->wc_grp_send[i].status != IBV_WC_SUCCESS)) { return GASPI_ERROR; } } //TODO: gctx->ne_count_grp = 0; return GASPI_SUCCESS; }
static UCS_F_MAYBE_UNUSED struct ibv_mr *uct_ib_md_create_umr(uct_ib_md_t *md, struct ibv_mr *mr) { #if HAVE_EXP_UMR struct ibv_exp_mem_region mem_reg; struct ibv_exp_send_wr wr, *bad_wr; struct ibv_exp_create_mr_in mrin; struct ibv_mr *umr; struct ibv_wc wc; int ret; size_t offset; if ((md->umr_qp == NULL) || (md->umr_cq == NULL)) { return NULL; } offset = uct_ib_md_umr_offset(uct_ib_md_umr_id(md)); /* Create memory key */ memset(&mrin, 0, sizeof(mrin)); mrin.pd = md->pd; #ifdef HAVE_EXP_UMR_NEW_API mrin.attr.create_flags = IBV_EXP_MR_INDIRECT_KLMS; mrin.attr.exp_access_flags = UCT_IB_MEM_ACCESS_FLAGS; mrin.attr.max_klm_list_size = 1; #else mrin.attr.create_flags = IBV_MR_NONCONTIG_MEM; mrin.attr.access_flags = UCT_IB_MEM_ACCESS_FLAGS; mrin.attr.max_reg_descriptors = 1; #endif umr = ibv_exp_create_mr(&mrin); if (!umr) { ucs_error("Failed to create modified_mr: %m"); goto err; } /* Fill memory list and UMR */ memset(&wr, 0, sizeof(wr)); memset(&mem_reg, 0, sizeof(mem_reg)); mem_reg.base_addr = (uintptr_t) mr->addr; mem_reg.length = mr->length; #ifdef HAVE_EXP_UMR_NEW_API mem_reg.mr = mr; wr.ext_op.umr.umr_type = IBV_EXP_UMR_MR_LIST; wr.ext_op.umr.mem_list.mem_reg_list = &mem_reg; wr.ext_op.umr.exp_access = UCT_IB_MEM_ACCESS_FLAGS; wr.ext_op.umr.modified_mr = umr; wr.ext_op.umr.base_addr = (uint64_t) (uintptr_t) mr->addr + offset; wr.ext_op.umr.num_mrs = 1; #else mem_reg.m_key = mr; wr.ext_op.umr.memory_key.mkey_type = IBV_EXP_UMR_MEM_LAYOUT_NONCONTIG; wr.ext_op.umr.memory_key.mem_list.mem_reg_list = &mem_reg; wr.ext_op.umr.memory_key.access = UCT_IB_MEM_ACCESS_FLAGS; wr.ext_op.umr.memory_key.modified_mr = umr; wr.ext_op.umr.memory_key.region_base_addr = mr->addr + offset; wr.num_sge = 1; #endif wr.exp_opcode = IBV_EXP_WR_UMR_FILL; wr.exp_send_flags = IBV_EXP_SEND_INLINE | IBV_EXP_SEND_SIGNALED; /* Post UMR */ ret = ibv_exp_post_send(md->umr_qp, &wr, &bad_wr); if (ret) { ucs_error("ibv_exp_post_send(UMR_FILL) failed: %m"); goto err_free_umr; } /* Wait for send UMR completion */ for (;;) { ret = ibv_poll_cq(md->umr_cq, 1, &wc); if (ret < 0) { ucs_error("ibv_exp_poll_cq(umr_cq) failed: %m"); goto err_free_umr; } if (ret == 1) { if (wc.status != IBV_WC_SUCCESS) { ucs_error("UMR_FILL completed with error: %s vendor_err %d", ibv_wc_status_str(wc.status), wc.vendor_err); goto err_free_umr; } break; } } ucs_trace("UMR registered memory %p..%p offset 0x%x on %s lkey 0x%x rkey 0x%x", mr->addr, mr->addr + mr->length, (unsigned)offset, uct_ib_device_name(&md->dev), umr->lkey, umr->rkey); return umr; err_free_umr: ibv_dereg_mr(umr); err: #endif return NULL; }