static void build_verbs(IbvConnection *conn, struct ibv_context *verbs) { conn->ibvctx = verbs; TEST_Z(conn->pd = ibv_alloc_pd(conn->ibvctx)); TEST_Z(conn->comp_channel = ibv_create_comp_channel(conn->ibvctx)); TEST_Z(conn->cq = ibv_create_cq(conn->ibvctx, 10, NULL, conn->comp_channel, 0)); /* cqe=10 is arbitrary */ TEST_NZ(ibv_req_notify_cq(conn->cq, 0)); TEST_NZ(pthread_create(&conn->cq_poller_thread, NULL, poll_cq, conn)); }
CompletionChannel::CompletionChannel(ibv_context* context) : mChannel(ibv_create_comp_channel(context)) { if (mChannel == nullptr) { throw std::system_error(errno, std::generic_category()); } LOG_TRACE("Created completion channel"); }
static CcPtr make_cc(CtxPtr ctx) { auto ptr = ibv_create_comp_channel(ctx.get()); if(!ptr) { throw std::runtime_error("cannot create cq"); } return CcPtr(ptr, ibv_destroy_comp_channel); }
RDMAAdapter::RDMAAdapter() : context_(open_default_device()), pd_(alloc_protection_domain(context_)) { channel_ = ibv_create_comp_channel(context_); CHECK(channel_) << "Failed to create completion channel"; cq_ = ibv_create_cq(context_, MAX_CONCURRENT_WRITES * 2, NULL, channel_, 0); CHECK(cq_) << "Failed to create completion queue"; CHECK(!ibv_req_notify_cq(cq_, 0)) << "Failed to request CQ notification"; StartInternalThread(); }
static int rping_setup_qp(struct rping_cb *cb, struct rdma_cm_id *cm_id) { int ret; cb->pd = ibv_alloc_pd(cm_id->verbs); if (!cb->pd) { fprintf(stderr, "ibv_alloc_pd failed\n"); return errno; } DEBUG_LOG("created pd %p\n", cb->pd); cb->channel = ibv_create_comp_channel(cm_id->verbs); if (!cb->channel) { fprintf(stderr, "ibv_create_comp_channel failed\n"); ret = errno; goto err1; } DEBUG_LOG("created channel %p\n", cb->channel); cb->cq = ibv_create_cq(cm_id->verbs, RPING_SQ_DEPTH * 2, cb, cb->channel, 0); if (!cb->cq) { fprintf(stderr, "ibv_create_cq failed\n"); ret = errno; goto err2; } DEBUG_LOG("created cq %p\n", cb->cq); ret = ibv_req_notify_cq(cb->cq, 0); if (ret) { fprintf(stderr, "ibv_create_cq failed\n"); ret = errno; goto err3; } ret = rping_create_qp(cb); if (ret) { perror("rdma_create_qp"); goto err3; } DEBUG_LOG("created qp %p\n", cb->qp); return 0; err3: ibv_destroy_cq(cb->cq); err2: ibv_destroy_comp_channel(cb->channel); err1: ibv_dealloc_pd(cb->pd); return ret; }
void build_context(struct ibv_context *verbs) { if (s_ctx) { if (s_ctx->ctx != verbs) { die("cannot handle events in more than one context."); } return; } s_ctx = (rdma_ctx_t *)malloc(sizeof(rdma_ctx_t)); s_ctx->ctx = verbs; TEST_Z(s_ctx->pd = ibv_alloc_pd(s_ctx->ctx)); TEST_Z(s_ctx->comp_channel = ibv_create_comp_channel(s_ctx->ctx)); TEST_Z(s_ctx->cq = ibv_create_cq(s_ctx->ctx, 10, NULL, s_ctx->comp_channel, 0)); /* cqe=10 is arbitrary */ TEST_NZ(ibv_req_notify_cq(s_ctx->cq, 0)); }
static void build_context(struct ibv_context *verbs) { if (s_ctx) { if (s_ctx->ctx != verbs) die("cannot handle events in more than one context."); return; } s_ctx = (struct context *)malloc(sizeof(struct context)); s_ctx->ctx = verbs; TEST_Z(s_ctx->pd = ibv_alloc_pd(s_ctx->ctx)); TEST_Z(s_ctx->comp_channel = ibv_create_comp_channel(s_ctx->ctx)); TEST_Z(s_ctx->cq = ibv_create_cq(s_ctx->ctx, 10, NULL, s_ctx->comp_channel, 0)); /* cqe=10 is arbitrary */ TEST_NZ(ibv_req_notify_cq(s_ctx->cq, 0)); // TEST_NZ(pthread_create(&s_ctx->cq_poller_thread, NULL, poll_cq, NULL)); }
void Connector::build_context(struct ibv_context* verb_) { if (s_ctx_ && s_ctx_->ctx_ != verb_) { log_(ERROR, "cannot handle events in more than one context.") exit(EXIT_FAILURE); } s_ctx_ = (struct context*)malloc(sizeof(struct context) ); s_ctx_->ctx_ = verb_; TEST_Z(s_ctx_->pd_ = ibv_alloc_pd(s_ctx_->ctx_) ); TEST_Z(s_ctx_->comp_channel_ = ibv_create_comp_channel(s_ctx_->ctx_) ); TEST_Z(s_ctx_->cq_ = ibv_create_cq(s_ctx_->ctx_, MAX_QP__CQ_LENGTH, NULL, s_ctx_->comp_channel_, 0) ); TEST_NZ(ibv_req_notify_cq(s_ctx_->cq_, 0) ) // TODO // TEST_NZ(pthread_create(pthread_v.back(), NULL, &Connector::bst_poll_cq, (void*)(this) ) ) pthread_v.push_back(new pthread_t() ); wrap_Connector* wrap_ = new wrap_Connector(this, s_ctx_); TEST_NZ(pthread_create(pthread_v.back(), NULL, call_poll_cq_w_wrap, wrap_) ) }
static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, int tx_depth, int port,struct user_parameters *user_parm) { struct pingpong_context *ctx; struct ibv_device_attr device_attr; ctx = malloc(sizeof *ctx); if (!ctx) return NULL; ctx->size = size; ctx->tx_depth = tx_depth; /* in case of UD need space for the GRH */ if (user_parm->connection_type==UD) { ctx->buf = memalign(page_size, ( size + 40 ) * 2); if (!ctx->buf) { fprintf(stderr, "Couldn't allocate work buf.\n"); return NULL; } memset(ctx->buf, 0, ( size + 40 ) * 2); } else { ctx->buf = memalign(page_size, size * 2); if (!ctx->buf) { fprintf(stderr, "Couldn't allocate work buf.\n"); return NULL; } memset(ctx->buf, 0, size * 2); } ctx->post_buf = (char*)ctx->buf + (size - 1); ctx->poll_buf = (char*)ctx->buf + (2 * size - 1); ctx->context = ibv_open_device(ib_dev); if (!ctx->context) { fprintf(stderr, "Couldn't get context for %s\n", ibv_get_device_name(ib_dev)); return NULL; } if (user_parm->mtu == 0) {/*user did not ask for specific mtu */ if (ibv_query_device(ctx->context, &device_attr)) { fprintf(stderr, "Failed to query device props"); return NULL; } if (device_attr.vendor_part_id == 23108 || user_parm->gid_index > -1) { user_parm->mtu = 1024; } else { user_parm->mtu = 2048; } } if (user_parm->use_event) { ctx->channel = ibv_create_comp_channel(ctx->context); if (!ctx->channel) { fprintf(stderr, "Couldn't create completion channel\n"); return NULL; } } else ctx->channel = NULL; ctx->pd = ibv_alloc_pd(ctx->context); if (!ctx->pd) { fprintf(stderr, "Couldn't allocate PD\n"); return NULL; } if (user_parm->connection_type==UD) { ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, (size + 40 ) * 2, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE); if (!ctx->mr) { fprintf(stderr, "Couldn't allocate MR\n"); return NULL; } } else { ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, size * 2, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE); if (!ctx->mr) { fprintf(stderr, "Couldn't allocate MR\n"); return NULL; } } ctx->scq = ibv_create_cq(ctx->context, tx_depth, NULL, ctx->channel, 0); if (!ctx->scq) { fprintf(stderr, "Couldn't create CQ\n"); return NULL; } ctx->rcq = ibv_create_cq(ctx->context, tx_depth, NULL, ctx->channel, 0); if (!ctx->rcq) { fprintf(stderr, "Couldn't create Recieve CQ\n"); return NULL; } { struct ibv_qp_init_attr attr; memset(&attr, 0, sizeof(struct ibv_qp_init_attr)); attr.send_cq = ctx->scq; attr.recv_cq = ctx->rcq; attr.cap.max_send_wr = tx_depth; /* Work around: driver doesnt support * recv_wr = 0 */ attr.cap.max_recv_wr = tx_depth; attr.cap.max_send_sge = 1; attr.cap.max_recv_sge = 1; attr.cap.max_inline_data = user_parm->inline_size; switch (user_parm->connection_type) { case RC : attr.qp_type = IBV_QPT_RC; break; case UC : attr.qp_type = IBV_QPT_UC; break; case UD : attr.qp_type = IBV_QPT_UD; break; default: fprintf(stderr, "Unknown connection type %d \n",user_parm->connection_type); return NULL; } attr.sq_sig_all = 0; ctx->qp = ibv_create_qp(ctx->pd, &attr); if (!ctx->qp) { fprintf(stderr, "Couldn't create QP\n"); return NULL; } } { struct ibv_qp_attr attr; memset(&attr, 0, sizeof(struct ibv_qp_init_attr)); attr.qp_state = IBV_QPS_INIT; attr.pkey_index = 0; attr.port_num = port; if (user_parm->connection_type==UD) { attr.qkey = 0x11111111; } else { attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE; } if (user_parm->connection_type==UD) { if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_QKEY)) { fprintf(stderr, "Failed to modify UD QP to INIT\n"); return NULL; } if (user_parm->use_mcg) { union ibv_gid gid; uint8_t mcg_gid[16] = MCG_GID; /* use the local QP number as part of the mcg */ mcg_gid[11] = (user_parm->servername) ? 0 : 1; *(uint32_t *)(&mcg_gid[12]) = ctx->qp->qp_num; memcpy(gid.raw, mcg_gid, 16); if (ibv_attach_mcast(ctx->qp, &gid, MCG_LID)) { fprintf(stderr, "Couldn't attach QP to mcg\n"); return NULL; } } } else if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { fprintf(stderr, "Failed to modify QP to INIT\n"); return NULL; } } //send ctx->wr.wr_id = PINGPONG_SEND_WRID; ctx->wr.sg_list = &ctx->list; ctx->wr.num_sge = 1; ctx->wr.opcode = IBV_WR_SEND; ctx->wr.next = NULL; //recieve ctx->rwr.wr_id = PINGPONG_RECV_WRID; ctx->rwr.sg_list = &ctx->recv_list; ctx->rwr.num_sge = 1; ctx->rwr.next = NULL; return ctx; }
static int fio_rdmaio_setup_qp(struct thread_data *td) { struct rdmaio_data *rd = td->io_ops->data; struct ibv_qp_init_attr init_attr; int qp_depth = td->o.iodepth * 2; /* 2 times of io depth */ if (rd->is_client == 0) rd->pd = ibv_alloc_pd(rd->child_cm_id->verbs); else rd->pd = ibv_alloc_pd(rd->cm_id->verbs); if (rd->pd == NULL) { log_err("fio: ibv_alloc_pd fail\n"); return 1; } if (rd->is_client == 0) rd->channel = ibv_create_comp_channel(rd->child_cm_id->verbs); else rd->channel = ibv_create_comp_channel(rd->cm_id->verbs); if (rd->channel == NULL) { log_err("fio: ibv_create_comp_channel fail\n"); goto err1; } if (qp_depth < 16) qp_depth = 16; if (rd->is_client == 0) rd->cq = ibv_create_cq(rd->child_cm_id->verbs, qp_depth, rd, rd->channel, 0); else rd->cq = ibv_create_cq(rd->cm_id->verbs, qp_depth, rd, rd->channel, 0); if (rd->cq == NULL) { log_err("fio: ibv_create_cq failed\n"); goto err2; } if (ibv_req_notify_cq(rd->cq, 0) != 0) { log_err("fio: ibv_create_cq failed\n"); goto err3; } /* create queue pair */ memset(&init_attr, 0, sizeof(init_attr)); init_attr.cap.max_send_wr = qp_depth; init_attr.cap.max_recv_wr = qp_depth; init_attr.cap.max_recv_sge = 1; init_attr.cap.max_send_sge = 1; init_attr.qp_type = IBV_QPT_RC; init_attr.send_cq = rd->cq; init_attr.recv_cq = rd->cq; if (rd->is_client == 0) { if (rdma_create_qp(rd->child_cm_id, rd->pd, &init_attr) != 0) { log_err("fio: rdma_create_qp failed\n"); goto err3; } rd->qp = rd->child_cm_id->qp; } else { if (rdma_create_qp(rd->cm_id, rd->pd, &init_attr) != 0) { log_err("fio: rdma_create_qp failed\n"); goto err3; } rd->qp = rd->cm_id->qp; } return 0; err3: ibv_destroy_cq(rd->cq); err2: ibv_destroy_comp_channel(rd->channel); err1: ibv_dealloc_pd(rd->pd); return 1; }
int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev, RdmaDeviceResources *rdma_dev_res, const char *backend_device_name, uint8_t port_num, struct ibv_device_attr *dev_attr, CharBackend *mad_chr_be) { int i; int ret = 0; int num_ibv_devices; struct ibv_device **dev_list; memset(backend_dev, 0, sizeof(*backend_dev)); backend_dev->dev = pdev; backend_dev->port_num = port_num; backend_dev->rdma_dev_res = rdma_dev_res; rdma_backend_register_comp_handler(dummy_comp_handler); dev_list = ibv_get_device_list(&num_ibv_devices); if (!dev_list) { rdma_error_report("Failed to get IB devices list"); return -EIO; } if (num_ibv_devices == 0) { rdma_error_report("No IB devices were found"); ret = -ENXIO; goto out_free_dev_list; } if (backend_device_name) { for (i = 0; dev_list[i]; ++i) { if (!strcmp(ibv_get_device_name(dev_list[i]), backend_device_name)) { break; } } backend_dev->ib_dev = dev_list[i]; if (!backend_dev->ib_dev) { rdma_error_report("Failed to find IB device %s", backend_device_name); ret = -EIO; goto out_free_dev_list; } } else { backend_dev->ib_dev = *dev_list; } rdma_info_report("uverb device %s", backend_dev->ib_dev->dev_name); backend_dev->context = ibv_open_device(backend_dev->ib_dev); if (!backend_dev->context) { rdma_error_report("Failed to open IB device %s", ibv_get_device_name(backend_dev->ib_dev)); ret = -EIO; goto out; } backend_dev->channel = ibv_create_comp_channel(backend_dev->context); if (!backend_dev->channel) { rdma_error_report("Failed to create IB communication channel"); ret = -EIO; goto out_close_device; } ret = init_device_caps(backend_dev, dev_attr); if (ret) { rdma_error_report("Failed to initialize device capabilities"); ret = -EIO; goto out_destroy_comm_channel; } ret = mad_init(backend_dev, mad_chr_be); if (ret) { rdma_error_report("Failed to initialize mad"); ret = -EIO; goto out_destroy_comm_channel; } backend_dev->comp_thread.run = false; backend_dev->comp_thread.is_running = false; ah_cache_init(); goto out_free_dev_list; out_destroy_comm_channel: ibv_destroy_comp_channel(backend_dev->channel); out_close_device: ibv_close_device(backend_dev->context); out_free_dev_list: ibv_free_device_list(dev_list); out: return ret; }
/** * @param rx_headroom Headroom requested by the user. * @param rx_priv_len Length of transport private data to reserve (0 if unused) * @param rx_hdr_len Length of transport network header. * @param mss Maximal segment size (transport limit). */ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md, uct_worker_h worker, const uct_iface_params_t *params, unsigned rx_priv_len, unsigned rx_hdr_len, unsigned tx_cq_len, size_t mss, const uct_ib_iface_config_t *config) { uct_ib_device_t *dev = &ucs_derived_of(md, uct_ib_md_t)->dev; ucs_status_t status; uint8_t port_num; UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &ops->super, md, worker, &config->super UCS_STATS_ARG(dev->stats)); status = uct_ib_device_find_port(dev, params->dev_name, &port_num); if (status != UCS_OK) { goto err; } self->ops = ops; self->config.rx_payload_offset = sizeof(uct_ib_iface_recv_desc_t) + ucs_max(sizeof(uct_am_recv_desc_t) + params->rx_headroom, rx_priv_len + rx_hdr_len); self->config.rx_hdr_offset = self->config.rx_payload_offset - rx_hdr_len; self->config.rx_headroom_offset= self->config.rx_payload_offset - params->rx_headroom; self->config.seg_size = ucs_min(mss, config->super.max_bcopy); self->config.tx_max_poll = config->tx.max_poll; self->config.rx_max_poll = config->rx.max_poll; self->config.rx_max_batch = ucs_min(config->rx.max_batch, config->rx.queue_len / 4); self->config.port_num = port_num; self->config.sl = config->sl; self->config.gid_index = config->gid_index; status = uct_ib_iface_init_pkey(self, config); if (status != UCS_OK) { goto err; } status = uct_ib_device_query_gid(dev, self->config.port_num, self->config.gid_index, &self->gid); if (status != UCS_OK) { goto err; } status = uct_ib_iface_init_lmc(self, config); if (status != UCS_OK) { goto err; } self->comp_channel = ibv_create_comp_channel(dev->ibv_context); if (self->comp_channel == NULL) { ucs_error("ibv_create_comp_channel() failed: %m"); status = UCS_ERR_IO_ERROR; goto err_free_path_bits; } status = ucs_sys_fcntl_modfl(self->comp_channel->fd, O_NONBLOCK, 0); if (status != UCS_OK) { goto err_destroy_comp_channel; } status = uct_ib_iface_create_cq(self, tx_cq_len, 0, &self->send_cq); if (status != UCS_OK) { goto err_destroy_comp_channel; } status = uct_ib_iface_create_cq(self, config->rx.queue_len, config->rx.inl, &self->recv_cq); if (status != UCS_OK) { goto err_destroy_send_cq; } /* Address scope and size */ if (config->addr_type == UCT_IB_IFACE_ADDRESS_TYPE_AUTO) { if (IBV_PORT_IS_LINK_LAYER_ETHERNET(uct_ib_iface_port_attr(self))) { self->addr_type = UCT_IB_ADDRESS_TYPE_ETH; } else { self->addr_type = uct_ib_address_scope(self->gid.global.subnet_prefix); } } else { ucs_assert(config->addr_type < UCT_IB_ADDRESS_TYPE_LAST); self->addr_type = config->addr_type; } self->addr_size = uct_ib_address_size(self->addr_type); ucs_debug("created uct_ib_iface_t headroom_ofs %d payload_ofs %d hdr_ofs %d data_sz %d", self->config.rx_headroom_offset, self->config.rx_payload_offset, self->config.rx_hdr_offset, self->config.seg_size); return UCS_OK; err_destroy_send_cq: ibv_destroy_cq(self->send_cq); err_destroy_comp_channel: ibv_destroy_comp_channel(self->comp_channel); err_free_path_bits: ucs_free(self->path_bits); err: return status; }
static int rdmasniff_activate(pcap_t *handle) { struct pcap_rdmasniff *priv = handle->priv; struct ibv_qp_init_attr qp_init_attr; struct ibv_qp_attr qp_attr; struct ibv_flow_attr flow_attr; struct ibv_port_attr port_attr; int i; priv->context = ibv_open_device(priv->rdma_device); if (!priv->context) { pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "Failed to open device %s", handle->opt.device); goto error; } priv->pd = ibv_alloc_pd(priv->context); if (!priv->pd) { pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "Failed to alloc PD for device %s", handle->opt.device); goto error; } priv->channel = ibv_create_comp_channel(priv->context); if (!priv->channel) { pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "Failed to create comp channel for device %s", handle->opt.device); goto error; } priv->cq = ibv_create_cq(priv->context, RDMASNIFF_NUM_RECEIVES, NULL, priv->channel, 0); if (!priv->cq) { pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "Failed to create CQ for device %s", handle->opt.device); goto error; } ibv_req_notify_cq(priv->cq, 0); memset(&qp_init_attr, 0, sizeof qp_init_attr); qp_init_attr.send_cq = qp_init_attr.recv_cq = priv->cq; qp_init_attr.cap.max_recv_wr = RDMASNIFF_NUM_RECEIVES; qp_init_attr.cap.max_recv_sge = 1; qp_init_attr.qp_type = IBV_QPT_RAW_PACKET; priv->qp = ibv_create_qp(priv->pd, &qp_init_attr); if (!priv->qp) { pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "Failed to create QP for device %s", handle->opt.device); goto error; } memset(&qp_attr, 0, sizeof qp_attr); qp_attr.qp_state = IBV_QPS_INIT; qp_attr.port_num = priv->port_num; if (ibv_modify_qp(priv->qp, &qp_attr, IBV_QP_STATE | IBV_QP_PORT)) { pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "Failed to modify QP to INIT for device %s", handle->opt.device); goto error; } memset(&qp_attr, 0, sizeof qp_attr); qp_attr.qp_state = IBV_QPS_RTR; if (ibv_modify_qp(priv->qp, &qp_attr, IBV_QP_STATE)) { pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "Failed to modify QP to RTR for device %s", handle->opt.device); goto error; } memset(&flow_attr, 0, sizeof flow_attr); flow_attr.type = IBV_FLOW_ATTR_SNIFFER; flow_attr.size = sizeof flow_attr; flow_attr.port = priv->port_num; priv->flow = ibv_create_flow(priv->qp, &flow_attr); if (!priv->flow) { pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "Failed to create flow for device %s", handle->opt.device); goto error; } handle->bufsize = RDMASNIFF_NUM_RECEIVES * RDMASNIFF_RECEIVE_SIZE; handle->buffer = malloc(handle->bufsize); if (!handle->buffer) { pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "Failed to allocate receive buffer for device %s", handle->opt.device); goto error; } priv->oneshot_buffer = malloc(RDMASNIFF_RECEIVE_SIZE); if (!priv->oneshot_buffer) { pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "Failed to allocate oneshot buffer for device %s", handle->opt.device); goto error; } priv->mr = ibv_reg_mr(priv->pd, handle->buffer, handle->bufsize, IBV_ACCESS_LOCAL_WRITE); if (!priv->mr) { pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "Failed to register MR for device %s", handle->opt.device); goto error; } for (i = 0; i < RDMASNIFF_NUM_RECEIVES; ++i) { rdmasniff_post_recv(handle, i); } if (!ibv_query_port(priv->context, priv->port_num, &port_attr) && port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) { handle->linktype = DLT_INFINIBAND; } else { handle->linktype = DLT_EN10MB; } if (handle->snapshot <= 0 || handle->snapshot > RDMASNIFF_RECEIVE_SIZE) handle->snapshot = RDMASNIFF_RECEIVE_SIZE; handle->offset = 0; handle->read_op = rdmasniff_read; handle->stats_op = rdmasniff_stats; handle->cleanup_op = rdmasniff_cleanup; handle->setfilter_op = install_bpf_program; handle->setdirection_op = NULL; handle->set_datalink_op = NULL; handle->getnonblock_op = pcap_getnonblock_fd; handle->setnonblock_op = pcap_setnonblock_fd; handle->oneshot_callback = rdmasniff_oneshot; handle->selectable_fd = priv->channel->fd; return 0; error: if (priv->mr) { ibv_dereg_mr(priv->mr); } if (priv->flow) { ibv_destroy_flow(priv->flow); } if (priv->qp) { ibv_destroy_qp(priv->qp); } if (priv->cq) { ibv_destroy_cq(priv->cq); } if (priv->channel) { ibv_destroy_comp_channel(priv->channel); } if (priv->pd) { ibv_dealloc_pd(priv->pd); } if (priv->context) { ibv_close_device(priv->context); } if (priv->oneshot_buffer) { free(priv->oneshot_buffer); } return PCAP_ERROR; }
static inline int mca_oob_ud_device_setup (mca_oob_ud_device_t *device, struct ibv_device *ib_device) { int rc, port_num; struct ibv_device_attr dev_attr; OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:device_setup attempting to setup ib device %p", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) ib_device)); device->ib_context = ibv_open_device (ib_device); if (NULL == device->ib_context) { OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:device_setup error opening device. errno = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno)); return ORTE_ERROR; } rc = ibv_query_device (device->ib_context, &dev_attr); if (0 != rc) { OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:device_setup error querying device. errno = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno)); return ORTE_ERROR; } device->ib_channel = ibv_create_comp_channel (device->ib_context); if (NULL == device->ib_channel) { OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:device_setup error completing completion channel." "errno = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno)); return ORTE_ERROR; } device->ib_pd = ibv_alloc_pd (device->ib_context); if (NULL == device->ib_pd) { OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:device_setup error allocating protection domain." "errno = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno)); return ORTE_ERROR; } for (port_num = 1 ; port_num <= dev_attr.phys_port_cnt ; ++port_num) { mca_oob_ud_port_t *port = OBJ_NEW(mca_oob_ud_port_t); if (NULL == port) { opal_output (0, "oob:ud:device_setup malloc failure. errno = %d", errno); return ORTE_ERR_OUT_OF_RESOURCE; } port->device = device; port->port_num = port_num; rc = mca_oob_ud_port_setup (port); if (ORTE_SUCCESS != rc) { OBJ_RELEASE(port); continue; } opal_list_append (&device->ports, (opal_list_item_t *) port); break; } if (0 == opal_list_get_size(&device->ports)) { OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:device_setup could not init device. no usable " "ports present", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return ORTE_ERROR; } return ORTE_SUCCESS; }
/** * \brief Create an RDMA transport server * * \param cmid The CM id passed up in the connect event * \param q_depth A hint from the client on the depth of it's SQ/RQ * \param msize The max message size * \returns A pointer to the newly allocated transport */ Nptrans * np_rdmatrans_create(struct rdma_cm_id *cmid, int q_depth, int msize) { int i, ret; u8 *p; struct Nptrans *trans; struct Rdmatrans *rdma; struct ibv_qp_init_attr qp_attr; struct rdma_conn_param cparam; rdma = calloc(1, sizeof *rdma); if (!rdma) goto error; ret = pthread_mutex_init(&rdma->lock, NULL); if (ret) goto error; ret = pthread_cond_init(&rdma->cond, NULL); if (ret) goto error; rdma->connected = 0; rdma->cm_id = cmid; rdma->context = cmid->verbs; rdma->q_depth = q_depth; rdma->msize = msize + sizeof(Rdmactx); rdma->pd = ibv_alloc_pd(rdma->context); if (!rdma->pd) goto error; /* Create receive buffer space and register it */ rdma->rcv_buf = malloc(rdma->msize * q_depth); if (!rdma->rcv_buf) goto error; rdma->rcv_mr = ibv_reg_mr(rdma->pd, rdma->rcv_buf, rdma->msize * q_depth, IBV_ACCESS_LOCAL_WRITE); if (!rdma->rcv_mr) goto error; /* Create send buffer space and register it */ rdma->snd_buf = malloc(rdma->msize * q_depth); if (!rdma->snd_buf) goto error; rdma->next_buf = 0; rdma->snd_mr = ibv_reg_mr(rdma->pd, rdma->snd_buf, rdma->msize * q_depth, 0); if (!rdma->snd_mr) goto error; rdma->ch = ibv_create_comp_channel(rdma->context); if (!rdma->ch) goto error; rdma->fd = rdma->ch->fd; rdma->cq = ibv_create_cq(rdma->context, 2*q_depth, rdma, rdma->ch, 0); if (!rdma->cq) goto error; ibv_req_notify_cq(rdma->cq, 0); /* Create the CQ */ memset(&qp_attr, 0, sizeof qp_attr); qp_attr.send_cq = rdma->cq; qp_attr.recv_cq = rdma->cq; qp_attr.cap.max_send_wr = q_depth; qp_attr.cap.max_recv_wr = q_depth; qp_attr.cap.max_send_sge = 1; qp_attr.cap.max_send_sge = 1; qp_attr.cap.max_recv_sge = 1; qp_attr.cap.max_inline_data = 64; qp_attr.qp_type = IBV_QPT_RC; ret = rdma_create_qp(rdma->cm_id, rdma->pd, &qp_attr); if (ret) goto error; rdma->qp = rdma->cm_id->qp; p = rdma->rcv_buf; for (i = 0; i < q_depth; i++) rdma_post_recv(rdma, (Rdmactx *)(p + i*rdma->msize)); trans = np_trans_create(rdma, rdma_trans_recv, rdma_trans_send, rdma_trans_destroy); if (!trans) goto error; rdma->trans = trans; memset(&cparam, 0, sizeof(cparam)); cparam.responder_resources = 1; cparam.initiator_depth = 1; cparam.private_data = NULL; cparam.private_data_len = 0; ret = rdma_accept(cmid, &cparam); if (ret) { np_uerror(ret); goto error; } rdma->connected = 1; return trans; error: if (rdma) rdma_trans_destroy(rdma); rdma_reject(cmid, NULL, 0); return NULL; }
static struct ibv_comp_channel * mlx5_glue_create_comp_channel(struct ibv_context *context) { return ibv_create_comp_channel(context); }
/** * @param rx_headroom Headroom requested by the user. * @param rx_priv_len Length of transport private data to reserve (0 if unused) * @param rx_hdr_len Length of transport network header. * @param mss Maximal segment size (transport limit). */ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md, uct_worker_h worker, const char *dev_name, unsigned rx_headroom, unsigned rx_priv_len, unsigned rx_hdr_len, unsigned tx_cq_len, size_t mss, uct_ib_iface_config_t *config) { uct_ib_device_t *dev = &ucs_derived_of(md, uct_ib_md_t)->dev; ucs_status_t status; uint8_t port_num; UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &ops->super, md, worker, &config->super UCS_STATS_ARG(dev->stats)); status = uct_ib_device_find_port(dev, dev_name, &port_num); if (status != UCS_OK) { goto err; } self->port_num = port_num; self->sl = config->sl; self->config.rx_payload_offset = sizeof(uct_ib_iface_recv_desc_t) + ucs_max(sizeof(uct_am_recv_desc_t) + rx_headroom, rx_priv_len + rx_hdr_len); self->config.rx_hdr_offset = self->config.rx_payload_offset - rx_hdr_len; self->config.rx_headroom_offset= self->config.rx_payload_offset - rx_headroom; self->config.seg_size = ucs_min(mss, config->super.max_bcopy); self->config.tx_max_poll = config->tx.max_poll; self->config.rx_max_poll = config->rx.max_poll; self->config.rx_max_batch = ucs_min(config->rx.max_batch, config->rx.queue_len / 4); self->ops = ops; status = uct_ib_iface_init_pkey(self, config); if (status != UCS_OK) { goto err; } status = uct_ib_iface_init_gid(self, config); if (status != UCS_OK) { goto err; } status = uct_ib_iface_init_lmc(self, config); if (status != UCS_OK) { goto err; } self->comp_channel = ibv_create_comp_channel(dev->ibv_context); if (self->comp_channel == NULL) { ucs_error("Failed to create completion channel: %m"); status = UCS_ERR_IO_ERROR; goto err_free_path_bits; } status = ucs_sys_fcntl_modfl(self->comp_channel->fd, O_NONBLOCK, 0); if (status != UCS_OK) { goto err_destroy_comp_channel; } /* TODO inline scatter for send SQ */ self->send_cq = ibv_create_cq(dev->ibv_context, tx_cq_len, NULL, self->comp_channel, 0); if (self->send_cq == NULL) { ucs_error("Failed to create send cq: %m"); status = UCS_ERR_IO_ERROR; goto err_destroy_comp_channel; } if (config->rx.inl > 32 /*UCT_IB_MLX5_CQE64_MAX_INL*/) { ibv_exp_setenv(dev->ibv_context, "MLX5_CQE_SIZE", "128", 1); } self->recv_cq = ibv_create_cq(dev->ibv_context, config->rx.queue_len, NULL, self->comp_channel, 0); ibv_exp_setenv(dev->ibv_context, "MLX5_CQE_SIZE", "64", 1); if (self->recv_cq == NULL) { ucs_error("Failed to create recv cq: %m"); status = UCS_ERR_IO_ERROR; goto err_destroy_send_cq; } if (!uct_ib_device_is_port_ib(dev, self->port_num)) { ucs_error("Unsupported link layer"); status = UCS_ERR_UNSUPPORTED; goto err_destroy_recv_cq; } /* Address scope and size */ self->addr_scope = uct_ib_address_scope(self->gid.global.subnet_prefix); self->addr_size = uct_ib_address_size(self->addr_scope); ucs_debug("created uct_ib_iface_t headroom_ofs %d payload_ofs %d hdr_ofs %d data_sz %d", self->config.rx_headroom_offset, self->config.rx_payload_offset, self->config.rx_hdr_offset, self->config.seg_size); return UCS_OK; err_destroy_recv_cq: ibv_destroy_cq(self->recv_cq); err_destroy_send_cq: ibv_destroy_cq(self->send_cq); err_destroy_comp_channel: ibv_destroy_comp_channel(self->comp_channel); err_free_path_bits: ucs_free(self->path_bits); err: return status; }
struct xfer_context *xfer_rdma_init_ctx(void *ptr, struct xfer_data *data) { struct xfer_context *ctx; struct rdma_cm_id *cm_id = NULL; ctx = malloc(sizeof *ctx); if (!ctx) return NULL; ctx->tx_depth = data->tx_depth; if (data->use_cma) { cm_id = (struct rdma_cm_id *)ptr; ctx->context = cm_id->verbs; if (!ctx->context) { fprintf(stderr, "%d:%s: Unbound cm_id!!\n", pid, __func__); return NULL; } } else { // use alternative to CMA here } ctx->pd = ibv_alloc_pd(ctx->context); if (!ctx->pd) { fprintf(stderr, "%d:%s: Couldn't allocate PD\n", pid, __func__); return NULL; } // setup the message buffers ctx->send_msg = malloc(sizeof(struct message)); ctx->recv_msg = malloc(sizeof(struct message)); ctx->recv_mr = ibv_reg_mr(ctx->pd, ctx->recv_msg, sizeof(struct message), IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE); if (!ctx->recv_mr) { fprintf(stderr, "%d:%s: Couldn't allocate MR\n", pid, __func__); return NULL; } ctx->send_mr = ibv_reg_mr(ctx->pd, ctx->send_msg, sizeof(struct message), IBV_ACCESS_LOCAL_WRITE); if (!ctx->send_mr) { fprintf(stderr, "%d:%s: Couldn't allocate MR\n", pid, __func__); return NULL; } ctx->ch = ibv_create_comp_channel(ctx->context); if (!ctx->ch) { fprintf(stderr, "%d:%s: Couldn't create comp channel\n", pid, __func__); return NULL; } ctx->cq = ibv_create_cq(ctx->context, ctx->tx_depth+1, ctx, ctx->ch, 0); if (!ctx->cq) { fprintf(stderr, "%d:%s: Couldn't create CQ\n", pid, __func__); return NULL; } if (ibv_req_notify_cq(ctx->cq, 0)) { fprintf(stderr, "%d:%s: Couldn't request CQ notification\n", pid, __func__); return NULL; } struct ibv_qp_init_attr attr = { .qp_context = ctx, .send_cq = ctx->cq, .recv_cq = ctx->cq, .cap = { .max_send_wr = ctx->tx_depth+1, .max_recv_wr = ctx->tx_depth+1, .max_send_sge = 1, .max_recv_sge = 1, .max_inline_data = 0 }, .qp_type = IBV_QPT_RC, .sq_sig_all = 1, .srq = NULL }; if (data->use_cma) { if (rdma_create_qp(cm_id, ctx->pd, &attr)) { fprintf(stderr, "%d:%s: Couldn't create QP\n", pid, __func__); return NULL; } ctx->qp = cm_id->qp; ctx->cm_id = cm_id; // arm the QP __xfer_rdma_post_recv(ctx); return ctx; } else { // use an alternative to CMA here ctx = NULL; return ctx; } }
/***************************************** * Function: resources_create *****************************************/ static int resources_create( struct resources *res) { struct ibv_qp_init_attr qp_init_attr; struct ibv_device *ib_dev = NULL; size_t size; int i; int mr_flags = 0; int cq_size = 0; int num_devices; int rc; /* if client side */ if (config.server_name) { res->sock = sock_client_connect(config.server_name, config.tcp_port); if (res->sock < 0) { fprintf(stderr, "failed to establish TCP connection to server %s, port %d\n", config.server_name, config.tcp_port); return -1; } } else { fprintf(stdout, "waiting on port %d for TCP connection\n", config.tcp_port); res->sock = sock_daemon_connect(config.tcp_port); if (res->sock < 0) { fprintf(stderr, "failed to establish TCP connection with client on port %d\n", config.tcp_port); return -1; } } fprintf(stdout, "TCP connection was established\n"); fprintf(stdout, "searching for IB devices in host\n"); /* get device names in the system */ res->dev_list = ibv_get_device_list(&num_devices); if (!res->dev_list) { fprintf(stderr, "failed to get IB devices list\n"); return 1; } /* if there isn't any IB device in host */ if (!num_devices) { fprintf(stderr, "found %d device(s)\n", num_devices); return 1; } fprintf(stdout, "found %d device(s)\n", num_devices); /* search for the specific device we want to work with */ for (i = 0; i < num_devices; i ++) { if (!strcmp(ibv_get_device_name(res->dev_list[i]), config.dev_name)) { ib_dev = res->dev_list[i]; break; } } /* if the device wasn't found in host */ if (!ib_dev) { fprintf(stderr, "IB device %s wasn't found\n", config.dev_name); return 1; } /* get device handle */ res->ib_ctx = ibv_open_device(ib_dev); if (!res->ib_ctx) { fprintf(stderr, "failed to open device %s\n", config.dev_name); return 1; } /* query port properties */ if (ibv_query_port(res->ib_ctx, config.ib_port, &res->port_attr)) { fprintf(stderr, "ibv_query_port on port %u failed\n", config.ib_port); return 1; } /* allocate Protection Domain */ res->pd = ibv_alloc_pd(res->ib_ctx); if (!res->pd) { fprintf(stderr, "ibv_alloc_pd failed\n"); return 1; } res->comp_channel = ibv_create_comp_channel(res->ib_ctx); if (!res->comp_channel) { fprintf(stderr, "ibv_create_comp_channel failed\n"); return 1; } /* each side will send only one WR, so Completion Queue with 1 entry is enough */ cq_size = 1; res->cq = ibv_create_cq(res->ib_ctx, cq_size, NULL, res->comp_channel, 0); if (!res->cq) { fprintf(stderr, "failed to create CQ with %u entries\n", cq_size); return 1; } /* Arm the CQ before any completion is expected (to prevent races) */ rc = ibv_req_notify_cq(res->cq, 0); if (rc) { fprintf(stderr, "failed to arm the CQ\n"); return 1; } fprintf(stdout, "CQ was armed\n"); /* allocate the memory buffer that will hold the data */ size = MSG_SIZE; res->buf = malloc(size); if (!res->buf) { fprintf(stderr, "failed to malloc %Zu bytes to memory buffer\n", size); return 1; } /* only in the daemon side put the message in the memory buffer */ if (!config.server_name) { strcpy(res->buf, MSG); fprintf(stdout, "going to send the message: '%s'\n", res->buf); } else memset(res->buf, 0, size); /* register this memory buffer */ mr_flags = (config.server_name) ? IBV_ACCESS_LOCAL_WRITE : 0; res->mr = ibv_reg_mr(res->pd, res->buf, size, mr_flags); if (!res->mr) { fprintf(stderr, "ibv_reg_mr failed with mr_flags=0x%x\n", mr_flags); return 1; } fprintf(stdout, "MR was registered with addr=%p, lkey=0x%x, rkey=0x%x, flags=0x%x\n", res->buf, res->mr->lkey, res->mr->rkey, mr_flags); /* create the Queue Pair */ memset(&qp_init_attr, 0, sizeof(qp_init_attr)); qp_init_attr.qp_type = IBV_QPT_RC; qp_init_attr.sq_sig_all = 1; qp_init_attr.send_cq = res->cq; qp_init_attr.recv_cq = res->cq; qp_init_attr.cap.max_send_wr = 1; qp_init_attr.cap.max_recv_wr = 1; qp_init_attr.cap.max_send_sge = 1; qp_init_attr.cap.max_recv_sge = 1; res->qp = ibv_create_qp(res->pd, &qp_init_attr); if (!res->qp) { fprintf(stderr, "failed to create QP\n"); return 1; } fprintf(stdout, "QP was created, QP number=0x%x\n", res->qp->qp_num); return 0; }
static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev,int size, struct perftest_parameters *user_parm) { struct pingpong_context *ctx; ALLOCATE(ctx,struct pingpong_context,1); ctx->size = size; ctx->tx_depth = user_parm->tx_depth; ctx->buf = memalign(page_size, BUFF_SIZE(size)); if (!ctx->buf) { fprintf(stderr, "Couldn't allocate work buf.\n"); return NULL; } memset(ctx->buf, 0, BUFF_SIZE(size)); ctx->context = ibv_open_device(ib_dev); if (!ctx->context) { fprintf(stderr, "Couldn't get context for %s\n",ibv_get_device_name(ib_dev)); return NULL; } // Finds the link type and configure the HCA accordingly. if (ctx_set_link_layer(ctx->context,user_parm)) { fprintf(stderr, " Couldn't set the link layer\n"); return NULL; } // Configure the Link MTU acoording to the user or the active mtu. if (ctx_set_mtu(ctx->context,user_parm)) { fprintf(stderr, "Couldn't set the link layer\n"); return NULL; } if (user_parm->use_event) { ctx->channel = ibv_create_comp_channel(ctx->context); if (!ctx->channel) { fprintf(stderr, "Couldn't create completion channel\n"); return NULL; } } else ctx->channel = NULL; ctx->pd = ibv_alloc_pd(ctx->context); if (!ctx->pd) { fprintf(stderr, "Couldn't allocate PD\n"); return NULL; } ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf,BUFF_SIZE(size),IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ); if (!ctx->mr) { fprintf(stderr, "Couldn't allocate MR\n"); return NULL; } // Creates the CQ according to ctx_cq_create in perfetst_resources. ctx->cq = ctx_cq_create(ctx->context,ctx->channel,user_parm); if (!ctx->cq) { fprintf(stderr, "Couldn't create CQ\n"); return NULL; } ctx->qp = ctx_qp_create(ctx->pd,ctx->cq,ctx->cq,user_parm); if (!ctx->qp) { fprintf(stderr, "Couldn't create QP\n"); return NULL; } if (ctx_modify_qp_to_init(ctx->qp,user_parm)) { fprintf(stderr, "Failed to modify QP to INIT\n"); return NULL; } return ctx; }
int main(int argc, char *argv[]) { struct pdata rep_pdata; struct rdma_event_channel *cm_channel; struct rdma_cm_id *listen_id; struct rdma_cm_id *cm_id; struct rdma_cm_event *event; struct rdma_conn_param conn_param = { }; struct ibv_pd *pd; struct ibv_comp_channel *comp_chan; struct ibv_cq *cq; struct ibv_cq *evt_cq; struct ibv_mr *mr; struct ibv_qp_init_attr qp_attr = { }; struct ibv_sge sge; struct ibv_send_wr send_wr = { }; struct ibv_send_wr *bad_send_wr; struct ibv_recv_wr recv_wr = { }; struct ibv_recv_wr *bad_recv_wr; struct ibv_wc wc; void *cq_context; struct sockaddr_in sin; uint32_t *buf; int err; /* Set up RDMA CM structures */ cm_channel = rdma_create_event_channel(); if (!cm_channel) return 1; err = rdma_create_id(cm_channel, &listen_id, NULL, RDMA_PS_TCP); if (err) return err; sin.sin_family = AF_INET; sin.sin_port = htons(20079); sin.sin_addr.s_addr = INADDR_ANY; /* Bind to local port and listen for connection request */ err = rdma_bind_addr(listen_id, (struct sockaddr *) &sin); if (err) return 1; err = rdma_listen(listen_id, 1); if (err) return 1; err = rdma_get_cm_event(cm_channel, &event); if (err) return err; printf("after get_cm_event\n"); if (event->event != RDMA_CM_EVENT_CONNECT_REQUEST) return 1; cm_id = event->id; rdma_ack_cm_event(event); /* Create verbs objects now that we know which device to use */ pd = ibv_alloc_pd(cm_id->verbs); if (!pd) return 1; comp_chan = ibv_create_comp_channel(cm_id->verbs); if (!comp_chan) return 1; cq = ibv_create_cq(cm_id->verbs, 2, NULL, comp_chan, 0); if (!cq) return 1; if (ibv_req_notify_cq(cq, 0)) return 1; buf = calloc(2, sizeof(uint32_t)); if (!buf) return 1; mr = ibv_reg_mr(pd, buf, 2 * sizeof(uint32_t), IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE); if (!mr) return 1; qp_attr.cap.max_send_wr = 1; qp_attr.cap.max_send_sge = 1; qp_attr.cap.max_recv_wr = 1; qp_attr.cap.max_recv_sge = 1; qp_attr.send_cq = cq; qp_attr.recv_cq = cq; qp_attr.qp_type = IBV_QPT_RC; err = rdma_create_qp(cm_id, pd, &qp_attr); if (err) return err; /* Post receive before accepting connection */ sge.addr = (uintptr_t) buf + sizeof(uint32_t); sge.length = sizeof(uint32_t); sge.lkey = mr->lkey; recv_wr.sg_list = &sge; recv_wr.num_sge = 1; if (ibv_post_recv(cm_id->qp, &recv_wr, &bad_recv_wr)) return 1; rep_pdata.buf_va = htonll((uintptr_t) buf); rep_pdata.buf_rkey = htonl(mr->rkey); conn_param.responder_resources = 1; conn_param.private_data = &rep_pdata; conn_param.private_data_len = sizeof rep_pdata; /* Accept connection */ printf("before accept\n"); err = rdma_accept(cm_id, &conn_param); if (err) return 1; printf("after accept\n"); err = rdma_get_cm_event(cm_channel, &event); if (err) return err; if (event->event != RDMA_CM_EVENT_ESTABLISHED) return 1; rdma_ack_cm_event(event); /* Wait for receive completion */ if (ibv_get_cq_event(comp_chan, &evt_cq, &cq_context)) return 1; if (ibv_req_notify_cq(cq, 0)) return 1; if (ibv_poll_cq(cq, 1, &wc) < 1) return 1; if (wc.status != IBV_WC_SUCCESS) return 1; /* Add two integers and send reply back */ buf[0] = htonl(ntohl(buf[0]) + ntohl(buf[1])); sge.addr = (uintptr_t) buf; sge.length = sizeof(uint32_t); sge.lkey = mr->lkey; send_wr.opcode = IBV_WR_SEND; send_wr.send_flags = IBV_SEND_SIGNALED; send_wr.sg_list = &sge; send_wr.num_sge = 1; if (ibv_post_send(cm_id->qp, &send_wr, &bad_send_wr)) return 1; /* Wait for send completion */ if (ibv_get_cq_event(comp_chan, &evt_cq, &cq_context)) return 1; if (ibv_poll_cq(cq, 1, &wc) < 1) return 1; if (wc.status != IBV_WC_SUCCESS) return 1; printf("before ack cq 2\n"); ibv_ack_cq_events(cq, 2); return 0; }
void network_init() { /* Set up RDMA CM structures */ cm_channel = rdma_create_event_channel(); assert(cm_channel); err = rdma_create_id(cm_channel, &cm_id, 0, RDMA_PS_TCP); assert(err == 0); /* Resolve server address and route */ n = getaddrinfo(server_ip, server_port_string, &hints, &res); assert(n >= 0); for (t = res; t; t = t->ai_next) { err = rdma_resolve_addr(cm_id, 0, t->ai_addr, RESOLVE_TIMEOUT_MS); if (!err) break; } assert(err == 0); err = rdma_get_cm_event(cm_channel, &event); assert(err == 0); assert(event->event == RDMA_CM_EVENT_ADDR_RESOLVED); rdma_ack_cm_event(event); err = rdma_resolve_route(cm_id, RESOLVE_TIMEOUT_MS); assert(err == 0); err = rdma_get_cm_event(cm_channel, &event); assert(err == 0); assert(event->event == RDMA_CM_EVENT_ROUTE_RESOLVED); rdma_ack_cm_event(event); /* Create verbs objects now that we know which device to use */ pd = ibv_alloc_pd(cm_id->verbs); assert(pd); comp_chan = ibv_create_comp_channel(cm_id->verbs); assert(comp_chan); cq = ibv_create_cq(cm_id->verbs, 10, 0, comp_chan, 0); assert(cq); err = ibv_req_notify_cq(cq, 0); assert(err == 0); mr_data = ibv_reg_mr(pd, data, BUFFER_SIZE, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); assert(mr_data); mr_ack_buffer = ibv_reg_mr(pd, &ack_buffer, sizeof(ack_buffer), IBV_ACCESS_LOCAL_WRITE); assert(mr_ack_buffer); qp_attr.cap.max_send_wr = 10; qp_attr.cap.max_send_sge = 10; qp_attr.cap.max_recv_wr = 10; qp_attr.cap.max_recv_sge = 10; qp_attr.send_cq = cq; qp_attr.recv_cq = cq; qp_attr.qp_type = IBV_QPT_RC; err = rdma_create_qp(cm_id, pd, &qp_attr); assert(err == 0); /* Post receive for data before connecting */ sge_data.addr = (uintptr_t)data; sge_data.length = BUFFER_SIZE; sge_data.lkey = mr_data->lkey; recv_wr.sg_list = &sge_data; recv_wr.num_sge = 1; err = ibv_post_recv(cm_id->qp, &recv_wr, &bad_recv_wr); assert(err == 0); /* Construct connection params */ client_pdata.data_va = htonll((uintptr_t)data); client_pdata.data_rkey = htonl(mr_data->rkey); conn_param.private_data = &client_pdata; conn_param.private_data_len = sizeof(client_pdata); conn_param.initiator_depth = 1; conn_param.retry_count = 7; /* Connect to server */ err = rdma_connect(cm_id, &conn_param); assert(err == 0); err = rdma_get_cm_event(cm_channel, &event); assert(err == 0); assert(event->event == RDMA_CM_EVENT_ESTABLISHED); memcpy(&server_pdata, event->param.conn.private_data, sizeof(server_pdata)); rdma_ack_cm_event(event); printf("My index == %d\n", server_pdata.index); /* Construct connection params */ client_pdata.index = server_pdata.index; client_pdata.ack_buffer_va = server_pdata.ack_buffer_va; client_pdata.ack_buffer_rkey = server_pdata.ack_buffer_rkey; }
static int ibw_setup_cq_qp(struct ibw_conn *conn) { struct ibw_ctx_priv *pctx = talloc_get_type(conn->ctx->internal, struct ibw_ctx_priv); struct ibw_conn_priv *pconn = talloc_get_type(conn->internal, struct ibw_conn_priv); struct ibv_qp_init_attr init_attr; struct ibv_qp_attr attr; int rc; DEBUG(DEBUG_DEBUG, ("ibw_setup_cq_qp(cmid: %p)\n", pconn->cm_id)); /* init verbs */ pconn->verbs_channel = ibv_create_comp_channel(pconn->cm_id->verbs); if (!pconn->verbs_channel) { sprintf(ibw_lasterr, "ibv_create_comp_channel failed %d\n", errno); return -1; } DEBUG(DEBUG_DEBUG, ("created channel %p\n", pconn->verbs_channel)); pconn->verbs_channel_event = tevent_add_fd(pctx->ectx, NULL, /* not pconn or conn */ pconn->verbs_channel->fd, TEVENT_FD_READ, ibw_event_handler_verbs, conn); pconn->pd = ibv_alloc_pd(pconn->cm_id->verbs); if (!pconn->pd) { sprintf(ibw_lasterr, "ibv_alloc_pd failed %d\n", errno); return -1; } DEBUG(DEBUG_DEBUG, ("created pd %p\n", pconn->pd)); /* init mr */ if (ibw_init_memory(conn)) return -1; /* init cq */ pconn->cq = ibv_create_cq(pconn->cm_id->verbs, pctx->opts.max_recv_wr + pctx->opts.max_send_wr, conn, pconn->verbs_channel, 0); if (pconn->cq==NULL) { sprintf(ibw_lasterr, "ibv_create_cq failed\n"); return -1; } rc = ibv_req_notify_cq(pconn->cq, 0); if (rc) { sprintf(ibw_lasterr, "ibv_req_notify_cq failed with %d\n", rc); return rc; } /* init qp */ memset(&init_attr, 0, sizeof(init_attr)); init_attr.cap.max_send_wr = pctx->opts.max_send_wr; init_attr.cap.max_recv_wr = pctx->opts.max_recv_wr; init_attr.cap.max_recv_sge = 1; init_attr.cap.max_send_sge = 1; init_attr.qp_type = IBV_QPT_RC; init_attr.send_cq = pconn->cq; init_attr.recv_cq = pconn->cq; rc = rdma_create_qp(pconn->cm_id, pconn->pd, &init_attr); if (rc) { sprintf(ibw_lasterr, "rdma_create_qp failed with %d\n", rc); return rc; } /* elase result is in pconn->cm_id->qp */ rc = ibv_query_qp(pconn->cm_id->qp, &attr, IBV_QP_PATH_MTU, &init_attr); if (rc) { sprintf(ibw_lasterr, "ibv_query_qp failed with %d\n", rc); return rc; } return ibw_fill_cq(conn); }
/** * the first step in original MPID_nem_ib_setup_conn() function * open hca, create ptags and create cqs */ int MPID_nem_ib_open_ports() { int mpi_errno = MPI_SUCCESS; /* Infiniband Verb Structures */ struct ibv_port_attr port_attr; struct ibv_device_attr dev_attr; int nHca; /* , curRank, rail_index ; */ MPIDI_STATE_DECL(MPID_STATE_MPIDI_OPEN_HCA); MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_OPEN_HCA); for (nHca = 0; nHca < ib_hca_num_hcas; nHca++) { if (ibv_query_device(hca_list[nHca].nic_context, &dev_attr)) { MPIU_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**fail", "**fail %s", "Error getting HCA attributes"); } /* detecting active ports */ if (rdma_default_port < 0 || ib_hca_num_ports > 1) { int nPort; int k = 0; for (nPort = 1; nPort <= RDMA_DEFAULT_MAX_PORTS; nPort ++) { if ((! ibv_query_port(hca_list[nHca].nic_context, nPort, &port_attr)) && port_attr.state == IBV_PORT_ACTIVE && (port_attr.lid || (!port_attr.lid && use_iboeth))) { if (use_iboeth) { if (ibv_query_gid(hca_list[nHca].nic_context, nPort, 0, &hca_list[nHca].gids[k])) { /* new error information function needed */ MPIU_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**fail", "Failed to retrieve gid on rank %d", process_info.rank); } DEBUG_PRINT("[%d] %s(%d): Getting gid[%d][%d] for" " port %d subnet_prefix = %llx," " intf_id = %llx\r\n", process_info.rank, __FUNCTION__, __LINE__, nHca, k, k, hca_list[nHca].gids[k].global.subnet_prefix, hca_list[nHca].gids[k].global.interface_id); } else { hca_list[nHca].lids[k] = port_attr.lid; } hca_list[nHca].ports[k++] = nPort; if (check_attrs(&port_attr, &dev_attr)) { MPIU_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**fail", "**fail %s", "Attributes failed sanity check"); } } } if (k < ib_hca_num_ports) { MPIU_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**activeports", "**activeports %d", ib_hca_num_ports); } } else { if(ibv_query_port(hca_list[nHca].nic_context, rdma_default_port, &port_attr) || (!port_attr.lid && !use_iboeth) || (port_attr.state != IBV_PORT_ACTIVE)) { MPIU_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**portquery", "**portquery %d", rdma_default_port); } hca_list[nHca].ports[0] = rdma_default_port; if (use_iboeth) { if (ibv_query_gid(hca_list[nHca].nic_context, 0, 0, &hca_list[nHca].gids[0])) { /* new error function needed */ MPIU_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**fail", "Failed to retrieve gid on rank %d", process_info.rank); } if (check_attrs(&port_attr, &dev_attr)) { MPIU_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**fail", "**fail %s", "Attributes failed sanity check"); } } else { hca_list[nHca].lids[0] = port_attr.lid; } } if (rdma_use_blocking) { hca_list[nHca].comp_channel = ibv_create_comp_channel(hca_list[nHca].nic_context); if (!hca_list[nHca].comp_channel) { MPIU_ERR_SETFATALANDSTMT1(mpi_errno, MPI_ERR_OTHER, goto fn_fail, "**fail", "**fail %s", "cannot create completion channel"); } hca_list[nHca].send_cq_hndl = NULL; hca_list[nHca].recv_cq_hndl = NULL; hca_list[nHca].cq_hndl = ibv_create_cq(hca_list[nHca].nic_context, rdma_default_max_cq_size, NULL, hca_list[nHca].comp_channel, 0); if (!hca_list[nHca].cq_hndl) { MPIU_ERR_SETFATALANDSTMT1(mpi_errno, MPI_ERR_OTHER, goto fn_fail, "**fail", "**fail %s", "cannot create cq"); } if (ibv_req_notify_cq(hca_list[nHca].cq_hndl, 0)) { MPIU_ERR_SETFATALANDSTMT1(mpi_errno, MPI_ERR_OTHER, goto fn_fail, "**fail", "**fail %s", "cannot request cq notification"); }
/***************************************************************************//** * Description * Init rdma global resources * ******************************************************************************/ static struct thread_context* init_rdma_thread_resources() { struct thread_context *ctx = calloc(1, sizeof(struct thread_context)); ctx->qp_hash = hashtable_create(1024); int num_device; if ( !(ctx->device_ctx_list = rdma_get_devices(&num_device)) ) { perror("rdma_get_devices()"); return NULL; } ctx->device_ctx = *ctx->device_ctx_list; if (verbose) { printf("Get device: %d\n", num_device); } if ( !(ctx->pd = ibv_alloc_pd(ctx->device_ctx)) ) { perror("ibv_alloc_pd()"); return NULL; } if ( !(ctx->comp_channel = ibv_create_comp_channel(ctx->device_ctx)) ) { perror("ibv_create_comp_channel()"); return NULL; } struct ibv_srq_init_attr srq_init_attr; srq_init_attr.srq_context = NULL; srq_init_attr.attr.max_sge = 16; srq_init_attr.attr.max_wr = srq_size; srq_init_attr.attr.srq_limit = srq_size; /* RDMA TODO: what is srq_limit? */ if ( !(ctx->srq = ibv_create_srq(ctx->pd, &srq_init_attr)) ) { perror("ibv_create_srq()"); return NULL; } if ( !(ctx->send_cq = ibv_create_cq(ctx->device_ctx, cq_size, NULL, ctx->comp_channel, 0)) ) { perror("ibv_create_cq()"); return NULL; } if (0 != ibv_req_notify_cq(ctx->send_cq, 0)) { perror("ibv_reg_notify_cq()"); return NULL; } if ( !(ctx->recv_cq = ibv_create_cq(ctx->device_ctx, cq_size, NULL, ctx->comp_channel, 0)) ) { perror("ibv_create_cq()"); return NULL; } if (0 != ibv_req_notify_cq(ctx->recv_cq, 0)) { perror("ibv_reg_notify_cq()"); return NULL; } ctx->rsize = BUFF_SIZE; ctx->rbuf_list = calloc(buff_per_thread, sizeof(char *)); ctx->rmr_list = calloc(buff_per_thread, sizeof(struct ibv_mr*)); ctx->poll_wc = calloc(poll_wc_size, sizeof(struct ibv_wc)); int i = 0; for (i = 0; i < buff_per_thread; ++i) { ctx->rbuf_list[i] = malloc(ctx->rsize); if (ctx->rbuf_list[i] == 0) { break; } } if (i != buff_per_thread) { int j = 0; for (j = 0; j < i; ++j) { free(ctx->rbuf_list[j]); } free(ctx->rbuf_list); ctx->rbuf_list = 0; } if (!ctx->rmr_list || !ctx->rbuf_list) { fprintf(stderr, "out of ctxmory in init_rdma_thread_resources()\n"); return NULL; } struct ibv_recv_wr *bad = NULL; struct ibv_sge sge; struct ibv_recv_wr rwr; for (i = 0; i < buff_per_thread; ++i) { ctx->rmr_list[i] = ibv_reg_mr(ctx->pd, ctx->rbuf_list[i], ctx->rsize, IBV_ACCESS_LOCAL_WRITE); sge.addr = (uintptr_t)ctx->rbuf_list[i]; sge.length = ctx->rsize; sge.lkey = ctx->rmr_list[i]->lkey; rwr.wr_id = (uintptr_t)ctx->rmr_list[i]; rwr.next = NULL; rwr.sg_list = &sge; rwr.num_sge = 1; if (0 != ibv_post_srq_recv(ctx->srq, &rwr, &bad)) { perror("ibv_post_srq_recv()"); return NULL; } } return ctx; }
struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, int rx_depth, int port, int use_event, enum pp_wr_calc_op calc_op, enum pp_wr_data_type calc_data_type, char *calc_operands_str) { struct pingpong_context *ctx; int rc; ctx = malloc(sizeof *ctx); if (!ctx) return NULL; memset(ctx, 0, sizeof *ctx); ctx->size = size; ctx->rx_depth = rx_depth; ctx->calc_op.opcode = IBV_EXP_CALC_OP_NUMBER; ctx->calc_op.data_type = IBV_EXP_CALC_DATA_TYPE_NUMBER; ctx->calc_op.data_size = IBV_EXP_CALC_DATA_SIZE_NUMBER; ctx->buf = memalign(page_size, size); if (!ctx->buf) { fprintf(stderr, "Couldn't allocate work buf.\n"); goto clean_ctx; } memset(ctx->buf, 0, size); ctx->net_buf = memalign(page_size, size); if (!ctx->net_buf) { fprintf(stderr, "Couldn't allocate work buf.\n"); goto clean_buffer; } memset(ctx->net_buf, 0, size); ctx->context = ibv_open_device(ib_dev); if (!ctx->context) { fprintf(stderr, "Couldn't get context for %s\n", ibv_get_device_name(ib_dev)); goto clean_net_buf; } if (use_event) { ctx->channel = ibv_create_comp_channel(ctx->context); if (!ctx->channel) { fprintf(stderr, "Couldn't create completion channel\n"); goto clean_device; } } else ctx->channel = NULL; ctx->pd = ibv_alloc_pd(ctx->context); if (!ctx->pd) { fprintf(stderr, "Couldn't allocate PD\n"); goto clean_comp_channel; } ctx->mr = ibv_reg_mr(ctx->pd, ctx->net_buf, size, IBV_ACCESS_LOCAL_WRITE); if (!ctx->mr) { fprintf(stderr, "Couldn't register MR\n"); goto clean_pd; } if (calc_op != PP_CALC_INVALID) { int op_per_gather, num_op, max_num_op; ctx->calc_op.opcode = IBV_EXP_CALC_OP_NUMBER; ctx->calc_op.data_type = IBV_EXP_CALC_DATA_TYPE_NUMBER; ctx->calc_op.data_size = IBV_EXP_CALC_DATA_SIZE_NUMBER; num_op = pp_parse_calc_to_gather(calc_operands_str, calc_op, calc_data_type, &ctx->calc_op, ctx->context, ctx->buf, ctx->net_buf); if (num_op < 0) { fprintf(stderr, "-E- failed parsing calc operators\n"); goto clean_mr; } rc = pp_query_calc_cap(ctx->context, ctx->calc_op.opcode, ctx->calc_op.data_type, ctx->calc_op.data_size, &op_per_gather, &max_num_op); if (rc) { fprintf(stderr, "-E- operation not supported on %s. valid ops are:\n", ibv_get_device_name(ib_dev)); pp_print_dev_calc_ops(ctx->context); goto clean_mr; } if (pp_prepare_sg_list(op_per_gather, num_op, ctx->mr->lkey, &ctx->calc_op, ctx->net_buf)) { fprintf(stderr, "-failed to prepare the sg list\n"); goto clean_mr; } } ctx->cq = ibv_create_cq(ctx->context, rx_depth + 1, NULL, ctx->channel, 0); if (!ctx->cq) { fprintf(stderr, "Couldn't create CQ\n"); goto clean_mr; } { struct ibv_exp_qp_init_attr attr = { .send_cq = ctx->cq, .recv_cq = ctx->cq, .cap = { .max_send_wr = 16, .max_recv_wr = rx_depth, .max_send_sge = 16, .max_recv_sge = 16 }, .qp_type = IBV_QPT_RC, .pd = ctx->pd }; attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS | IBV_EXP_QP_INIT_ATTR_PD; attr.exp_create_flags = IBV_EXP_QP_CREATE_CROSS_CHANNEL; ctx->qp = ibv_exp_create_qp(ctx->context, &attr); if (!ctx->qp) { fprintf(stderr, "Couldn't create QP\n"); goto clean_cq; } } { struct ibv_qp_attr attr = { .qp_state = IBV_QPS_INIT, .pkey_index = 0, .port_num = port, .qp_access_flags = 0 }; if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { fprintf(stderr, "Failed to modify QP to INIT\n"); goto clean_qp; } } ctx->mcq = ibv_create_cq(ctx->context, rx_depth + 1, NULL, ctx->channel, 0); if (!ctx->mcq) { fprintf(stderr, "Couldn't create CQ for MQP\n"); goto clean_qp; } { struct ibv_exp_qp_init_attr mattr = { .send_cq = ctx->mcq, .recv_cq = ctx->mcq, .cap = { .max_send_wr = 1, .max_recv_wr = rx_depth, .max_send_sge = 16, .max_recv_sge = 16 }, .qp_type = IBV_QPT_RC, .pd = ctx->pd }; mattr.comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS | IBV_EXP_QP_INIT_ATTR_PD; mattr.exp_create_flags = IBV_EXP_QP_CREATE_CROSS_CHANNEL; ctx->mqp = ibv_exp_create_qp(ctx->context, &mattr); if (!ctx->qp) { fprintf(stderr, "Couldn't create MQP\n"); goto clean_mcq; } } { struct ibv_qp_attr mattr = { .qp_state = IBV_QPS_INIT, .pkey_index = 0, .port_num = port, .qp_access_flags = 0 }; if (ibv_modify_qp(ctx->mqp, &mattr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { fprintf(stderr, "Failed to modify MQP to INIT\n"); goto clean_mqp; } } return ctx; clean_mqp: ibv_destroy_qp(ctx->mqp); clean_mcq: ibv_destroy_cq(ctx->mcq); clean_qp: ibv_destroy_qp(ctx->qp); clean_cq: ibv_destroy_cq(ctx->cq); clean_mr: ibv_dereg_mr(ctx->mr); clean_pd: ibv_dealloc_pd(ctx->pd); clean_comp_channel: if (ctx->channel) ibv_destroy_comp_channel(ctx->channel); clean_device: ibv_close_device(ctx->context); clean_net_buf: free(ctx->net_buf); clean_buffer: free(ctx->buf); clean_ctx: free(ctx); return NULL; } int pp_close_ctx(struct pingpong_context *ctx) { if (ibv_destroy_qp(ctx->qp)) { fprintf(stderr, "Couldn't destroy QP\n"); return 1; } if (ibv_destroy_qp(ctx->mqp)) { fprintf(stderr, "Couldn't destroy MQP\n"); return 1; } if (ibv_destroy_cq(ctx->cq)) { fprintf(stderr, "Couldn't destroy CQ\n"); return 1; } if (ibv_destroy_cq(ctx->mcq)) { fprintf(stderr, "Couldn't destroy MCQ\n"); return 1; } if (ibv_dereg_mr(ctx->mr)) { fprintf(stderr, "Couldn't deregister MR\n"); return 1; } if (ibv_dealloc_pd(ctx->pd)) { fprintf(stderr, "Couldn't deallocate PD\n"); return 1; } if (ctx->channel) { if (ibv_destroy_comp_channel(ctx->channel)) { fprintf(stderr, "Couldn't destroy completion channel\n"); return 1; } } if (ibv_close_device(ctx->context)) { fprintf(stderr, "Couldn't release context\n"); return 1; } free(ctx->buf); free(ctx->net_buf); free(ctx); return 0; } static int pp_post_recv(struct pingpong_context *ctx, int n) { int rc; struct ibv_sge list = { .addr = (uintptr_t) ctx->net_buf, .length = ctx->size, .lkey = ctx->mr->lkey }; struct ibv_recv_wr wr = { .wr_id = PP_RECV_WRID, .sg_list = &list, .num_sge = 1, }; struct ibv_recv_wr *bad_wr; int i; for (i = 0; i < n; ++i) { rc = ibv_post_recv(ctx->qp, &wr, &bad_wr); if (rc) return rc; } return i; } static int pp_post_send(struct pingpong_context *ctx) { int ret; struct ibv_sge list = { .addr = (uintptr_t) ctx->net_buf, .length = ctx->size, .lkey = ctx->mr->lkey }; struct ibv_exp_send_wr wr = { .wr_id = PP_SEND_WRID, .sg_list = &list, .num_sge = 1, .exp_opcode = IBV_EXP_WR_SEND, .exp_send_flags = IBV_EXP_SEND_SIGNALED, }; struct ibv_exp_send_wr *bad_wr; /* If this is a calc operation - set the required params in the wr */ if (ctx->calc_op.opcode != IBV_EXP_CALC_OP_NUMBER) { wr.exp_opcode = IBV_EXP_WR_SEND; wr.exp_send_flags |= IBV_EXP_SEND_WITH_CALC; wr.sg_list = ctx->calc_op.gather_list; wr.num_sge = ctx->calc_op.gather_list_size; wr.op.calc.calc_op = ctx->calc_op.opcode; wr.op.calc.data_type = ctx->calc_op.data_type; wr.op.calc.data_size = ctx->calc_op.data_size; } ret = ibv_exp_post_send(ctx->qp, &wr, &bad_wr); return ret; } int pp_post_ext_wqe(struct pingpong_context *ctx, enum ibv_exp_wr_opcode op) { int ret; struct ibv_exp_send_wr wr = { .wr_id = PP_CQE_WAIT, .sg_list = NULL, .num_sge = 0, .exp_opcode = op, .exp_send_flags = IBV_EXP_SEND_SIGNALED, }; struct ibv_exp_send_wr *bad_wr; switch (op) { case IBV_EXP_WR_RECV_ENABLE: case IBV_EXP_WR_SEND_ENABLE: wr.task.wqe_enable.qp = ctx->qp; wr.task.wqe_enable.wqe_count = 0; wr.exp_send_flags |= IBV_EXP_SEND_WAIT_EN_LAST; break; case IBV_EXP_WR_CQE_WAIT: wr.task.cqe_wait.cq = ctx->cq; wr.task.cqe_wait.cq_count = 1; wr.exp_send_flags |= IBV_EXP_SEND_WAIT_EN_LAST; break; default: fprintf(stderr, "-E- unsupported m_wqe opcode %d\n", op); return -1; } ret = ibv_exp_post_send(ctx->mqp, &wr, &bad_wr); return ret; } int pp_poll_mcq(struct ibv_cq *cq, int num_cqe) { int ne; int i; struct ibv_wc wc[2]; if (num_cqe > 2) { fprintf(stderr, "-E- max num cqe exceeded\n"); return -1; } do { ne = ibv_poll_cq(cq, num_cqe, wc); if (ne < 0) { fprintf(stderr, "poll CQ failed %d\n", ne); return 1; } } while (ne < 1); for (i = 0; i < ne; ++i) { if (wc[i].status != IBV_WC_SUCCESS) { fprintf(stderr, "Failed %s status %s (%d)\n", wr_id_str[(int)wc[i].wr_id], ibv_wc_status_str(wc[i].status), wc[i].status); return 1; } if ((int) wc[i].wr_id != PP_CQE_WAIT) { fprintf(stderr, "invalid wr_id %" PRIx64 "\n", wc[i].wr_id); return -1; } } return 0; } static int pp_calc_verify(struct pingpong_context *ctx, enum pp_wr_data_type calc_data_type, enum pp_wr_calc_op calc_opcode) { uint64_t *op1 = &(ctx->last_result); uint64_t *op2 = (uint64_t *)ctx->buf + 2; uint64_t *res = (uint64_t *)ctx->buf; return !EXEC_VERIFY(calc_data_type, calc_opcode, 1, op1, op2, res); } static int pp_update_last_result(struct pingpong_context *ctx, enum pp_wr_data_type calc_data_type, enum pp_wr_calc_op calc_opcode) { /* EXEC_VERIFY derefence result parameter */ uint64_t *dummy; uint64_t *op1 = (uint64_t *)ctx->buf; uint64_t *op2 = (uint64_t *)ctx->buf + 2; uint64_t res = (uint64_t)EXEC_VERIFY(calc_data_type, calc_opcode, 0, op1, op2, dummy); ctx->last_result = res; return 0; } static void usage(const char *argv0) { printf("Usage:\n"); printf(" %s start a server and wait for connection\n", argv0); printf(" %s <host> connect to server at <host>\n", argv0); printf("\n"); printf("Options:\n"); printf(" -p, --port=<port> listen on/connect to port <port> (default 18515)\n"); printf(" -d, --ib-dev=<dev> use IB device <dev> (default first device found)\n"); printf(" -i, --ib-port=<port> use port <port> of IB device (default 1)\n"); printf(" -s, --size=<size> size of message to exchange (default 4096 minimum 16)\n"); printf(" -m, --mtu=<size> path MTU (default 1024)\n"); printf(" -r, --rx-depth=<dep> number of receives to post at a time (default 500)\n"); printf(" -n, --iters=<iters> number of exchanges (default 1000)\n"); printf(" -l, --sl=<sl> service level value\n"); printf(" -e, --events sleep on CQ events (default poll)\n"); printf(" -c, --calc=<operation> calc operation\n"); printf(" -t, --op_type=<type> calc operands type\n"); printf(" -o, --operands=<o1,o2,...> comma separated list of operands\n"); printf(" -w, --wait_cq=cqn wait for entries on cq\n"); printf(" -v, --verbose print verbose information\n"); printf(" -V, --verify verify calc operations\n"); }
int main(int argc, char *argv[]) { struct ibv_pd *pd1, *pd2; struct ibv_comp_channel *comp_chan1, *comp_chan2; struct ibv_cq *cq1, *cq2; struct ibv_cq *evt_cq = NULL; struct ibv_mr *mr1, *mr2; struct ibv_qp_init_attr qp_attr1 = { }, qp_attr2 = {}; struct ibv_sge sge; struct ibv_send_wr send_wr = { }; struct ibv_send_wr *bad_send_wr = NULL; struct ibv_wc wc; struct ibv_qp *qp1, *qp2; void *cq_context = NULL; union ibv_gid gid1, gid2; int n; uint8_t *buf1, *buf2; int err; int num_devices; struct ibv_context * verbs1, *verbs2; struct ibv_device ** dev_list = ibv_get_device_list(&num_devices); struct ibv_device_attr dev_attr; int use = 0; int port = 1; int x = 0; unsigned long mb = 0; unsigned long bytes = 0; unsigned long save_diff = 0; struct timeval start, stop, diff; int iterations = 0; struct rusage usage; struct timeval ustart, uend; struct timeval sstart, send; struct timeval tstart, tend; DPRINTF("There are %d devices\n", num_devices); for(x = 0; x < num_devices; x++) { printf("Device: %d, %s\n", x, ibv_get_device_name(dev_list[use])); } if(num_devices == 0 || dev_list == NULL) { printf("No devices found\n"); return 1; } if(argc < 2) { printf("Which RDMA device to use? 0, 1, 2, 3...\n"); return 1; } use = atoi(argv[1]); DPRINTF("Using device %d\n", use); verbs1 = ibv_open_device(dev_list[use]); if(verbs1 == NULL) { printf("Failed to open device!\n"); return 1; } DPRINTF("Device open %s\n", ibv_get_device_name(dev_list[use])); verbs2 = ibv_open_device(dev_list[use]); if(verbs2 == NULL) { printf("Failed to open device again!\n"); return 1; } if(ibv_query_device(verbs1, &dev_attr)) { printf("Failed to query device attributes.\n"); return 1; } printf("Device open: %d, %s which has %d ports\n", x, ibv_get_device_name(dev_list[use]), dev_attr.phys_port_cnt); if(argc < 3) { printf("Which port on the device to use? 1, 2, 3...\n"); return 1; } port = atoi(argv[2]); if(port <= 0) { printf("Port #%d invalid, must start with 1, 2, 3, ...\n", port); return 1; } printf("Using port %d\n", port); if(argc < 4) { printf("How many iterations to perform?\n"); return 1; } iterations = atoi(argv[3]); printf("Will perform %d iterations\n", iterations); pd1 = ibv_alloc_pd(verbs1); if (!pd1) return 1; if(argc < 5) { printf("How many megabytes to allocate? (This will be allocated twice. Once for source, once for destination.)\n"); return 1; } mb = atoi(argv[4]); if(mb <= 0) { printf("Megabytes %lu invalid\n", mb); return 1; } DPRINTF("protection domain1 allocated\n"); pd2 = ibv_alloc_pd(verbs2); if (!pd2) return 1; DPRINTF("protection domain2 allocated\n"); comp_chan1 = ibv_create_comp_channel(verbs1); if (!comp_chan1) return 1; DPRINTF("completion chan1 created\n"); comp_chan2 = ibv_create_comp_channel(verbs2); if (!comp_chan2) return 1; DPRINTF("completion chan2 created\n"); cq1 = ibv_create_cq(verbs1, 2, NULL, comp_chan1, 0); if (!cq1) return 1; DPRINTF("CQ1 created\n"); cq2 = ibv_create_cq(verbs2, 2, NULL, comp_chan2, 0); if (!cq2) return 1; DPRINTF("CQ2 created\n"); bytes = mb * 1024UL * 1024UL; buf1 = malloc(bytes); if (!buf1) return 1; buf2 = malloc(bytes); if (!buf2) return 1; printf("Populating %lu MB memory.\n", mb * 2); for(x = 0; x < bytes; x++) { buf1[x] = 123; } buf1[bytes - 1] = 123; mr1 = ibv_reg_mr(pd1, buf1, bytes, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ); if (!mr1) { printf("Failed to register memory.\n"); return 1; } mr2 = ibv_reg_mr(pd2, buf2, bytes, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ); if (!mr2) { printf("Failed to register memory.\n"); return 1; } DPRINTF("memory registered.\n"); qp_attr1.cap.max_send_wr = 10; qp_attr1.cap.max_send_sge = 10; qp_attr1.cap.max_recv_wr = 10; qp_attr1.cap.max_recv_sge = 10; qp_attr1.sq_sig_all = 1; qp_attr1.send_cq = cq1; qp_attr1.recv_cq = cq1; qp_attr1.qp_type = IBV_QPT_RC; qp1 = ibv_create_qp(pd1, &qp_attr1); if (!qp1) { printf("failed to create queue pair #1\n"); return 1; } DPRINTF("queue pair1 created\n"); qp_attr2.cap.max_send_wr = 10; qp_attr2.cap.max_send_sge = 10; qp_attr2.cap.max_recv_wr = 10; qp_attr2.cap.max_recv_sge = 10; qp_attr2.sq_sig_all = 1; qp_attr2.send_cq = cq2; qp_attr2.recv_cq = cq2; qp_attr2.qp_type = IBV_QPT_RC; qp2 = ibv_create_qp(pd2, &qp_attr2); if (!qp2) { printf("failed to create queue pair #2\n"); return 1; } DPRINTF("queue pair2 created\n"); struct ibv_qp_attr attr1 = { .qp_state = IBV_QPS_INIT, .pkey_index = 0, .port_num = port, .qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE, }; if(ibv_modify_qp(qp1, &attr1, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { printf("verbs 1 Failed to go to init\n"); return 1; } DPRINTF("verbs1 to init\n"); struct ibv_qp_attr attr2 = { .qp_state = IBV_QPS_INIT, .pkey_index = 0, .port_num = port, .qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE, }; if(ibv_modify_qp(qp2, &attr2, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { printf("verbs 2 Failed to go to init\n"); return 1; } DPRINTF("verbs2 to init\n"); //struct ibv_gid gid1, gid2; struct ibv_port_attr port1, port2; uint64_t psn1 = lrand48() & 0xffffff; uint64_t psn2 = lrand48() & 0xffffff; if(ibv_query_port(verbs1, port, &port1)) return 1; DPRINTF("got port1 information\n"); if(ibv_query_port(verbs2, port, &port2)) return 1; DPRINTF("got port2 information\n"); if(ibv_query_gid(verbs1, 1, 0, &gid1)) return 1; DPRINTF("got gid1 information\n"); if(ibv_query_gid(verbs2, 1, 0, &gid2)) return 1; DPRINTF("got gid2 information\n"); struct ibv_qp_attr next2 = { .qp_state = IBV_QPS_RTR, .path_mtu = IBV_MTU_1024, .dest_qp_num = qp2->qp_num, .rq_psn = psn2, .max_dest_rd_atomic = 5, .min_rnr_timer = 12, .ah_attr = { .is_global = 0, .dlid = port2.lid, .sl = 0, .src_path_bits = 0, .port_num = port, } }; if(gid2.global.interface_id) { next2.ah_attr.is_global = 1; next2.ah_attr.grh.hop_limit = 1; next2.ah_attr.grh.dgid = gid2; next2.ah_attr.grh.sgid_index = 0; } struct ibv_qp_attr next1 = { .qp_state = IBV_QPS_RTR, .path_mtu = IBV_MTU_1024, .dest_qp_num = qp1->qp_num, .rq_psn = psn1, .max_dest_rd_atomic = 1, .min_rnr_timer = 12, .ah_attr = { .is_global = 0, .dlid = port1.lid, .sl = 0, .src_path_bits = 0, .port_num = port, } }; if(gid1.global.interface_id) { next1.ah_attr.is_global = 1; next1.ah_attr.grh.hop_limit = 1; next1.ah_attr.grh.dgid = gid1; next1.ah_attr.grh.sgid_index = 0; } if(ibv_modify_qp(qp2, &next1, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER)) { printf("Failed to modify verbs2 to ready\n"); return 1; } DPRINTF("verbs2 RTR\n"); if(ibv_modify_qp(qp1, &next2, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER)) { printf("Failed to modify verbs1 to ready\n"); return 1; } DPRINTF("verbs1 RTR\n"); next2.qp_state = IBV_QPS_RTS; next2.timeout = 14; next2.retry_cnt = 7; next2.rnr_retry = 7; next2.sq_psn = psn1; next2.max_rd_atomic = 1; if(ibv_modify_qp(qp1, &next2, IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC)) { printf("Failed again to modify verbs1 to ready\n"); return 1; } DPRINTF("verbs1 RTS\n"); next1.qp_state = IBV_QPS_RTS; next1.timeout = 14; next1.retry_cnt = 7; next1.rnr_retry = 7; next1.sq_psn = psn2; next1.max_rd_atomic = 1; if(ibv_modify_qp(qp2, &next1, IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC)) { printf("Failed again to modify verbs2 to ready\n"); return 1; } DPRINTF("verbs2 RTS\n"); printf("Performing RDMA first.\n"); iterations = atoi(argv[3]); getrusage(RUSAGE_SELF, &usage); ustart = usage.ru_utime; sstart = usage.ru_stime; gettimeofday(&tstart, NULL); while(iterations-- > 0) { sge.addr = (uintptr_t) buf1; sge.length = bytes; sge.lkey = mr1->lkey; send_wr.wr_id = 1; send_wr.opcode = IBV_WR_RDMA_WRITE; send_wr.sg_list = &sge; send_wr.num_sge = 1; send_wr.send_flags = IBV_SEND_SIGNALED; send_wr.wr.rdma.rkey = mr2->rkey; send_wr.wr.rdma.remote_addr = (uint64_t) buf2; DPRINTF("Iterations left: %d\n", iterations); if (ibv_req_notify_cq(cq1, 0)) return 1; DPRINTF("Submitting local RDMA\n"); gettimeofday(&start, NULL); if (ibv_post_send(qp1, &send_wr, &bad_send_wr)) return 1; DPRINTF("RDMA posted %p %p\n", &send_wr, bad_send_wr); DPRINTF("blocking...\n"); if(ibv_get_cq_event(comp_chan1, &evt_cq, &cq_context)) { printf("failed to get CQ event\n"); return 1; } gettimeofday(&stop, NULL); timersub(&stop, &start, &diff); DPRINTF("RDMA took: %lu us\n", diff.tv_usec); ibv_ack_cq_events(evt_cq, 1); DPRINTF("got event\n"); n = ibv_poll_cq(cq1, 1, &wc); if (n > 0) { DPRINTF("return from poll: %lu\n", wc.wr_id); if (wc.status != IBV_WC_SUCCESS) { printf("poll failed %s\n", ibv_wc_status_str(wc.status)); return 1; } if (wc.wr_id == 1) { DPRINTF("Finished %d bytes %d %d\n", n, buf1[bytes - 1], buf2[bytes - 1]); } else { printf("didn't find completion\n"); } } if (n < 0) { printf("poll returned error\n"); return 1; } DPRINTF("Poll returned %d bytes %d %d\n", n, buf1[0], buf2[0]); } gettimeofday(&tend, NULL); getrusage(RUSAGE_SELF, &usage); uend = usage.ru_utime; send = usage.ru_stime; save_diff = 0; timersub(&uend, &ustart, &diff); save_diff += diff.tv_usec; printf("User CPU time: %lu us\n", diff.tv_usec); timersub(&send, &sstart, &diff); save_diff += diff.tv_usec; printf("System CPU time: %lu us\n", diff.tv_usec); timersub(&tend, &tstart, &diff); printf("Sleeping time: %lu us\n", diff.tv_usec - save_diff); printf("Wall clock CPU time: %lu us\n", diff.tv_usec); iterations = atoi(argv[3]); printf("Now using the CPU instead....\n"); getrusage(RUSAGE_SELF, &usage); ustart = usage.ru_utime; sstart = usage.ru_stime; gettimeofday(&tstart, NULL); while(iterations-- > 0) { DPRINTF("Repeating without RDMA...\n"); gettimeofday(&start, NULL); memcpy(buf2, buf1, bytes); gettimeofday(&stop, NULL); timersub(&stop, &start, &diff); DPRINTF("Regular copy too took: %lu us\n", diff.tv_usec); } gettimeofday(&tend, NULL); getrusage(RUSAGE_SELF, &usage); uend = usage.ru_utime; send = usage.ru_stime; save_diff = 0; timersub(&uend, &ustart, &diff); save_diff += diff.tv_usec; printf("User CPU time: %lu us\n", diff.tv_usec); timersub(&send, &sstart, &diff); save_diff += diff.tv_usec; printf("System CPU time: %lu us\n", diff.tv_usec); timersub(&tend, &tstart, &diff); printf("Sleeping time: %lu us\n", diff.tv_usec - save_diff); printf("Wall clock CPU time: %lu us\n", diff.tv_usec); return 0; }
static int pp_init_ctx(char *ib_devname) { struct ibv_srq_init_attr_ex attr; struct ibv_xrcd_init_attr xrcd_attr; struct ibv_port_attr port_attr; ctx.recv_qp = calloc(ctx.num_clients, sizeof *ctx.recv_qp); ctx.send_qp = calloc(ctx.num_clients, sizeof *ctx.send_qp); ctx.rem_dest = calloc(ctx.num_clients, sizeof *ctx.rem_dest); if (!ctx.recv_qp || !ctx.send_qp || !ctx.rem_dest) return 1; if (open_device(ib_devname)) { fprintf(stderr, "Failed to open device\n"); return 1; } if (pp_get_port_info(ctx.context, ctx.ib_port, &port_attr)) { fprintf(stderr, "Failed to get port info\n"); return 1; } ctx.lid = port_attr.lid; if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET && !ctx.lid) { fprintf(stderr, "Couldn't get local LID\n"); return 1; } ctx.buf = memalign(page_size, ctx.size); if (!ctx.buf) { fprintf(stderr, "Couldn't allocate work buf.\n"); return 1; } memset(ctx.buf, 0, ctx.size); if (ctx.use_event) { ctx.channel = ibv_create_comp_channel(ctx.context); if (!ctx.channel) { fprintf(stderr, "Couldn't create completion channel\n"); return 1; } } ctx.pd = ibv_alloc_pd(ctx.context); if (!ctx.pd) { fprintf(stderr, "Couldn't allocate PD\n"); return 1; } ctx.mr = ibv_reg_mr(ctx.pd, ctx.buf, ctx.size, IBV_ACCESS_LOCAL_WRITE); if (!ctx.mr) { fprintf(stderr, "Couldn't register MR\n"); return 1; } ctx.fd = open("/tmp/xrc_domain", O_RDONLY | O_CREAT, S_IRUSR | S_IRGRP); if (ctx.fd < 0) { fprintf(stderr, "Couldn't create the file for the XRC Domain " "but not stopping %d\n", errno); ctx.fd = -1; } memset(&xrcd_attr, 0, sizeof xrcd_attr); xrcd_attr.comp_mask = IBV_XRCD_INIT_ATTR_FD | IBV_XRCD_INIT_ATTR_OFLAGS; xrcd_attr.fd = ctx.fd; xrcd_attr.oflags = O_CREAT; ctx.xrcd = ibv_open_xrcd(ctx.context, &xrcd_attr); if (!ctx.xrcd) { fprintf(stderr, "Couldn't Open the XRC Domain %d\n", errno); return 1; } ctx.recv_cq = ibv_create_cq(ctx.context, ctx.num_clients, &ctx.recv_cq, ctx.channel, 0); if (!ctx.recv_cq) { fprintf(stderr, "Couldn't create recv CQ\n"); return 1; } if (ctx.use_event) { if (ibv_req_notify_cq(ctx.recv_cq, 0)) { fprintf(stderr, "Couldn't request CQ notification\n"); return 1; } } ctx.send_cq = ibv_create_cq(ctx.context, ctx.num_clients, NULL, NULL, 0); if (!ctx.send_cq) { fprintf(stderr, "Couldn't create send CQ\n"); return 1; } memset(&attr, 0, sizeof attr); attr.attr.max_wr = ctx.num_clients; attr.attr.max_sge = 1; attr.comp_mask = IBV_SRQ_INIT_ATTR_TYPE | IBV_SRQ_INIT_ATTR_XRCD | IBV_SRQ_INIT_ATTR_CQ | IBV_SRQ_INIT_ATTR_PD; attr.srq_type = IBV_SRQT_XRC; attr.xrcd = ctx.xrcd; attr.cq = ctx.recv_cq; attr.pd = ctx.pd; ctx.srq = ibv_create_srq_ex(ctx.context, &attr); if (!ctx.srq) { fprintf(stderr, "Couldn't create SRQ\n"); return 1; } if (create_qps()) return 1; return 0; }