/* Create XRC send qp */ static int xoob_send_qp_create (mca_btl_base_endpoint_t* endpoint) { int prio = BTL_OPENIB_LP_CQ; /* all send completions go to low prio CQ */ uint32_t send_wr; struct ibv_qp **qp; uint32_t *psn; struct ibv_qp_init_attr qp_init_attr; struct ibv_qp_attr attr; int ret; size_t req_inline; mca_btl_openib_module_t *openib_btl = (mca_btl_openib_module_t*)endpoint->endpoint_btl; /* Prepare QP structs */ BTL_VERBOSE(("Creating Send QP\n")); qp = &endpoint->qps[0].qp->lcl_qp; psn = &endpoint->qps[0].qp->lcl_psn; /* reserve additional wr for eager rdma credit management */ send_wr = endpoint->ib_addr->qp->sd_wqe + (mca_btl_openib_component.use_eager_rdma ? mca_btl_openib_component.max_eager_rdma : 0); memset(&qp_init_attr, 0, sizeof(struct ibv_qp_init_attr)); memset(&attr, 0, sizeof(struct ibv_qp_attr)); qp_init_attr.send_cq = qp_init_attr.recv_cq = openib_btl->device->ib_cq[prio]; /* no need recv queue; receives are posted to srq */ qp_init_attr.cap.max_recv_wr = 0; qp_init_attr.cap.max_send_wr = send_wr; qp_init_attr.cap.max_inline_data = req_inline = openib_btl->device->max_inline_data; qp_init_attr.cap.max_send_sge = 1; /* this one is ignored by driver */ qp_init_attr.cap.max_recv_sge = 1; /* we do not use SG list */ qp_init_attr.qp_type = IBV_QPT_XRC; qp_init_attr.xrc_domain = openib_btl->device->xrc_domain; *qp = ibv_create_qp(openib_btl->device->ib_pd, &qp_init_attr); if (NULL == *qp) { BTL_ERROR(("Error creating QP, errno says: %s", strerror(errno))); return OMPI_ERROR; } if (qp_init_attr.cap.max_inline_data < req_inline) { endpoint->qps[0].ib_inline_max = qp_init_attr.cap.max_inline_data; orte_show_help("help-mpi-btl-openib-cpc-base.txt", "inline truncated", orte_process_info.nodename, ibv_get_device_name(openib_btl->device->ib_dev), openib_btl->port_num, req_inline, qp_init_attr.cap.max_inline_data); } else { endpoint->qps[0].ib_inline_max = req_inline; } attr.qp_state = IBV_QPS_INIT; attr.pkey_index = openib_btl->pkey_index; attr.port_num = openib_btl->port_num; attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ; ret = ibv_modify_qp(*qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS ); if (ret) { BTL_ERROR(("Error modifying QP[%x] to IBV_QPS_INIT errno says: %s [%d]", (*qp)->qp_num, strerror(ret), ret)); return OMPI_ERROR; } /* Setup meta data on the endpoint */ *psn = lrand48() & 0xffffff; /* Now that all the qp's are created locally, post some receive buffers, setup credits, etc. */ return mca_btl_openib_endpoint_post_recvs(endpoint); }
/*-----------------------------------------------------------------------------------*/ static void low_level_init(struct netif *netif) { struct ibvif *ibvif; int num_of_device, flags = IBV_ACCESS_LOCAL_WRITE; struct ibv_qp_init_attr attr; struct ibv_qp_attr qp_attr; uint8_t port_num = 1; int qp_flags; struct ibv_device **ib_dev_list; struct tcpip_thread *thread; struct ibv_exp_cq_init_attr cq_attr; ibvif = (struct ibvif *)netif->state; /* Obtain MAC address from network interface. */ ibvif->ethaddr->addr[0] = 0x00; ibvif->ethaddr->addr[1] = 0x02; ibvif->ethaddr->addr[2] = 0xc9; ibvif->ethaddr->addr[3] = 0xa4; ibvif->ethaddr->addr[4] = 0x59; ibvif->ethaddr->addr[5] = 0x41; ibvif->buf_size = ALIGN_TO_PAGE_SIZE(PBUF_POOL_SIZE * TCP_MAX_PACKET_SIZE); /* Do things needed for using Raw Packet Verbs */ ib_dev_list = ibv_get_device_list(&num_of_device); if (num_of_device <= 0 || !ib_dev_list || !ib_dev_list[0]) { perror("IBV no device found\n"); exit(1); } ibvif->context = ibv_open_device(ib_dev_list[1]); if (!ibvif->context) { perror("IBV can't open device\n"); exit(1); } ibv_free_device_list(ib_dev_list); if (set_link_layer(ibvif->context, 1) == LINK_FAILURE) { perror("IBV can't allocate PD\n"); exit(1); } ibvif->pd = ibv_alloc_pd(ibvif->context); if (!ibvif->pd) { perror("IBV can't allocate PD\n"); exit(1); } /*if (!ibv_buffer(ibvif)) { LWIP_DEBUGF(NETIF_DEBUG, ("Buffer allocation failed\n")); exit(1); }*/ ibvif->recv_buf = netif->prot_thread->pbuf_rx_handle.buf; ibvif->send_buf = netif->prot_thread->pbuf_tx_handle.buf; ibvif->send_size = TCP_MAX_PACKET_SIZE; ibvif->rx_depth = PBUF_POOL_SIZE; ibvif->tx_depth = PBUF_POOL_SIZE; ibvif->send_mr = ibv_reg_mr(ibvif->pd, ibvif->send_buf, ibvif->buf_size, flags); if (!ibvif->send_mr) { perror("IBV error reg send mr\n"); exit(1); } ibvif->recv_mr = ibv_reg_mr(ibvif->pd, ibvif->recv_buf, ibvif->buf_size, flags); if (!ibvif->recv_mr) { perror("IBV error reg recv mr\n"); exit(1); } ibvif->send_cq = ibv_create_cq(ibvif->context, ibvif->tx_depth, NULL, NULL, 0); if (!ibvif->send_cq) { perror("IBV can't create send cq\n"); exit(1); } cq_attr.flags = IBV_EXP_CQ_TIMESTAMP; cq_attr.comp_mask = IBV_EXP_CQ_INIT_ATTR_FLAGS; ibvif->recv_cq = ibv_exp_create_cq(ibvif->context, ibvif->rx_depth, NULL, NULL, 0, &cq_attr); if (!ibvif->recv_cq) { perror("IBV can't create recv cq\n"); exit(1); } memset(&attr, 0, sizeof(struct ibv_qp_init_attr)); attr.send_cq = ibvif->send_cq; attr.recv_cq = ibvif->recv_cq; attr.cap.max_send_wr = ibvif->tx_depth; attr.cap.max_send_sge = 1; attr.cap.max_recv_wr = ibvif->rx_depth; attr.cap.max_recv_sge = 1; attr.qp_type = IBV_QPT_RAW_PACKET; ibvif->qp = ibv_create_qp(ibvif->pd, &attr); if (!ibvif->qp) { perror("IBV can't create QP\n"); exit(1); } qp_flags = IBV_QP_STATE | IBV_QP_PORT; memset(&qp_attr, 0, sizeof(struct ibv_qp_attr)); qp_attr.qp_state = IBV_QPS_INIT; qp_attr.pkey_index = 0; qp_attr.port_num = port_num; qp_attr.qp_access_flags = 0; if (ibv_modify_qp(ibvif->qp, &qp_attr, qp_flags)) { perror("IBV can't set qp to init\n"); exit(1); } ibv_attach_device(netif); }
static struct ibv_qp * mlx5_glue_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr) { return ibv_create_qp(pd, qp_init_attr); }
static inline int fi_ibv_get_qp_cap(struct ibv_context *ctx, struct ibv_device_attr *device_attr, struct fi_info *info) { struct ibv_pd *pd; struct ibv_cq *cq; struct ibv_qp *qp; struct ibv_qp_init_attr init_attr; int ret = 0; pd = ibv_alloc_pd(ctx); if (!pd) { VERBS_INFO_ERRNO(FI_LOG_FABRIC, "ibv_alloc_pd", errno); return -errno; } cq = ibv_create_cq(ctx, 1, NULL, NULL, 0); if (!cq) { VERBS_INFO_ERRNO(FI_LOG_FABRIC, "ibv_create_cq", errno); ret = -errno; goto err1; } /* TODO: serialize access to string buffers */ fi_read_file(FI_CONF_DIR, "def_tx_ctx_size", def_tx_ctx_size, sizeof def_tx_ctx_size); fi_read_file(FI_CONF_DIR, "def_rx_ctx_size", def_rx_ctx_size, sizeof def_rx_ctx_size); fi_read_file(FI_CONF_DIR, "def_tx_iov_limit", def_tx_iov_limit, sizeof def_tx_iov_limit); fi_read_file(FI_CONF_DIR, "def_rx_iov_limit", def_rx_iov_limit, sizeof def_rx_iov_limit); fi_read_file(FI_CONF_DIR, "def_inject_size", def_inject_size, sizeof def_inject_size); memset(&init_attr, 0, sizeof init_attr); init_attr.send_cq = cq; init_attr.recv_cq = cq; init_attr.cap.max_send_wr = MIN(atoi(def_tx_ctx_size), device_attr->max_qp_wr); init_attr.cap.max_recv_wr = MIN(atoi(def_rx_ctx_size), device_attr->max_qp_wr); init_attr.cap.max_send_sge = MIN(atoi(def_tx_iov_limit), device_attr->max_sge); init_attr.cap.max_recv_sge = MIN(atoi(def_rx_iov_limit), device_attr->max_sge); init_attr.cap.max_inline_data = atoi(def_inject_size); init_attr.qp_type = IBV_QPT_RC; qp = ibv_create_qp(pd, &init_attr); if (!qp) { VERBS_INFO_ERRNO(FI_LOG_FABRIC, "ibv_create_qp", errno); ret = -errno; goto err2; } info->tx_attr->inject_size = init_attr.cap.max_inline_data; info->tx_attr->iov_limit = init_attr.cap.max_send_sge; info->tx_attr->size = init_attr.cap.max_send_wr; info->rx_attr->iov_limit = init_attr.cap.max_recv_sge; /* * On some HW ibv_create_qp can increase max_recv_wr value more than * it really supports. So, alignment with device capability is needed. */ info->rx_attr->size = MIN(init_attr.cap.max_recv_wr, device_attr->max_qp_wr); ibv_destroy_qp(qp); err2: ibv_destroy_cq(cq); err1: ibv_dealloc_pd(pd); return ret; }
/* * Create the local side of one qp. The remote side will be connected * later. */ static int qp_create_one(mca_btl_base_endpoint_t* endpoint, int qp, struct ibv_srq *srq, uint32_t max_recv_wr, uint32_t max_send_wr) { mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl; struct ibv_qp *my_qp; struct ibv_qp_init_attr init_attr; struct ibv_qp_attr attr; size_t req_inline; memset(&init_attr, 0, sizeof(init_attr)); memset(&attr, 0, sizeof(attr)); init_attr.qp_type = IBV_QPT_RC; init_attr.send_cq = openib_btl->device->ib_cq[BTL_OPENIB_LP_CQ]; init_attr.recv_cq = openib_btl->device->ib_cq[qp_cq_prio(qp)]; init_attr.srq = srq; init_attr.cap.max_inline_data = req_inline = max_inline_size(qp, openib_btl->device); init_attr.cap.max_send_sge = 1; init_attr.cap.max_recv_sge = 1; /* we do not use SG list */ if(BTL_OPENIB_QP_TYPE_PP(qp)) { init_attr.cap.max_recv_wr = max_recv_wr; } else { init_attr.cap.max_recv_wr = 0; } init_attr.cap.max_send_wr = max_send_wr; my_qp = ibv_create_qp(openib_btl->device->ib_pd, &init_attr); if (NULL == my_qp) { BTL_ERROR(("error creating qp errno says %s", strerror(errno))); return OMPI_ERROR; } endpoint->qps[qp].qp->lcl_qp = my_qp; if (init_attr.cap.max_inline_data < req_inline) { endpoint->qps[qp].ib_inline_max = init_attr.cap.max_inline_data; orte_show_help("help-mpi-btl-openib-cpc-base.txt", "inline truncated", true, orte_process_info.nodename, ibv_get_device_name(openib_btl->device->ib_dev), openib_btl->port_num, req_inline, init_attr.cap.max_inline_data); } else { endpoint->qps[qp].ib_inline_max = req_inline; } attr.qp_state = IBV_QPS_INIT; attr.pkey_index = openib_btl->pkey_index; attr.port_num = openib_btl->port_num; attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ; if (ibv_modify_qp(endpoint->qps[qp].qp->lcl_qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS )) { BTL_ERROR(("error modifying qp to INIT errno says %s", strerror(errno))); return OMPI_ERROR; } /* Setup meta data on the endpoint */ endpoint->qps[qp].qp->lcl_psn = lrand48() & 0xffffff; endpoint->qps[qp].credit_frag = NULL; return OMPI_SUCCESS; }
/* * move_to_rts */ static int move_to_rts(struct ibv_qp *qp) { struct ibv_qp_attr attr = { .qp_state = IBV_QPS_RTS, .timeout = 14, /* old = 10 */ .retry_cnt = 7, /* old = 1 */ .rnr_retry = 7, /* old = 1 */ .sq_psn = 0, /* Packet sequence number */ .max_rd_atomic = 1, /* Number of outstanding RDMA rd/atomic ops at destination ?*/ }; if (ibv_modify_qp(qp, &attr, IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC)) goto err_VAPI_modify_qp; return 0; /* --- */ err_VAPI_modify_qp: psoib_err_errno("ibv_modify_qp() move to RTS failed", errno); return -1; } int psoib_con_init(psoib_con_info_t *con_info, hca_info_t *hca_info, port_info_t *port_info) { unsigned int i; if (!hca_info) hca_info = &default_hca; if (!port_info) port_info = &default_port; con_info->ctx = hca_info->ctx; con_info->port_info = port_info; con_info->qp = NULL; con_info->hca_info = hca_info; con_info->send.bufs.mr = NULL; con_info->recv.bufs.mr = NULL; con_info->con_broken = 0; INIT_LIST_HEAD(&con_info->next_con_info); { struct ibv_qp_init_attr attr = { .send_cq = hca_info->cq, .recv_cq = hca_info->cq, .cap = { //.max_send_wr = 128, /* Max outstanding WR on the SQ ??*/ //.max_recv_wr = 128, /* Max outstanding WR on the RQ ??*/ //.max_send_sge = 4, /* Max scatter/gather descriptor entries on the SQ ??*/ //.max_recv_sge = 4, /* Max scatter/gather descriptor entries on the RQ */ .max_send_wr = 128, /* Max outstanding WR on the SQ ??*/ .max_recv_wr = 128, /* Max outstanding WR on the RQ ??*/ .max_send_sge = 1, /* Max scatter/gather descriptor entries on the SQ ??*/ .max_recv_sge = 1, /* Max scatter/gather descriptor entries on the RQ */ .max_inline_data = IB_MAX_INLINE, }, .qp_type = IBV_QPT_RC }; con_info->qp = ibv_create_qp(hca_info->pd, &attr); if (!con_info->qp) goto err_create_qp; }
static int init_ud_qp(struct ibv_context *context_arg, struct mca_btl_openib_sa_qp_cache *cache) { struct ibv_qp_init_attr iattr; struct ibv_qp_attr mattr; int rc; /* create cq */ cache->cq = ibv_create_cq(cache->context, 4, NULL, NULL, 0); if (NULL == cache->cq) { BTL_ERROR(("error creating cq, errno says %s", strerror(errno))); opal_show_help("help-mpi-btl-openib.txt", "init-fail-create-q", true, opal_proc_local_get()->proc_hostname, __FILE__, __LINE__, "ibv_create_cq", strerror(errno), errno, ibv_get_device_name(context_arg->device)); return OPAL_ERROR; } /* create qp */ memset(&iattr, 0, sizeof(iattr)); iattr.send_cq = cache->cq; iattr.recv_cq = cache->cq; iattr.cap.max_send_wr = 2; iattr.cap.max_recv_wr = 2; iattr.cap.max_send_sge = 1; iattr.cap.max_recv_sge = 1; iattr.qp_type = IBV_QPT_UD; cache->qp = ibv_create_qp(cache->pd, &iattr); if (NULL == cache->qp) { BTL_ERROR(("error creating qp %s (%d)", strerror(errno), errno)); return OPAL_ERROR; } /* modify qp to IBV_QPS_INIT */ memset(&mattr, 0, sizeof(mattr)); mattr.qp_state = IBV_QPS_INIT; mattr.port_num = cache->port_num; mattr.qkey = ntohl(IB_QP1_WELL_KNOWN_Q_KEY); rc = ibv_modify_qp(cache->qp, &mattr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_QKEY); if (rc) { BTL_ERROR(("Error modifying QP[%x] to IBV_QPS_INIT errno says: %s [%d]", cache->qp->qp_num, strerror(errno), errno)); return OPAL_ERROR; } /* modify qp to IBV_QPS_RTR */ memset(&mattr, 0, sizeof(mattr)); mattr.qp_state = IBV_QPS_RTR; rc = ibv_modify_qp(cache->qp, &mattr, IBV_QP_STATE); if (rc) { BTL_ERROR(("Error modifying QP[%x] to IBV_QPS_RTR errno says: %s [%d]", cache->qp->qp_num, strerror(errno), errno)); return OPAL_ERROR; } /* modify qp to IBV_QPS_RTS */ mattr.qp_state = IBV_QPS_RTS; rc = ibv_modify_qp(cache->qp, &mattr, IBV_QP_STATE | IBV_QP_SQ_PSN); if (rc) { BTL_ERROR(("Error modifying QP[%x] to IBV_QPS_RTR errno says: %s [%d]", cache->qp->qp_num, strerror(errno), errno)); return OPAL_ERROR; } return OPAL_SUCCESS; }
static int rdmasniff_activate(pcap_t *handle) { struct pcap_rdmasniff *priv = handle->priv; struct ibv_qp_init_attr qp_init_attr; struct ibv_qp_attr qp_attr; struct ibv_flow_attr flow_attr; struct ibv_port_attr port_attr; int i; priv->context = ibv_open_device(priv->rdma_device); if (!priv->context) { pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "Failed to open device %s", handle->opt.device); goto error; } priv->pd = ibv_alloc_pd(priv->context); if (!priv->pd) { pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "Failed to alloc PD for device %s", handle->opt.device); goto error; } priv->channel = ibv_create_comp_channel(priv->context); if (!priv->channel) { pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "Failed to create comp channel for device %s", handle->opt.device); goto error; } priv->cq = ibv_create_cq(priv->context, RDMASNIFF_NUM_RECEIVES, NULL, priv->channel, 0); if (!priv->cq) { pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "Failed to create CQ for device %s", handle->opt.device); goto error; } ibv_req_notify_cq(priv->cq, 0); memset(&qp_init_attr, 0, sizeof qp_init_attr); qp_init_attr.send_cq = qp_init_attr.recv_cq = priv->cq; qp_init_attr.cap.max_recv_wr = RDMASNIFF_NUM_RECEIVES; qp_init_attr.cap.max_recv_sge = 1; qp_init_attr.qp_type = IBV_QPT_RAW_PACKET; priv->qp = ibv_create_qp(priv->pd, &qp_init_attr); if (!priv->qp) { pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "Failed to create QP for device %s", handle->opt.device); goto error; } memset(&qp_attr, 0, sizeof qp_attr); qp_attr.qp_state = IBV_QPS_INIT; qp_attr.port_num = priv->port_num; if (ibv_modify_qp(priv->qp, &qp_attr, IBV_QP_STATE | IBV_QP_PORT)) { pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "Failed to modify QP to INIT for device %s", handle->opt.device); goto error; } memset(&qp_attr, 0, sizeof qp_attr); qp_attr.qp_state = IBV_QPS_RTR; if (ibv_modify_qp(priv->qp, &qp_attr, IBV_QP_STATE)) { pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "Failed to modify QP to RTR for device %s", handle->opt.device); goto error; } memset(&flow_attr, 0, sizeof flow_attr); flow_attr.type = IBV_FLOW_ATTR_SNIFFER; flow_attr.size = sizeof flow_attr; flow_attr.port = priv->port_num; priv->flow = ibv_create_flow(priv->qp, &flow_attr); if (!priv->flow) { pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "Failed to create flow for device %s", handle->opt.device); goto error; } handle->bufsize = RDMASNIFF_NUM_RECEIVES * RDMASNIFF_RECEIVE_SIZE; handle->buffer = malloc(handle->bufsize); if (!handle->buffer) { pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "Failed to allocate receive buffer for device %s", handle->opt.device); goto error; } priv->oneshot_buffer = malloc(RDMASNIFF_RECEIVE_SIZE); if (!priv->oneshot_buffer) { pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "Failed to allocate oneshot buffer for device %s", handle->opt.device); goto error; } priv->mr = ibv_reg_mr(priv->pd, handle->buffer, handle->bufsize, IBV_ACCESS_LOCAL_WRITE); if (!priv->mr) { pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE, "Failed to register MR for device %s", handle->opt.device); goto error; } for (i = 0; i < RDMASNIFF_NUM_RECEIVES; ++i) { rdmasniff_post_recv(handle, i); } if (!ibv_query_port(priv->context, priv->port_num, &port_attr) && port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) { handle->linktype = DLT_INFINIBAND; } else { handle->linktype = DLT_EN10MB; } if (handle->snapshot <= 0 || handle->snapshot > RDMASNIFF_RECEIVE_SIZE) handle->snapshot = RDMASNIFF_RECEIVE_SIZE; handle->offset = 0; handle->read_op = rdmasniff_read; handle->stats_op = rdmasniff_stats; handle->cleanup_op = rdmasniff_cleanup; handle->setfilter_op = install_bpf_program; handle->setdirection_op = NULL; handle->set_datalink_op = NULL; handle->getnonblock_op = pcap_getnonblock_fd; handle->setnonblock_op = pcap_setnonblock_fd; handle->oneshot_callback = rdmasniff_oneshot; handle->selectable_fd = priv->channel->fd; return 0; error: if (priv->mr) { ibv_dereg_mr(priv->mr); } if (priv->flow) { ibv_destroy_flow(priv->flow); } if (priv->qp) { ibv_destroy_qp(priv->qp); } if (priv->cq) { ibv_destroy_cq(priv->cq); } if (priv->channel) { ibv_destroy_comp_channel(priv->channel); } if (priv->pd) { ibv_dealloc_pd(priv->pd); } if (priv->context) { ibv_close_device(priv->context); } if (priv->oneshot_buffer) { free(priv->oneshot_buffer); } return PCAP_ERROR; }
/***************************************** * Function: resources_create *****************************************/ static int resources_create( struct resources *res) { struct ibv_qp_init_attr qp_init_attr; struct ibv_device *ib_dev = NULL; size_t size; int i; int mr_flags = 0; int cq_size = 0; int num_devices; int rc; /* if client side */ if (config.server_name) { res->sock = sock_client_connect(config.server_name, config.tcp_port); if (res->sock < 0) { fprintf(stderr, "failed to establish TCP connection to server %s, port %d\n", config.server_name, config.tcp_port); return -1; } } else { fprintf(stdout, "waiting on port %d for TCP connection\n", config.tcp_port); res->sock = sock_daemon_connect(config.tcp_port); if (res->sock < 0) { fprintf(stderr, "failed to establish TCP connection with client on port %d\n", config.tcp_port); return -1; } } fprintf(stdout, "TCP connection was established\n"); fprintf(stdout, "searching for IB devices in host\n"); /* get device names in the system */ res->dev_list = ibv_get_device_list(&num_devices); if (!res->dev_list) { fprintf(stderr, "failed to get IB devices list\n"); return 1; } /* if there isn't any IB device in host */ if (!num_devices) { fprintf(stderr, "found %d device(s)\n", num_devices); return 1; } fprintf(stdout, "found %d device(s)\n", num_devices); /* search for the specific device we want to work with */ for (i = 0; i < num_devices; i ++) { if (!strcmp(ibv_get_device_name(res->dev_list[i]), config.dev_name)) { ib_dev = res->dev_list[i]; break; } } /* if the device wasn't found in host */ if (!ib_dev) { fprintf(stderr, "IB device %s wasn't found\n", config.dev_name); return 1; } /* get device handle */ res->ib_ctx = ibv_open_device(ib_dev); if (!res->ib_ctx) { fprintf(stderr, "failed to open device %s\n", config.dev_name); return 1; } /* query port properties */ if (ibv_query_port(res->ib_ctx, config.ib_port, &res->port_attr)) { fprintf(stderr, "ibv_query_port on port %u failed\n", config.ib_port); return 1; } /* allocate Protection Domain */ res->pd = ibv_alloc_pd(res->ib_ctx); if (!res->pd) { fprintf(stderr, "ibv_alloc_pd failed\n"); return 1; } res->comp_channel = ibv_create_comp_channel(res->ib_ctx); if (!res->comp_channel) { fprintf(stderr, "ibv_create_comp_channel failed\n"); return 1; } /* each side will send only one WR, so Completion Queue with 1 entry is enough */ cq_size = 1; res->cq = ibv_create_cq(res->ib_ctx, cq_size, NULL, res->comp_channel, 0); if (!res->cq) { fprintf(stderr, "failed to create CQ with %u entries\n", cq_size); return 1; } /* Arm the CQ before any completion is expected (to prevent races) */ rc = ibv_req_notify_cq(res->cq, 0); if (rc) { fprintf(stderr, "failed to arm the CQ\n"); return 1; } fprintf(stdout, "CQ was armed\n"); /* allocate the memory buffer that will hold the data */ size = MSG_SIZE; res->buf = malloc(size); if (!res->buf) { fprintf(stderr, "failed to malloc %Zu bytes to memory buffer\n", size); return 1; } /* only in the daemon side put the message in the memory buffer */ if (!config.server_name) { strcpy(res->buf, MSG); fprintf(stdout, "going to send the message: '%s'\n", res->buf); } else memset(res->buf, 0, size); /* register this memory buffer */ mr_flags = (config.server_name) ? IBV_ACCESS_LOCAL_WRITE : 0; res->mr = ibv_reg_mr(res->pd, res->buf, size, mr_flags); if (!res->mr) { fprintf(stderr, "ibv_reg_mr failed with mr_flags=0x%x\n", mr_flags); return 1; } fprintf(stdout, "MR was registered with addr=%p, lkey=0x%x, rkey=0x%x, flags=0x%x\n", res->buf, res->mr->lkey, res->mr->rkey, mr_flags); /* create the Queue Pair */ memset(&qp_init_attr, 0, sizeof(qp_init_attr)); qp_init_attr.qp_type = IBV_QPT_RC; qp_init_attr.sq_sig_all = 1; qp_init_attr.send_cq = res->cq; qp_init_attr.recv_cq = res->cq; qp_init_attr.cap.max_send_wr = 1; qp_init_attr.cap.max_recv_wr = 1; qp_init_attr.cap.max_send_sge = 1; qp_init_attr.cap.max_recv_sge = 1; res->qp = ibv_create_qp(res->pd, &qp_init_attr); if (!res->qp) { fprintf(stderr, "failed to create QP\n"); return 1; } fprintf(stdout, "QP was created, QP number=0x%x\n", res->qp->qp_num); return 0; }
static int fi_ibv_rdm_find_max_inline(struct ibv_pd *pd, struct ibv_context *context) { struct ibv_qp_init_attr qp_attr; struct ibv_qp *qp = NULL; struct ibv_cq *cq = ibv_create_cq(context, 1, NULL, NULL, 0); assert(cq); int max_inline = 2; int rst = 0; memset(&qp_attr, 0, sizeof(qp_attr)); qp_attr.send_cq = cq; qp_attr.recv_cq = cq; qp_attr.qp_type = IBV_QPT_RC; qp_attr.cap.max_send_wr = 1; qp_attr.cap.max_recv_wr = 1; qp_attr.cap.max_send_sge = 1; qp_attr.cap.max_recv_sge = 1; do { if (qp) ibv_destroy_qp(qp); qp_attr.cap.max_inline_data = max_inline; qp = ibv_create_qp(pd, &qp_attr); if (qp) { /* * truescale returns max_inline_data 0 */ if (qp_attr.cap.max_inline_data == 0) break; /* * iWarp is able to create qp with unsupported * max_inline, lets take first returned value. */ if (context->device->transport_type == IBV_TRANSPORT_IWARP) { max_inline = rst = qp_attr.cap.max_inline_data; break; } rst = max_inline; } } while (qp && (max_inline < INT_MAX / 2) && (max_inline *= 2)); if (rst != 0) { int pos = rst, neg = max_inline; do { max_inline = pos + (neg - pos) / 2; if (qp) ibv_destroy_qp(qp); qp_attr.cap.max_inline_data = max_inline; qp = ibv_create_qp(pd, &qp_attr); if (qp) pos = max_inline; else neg = max_inline; } while (neg - pos > 2); rst = pos; } if (qp) { ibv_destroy_qp(qp); } if (cq) { ibv_destroy_cq(cq); } return rst; }
/* Initialize the actual IB device */ int initIB(ArgStruct *p) { int i, j, ret; char *tmp; int num_devices = 0; struct ibv_device **hca_list, **filtered_hca_list; struct ibv_device_attr hca_attr; #if !HAVE_IBV_DEVICE_LIST struct dlist *hca_dlist; struct ibv_device* hca_device; #endif /* Find all the devices on this host */ #if HAVE_IBV_DEVICE_LIST hca_list = ibv_get_device_list(&num_devices); #else hca_dlist = ibv_get_devices(); dlist_start(hca_dlist); dlist_for_each_data(hca_dlist, hca_device, struct ibv_device) ++num_devices; #endif /* If we didn't find any, return an error */ if (0 == num_devices) { fprintf(stderr, "Couldn't find any IBV devices\n"); return -1; } #if !HAVE_IBV_DEVICE_LIST /* If we have the old version (ibv_get_devices()), convert it to the new form */ hca_list = (struct ibv_device**) malloc(num_devices * sizeof(struct ibv_device*)); if (NULL == hca_list) { fprintf(stderr, "%s:%s:%d: malloc failed\n", __FILE__, __func__, __LINE__); return -1; } i = 0; dlist_start(hca_dlist); dlist_for_each_data(hca_dlist, hca_device, struct ibv_device) hca_list[i++] = hca_device; #endif /* Possible values for p->prot.device_and_port: 1. <device>:<port> -- use only this device and only this port 2. <device> -- use the first active port on this device 3. :<port> -- use only this port, but on any device <device> names are matched exactly. */ /* If a device name was specified on the command line, see if we can find it */ tmp = NULL; port_num = -1; filtered_hca_list = hca_list; if (NULL != p->prot.device_and_port) { /* If there's a : in the string, then we have a port */ tmp = strchr(p->prot.device_and_port, ':'); if (NULL != tmp) { *tmp = '\0'; ++tmp; port_num = atoi(tmp); } LOGPRINTF(("Pre-filter: looking for target device \"%s\", port %d", p->prot.device_and_port, port_num)); /* If the length of the device string left is >0, then there's a device specification */ if (strlen(p->prot.device_and_port) > 0) { int found = 0; /* Loop through all the devices and find a matching name */ for (i = 0; i < num_devices; ++i) { LOGPRINTF(("Pre-filter: found device: %s", ibv_get_device_name(hca_list[i]))); if (0 == strcmp(p->prot.device_and_port, ibv_get_device_name(hca_list[i]))) { LOGPRINTF(("Pre-filter: found target device: %s (%d of %d)", p->prot.device_and_port, i, num_devices)); filtered_hca_list = &(hca_list[i]); num_devices = 1; found = 1; break; } } /* If we didn't find it, abort */ if (!found) { fprintf(stderr, "Unable to find device \"%s\", aborting\n", p->prot.device_and_port); return -1; } } } /* Traverse the filtered HCA list and find a good port */ for (hca = NULL, i = 0; NULL == hca && i < num_devices; ++i) { /* Get a ibv_context from the ibv_device */ ctx = ibv_open_device(filtered_hca_list[i]); if (!ctx) { fprintf(stderr, "Couldn't create IBV context\n"); return -1; } else { LOGPRINTF(("Found HCA %s", ibv_get_device_name(filtered_hca_list[i]))); } /* Get the device attributes */ if (0 != ibv_query_device(ctx, &hca_attr)) { fprintf(stderr, "Could not get device context for %s, aborting\n", ibv_get_device_name(hca)); return -1; } for (j = 1; j <= hca_attr.phys_port_cnt; ++j) { /* If a specific port was asked for, *only* look at that port */ if (port_num >= 0 && port_num != j) { continue; } LOGPRINTF(("Checking %s:%d...", ibv_get_device_name(filtered_hca_list[i]), j)); /* Query this port and see if it's active */ if (0 != ibv_query_port(ctx, j, &hca_port)) { fprintf(stderr, "Unable to query port %s:%d, aborting\n", ibv_get_device_name(filtered_hca_list[i]), j); return -1; } /* If this port is active, we have a winner! */ if (IBV_PORT_ACTIVE == hca_port.state) { LOGPRINTF(("%s:%d is ACTIVE", ibv_get_device_name(filtered_hca_list[i]), j)); port_num = j; hca = filtered_hca_list[i]; break; } } /* If we found one, we're done */ if (hca) { break; } /* Otherwise, close the device (ignore any errors) */ ibv_close_device(ctx); ctx = NULL; } /* If we didn't find a good device/port combo, abort */ if (NULL == hca) { fprintf(stderr, "Could not find an active device and port, aborting\n"); return -1; } /* free up the other devices in the event we would have multiple ib devices. if this isnt done, the device pointers will still be around in space somewhere -> bad */ #if HAVE_IBV_DEVICE_LIST ibv_free_device_list(hca_list); #endif /* Get HCA properties */ lid = hca_port.lid; /* local id, used to ref back to the device */ LOGPRINTF((" lid = %d", lid)); /* Allocate Protection Domain */ /* need a Protection domain to handle/register memory over the card */ pd_hndl = ibv_alloc_pd(ctx); if(!pd_hndl) { fprintf(stderr, "Error allocating PD\n"); return -1; } else { LOGPRINTF(("Allocated Protection Domain")); } /* Create send completion queue */ num_cqe = 30000; /* Requested number of completion q elements */ s_cq_hndl = ibv_create_cq(ctx, num_cqe, NULL, NULL, 0); if(!s_cq_hndl) { fprintf(stderr, "Error creating send CQ\n"); return -1; } else { act_num_cqe = s_cq_hndl->cqe; LOGPRINTF(("Created Send Completion Queue with %d elements", act_num_cqe)); } /* Create recv completion queue */ num_cqe = 20000; /* Requested number of completion q elements */ r_cq_hndl = ibv_create_cq(ctx, num_cqe, NULL, NULL, 0); if(!r_cq_hndl) { fprintf(stderr, "Error creating send CQ\n"); return -1; } else { act_num_cqe = r_cq_hndl->cqe; LOGPRINTF(("Created Recv Completion Queue with %d elements", act_num_cqe)); } /* Placeholder for MR */ /* We dont actually setup the Memory Regions here, instead * this is done in the 'MyMalloc(..)' helper function. * You could however, set them up here. */ /* Create Queue Pair */ /* To setup a Queue Pair, the following qp initial attributes must be * specified and passed to the create_qp(..) function: * max send/recv write requests. (max_recv/send_wr) * max scatter/gather entries. (max_recv/send_sge) * Command queues to associate the qp with. (recv/send_cq) * Signalling type: 1-> signal all events. 0-> dont, event handler will * deal with this. * QP type. (RC=reliable connection, UC=unreliable.. etc.) defined * in the verbs header. */ memset(&qp_init_attr, 0, sizeof(struct ibv_qp_init_attr)); qp_init_attr.cap.max_recv_wr = max_wq; /* Max outstanding WR on RQ */ qp_init_attr.cap.max_send_wr = max_wq; /* Max outstanding WR on SQ */ qp_init_attr.cap.max_recv_sge = 1; /* Max scatter/gather entries on RQ */ qp_init_attr.cap.max_send_sge = 1; /* Max scatter/gather entries on SQ */ qp_init_attr.recv_cq = r_cq_hndl; /* CQ handle for RQ */ qp_init_attr.send_cq = s_cq_hndl; /* CQ handle for SQ */ qp_init_attr.sq_sig_all = 0; /* Signalling type */ qp_init_attr.qp_type = IBV_QPT_RC; /* Transmission type */ /* ibv_create_qp( ibv_pd *pd, ibv_qp_init_attr * attr) */ qp_hndl = ibv_create_qp(pd_hndl, &qp_init_attr); if(!qp_hndl) { fprintf(stderr, "Error creating Queue Pair: %s\n", strerror(errno)); return -1; } else { LOGPRINTF(("Created Queue Pair")); } /* Using the tcp connection, exchange necesary data needed to map * the remote memory: * (local: lid, qp_hndl->qp_num ), (remote: d_lid, d_qp_num) */ /* Exchange lid and qp_num with other node */ if( write(p->commfd, &lid, sizeof(lid) ) != sizeof(lid) ) { fprintf(stderr, "Failed to send lid over socket\n"); return -1; } if( write(p->commfd, &qp_hndl->qp_num, sizeof(qp_hndl->qp_num) ) != sizeof(qp_hndl->qp_num) ) { fprintf(stderr, "Failed to send qpnum over socket\n"); return -1; } if( read(p->commfd, &d_lid, sizeof(d_lid) ) != sizeof(d_lid) ) { fprintf(stderr, "Failed to read lid from socket\n"); return -1; } if( read(p->commfd, &d_qp_num, sizeof(d_qp_num) ) != sizeof(d_qp_num) ) { fprintf(stderr, "Failed to read qpnum from socket\n"); return -1; } LOGPRINTF(("Local: lid=%d qp_num=%d Remote: lid=%d qp_num=%d", lid, qp_hndl->qp_num, d_lid, d_qp_num)); /* Further setup must be done to finalize the QP 'connection'. * First set the State of the qp to initialization by making a seperate * ibv_qp_attr* variable, giving it the initial values, and calling * ibv_qp_modify(..) to merge these settings into the QP. */ /* NOTE: According to openIB, ib_mthca's QP modify does not set alternate path * fields in QP context, so you'll have to do this manually if necessary */ /* Bring up Queue Pair */ /******* INIT state ******/ /* qp_attr is seperately allocated per qp/connection */ qp_attr.qp_state = IBV_QPS_INIT; qp_attr.pkey_index = 0; qp_attr.port_num = port_num; qp_attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ; /* merge the qp_attributes into the queue pair */ ret = ibv_modify_qp(qp_hndl, &qp_attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS); if(ret) { fprintf(stderr, "Error modifying QP to INIT\n"); return -1; } LOGPRINTF(("Modified QP to INIT")); /* To enable the Queue Pair to finally receive data, it must be * put into the 'RTR' (Ready-To-Receive) state. The Queue Pair will NOT * function properly until it has been setup, and manually put through * the init and rtr states. */ /******* RTR (Ready-To-Receive) state *******/ qp_attr.qp_state = IBV_QPS_RTR; qp_attr.max_dest_rd_atomic = 1; qp_attr.dest_qp_num = d_qp_num; qp_attr.ah_attr.sl = 0; qp_attr.ah_attr.is_global = 0; qp_attr.ah_attr.dlid = d_lid; qp_attr.ah_attr.static_rate = 0; qp_attr.ah_attr.src_path_bits = 0; qp_attr.ah_attr.port_num = port_num; qp_attr.path_mtu = p->prot.ib_mtu; qp_attr.rq_psn = 0; qp_attr.pkey_index = 0; qp_attr.min_rnr_timer = 5; /* merge these settings into the qp */ ret = ibv_modify_qp(qp_hndl, &qp_attr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER); if(ret) { fprintf(stderr, "Error modifying QP to RTR\n"); return -1; } LOGPRINTF(("Modified QP to RTR")); /* Sync before going to RTS state */ Sync(p); /* In the same manner, 'enable' sending on the queue pair */ /******* RTS (Ready-to-Send) state *******/ qp_attr.qp_state = IBV_QPS_RTS; qp_attr.sq_psn = 0; qp_attr.timeout = 31; qp_attr.retry_cnt = 1; qp_attr.rnr_retry = 1; qp_attr.max_rd_atomic = 1; ret = ibv_modify_qp(qp_hndl, &qp_attr, IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC); if(ret) { fprintf(stderr, "Error modifying QP to RTS\n"); return -1; } LOGPRINTF(("Modified QP to RTS")); /* If using event completion, request the initial notification */ /* This spawns a seperate thread to do the event handling and * notification. * NOTE: This may have problems in systems with Weak Memory Consistency * since there are no mutex(*) calls to preserve coherancy?? */ if( p->prot.comptype == NP_COMP_EVENT ) { if (pthread_create(&thread, NULL, EventThread, NULL)) { fprintf(stderr, "Couldn't start event thread\n"); return -1; } ibv_req_notify_cq(r_cq_hndl, 0); /* request completion notification */ } /* for the receive cq. 2nd argument specifies if ONLY 'solicited' completions will be 'noticed' */ return 0; /* if we get here, the connection is setup correctly */ }