/* Create XRC send qp */
static int xoob_send_qp_create (mca_btl_base_endpoint_t* endpoint)
{
    int prio = BTL_OPENIB_LP_CQ; /* all send completions go to low prio CQ */
    uint32_t send_wr;
    struct ibv_qp **qp;
    uint32_t *psn;
    struct ibv_qp_init_attr qp_init_attr;
    struct ibv_qp_attr attr;
    int ret;
    size_t req_inline;

    mca_btl_openib_module_t *openib_btl =
        (mca_btl_openib_module_t*)endpoint->endpoint_btl;

    /* Prepare QP structs */
    BTL_VERBOSE(("Creating Send QP\n"));
    qp = &endpoint->qps[0].qp->lcl_qp;
    psn = &endpoint->qps[0].qp->lcl_psn;
    /* reserve additional wr for eager rdma credit management */
    send_wr = endpoint->ib_addr->qp->sd_wqe +
        (mca_btl_openib_component.use_eager_rdma ?
         mca_btl_openib_component.max_eager_rdma : 0);
    memset(&qp_init_attr, 0, sizeof(struct ibv_qp_init_attr));
    memset(&attr, 0, sizeof(struct ibv_qp_attr));

    qp_init_attr.send_cq = qp_init_attr.recv_cq = openib_btl->device->ib_cq[prio];

    /* no need recv queue; receives are posted to srq */
    qp_init_attr.cap.max_recv_wr = 0;
    qp_init_attr.cap.max_send_wr = send_wr;
    qp_init_attr.cap.max_inline_data = req_inline =
        openib_btl->device->max_inline_data;
    qp_init_attr.cap.max_send_sge = 1;
    /* this one is ignored by driver */
    qp_init_attr.cap.max_recv_sge = 1; /* we do not use SG list */
    qp_init_attr.qp_type = IBV_QPT_XRC;
    qp_init_attr.xrc_domain = openib_btl->device->xrc_domain;
    *qp = ibv_create_qp(openib_btl->device->ib_pd, &qp_init_attr);
    if (NULL == *qp) {
        BTL_ERROR(("Error creating QP, errno says: %s", strerror(errno)));
        return OMPI_ERROR;
    }

    if (qp_init_attr.cap.max_inline_data < req_inline) {
        endpoint->qps[0].ib_inline_max = qp_init_attr.cap.max_inline_data;
        orte_show_help("help-mpi-btl-openib-cpc-base.txt",
                       "inline truncated", orte_process_info.nodename,
                       ibv_get_device_name(openib_btl->device->ib_dev),
                       openib_btl->port_num,
                       req_inline, qp_init_attr.cap.max_inline_data);
    } else {
        endpoint->qps[0].ib_inline_max = req_inline;
    }

    attr.qp_state = IBV_QPS_INIT;
    attr.pkey_index = openib_btl->pkey_index;
    attr.port_num = openib_btl->port_num;
    attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ;
    ret = ibv_modify_qp(*qp, &attr,
                      IBV_QP_STATE |
                      IBV_QP_PKEY_INDEX |
                      IBV_QP_PORT |
                      IBV_QP_ACCESS_FLAGS );
    if (ret) {
        BTL_ERROR(("Error modifying QP[%x] to IBV_QPS_INIT errno says: %s [%d]",
                    (*qp)->qp_num, strerror(ret), ret));
        return OMPI_ERROR;
    }

    /* Setup meta data on the endpoint */
    *psn = lrand48() & 0xffffff;

    /* Now that all the qp's are created locally, post some receive
       buffers, setup credits, etc. */
    return mca_btl_openib_endpoint_post_recvs(endpoint);
}
Beispiel #2
0
/*-----------------------------------------------------------------------------------*/
static void
low_level_init(struct netif *netif)
{
  struct ibvif *ibvif;
  int num_of_device, flags = IBV_ACCESS_LOCAL_WRITE;
  struct ibv_qp_init_attr attr;
  struct ibv_qp_attr qp_attr;
  uint8_t port_num = 1;
  int    qp_flags;
  struct ibv_device **ib_dev_list;
  struct tcpip_thread *thread;
  struct ibv_exp_cq_init_attr cq_attr;

  ibvif = (struct ibvif *)netif->state;

  /* Obtain MAC address from network interface. */
  ibvif->ethaddr->addr[0] = 0x00;
  ibvif->ethaddr->addr[1] = 0x02;
  ibvif->ethaddr->addr[2] = 0xc9;
  ibvif->ethaddr->addr[3] = 0xa4;
  ibvif->ethaddr->addr[4] = 0x59;
  ibvif->ethaddr->addr[5] = 0x41;

  ibvif->buf_size = ALIGN_TO_PAGE_SIZE(PBUF_POOL_SIZE * TCP_MAX_PACKET_SIZE);

  /* Do things needed for using Raw Packet Verbs */

  ib_dev_list = ibv_get_device_list(&num_of_device);
  if (num_of_device <= 0 || !ib_dev_list || !ib_dev_list[0]) {
    perror("IBV no device found\n");
    exit(1);
  }

  ibvif->context = ibv_open_device(ib_dev_list[1]);
  if (!ibvif->context) {
    perror("IBV can't open device\n");
    exit(1);
  }

  ibv_free_device_list(ib_dev_list);

  if (set_link_layer(ibvif->context, 1) == LINK_FAILURE) {
    perror("IBV can't allocate PD\n");
    exit(1); 
  }

  ibvif->pd = ibv_alloc_pd(ibvif->context);
  if (!ibvif->pd) {
    perror("IBV can't allocate PD\n");
    exit(1);
  }

  /*if (!ibv_buffer(ibvif)) {
    LWIP_DEBUGF(NETIF_DEBUG, ("Buffer allocation failed\n"));
    exit(1);
  }*/

  ibvif->recv_buf     = netif->prot_thread->pbuf_rx_handle.buf;
  ibvif->send_buf     = netif->prot_thread->pbuf_tx_handle.buf;
  ibvif->send_size    = TCP_MAX_PACKET_SIZE;
  ibvif->rx_depth     = PBUF_POOL_SIZE;
  ibvif->tx_depth     = PBUF_POOL_SIZE;

  ibvif->send_mr = ibv_reg_mr(ibvif->pd, ibvif->send_buf, ibvif->buf_size, flags);
  if (!ibvif->send_mr) {
    perror("IBV error reg send mr\n");
    exit(1);
  }

  ibvif->recv_mr = ibv_reg_mr(ibvif->pd, ibvif->recv_buf, ibvif->buf_size, flags);
  if (!ibvif->recv_mr) {
    perror("IBV error reg recv mr\n");
    exit(1);
  }

  ibvif->send_cq = ibv_create_cq(ibvif->context, ibvif->tx_depth, NULL, NULL, 0);
  if (!ibvif->send_cq) {
    perror("IBV can't create send cq\n");
    exit(1);
  }

  cq_attr.flags = IBV_EXP_CQ_TIMESTAMP;
  cq_attr.comp_mask = IBV_EXP_CQ_INIT_ATTR_FLAGS;
  ibvif->recv_cq = ibv_exp_create_cq(ibvif->context, ibvif->rx_depth, NULL, NULL, 0, &cq_attr);
  if (!ibvif->recv_cq) {
    perror("IBV can't create recv cq\n");
    exit(1);
  }

  memset(&attr, 0, sizeof(struct ibv_qp_init_attr));
  attr.send_cq = ibvif->send_cq;
  attr.recv_cq = ibvif->recv_cq;
  attr.cap.max_send_wr = ibvif->tx_depth;
  attr.cap.max_send_sge = 1;
  attr.cap.max_recv_wr = ibvif->rx_depth;
  attr.cap.max_recv_sge = 1;
  attr.qp_type = IBV_QPT_RAW_PACKET;

  ibvif->qp = ibv_create_qp(ibvif->pd, &attr);
  if (!ibvif->qp) {
    perror("IBV can't create QP\n");
    exit(1);
  }

  qp_flags = IBV_QP_STATE | IBV_QP_PORT;
  memset(&qp_attr, 0, sizeof(struct ibv_qp_attr));
  qp_attr.qp_state = IBV_QPS_INIT;
  qp_attr.pkey_index = 0;
  qp_attr.port_num = port_num;
  qp_attr.qp_access_flags = 0;

  if (ibv_modify_qp(ibvif->qp, &qp_attr, qp_flags)) {
    perror("IBV can't set qp to init\n");
    exit(1);
  }
  ibv_attach_device(netif);
}
Beispiel #3
0
static struct ibv_qp *
mlx5_glue_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr)
{
	return ibv_create_qp(pd, qp_init_attr);
}
Beispiel #4
0
static inline int fi_ibv_get_qp_cap(struct ibv_context *ctx,
				    struct ibv_device_attr *device_attr,
				    struct fi_info *info)
{
	struct ibv_pd *pd;
	struct ibv_cq *cq;
	struct ibv_qp *qp;
	struct ibv_qp_init_attr init_attr;
	int ret = 0;

	pd = ibv_alloc_pd(ctx);
	if (!pd) {
		VERBS_INFO_ERRNO(FI_LOG_FABRIC, "ibv_alloc_pd", errno);
		return -errno;
	}

	cq = ibv_create_cq(ctx, 1, NULL, NULL, 0);
	if (!cq) {
		VERBS_INFO_ERRNO(FI_LOG_FABRIC, "ibv_create_cq", errno);
		ret = -errno;
		goto err1;
	}

	/* TODO: serialize access to string buffers */
	fi_read_file(FI_CONF_DIR, "def_tx_ctx_size",
			def_tx_ctx_size, sizeof def_tx_ctx_size);
	fi_read_file(FI_CONF_DIR, "def_rx_ctx_size",
			def_rx_ctx_size, sizeof def_rx_ctx_size);
	fi_read_file(FI_CONF_DIR, "def_tx_iov_limit",
			def_tx_iov_limit, sizeof def_tx_iov_limit);
	fi_read_file(FI_CONF_DIR, "def_rx_iov_limit",
			def_rx_iov_limit, sizeof def_rx_iov_limit);
	fi_read_file(FI_CONF_DIR, "def_inject_size",
			def_inject_size, sizeof def_inject_size);

	memset(&init_attr, 0, sizeof init_attr);
	init_attr.send_cq = cq;
	init_attr.recv_cq = cq;
	init_attr.cap.max_send_wr = MIN(atoi(def_tx_ctx_size), device_attr->max_qp_wr);
	init_attr.cap.max_recv_wr = MIN(atoi(def_rx_ctx_size), device_attr->max_qp_wr);
	init_attr.cap.max_send_sge = MIN(atoi(def_tx_iov_limit), device_attr->max_sge);
	init_attr.cap.max_recv_sge = MIN(atoi(def_rx_iov_limit), device_attr->max_sge);
	init_attr.cap.max_inline_data = atoi(def_inject_size);
	init_attr.qp_type = IBV_QPT_RC;

	qp = ibv_create_qp(pd, &init_attr);
	if (!qp) {
		VERBS_INFO_ERRNO(FI_LOG_FABRIC, "ibv_create_qp", errno);
		ret = -errno;
		goto err2;
	}

	info->tx_attr->inject_size	= init_attr.cap.max_inline_data;
	info->tx_attr->iov_limit 	= init_attr.cap.max_send_sge;
	info->tx_attr->size	 	= init_attr.cap.max_send_wr;

	info->rx_attr->iov_limit 	= init_attr.cap.max_recv_sge;
	/*
	 * On some HW ibv_create_qp can increase max_recv_wr value more than
	 * it really supports. So, alignment with device capability is needed.
	 */
	info->rx_attr->size	 	= MIN(init_attr.cap.max_recv_wr,
						device_attr->max_qp_wr);

	ibv_destroy_qp(qp);
err2:
	ibv_destroy_cq(cq);
err1:
	ibv_dealloc_pd(pd);

	return ret;
}
/*
 * Create the local side of one qp.  The remote side will be connected
 * later.
 */
static int qp_create_one(mca_btl_base_endpoint_t* endpoint, int qp, 
        struct ibv_srq *srq, uint32_t max_recv_wr, uint32_t max_send_wr)
{
    mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl;
    struct ibv_qp *my_qp;
    struct ibv_qp_init_attr init_attr;
    struct ibv_qp_attr attr;
    size_t req_inline;

    memset(&init_attr, 0, sizeof(init_attr));
    memset(&attr, 0, sizeof(attr));

    init_attr.qp_type = IBV_QPT_RC;
    init_attr.send_cq = openib_btl->device->ib_cq[BTL_OPENIB_LP_CQ];
    init_attr.recv_cq = openib_btl->device->ib_cq[qp_cq_prio(qp)];
    init_attr.srq     = srq;
    init_attr.cap.max_inline_data = req_inline = 
        max_inline_size(qp, openib_btl->device);
    init_attr.cap.max_send_sge = 1;
    init_attr.cap.max_recv_sge = 1; /* we do not use SG list */
    if(BTL_OPENIB_QP_TYPE_PP(qp)) {
        init_attr.cap.max_recv_wr  = max_recv_wr;
    } else {
        init_attr.cap.max_recv_wr  = 0;
    }
    init_attr.cap.max_send_wr  = max_send_wr;

    my_qp = ibv_create_qp(openib_btl->device->ib_pd, &init_attr); 
    
    if (NULL == my_qp) { 
        BTL_ERROR(("error creating qp errno says %s", strerror(errno))); 
        return OMPI_ERROR; 
    }
    endpoint->qps[qp].qp->lcl_qp = my_qp;

    if (init_attr.cap.max_inline_data < req_inline) {
        endpoint->qps[qp].ib_inline_max = init_attr.cap.max_inline_data;
        orte_show_help("help-mpi-btl-openib-cpc-base.txt",
                       "inline truncated", true, orte_process_info.nodename,
                       ibv_get_device_name(openib_btl->device->ib_dev),
                       openib_btl->port_num,
                       req_inline, init_attr.cap.max_inline_data);
    } else {
        endpoint->qps[qp].ib_inline_max = req_inline;
    }
    
    attr.qp_state        = IBV_QPS_INIT;
    attr.pkey_index      = openib_btl->pkey_index;
    attr.port_num        = openib_btl->port_num;
    attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ;

    if (ibv_modify_qp(endpoint->qps[qp].qp->lcl_qp, 
                      &attr, 
                      IBV_QP_STATE | 
                      IBV_QP_PKEY_INDEX | 
                      IBV_QP_PORT | 
                      IBV_QP_ACCESS_FLAGS )) { 
        BTL_ERROR(("error modifying qp to INIT errno says %s", strerror(errno))); 
        return OMPI_ERROR; 
    } 

    /* Setup meta data on the endpoint */
    endpoint->qps[qp].qp->lcl_psn = lrand48() & 0xffffff;
    endpoint->qps[qp].credit_frag = NULL;

    return OMPI_SUCCESS;
}
Beispiel #6
0
/*
 *  move_to_rts
 */
static
int move_to_rts(struct ibv_qp *qp)
{
    struct ibv_qp_attr attr = {
	.qp_state	= IBV_QPS_RTS,
	.timeout	= 14, /* old = 10 */
	.retry_cnt	= 7,  /* old = 1 */
	.rnr_retry	= 7,  /* old = 1 */
	.sq_psn	= 0,  /* Packet sequence number */
	.max_rd_atomic  = 1,  /* Number of outstanding RDMA rd/atomic ops at destination ?*/
    };

    if (ibv_modify_qp(qp, &attr,
		      IBV_QP_STATE              |
		      IBV_QP_TIMEOUT            |
		      IBV_QP_RETRY_CNT          |
		      IBV_QP_RNR_RETRY          |
		      IBV_QP_SQ_PSN             |
		      IBV_QP_MAX_QP_RD_ATOMIC))
	goto err_VAPI_modify_qp;

    return 0;
    /* --- */
 err_VAPI_modify_qp:
    psoib_err_errno("ibv_modify_qp() move to RTS failed", errno);
    return -1;
}


int psoib_con_init(psoib_con_info_t *con_info, hca_info_t *hca_info, port_info_t *port_info)
{
    unsigned int i;

    if (!hca_info) hca_info = &default_hca;
    if (!port_info) port_info = &default_port;

    con_info->ctx = hca_info->ctx;
    con_info->port_info = port_info;
    con_info->qp = NULL;
    con_info->hca_info = hca_info;

    con_info->send.bufs.mr = NULL;
    con_info->recv.bufs.mr = NULL;
    con_info->con_broken = 0;
    INIT_LIST_HEAD(&con_info->next_con_info);

    {
	struct ibv_qp_init_attr attr = {
	    .send_cq = hca_info->cq,
	    .recv_cq = hca_info->cq,
	    .cap     = {
		//.max_send_wr  = 128, /* Max outstanding WR on the SQ ??*/
		//.max_recv_wr  = 128, /* Max outstanding WR on the RQ ??*/
		//.max_send_sge = 4,   /* Max scatter/gather descriptor entries on the SQ ??*/
		//.max_recv_sge = 4,   /* Max scatter/gather descriptor entries on the RQ */
		.max_send_wr  = 128, /* Max outstanding WR on the SQ ??*/
		.max_recv_wr  = 128, /* Max outstanding WR on the RQ ??*/
		.max_send_sge = 1,   /* Max scatter/gather descriptor entries on the SQ ??*/
		.max_recv_sge = 1,   /* Max scatter/gather descriptor entries on the RQ */
		.max_inline_data = IB_MAX_INLINE,
	    },
	    .qp_type = IBV_QPT_RC
	};

	con_info->qp = ibv_create_qp(hca_info->pd, &attr);
	if (!con_info->qp) goto err_create_qp;
    }
Beispiel #7
0
static int init_ud_qp(struct ibv_context *context_arg,
                      struct mca_btl_openib_sa_qp_cache *cache)
{
    struct ibv_qp_init_attr iattr;
    struct ibv_qp_attr mattr;
    int rc;

    /* create cq */
    cache->cq = ibv_create_cq(cache->context, 4, NULL, NULL, 0);
    if (NULL == cache->cq) {
        BTL_ERROR(("error creating cq, errno says %s", strerror(errno)));
        opal_show_help("help-mpi-btl-openib.txt", "init-fail-create-q",
                true, opal_proc_local_get()->proc_hostname,
                __FILE__, __LINE__, "ibv_create_cq",
                strerror(errno), errno,
                ibv_get_device_name(context_arg->device));
        return OPAL_ERROR;
    }

    /* create qp */
    memset(&iattr, 0, sizeof(iattr));
    iattr.send_cq = cache->cq;
    iattr.recv_cq = cache->cq;
    iattr.cap.max_send_wr = 2;
    iattr.cap.max_recv_wr = 2;
    iattr.cap.max_send_sge = 1;
    iattr.cap.max_recv_sge = 1;
    iattr.qp_type = IBV_QPT_UD;
    cache->qp = ibv_create_qp(cache->pd, &iattr);
    if (NULL == cache->qp) {
        BTL_ERROR(("error creating qp %s (%d)", strerror(errno), errno));
        return OPAL_ERROR;
    }

    /* modify qp to IBV_QPS_INIT */
    memset(&mattr, 0, sizeof(mattr));
    mattr.qp_state = IBV_QPS_INIT;
    mattr.port_num = cache->port_num;
    mattr.qkey = ntohl(IB_QP1_WELL_KNOWN_Q_KEY);
    rc = ibv_modify_qp(cache->qp, &mattr,
            IBV_QP_STATE              |
            IBV_QP_PKEY_INDEX         |
            IBV_QP_PORT               |
            IBV_QP_QKEY);
    if (rc) {
        BTL_ERROR(("Error modifying QP[%x] to IBV_QPS_INIT errno says: %s [%d]",
                    cache->qp->qp_num, strerror(errno), errno));
        return OPAL_ERROR;
    }

    /* modify qp to IBV_QPS_RTR */
    memset(&mattr, 0, sizeof(mattr));
    mattr.qp_state = IBV_QPS_RTR;
    rc = ibv_modify_qp(cache->qp, &mattr, IBV_QP_STATE);
    if (rc) {
        BTL_ERROR(("Error modifying QP[%x] to IBV_QPS_RTR errno says: %s [%d]",
                    cache->qp->qp_num, strerror(errno), errno));
        return OPAL_ERROR;
    }

    /* modify qp to IBV_QPS_RTS */
    mattr.qp_state = IBV_QPS_RTS;
    rc = ibv_modify_qp(cache->qp, &mattr, IBV_QP_STATE | IBV_QP_SQ_PSN);
    if (rc) {
        BTL_ERROR(("Error modifying QP[%x] to IBV_QPS_RTR errno says: %s [%d]",
                    cache->qp->qp_num, strerror(errno), errno));
        return OPAL_ERROR;
    }

    return OPAL_SUCCESS;
}
Beispiel #8
0
static int
rdmasniff_activate(pcap_t *handle)
{
	struct pcap_rdmasniff *priv = handle->priv;
	struct ibv_qp_init_attr qp_init_attr;
	struct ibv_qp_attr qp_attr;
	struct ibv_flow_attr flow_attr;
	struct ibv_port_attr port_attr;
	int i;

	priv->context = ibv_open_device(priv->rdma_device);
	if (!priv->context) {
		pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
			      "Failed to open device %s", handle->opt.device);
		goto error;
	}

	priv->pd = ibv_alloc_pd(priv->context);
	if (!priv->pd) {
		pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
			      "Failed to alloc PD for device %s", handle->opt.device);
		goto error;
	}

	priv->channel = ibv_create_comp_channel(priv->context);
	if (!priv->channel) {
		pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
			      "Failed to create comp channel for device %s", handle->opt.device);
		goto error;
	}

	priv->cq = ibv_create_cq(priv->context, RDMASNIFF_NUM_RECEIVES,
				 NULL, priv->channel, 0);
	if (!priv->cq) {
		pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
			      "Failed to create CQ for device %s", handle->opt.device);
		goto error;
	}

	ibv_req_notify_cq(priv->cq, 0);

	memset(&qp_init_attr, 0, sizeof qp_init_attr);
	qp_init_attr.send_cq = qp_init_attr.recv_cq = priv->cq;
	qp_init_attr.cap.max_recv_wr = RDMASNIFF_NUM_RECEIVES;
	qp_init_attr.cap.max_recv_sge = 1;
	qp_init_attr.qp_type = IBV_QPT_RAW_PACKET;
	priv->qp = ibv_create_qp(priv->pd, &qp_init_attr);
	if (!priv->qp) {
		pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
			      "Failed to create QP for device %s", handle->opt.device);
		goto error;
	}

	memset(&qp_attr, 0, sizeof qp_attr);
	qp_attr.qp_state = IBV_QPS_INIT;
	qp_attr.port_num = priv->port_num;
	if (ibv_modify_qp(priv->qp, &qp_attr, IBV_QP_STATE | IBV_QP_PORT)) {
		pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
			      "Failed to modify QP to INIT for device %s", handle->opt.device);
		goto error;
	}

	memset(&qp_attr, 0, sizeof qp_attr);
	qp_attr.qp_state = IBV_QPS_RTR;
	if (ibv_modify_qp(priv->qp, &qp_attr, IBV_QP_STATE)) {
		pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
			      "Failed to modify QP to RTR for device %s", handle->opt.device);
		goto error;
	}

	memset(&flow_attr, 0, sizeof flow_attr);
	flow_attr.type = IBV_FLOW_ATTR_SNIFFER;
	flow_attr.size = sizeof flow_attr;
	flow_attr.port = priv->port_num;
	priv->flow = ibv_create_flow(priv->qp, &flow_attr);
	if (!priv->flow) {
		pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
			      "Failed to create flow for device %s", handle->opt.device);
		goto error;
	}

	handle->bufsize = RDMASNIFF_NUM_RECEIVES * RDMASNIFF_RECEIVE_SIZE;
	handle->buffer = malloc(handle->bufsize);
	if (!handle->buffer) {
		pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
			      "Failed to allocate receive buffer for device %s", handle->opt.device);
		goto error;
	}

	priv->oneshot_buffer = malloc(RDMASNIFF_RECEIVE_SIZE);
	if (!priv->oneshot_buffer) {
		pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
			      "Failed to allocate oneshot buffer for device %s", handle->opt.device);
		goto error;
	}

	priv->mr = ibv_reg_mr(priv->pd, handle->buffer, handle->bufsize, IBV_ACCESS_LOCAL_WRITE);
	if (!priv->mr) {
		pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
			      "Failed to register MR for device %s", handle->opt.device);
		goto error;
	}


	for (i = 0; i < RDMASNIFF_NUM_RECEIVES; ++i) {
		rdmasniff_post_recv(handle, i);
	}

	if (!ibv_query_port(priv->context, priv->port_num, &port_attr) &&
	    port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) {
		handle->linktype = DLT_INFINIBAND;
	} else {
		handle->linktype = DLT_EN10MB;
	}

	if (handle->snapshot <= 0 || handle->snapshot > RDMASNIFF_RECEIVE_SIZE)
		handle->snapshot = RDMASNIFF_RECEIVE_SIZE;

	handle->offset = 0;
	handle->read_op = rdmasniff_read;
	handle->stats_op = rdmasniff_stats;
	handle->cleanup_op = rdmasniff_cleanup;
	handle->setfilter_op = install_bpf_program;
	handle->setdirection_op = NULL;
	handle->set_datalink_op = NULL;
	handle->getnonblock_op = pcap_getnonblock_fd;
	handle->setnonblock_op = pcap_setnonblock_fd;
	handle->oneshot_callback = rdmasniff_oneshot;
	handle->selectable_fd = priv->channel->fd;

	return 0;

error:
	if (priv->mr) {
		ibv_dereg_mr(priv->mr);
	}

	if (priv->flow) {
		ibv_destroy_flow(priv->flow);
	}

	if (priv->qp) {
		ibv_destroy_qp(priv->qp);
	}

	if (priv->cq) {
		ibv_destroy_cq(priv->cq);
	}

	if (priv->channel) {
		ibv_destroy_comp_channel(priv->channel);
	}

	if (priv->pd) {
		ibv_dealloc_pd(priv->pd);
	}

	if (priv->context) {
		ibv_close_device(priv->context);
	}

	if (priv->oneshot_buffer) {
		free(priv->oneshot_buffer);
	}

	return PCAP_ERROR;
}
/*****************************************
* Function: resources_create
*****************************************/
static int resources_create(
	struct resources *res)
{
	struct ibv_qp_init_attr qp_init_attr;
	struct ibv_device *ib_dev = NULL;
	size_t size;
	int i;
	int mr_flags = 0;
	int cq_size = 0;
	int num_devices;
	int rc;

	/* if client side */
	if (config.server_name) {
		res->sock = sock_client_connect(config.server_name, config.tcp_port);
		if (res->sock < 0) {
			fprintf(stderr, "failed to establish TCP connection to server %s, port %d\n", 
				config.server_name, config.tcp_port);
			return -1;
		}
	} else {
		fprintf(stdout, "waiting on port %d for TCP connection\n", config.tcp_port);

		res->sock = sock_daemon_connect(config.tcp_port);
		if (res->sock < 0) {
			fprintf(stderr, "failed to establish TCP connection with client on port %d\n", 
				config.tcp_port);
			return -1;
		}
	}

	fprintf(stdout, "TCP connection was established\n");

	fprintf(stdout, "searching for IB devices in host\n");

	/* get device names in the system */
	res->dev_list = ibv_get_device_list(&num_devices);
	if (!res->dev_list) {
		fprintf(stderr, "failed to get IB devices list\n");
		return 1;
	}

	/* if there isn't any IB device in host */
	if (!num_devices) {
		fprintf(stderr, "found %d device(s)\n", num_devices);
		return 1;
	}

	fprintf(stdout, "found %d device(s)\n", num_devices);

	/* search for the specific device we want to work with */
	for (i = 0; i < num_devices; i ++) {
		if (!strcmp(ibv_get_device_name(res->dev_list[i]), config.dev_name)) {
			ib_dev = res->dev_list[i];
			break;
		}
	}

	/* if the device wasn't found in host */
	if (!ib_dev) {
		fprintf(stderr, "IB device %s wasn't found\n", config.dev_name);
		return 1;
	}

	/* get device handle */
	res->ib_ctx = ibv_open_device(ib_dev);
	if (!res->ib_ctx) {
		fprintf(stderr, "failed to open device %s\n", config.dev_name);
		return 1;
	}

	/* query port properties  */
	if (ibv_query_port(res->ib_ctx, config.ib_port, &res->port_attr)) {
		fprintf(stderr, "ibv_query_port on port %u failed\n", config.ib_port);
		return 1;
	}

	/* allocate Protection Domain */
	res->pd = ibv_alloc_pd(res->ib_ctx);
	if (!res->pd) {
		fprintf(stderr, "ibv_alloc_pd failed\n");
		return 1;
	}

	res->comp_channel = ibv_create_comp_channel(res->ib_ctx);
	if (!res->comp_channel) {
		fprintf(stderr, "ibv_create_comp_channel failed\n");
		return 1;
	}

	/* each side will send only one WR, so Completion Queue with 1 entry is enough */
	cq_size = 1;
	res->cq = ibv_create_cq(res->ib_ctx, cq_size, NULL, res->comp_channel, 0);
	if (!res->cq) {
		fprintf(stderr, "failed to create CQ with %u entries\n", cq_size);
		return 1;
	}

	/* Arm the CQ before any completion is expected (to prevent races) */
	rc = ibv_req_notify_cq(res->cq, 0);
	if (rc) {
		fprintf(stderr, "failed to arm the CQ\n");
		return 1;
	}
	fprintf(stdout, "CQ was armed\n");

	/* allocate the memory buffer that will hold the data */
	size = MSG_SIZE;
	res->buf = malloc(size);
	if (!res->buf) {
		fprintf(stderr, "failed to malloc %Zu bytes to memory buffer\n", size);
		return 1;
	}

	/* only in the daemon side put the message in the memory buffer */
	if (!config.server_name) {
		strcpy(res->buf, MSG);
		fprintf(stdout, "going to send the message: '%s'\n", res->buf);
	} else
		memset(res->buf, 0, size);

	/* register this memory buffer */
	mr_flags = (config.server_name) ? IBV_ACCESS_LOCAL_WRITE : 0;
	res->mr = ibv_reg_mr(res->pd, res->buf, size, mr_flags);
	if (!res->mr) {
		fprintf(stderr, "ibv_reg_mr failed with mr_flags=0x%x\n", mr_flags);
		return 1;
	}

	fprintf(stdout, "MR was registered with addr=%p, lkey=0x%x, rkey=0x%x, flags=0x%x\n",
			      res->buf, res->mr->lkey, res->mr->rkey, mr_flags);


	/* create the Queue Pair */
	memset(&qp_init_attr, 0, sizeof(qp_init_attr));

	qp_init_attr.qp_type    = IBV_QPT_RC;
	qp_init_attr.sq_sig_all = 1;
	qp_init_attr.send_cq    = res->cq;
	qp_init_attr.recv_cq    = res->cq;
	qp_init_attr.cap.max_send_wr  = 1;
	qp_init_attr.cap.max_recv_wr  = 1;
	qp_init_attr.cap.max_send_sge = 1;
	qp_init_attr.cap.max_recv_sge = 1;

	res->qp = ibv_create_qp(res->pd, &qp_init_attr);
	if (!res->qp) {
		fprintf(stderr, "failed to create QP\n");
		return 1;
	}
	fprintf(stdout, "QP was created, QP number=0x%x\n", res->qp->qp_num);

	return 0;
}
static int
fi_ibv_rdm_find_max_inline(struct ibv_pd *pd, struct ibv_context *context)
{
	struct ibv_qp_init_attr qp_attr;
	struct ibv_qp *qp = NULL;
	struct ibv_cq *cq = ibv_create_cq(context, 1, NULL, NULL, 0);
	assert(cq);
	int max_inline = 2;
	int rst = 0;

	memset(&qp_attr, 0, sizeof(qp_attr));
	qp_attr.send_cq = cq;
	qp_attr.recv_cq = cq;
	qp_attr.qp_type = IBV_QPT_RC;
	qp_attr.cap.max_send_wr = 1;
	qp_attr.cap.max_recv_wr = 1;
	qp_attr.cap.max_send_sge = 1;
	qp_attr.cap.max_recv_sge = 1;

	do {
		if (qp)
			ibv_destroy_qp(qp);
		qp_attr.cap.max_inline_data = max_inline;
		qp = ibv_create_qp(pd, &qp_attr);
		if (qp) {
			/* 
			 * truescale returns max_inline_data 0
			 */
			if (qp_attr.cap.max_inline_data == 0)
				break;

			/*
			 * iWarp is able to create qp with unsupported
			 * max_inline, lets take first returned value.
			 */
			if (context->device->transport_type == IBV_TRANSPORT_IWARP) {
				max_inline = rst = qp_attr.cap.max_inline_data;
				break;
			}
			rst = max_inline;
		}
	} while (qp && (max_inline < INT_MAX / 2) && (max_inline *= 2));

	if (rst != 0) {
		int pos = rst, neg = max_inline;
		do {
			max_inline = pos + (neg - pos) / 2;
			if (qp)
				ibv_destroy_qp(qp);

			qp_attr.cap.max_inline_data = max_inline;
			qp = ibv_create_qp(pd, &qp_attr);
			if (qp)
				pos = max_inline;
			else
				neg = max_inline;

		} while (neg - pos > 2);

		rst = pos;
	}

	if (qp) {
		ibv_destroy_qp(qp);
	}

	if (cq) {
		ibv_destroy_cq(cq);
	}

	return rst;
}
Beispiel #11
0
/* Initialize the actual IB device */
int initIB(ArgStruct *p)
{
  int i, j, ret;
  char *tmp;
  int num_devices = 0;
  struct ibv_device **hca_list, **filtered_hca_list;
  struct ibv_device_attr hca_attr;
#if !HAVE_IBV_DEVICE_LIST
  struct dlist *hca_dlist; 
  struct ibv_device* hca_device; 
#endif

  /* Find all the devices on this host */
#if HAVE_IBV_DEVICE_LIST
  hca_list = ibv_get_device_list(&num_devices);
#else
  hca_dlist = ibv_get_devices();
  dlist_start(hca_dlist); 
  dlist_for_each_data(hca_dlist, hca_device, struct ibv_device)
    ++num_devices;
#endif

  /* If we didn't find any, return an error */
  if (0 == num_devices) {
      fprintf(stderr, "Couldn't find any IBV devices\n");
      return -1;
  }
  
#if !HAVE_IBV_DEVICE_LIST
  /* If we have the old version (ibv_get_devices()), convert it to
     the new form */
  hca_list = (struct ibv_device**) malloc(num_devices * 
                                          sizeof(struct ibv_device*));
  if (NULL == hca_list) {
      fprintf(stderr, "%s:%s:%d: malloc failed\n", __FILE__,
              __func__, __LINE__);
      return -1;
  }
  
  i = 0; 
  dlist_start(hca_dlist); 
  dlist_for_each_data(hca_dlist, hca_device, struct ibv_device)
      hca_list[i++] = hca_device;
#endif    

  /* Possible values for p->prot.device_and_port:

     1. <device>:<port> -- use only this device and only this port
     2. <device> -- use the first active port on this device
     3. :<port> -- use only this port, but on any device

     <device> names are matched exactly.
  */

  /* If a device name was specified on the command line, see if we can
     find it */
  tmp = NULL;
  port_num = -1;
  filtered_hca_list = hca_list;
  if (NULL != p->prot.device_and_port) {
      /* If there's a : in the string, then we have a port */
      tmp = strchr(p->prot.device_and_port, ':');
      if (NULL != tmp) {
          *tmp = '\0';
          ++tmp;
          port_num = atoi(tmp);
      }
      LOGPRINTF(("Pre-filter: looking for target device \"%s\", port %d",
                 p->prot.device_and_port, port_num));

      /* If the length of the device string left is >0, then there's a
         device specification */
      if (strlen(p->prot.device_and_port) > 0) {
          int found = 0;

          /* Loop through all the devices and find a matching
             name */
          for (i = 0; i < num_devices; ++i) {
              LOGPRINTF(("Pre-filter: found device: %s",
                         ibv_get_device_name(hca_list[i])));
              if (0 == strcmp(p->prot.device_and_port, 
                              ibv_get_device_name(hca_list[i]))) {
                  LOGPRINTF(("Pre-filter: found target device: %s (%d of %d)",
                             p->prot.device_and_port, i, num_devices));
                  filtered_hca_list = &(hca_list[i]);
                  num_devices = 1;
                  found = 1;
                  break;
              }
          }

          /* If we didn't find it, abort */
          if (!found) {
              fprintf(stderr, "Unable to find device \"%s\", aborting\n",
                      p->prot.device_and_port);
              return -1;
          }
      }
  }

  /* Traverse the filtered HCA list and find a good port */
  for (hca = NULL, i = 0; NULL == hca && i < num_devices; ++i) {

      /* Get a ibv_context from the ibv_device  */
      ctx = ibv_open_device(filtered_hca_list[i]);
      if (!ctx) {
          fprintf(stderr, "Couldn't create IBV context\n");
          return -1;
      } else {
          LOGPRINTF(("Found HCA %s",
                     ibv_get_device_name(filtered_hca_list[i])));
      }
      
      /* Get the device attributes */
      if (0 != ibv_query_device(ctx, &hca_attr)) {
          fprintf(stderr, "Could not get device context for %s, aborting\n",
                  ibv_get_device_name(hca));
          return -1;
      }

      for (j = 1; j <= hca_attr.phys_port_cnt; ++j) {
          /* If a specific port was asked for, *only* look at that port */
          if (port_num >= 0 && port_num != j) {
              continue;
          }
          LOGPRINTF(("Checking %s:%d...", 
                     ibv_get_device_name(filtered_hca_list[i]), j));

          /* Query this port and see if it's active */
          if (0 != ibv_query_port(ctx, j, &hca_port)) {
              fprintf(stderr, "Unable to query port %s:%d, aborting\n",
                      ibv_get_device_name(filtered_hca_list[i]), j);
              return -1;
          }

          /* If this port is active, we have a winner! */
          if (IBV_PORT_ACTIVE == hca_port.state) {
              LOGPRINTF(("%s:%d is ACTIVE", 
                         ibv_get_device_name(filtered_hca_list[i]), j));
              port_num = j;
              hca = filtered_hca_list[i];
              break;
          }
      }

      /* If we found one, we're done */
      if (hca) {
          break;
      }

      /* Otherwise, close the device (ignore any errors) */
      ibv_close_device(ctx);
      ctx = NULL;
  }

  /* If we didn't find a good device/port combo, abort */
  if (NULL == hca) {
      fprintf(stderr, "Could not find an active device and port, aborting\n");
      return -1;
  }

  /* free up the other devices in the event we would have multiple ib
     devices. if this isnt done, the device pointers will still be
     around in space somewhere -> bad */

#if HAVE_IBV_DEVICE_LIST
  ibv_free_device_list(hca_list); 
#endif
  
  /* Get HCA properties */
  
  lid = hca_port.lid;		/* local id, used to ref back to the device */
  LOGPRINTF(("  lid = %d", lid));


  /* Allocate Protection Domain */
	/* need a Protection domain to handle/register memory over the card */
  pd_hndl = ibv_alloc_pd(ctx);	
  if(!pd_hndl) {
    fprintf(stderr, "Error allocating PD\n");
    return -1;
  } else {
    LOGPRINTF(("Allocated Protection Domain"));
  }


  /* Create send completion queue */
  
  num_cqe = 30000; /* Requested number of completion q elements */
  s_cq_hndl = ibv_create_cq(ctx, num_cqe, NULL, NULL, 0);
  if(!s_cq_hndl) {
    fprintf(stderr, "Error creating send CQ\n");
    return -1;
  } else {
    act_num_cqe = s_cq_hndl->cqe;
    LOGPRINTF(("Created Send Completion Queue with %d elements", act_num_cqe));
  }


  /* Create recv completion queue */
  
  num_cqe = 20000; /* Requested number of completion q elements */
  r_cq_hndl = ibv_create_cq(ctx, num_cqe, NULL, NULL, 0);
  if(!r_cq_hndl) {
    fprintf(stderr, "Error creating send CQ\n");
    return -1;
  } else {
    act_num_cqe = r_cq_hndl->cqe;
    LOGPRINTF(("Created Recv Completion Queue with %d elements", act_num_cqe));
  }


  /* Placeholder for MR */
	/* We dont actually setup the Memory Regions here, instead
	 * this is done in the 'MyMalloc(..)' helper function.
	 * You could however, set them up here.
	 */

  /* Create Queue Pair */
    /* To setup a Queue Pair, the following qp initial attributes must be
     * specified and passed to the create_qp(..) function:
     * max send/recv write requests.  (max_recv/send_wr)
     * max scatter/gather entries. (max_recv/send_sge)
     * Command queues to associate the qp with.  (recv/send_cq)
     * Signalling type:  1-> signal all events.  0-> dont, event handler will
     *   deal with this.
     * QP type.  (RC=reliable connection, UC=unreliable.. etc.) defined 
     *   in the verbs header.
     */
  memset(&qp_init_attr, 0, sizeof(struct ibv_qp_init_attr)); 
  qp_init_attr.cap.max_recv_wr    = max_wq; /* Max outstanding WR on RQ      */
  qp_init_attr.cap.max_send_wr    = max_wq; /* Max outstanding WR on SQ      */
  qp_init_attr.cap.max_recv_sge   = 1; /* Max scatter/gather entries on RQ */
  qp_init_attr.cap.max_send_sge   = 1; /* Max scatter/gather entries on SQ */
  qp_init_attr.recv_cq            = r_cq_hndl; /* CQ handle for RQ         */
  qp_init_attr.send_cq            = s_cq_hndl; /* CQ handle for SQ         */
  qp_init_attr.sq_sig_all         = 0; /* Signalling type */
  qp_init_attr.qp_type            = IBV_QPT_RC; /* Transmission type         */

  /* ibv_create_qp( ibv_pd *pd, ibv_qp_init_attr * attr) */  
  qp_hndl = ibv_create_qp(pd_hndl, &qp_init_attr);
  if(!qp_hndl) {
    fprintf(stderr, "Error creating Queue Pair: %s\n", strerror(errno));
    return -1;
  } else {
    LOGPRINTF(("Created Queue Pair"));
  }

    /* Using the tcp connection, exchange necesary data needed to map
     *  the remote memory:
     *  (local: lid, qp_hndl->qp_num ), (remote: d_lid, d_qp_num)
     */

  /* Exchange lid and qp_num with other node */
  
  if( write(p->commfd, &lid, sizeof(lid) ) != sizeof(lid) ) {
    fprintf(stderr, "Failed to send lid over socket\n");
    return -1;
  }
  if( write(p->commfd, &qp_hndl->qp_num, sizeof(qp_hndl->qp_num) ) != sizeof(qp_hndl->qp_num) ) {
    fprintf(stderr, "Failed to send qpnum over socket\n");
    return -1;
  }
  if( read(p->commfd, &d_lid, sizeof(d_lid) ) != sizeof(d_lid) ) {
    fprintf(stderr, "Failed to read lid from socket\n");
    return -1;
  }
  if( read(p->commfd, &d_qp_num, sizeof(d_qp_num) ) != sizeof(d_qp_num) ) {
    fprintf(stderr, "Failed to read qpnum from socket\n");
    return -1;
  }
  
  LOGPRINTF(("Local: lid=%d qp_num=%d Remote: lid=%d qp_num=%d",
             lid, qp_hndl->qp_num, d_lid, d_qp_num));
    /* Further setup must be done to finalize the QP 'connection'.
     * First set the State of the qp to initialization by making a seperate
     * ibv_qp_attr* variable, giving it the initial values, and calling
     * ibv_qp_modify(..) to merge these settings into the QP.
     */
/* NOTE: According to openIB, ib_mthca's QP modify does not set alternate path
 *  fields in QP context, so you'll have to do this manually if necessary
 */

    /* Bring up Queue Pair */
  
  /******* INIT state ******/

  /* qp_attr is seperately allocated per qp/connection */
  qp_attr.qp_state = IBV_QPS_INIT;
  qp_attr.pkey_index = 0;
  qp_attr.port_num = port_num;
  qp_attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ;
  /* merge the qp_attributes into the queue pair */
  ret = ibv_modify_qp(qp_hndl, &qp_attr,
		      IBV_QP_STATE              |
		      IBV_QP_PKEY_INDEX         |
		      IBV_QP_PORT               |
		      IBV_QP_ACCESS_FLAGS);
  if(ret) {
    fprintf(stderr, "Error modifying QP to INIT\n");
    return -1;
  }

  LOGPRINTF(("Modified QP to INIT"));

/* To enable the Queue Pair to finally receive data, it must be 
 * put into the 'RTR' (Ready-To-Receive) state.  The Queue Pair will NOT
 * function properly until it has been setup, and manually put through
 * the init and rtr states.
 */
  
  /******* RTR (Ready-To-Receive) state *******/

  qp_attr.qp_state = IBV_QPS_RTR;
  qp_attr.max_dest_rd_atomic = 1;
  qp_attr.dest_qp_num = d_qp_num;
  qp_attr.ah_attr.sl = 0;
  qp_attr.ah_attr.is_global = 0;
  qp_attr.ah_attr.dlid = d_lid;
  qp_attr.ah_attr.static_rate = 0;
  qp_attr.ah_attr.src_path_bits = 0;
  qp_attr.ah_attr.port_num = port_num;
  qp_attr.path_mtu = p->prot.ib_mtu;
  qp_attr.rq_psn = 0;
  qp_attr.pkey_index = 0;
  qp_attr.min_rnr_timer = 5;
  /* merge these settings into the qp */
  ret = ibv_modify_qp(qp_hndl, &qp_attr,
		      IBV_QP_STATE              |
		      IBV_QP_AV                 |
		      IBV_QP_PATH_MTU           |
		      IBV_QP_DEST_QPN           |
		      IBV_QP_RQ_PSN             |
		      IBV_QP_MAX_DEST_RD_ATOMIC |
		      IBV_QP_MIN_RNR_TIMER);

  if(ret) {
    fprintf(stderr, "Error modifying QP to RTR\n");
    return -1;
  }

  LOGPRINTF(("Modified QP to RTR"));

  /* Sync before going to RTS state */
  Sync(p);

  /* In the same manner, 'enable' sending on the queue pair */
  
  /******* RTS (Ready-to-Send) state *******/

  qp_attr.qp_state = IBV_QPS_RTS;
  qp_attr.sq_psn = 0;
  qp_attr.timeout = 31;
  qp_attr.retry_cnt = 1;
  qp_attr.rnr_retry = 1;
  qp_attr.max_rd_atomic = 1;

  ret = ibv_modify_qp(qp_hndl, &qp_attr,
		      IBV_QP_STATE              |
		      IBV_QP_TIMEOUT            |
		      IBV_QP_RETRY_CNT          |
		      IBV_QP_RNR_RETRY          |
		      IBV_QP_SQ_PSN             |
		      IBV_QP_MAX_QP_RD_ATOMIC);

  if(ret) {
    fprintf(stderr, "Error modifying QP to RTS\n");
    return -1;
  }
  
  LOGPRINTF(("Modified QP to RTS"));

  /* If using event completion, request the initial notification */
  /* This spawns a seperate thread to do the event handling and
   * notification.
   * NOTE:  This may have problems in systems with Weak Memory Consistency
   * since there are no mutex(*) calls to preserve coherancy??
   */ 
  if( p->prot.comptype == NP_COMP_EVENT ) {
    if (pthread_create(&thread, NULL, EventThread, NULL)) {
      fprintf(stderr, "Couldn't start event thread\n");
      return -1;
    }
    ibv_req_notify_cq(r_cq_hndl, 0);	/* request completion notification  */
  }					/* for the receive cq.  2nd argument 
					   specifies if ONLY 'solicited'
					   completions will be 'noticed' */
  
 
  return 0; /* if we get here, the connection is setup correctly */
}