Beispiel #1
0
static int print_device_info(void) {
  struct ibv_device ** ibv_devs;
  int i = 0;
  /*TODO: get num_devs automatically*/
  int num_devs = 1;
  /*NULL => get all devices*/

  ibv_devs = ibv_get_device_list(NULL);

  for (i = 0; i < num_devs; i++) {
    struct ibv_context *ibv_contxt;
    struct ibv_device_attr device_attr;
    char *dev_name;
    uint64_t dev_guid;

    ibv_contxt = ibv_open_device (ibv_devs[i]);

    dev_name = ibv_get_device_name(ibv_devs[i]);
    dev_guid = ibv_get_device_guid(ibv_devs[i]);
    printf("%s (%d):\n", dev_name, dev_guid);
    ibv_query_device (ibv_contxt, &device_attr);
    printf("      Record           : %d\n", i);
    printf("         max_mr_size   : %llu\n", device_attr.max_mr_size);
    printf("         max_mr        : %llu\n", device_attr.max_mr);

    ibv_close_device (ibv_contxt);
  }

  ibv_free_device_list(ibv_devs);
  return 0;
}
Beispiel #2
0
/*
 * Given an HFI and a path record's packet lifetime attribute, 
 * compute the packet lifetime for a queue pair.
 */
uint8_t op_path_compute_timeout(struct ibv_context *hfi_context, 
								uint8_t pkt_life)
{
	uint8_t result;
	struct ibv_device_attr device_attr;

	pkt_life &= 0x1f;

	if (ibv_query_device(hfi_context, &device_attr))
		return 0;

	/* Adding 1 has the effect of doubling the timeout. */
	/* We do this because we're looking for a round-trip figure. */
	result = pkt_life + 1; 

	/* If the HFI's local delay is larger than the time out so far, */
	/* replace the result with the local delay, doubled. */
	if (pkt_life < device_attr.local_ca_ack_delay) {
		result = device_attr.local_ca_ack_delay + 1;
	}  else {
		/* The local delay is smaller than the packet life time, */
		/* so we'll simply double the timeout value again. */
		result += 1;
	}

	if (result > 0x1f) result = 0x1f;

	return result;
}
Beispiel #3
0
static int fi_ibv_get_device_attrs(struct ibv_context *ctx, struct fi_info *info)
{
	struct ibv_device_attr device_attr;
	struct ibv_port_attr port_attr;
	int ret = 0;

	ret = ibv_query_device(ctx, &device_attr);
	if (ret) {
		VERBS_INFO_ERRNO(FI_LOG_FABRIC, "ibv_query_device", errno);
		return -errno;
	}

	info->domain_attr->cq_cnt 		= device_attr.max_cq;
	info->domain_attr->ep_cnt 		= device_attr.max_qp;
	info->domain_attr->tx_ctx_cnt 		= MIN(info->domain_attr->tx_ctx_cnt, device_attr.max_qp);
	info->domain_attr->rx_ctx_cnt 		= MIN(info->domain_attr->rx_ctx_cnt, device_attr.max_qp);
	info->domain_attr->max_ep_tx_ctx 	= device_attr.max_qp;
	info->domain_attr->max_ep_rx_ctx 	= device_attr.max_qp;

	ret = fi_ibv_get_qp_cap(ctx, &device_attr, info);
	if (ret)
		return ret;

	ret = ibv_query_port(ctx, 1, &port_attr);
	if (ret) {
		VERBS_INFO_ERRNO(FI_LOG_FABRIC, "ibv_query_port", errno);
		return -errno;
	}

	info->ep_attr->max_msg_size 		= port_attr.max_msg_sz;
	info->ep_attr->max_order_raw_size 	= port_attr.max_msg_sz;
	info->ep_attr->max_order_waw_size	= port_attr.max_msg_sz;

	return 0;
}
Beispiel #4
0
static
void psofed_scan_hca_ports(struct ibv_device *ib_dev)
{
	struct ibv_context *ctx;
	struct ibv_device_attr device_attr;
	int rc;
	unsigned port_cnt;
	unsigned port;
	const char *dev_name;

	dev_name =ibv_get_device_name(ib_dev);
	if (!dev_name) dev_name = "unknown";

	ctx = ibv_open_device(ib_dev);
	if (!ctx) goto err_open_dev;

	rc = ibv_query_device(ctx, &device_attr);
	if (!rc) {
		port_cnt = device_attr.phys_port_cnt;
		if (port_cnt > 128) port_cnt = 128;
	} else {
		// Query failed. Assume 2 ports.
		port_cnt = 2;
	}

	for (port = 1; port <= port_cnt; port++) {
		struct ibv_port_attr port_attr;
		enum ibv_port_state port_state;
		const char *marker;

		rc = ibv_query_port(ctx, port, &port_attr);
		port_state = !rc ? port_attr.state : 999 /* unknown */;

		marker = "";
		if (port_state == IBV_PORT_ACTIVE &&
		    (!psofed_hca || !strcmp(dev_name, psofed_hca)) &&
		    (!psofed_port || psofed_port == port)) {
			// use this port for the communication:

			if (!psofed_hca) psofed_hca = strdup(dev_name);
			if (!psofed_port) psofed_port = port;
			marker = "*";
		}

		psofed_dprint(3, "IB port <%s:%u>: %s%s",
			      dev_name, port, port_state_str(port_state), marker);
	}

	if (ctx) ibv_close_device(ctx);

err_open_dev:
	return;
}
Beispiel #5
0
static int fi_ibv_get_device_attrs(struct ibv_context *ctx, struct fi_info *info)
{
	struct ibv_device_attr device_attr;
	struct ibv_port_attr port_attr;
	int ret = 0;

	ret = ibv_query_device(ctx, &device_attr);
	if (ret) {
		VERBS_INFO_ERRNO(FI_LOG_FABRIC, "ibv_query_device", errno);
		return -errno;
	}

	info->domain_attr->cq_cnt 		= device_attr.max_cq;
	info->domain_attr->ep_cnt 		= device_attr.max_qp;
	info->domain_attr->tx_ctx_cnt 		= MIN(info->domain_attr->tx_ctx_cnt, device_attr.max_qp);
	info->domain_attr->rx_ctx_cnt 		= MIN(info->domain_attr->rx_ctx_cnt, device_attr.max_qp);
	info->domain_attr->max_ep_tx_ctx 	= MIN(info->domain_attr->tx_ctx_cnt, device_attr.max_qp);
	info->domain_attr->max_ep_rx_ctx 	= MIN(info->domain_attr->rx_ctx_cnt, device_attr.max_qp);
	info->domain_attr->max_ep_srx_ctx	= device_attr.max_qp;
	info->domain_attr->mr_cnt		= device_attr.max_mr;

	if (info->ep_attr->type == FI_EP_RDM)
		info->domain_attr->cntr_cnt	= device_attr.max_qp * 4;

	info->tx_attr->size 			= device_attr.max_qp_wr;
	info->tx_attr->iov_limit 		= device_attr.max_sge;
	info->tx_attr->rma_iov_limit		= device_attr.max_sge;

	info->rx_attr->size 			= device_attr.max_srq_wr ?
						  MIN(device_attr.max_qp_wr,
						      device_attr.max_srq_wr) :
						      device_attr.max_qp_wr;
	info->rx_attr->iov_limit 		= MIN(device_attr.max_sge,
						      device_attr.max_srq_sge);

	ret = fi_ibv_get_qp_cap(ctx, info);
	if (ret)
		return ret;

	ret = ibv_query_port(ctx, 1, &port_attr);
	if (ret) {
		VERBS_INFO_ERRNO(FI_LOG_FABRIC, "ibv_query_port", errno);
		return -errno;
	}

	info->ep_attr->max_msg_size 		= port_attr.max_msg_sz;
	info->ep_attr->max_order_raw_size 	= port_attr.max_msg_sz;
	info->ep_attr->max_order_waw_size	= port_attr.max_msg_sz;

	return 0;
}
RDMAServerSocket::RDMAServerSocket(std::vector<std::string> hosts,
                                   const std::string &port, uint32_t max_wr,
                                   int cq_entries)
    : ec(createEventChannel()), id(createCmId(hosts.back(), port, true)),
      cc(id), cq(id, cc, cq_entries, 1, 0), running(true) {
  assert(max_wr);

  check_zero(rdma_migrate_id(id.get(), ec.get()));

  ibv_srq_init_attr srq_attr = { nullptr, { max_wr, 1, 0 } };
  check_zero(rdma_create_srq(id.get(), nullptr, &srq_attr));

  log_info() << "Created id " << id.get() << " " << (void *)this;
  hosts.pop_back();

  for (const auto &host : hosts) {
    ibv_qp_init_attr attr = {};
    attr.cap.max_send_wr = 256;
    attr.cap.max_recv_wr = 0;
    attr.cap.max_send_sge = 1;
    attr.cap.max_recv_sge = 0;
    attr.recv_cq = cq;
    attr.send_cq = cq;
    attr.srq = id->srq;
    attr.cap.max_inline_data = 72;
    attr.sq_sig_all = 1;
    auto client_id = createCmId(host, port, true, &attr, id->pd);

    check_zero(rdma_migrate_id(client_id.get(), ec.get()));

    ids.push_back(std::move(client_id));

    log_info() << srq_attr;
  }

  cm_events();
  if(id->verbs) {
    ibv_device_attr attr;
    check_zero(ibv_query_device(id->verbs, &attr));
    log_info() << attr;
  }
}
Beispiel #7
0
/**
 * Get the type of a device.
 *
 * @param dev the device.
 * @param ctx the device context.
 * @param hca_type the type (output).
 *
 * @return MPI_SUCCESS if succeded, MPI_ERR_OTHER if failed
 *
 * \see HCA_Type
 */
static inline int get_hca_type (struct ibv_device* dev, struct ibv_context* ctx, HCA_Type* hca_type)
{
    MPIDI_STATE_DECL(MPID_STATE_GET_HCA_TYPE);
    MPIDI_FUNC_ENTER(MPID_STATE_GET_HCA_TYPE);
    int ret;
    int mpi_errno = MPI_SUCCESS;
    struct ibv_device_attr dev_attr;

    memset(&dev_attr, 0, sizeof(struct ibv_device_attr));

    char* dev_name = (char*) ibv_get_device_name(dev);
    if (!dev_name)
    {
        MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**ibv_get_device_name");
    }

    ret = ibv_query_device(ctx, &dev_attr);
    if (ret)
    {
        MPIU_ERR_SETANDJUMP1(
            mpi_errno,
            MPI_ERR_OTHER,
            "**ibv_query_device",
            "**ibv_query_device %s",
            dev_name
        );
    }

    if ((mpi_errno = hcaNameToType(dev_name, hca_type)) != MPI_SUCCESS)
    {
        MPIU_ERR_POP(mpi_errno);
    }

fn_fail:
    MPIDI_FUNC_EXIT(MPID_STATE_GET_HCA_TYPE);
    return mpi_errno;
}
/*
 * Find a list of ibv_ports matching a set of criteria.
 */
opal_list_t *opal_common_verbs_find_ports(const char *if_include,
                                          const char *if_exclude,
                                          int flags,
                                          int stream)
{
    int32_t num_devs;
    struct ibv_device **devices;
    struct ibv_device *device;
    struct ibv_context *device_context;
    struct ibv_device_attr device_attr;
    struct ibv_port_attr port_attr;
    char **if_include_list = NULL, **if_exclude_list = NULL, **if_sanity_list = NULL;
    opal_common_verbs_device_item_t *di;
    opal_common_verbs_port_item_t *pi;
    int rc;
    uint32_t j;
    opal_list_t *port_list = NULL;
    bool want;

    /* Sanity check the include/exclude params */
    if (NULL != if_include && NULL != if_exclude) {
        return NULL;
    }

    /* Query all the IBV devices on the machine.  Use an ompi
       compatibility function, because how to get this list changed
       over the history of the IBV API. */
    devices = opal_ibv_get_device_list(&num_devs);
    if (0 == num_devs) {
        opal_output_verbose(5, stream, "no verbs interfaces found");
        return NULL;
    }

    opal_output_verbose(5, stream, "found %d verbs interface%s",
                        num_devs, (num_devs != 1) ? "s" : "");

    /* Allocate a list to fill */
    port_list = OBJ_NEW(opal_list_t);
    if (NULL == port_list) {
        return NULL;
    }

    if (NULL != if_include) {
        opal_output_verbose(5, stream, "finding verbs interfaces, including %s",
                            if_include);
        if_include_list = opal_argv_split(if_include, ',');
        if_sanity_list = opal_argv_copy(if_include_list);
    } else if (NULL != if_exclude) {
        opal_output_verbose(5, stream, "finding verbs interfaces, excluding %s",
                            if_exclude);
        if_exclude_list = opal_argv_split(if_exclude, ',');
        if_sanity_list = opal_argv_copy(if_exclude_list);
    }

    /* Now loop through all the devices.  Get the attributes for each
       port on each device to see if they match our selection
       criteria. */
    for (int32_t i = 0; (int32_t) i < num_devs; ++i) {
        /* See if this device is on the include/exclude sanity check
           list.  If it is, remove it from the sanity check list
           (i.e., we should end up with an empty list at the end if
           all entries in the sanity check list exist) */
        device = devices[i];
        check_sanity(&if_sanity_list, ibv_get_device_name(device), -1);

        opal_output_verbose(5, stream, "examining verbs interface: %s",
                            ibv_get_device_name(device));

        device_context = ibv_open_device(device);
        if (NULL == device_context) {
            opal_show_help("help-opal-common-verbs.txt",
                           "ibv_open_device fail", true,
                           opal_proc_local_get()->proc_hostname,
                           ibv_get_device_name(device),
                           errno, strerror(errno));
            goto err_free_port_list;
        }

        if (ibv_query_device(device_context, &device_attr)){
            opal_show_help("help-opal-common-verbs.txt",
                           "ibv_query_device fail", true,
                           opal_proc_local_get()->proc_hostname,
                           ibv_get_device_name(device),
                           errno, strerror(errno));
            goto err_free_port_list;
        }

        /* Now that we have the attributes of this device, remove all
           ports of this device from the sanity check list.  Note that
           IBV ports are indexed from 1, not 0. */
        for (j = 1; j <= device_attr.phys_port_cnt; j++) {
            check_sanity(&if_sanity_list, ibv_get_device_name(device), j);
        }

        /* Check the device-specific flags to see if we want this
           device */
        want = false;

        if (flags & OPAL_COMMON_VERBS_FLAGS_TRANSPORT_IB &&
            IBV_TRANSPORT_IB == device->transport_type) {
            opal_output_verbose(5, stream, "verbs interface %s has right type (IB)",
                                ibv_get_device_name(device));
            want = true;
        }
        if (flags & OPAL_COMMON_VERBS_FLAGS_TRANSPORT_IWARP &&
            IBV_TRANSPORT_IWARP == device->transport_type) {
            opal_output_verbose(5, stream, "verbs interface %s has right type (IWARP)",
                                ibv_get_device_name(device));
            want = true;
        }

        /* Check for RC or UD QP support */
        if (flags & OPAL_COMMON_VERBS_FLAGS_RC) {
            rc = opal_common_verbs_qp_test(device_context, flags);
            if (OPAL_SUCCESS == rc) {
                want = true;
                opal_output_verbose(5, stream,
                                    "verbs interface %s supports RC QPs",
                                    ibv_get_device_name(device));
            } else {
                opal_output_verbose(5, stream,
                                    "verbs interface %s failed to make RC QP",
                                    ibv_get_device_name(device));
            }
        }
        if (flags & OPAL_COMMON_VERBS_FLAGS_UD) {
            rc = opal_common_verbs_qp_test(device_context, flags);
            if (OPAL_SUCCESS == rc) {
                want = true;
                opal_output_verbose(5, stream,
                                    "verbs interface %s supports UD QPs",
                                    ibv_get_device_name(device));
            } else if (OPAL_ERR_TYPE_MISMATCH == rc) {
                opal_output_verbose(5, stream,
                                    "verbs interface %s made an RC QP! we don't want RC-capable devices",
                                    ibv_get_device_name(device));
            } else {
                opal_output_verbose(5, stream,
                                    "verbs interface %s failed to make UD QP",
                                    ibv_get_device_name(device));
            }
        }

        /* If we didn't want it, go to the next device */
        if (!want) {
            continue;
        }

        /* Make a device_item_t to hold the device information */
        di = OBJ_NEW(opal_common_verbs_device_item_t);
        if (NULL == di) {
            goto err_free_port_list;
        }
        di->device = device;
        di->context = device_context;
        di->device_attr = device_attr;
        di->device_name = strdup(ibv_get_device_name(device));

        /* Note IBV ports are 1 based (not 0 based) */
        for (j = 1; j <= device_attr.phys_port_cnt; j++) {

            /* If we don't want this port (based on if_include /
               if_exclude lists), skip it */
            if (!want_this_port(if_include_list, if_exclude_list, di, j)) {
                opal_output_verbose(5, stream, "verbs interface %s:%d: rejected by include/exclude",
                                    ibv_get_device_name(device), j);
                continue;
            }

            /* Query the port */
            if (ibv_query_port(device_context, (uint8_t) j, &port_attr)) {
                opal_show_help("help-opal-common-verbs.txt",
                               "ibv_query_port fail", true,
                               opal_proc_local_get()->proc_hostname,
                               ibv_get_device_name(device),
                               errno, strerror(errno));
                goto err_free_port_list;
            }

            /* We definitely only want ACTIVE ports */
            if (IBV_PORT_ACTIVE != port_attr.state) {
                opal_output_verbose(5, stream, "verbs interface %s:%d: not ACTIVE",
                                    ibv_get_device_name(device), j);
                continue;
            }

            /* Check the port-specific flags to see if we want this
               port */
            want = false;
            if (0 == flags) {
                want = true;
            }

            if ((flags & (OPAL_COMMON_VERBS_FLAGS_LINK_LAYER_IB |
                          OPAL_COMMON_VERBS_FLAGS_LINK_LAYER_ETHERNET)) ==
                 (OPAL_COMMON_VERBS_FLAGS_LINK_LAYER_IB |
                  OPAL_COMMON_VERBS_FLAGS_LINK_LAYER_ETHERNET)) {
                /* If they specified both link layers, then we want this port */
                want = true;
            } else if ((flags & (OPAL_COMMON_VERBS_FLAGS_LINK_LAYER_IB |
                                 OPAL_COMMON_VERBS_FLAGS_LINK_LAYER_ETHERNET)) == 0) {
                /* If they specified neither link layer, then we want this port */
                want = true;
            }
#if HAVE_DECL_IBV_LINK_LAYER_ETHERNET
            else if (flags & OPAL_COMMON_VERBS_FLAGS_LINK_LAYER_IB) {
                if (IBV_LINK_LAYER_INFINIBAND == port_attr.link_layer) {
                    want = true;
                } else {
                    opal_output_verbose(5, stream, "verbs interface %s:%d has wrong link layer (has %s, want IB)",
                                        ibv_get_device_name(device), j,
                                        link_layer_to_str(port_attr.link_layer));
                }
            } else if (flags & OPAL_COMMON_VERBS_FLAGS_LINK_LAYER_ETHERNET) {
                if (IBV_LINK_LAYER_ETHERNET == port_attr.link_layer) {
                    want = true;
                } else {
                    opal_output_verbose(5, stream, "verbs interface %s:%d has wrong link layer (has %s, want Ethernet)",
                                        ibv_get_device_name(device), j,
                                        link_layer_to_str(port_attr.link_layer));
                }
            }
#endif

            if (!want) {
                continue;
            }

            /* If we got this far, we want the port.  Make an item for it. */
            pi = OBJ_NEW(opal_common_verbs_port_item_t);
            if (NULL == pi) {
                goto err_free_port_list;
            }
            pi->device = di;
            pi->port_num = j;
            pi->port_attr = port_attr;
            OBJ_RETAIN(di);

            /* Add the port item to the list */
            opal_list_append(port_list, &pi->super);
            opal_output_verbose(5, stream, "found acceptable verbs interface %s:%d",
                                ibv_get_device_name(device), j);
        }

        /* We're done with the device; if some ports are using it, its
           ref count will be > 0, and therefore the device won't be
           deleted here. */
        OBJ_RELEASE(di);
    }

    /* Sanity check that the devices specified in the if_include /
       if_exclude lists actually existed.  If this is true, then the
       sanity list will now be empty.  If there are still items left
       on the list, then they didn't exist.  Bad.  Print a warning (if
       the warning is not disabled). */
    if (0 != opal_argv_count(if_sanity_list)) {
        if (opal_common_verbs_warn_nonexistent_if) {
            char *str = opal_argv_join(if_sanity_list, ',');
            opal_show_help("help-opal-common-verbs.txt", "nonexistent port",
                           true, opal_proc_local_get()->proc_hostname,
                           ((NULL != if_include) ? "in" : "ex"), str);
            free(str);

            /* Only warn once per process */
            opal_common_verbs_warn_nonexistent_if = false;
        }
    }
    if (NULL != if_sanity_list) {
        opal_argv_free(if_sanity_list);
    }

    opal_argv_free(if_include_list);
    opal_argv_free(if_exclude_list);

    /* All done! */
    opal_ibv_free_device_list(devices);
    return port_list;

 err_free_port_list:
    OPAL_LIST_RELEASE(port_list);
    opal_ibv_free_device_list(devices);

    if (NULL != if_sanity_list) {
        opal_argv_free(if_sanity_list);
    }

    opal_argv_free(if_include_list);
    opal_argv_free(if_exclude_list);

    return NULL;
}
Beispiel #9
0
static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size,
					    int tx_depth, int port,struct user_parameters *user_parm) {
	struct pingpong_context *ctx;
	struct ibv_device_attr device_attr;

	ctx = malloc(sizeof *ctx);
	if (!ctx)
		return NULL;

	ctx->size     = size;
	ctx->tx_depth = tx_depth;
	/* in case of UD need space for the GRH */
	if (user_parm->connection_type==UD) {
		ctx->buf = memalign(page_size, ( size + 40 ) * 2);
		if (!ctx->buf) {
			fprintf(stderr, "Couldn't allocate work buf.\n");
			return NULL;
		}
		memset(ctx->buf, 0, ( size + 40 ) * 2);
	} else {
		ctx->buf = memalign(page_size, size * 2);
		if (!ctx->buf) {
			fprintf(stderr, "Couldn't allocate work buf.\n");
			return NULL;
		}
		memset(ctx->buf, 0, size * 2);
	}

	ctx->post_buf = (char*)ctx->buf + (size - 1);
	ctx->poll_buf = (char*)ctx->buf + (2 * size - 1);

	ctx->context = ibv_open_device(ib_dev);
	if (!ctx->context) {
		fprintf(stderr, "Couldn't get context for %s\n",
			ibv_get_device_name(ib_dev));
		return NULL;
	}
	if (user_parm->mtu == 0) {/*user did not ask for specific mtu */
		if (ibv_query_device(ctx->context, &device_attr)) {
			fprintf(stderr, "Failed to query device props");
			return NULL;
		}
		if (device_attr.vendor_part_id == 23108 || user_parm->gid_index > -1) {
			user_parm->mtu = 1024;
		} else {
			user_parm->mtu = 2048;
		}
	}
    if (user_parm->use_event) {
		ctx->channel = ibv_create_comp_channel(ctx->context);
		if (!ctx->channel) {
			fprintf(stderr, "Couldn't create completion channel\n");
			return NULL;
		}
	} else
		ctx->channel = NULL;
	ctx->pd = ibv_alloc_pd(ctx->context);
	if (!ctx->pd) {
		fprintf(stderr, "Couldn't allocate PD\n");
		return NULL;
	}
	if (user_parm->connection_type==UD) {
		ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, (size + 40 ) * 2,
				     IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE);
		if (!ctx->mr) {
			fprintf(stderr, "Couldn't allocate MR\n");
			return NULL;
		}
	} else {
		ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, size * 2,
				     IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE);
		if (!ctx->mr) {
			fprintf(stderr, "Couldn't allocate MR\n");
			return NULL;
		}
	}

	ctx->scq = ibv_create_cq(ctx->context, tx_depth, NULL, ctx->channel, 0);
	if (!ctx->scq) {
		fprintf(stderr, "Couldn't create CQ\n");
		return NULL;
	}
	ctx->rcq = ibv_create_cq(ctx->context, tx_depth, NULL, ctx->channel, 0);
	if (!ctx->rcq) {
		fprintf(stderr, "Couldn't create Recieve CQ\n");
		return NULL;
	}
	{
		struct ibv_qp_init_attr attr;
		memset(&attr, 0, sizeof(struct ibv_qp_init_attr));
		attr.send_cq = ctx->scq;
		attr.recv_cq = ctx->rcq;
		attr.cap.max_send_wr  = tx_depth;
		/* Work around:  driver doesnt support
		 * recv_wr = 0 */
		attr.cap.max_recv_wr  = tx_depth;
		attr.cap.max_send_sge = 1;
		attr.cap.max_recv_sge = 1;
		attr.cap.max_inline_data = user_parm->inline_size;
		switch (user_parm->connection_type) {
		case RC :
			attr.qp_type = IBV_QPT_RC;
			break;
		case UC :
			attr.qp_type = IBV_QPT_UC;
			break;
		case UD :
			attr.qp_type = IBV_QPT_UD;
			break;
		default:
			fprintf(stderr, "Unknown connection type %d \n",user_parm->connection_type);
			return NULL;
		}
		attr.sq_sig_all = 0;
		ctx->qp = ibv_create_qp(ctx->pd, &attr);
		if (!ctx->qp) {
			fprintf(stderr, "Couldn't create QP\n");
			return NULL;
		}
	}

	{
		struct ibv_qp_attr attr;
		memset(&attr, 0, sizeof(struct ibv_qp_init_attr));
		attr.qp_state        = IBV_QPS_INIT;
		attr.pkey_index      = 0;
		attr.port_num        = port;
		if (user_parm->connection_type==UD) {
			attr.qkey            = 0x11111111;
		} else {
			attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE;
		}

		if (user_parm->connection_type==UD) {
			if (ibv_modify_qp(ctx->qp, &attr,
					  IBV_QP_STATE              |
					  IBV_QP_PKEY_INDEX         |
					  IBV_QP_PORT               |
					  IBV_QP_QKEY)) {
				fprintf(stderr, "Failed to modify UD QP to INIT\n");
				return NULL;
			}

			if (user_parm->use_mcg) {
				union ibv_gid gid;
				uint8_t mcg_gid[16] = MCG_GID;

				/* use the local QP number as part of the mcg */
				mcg_gid[11] = (user_parm->servername) ? 0 : 1;
				*(uint32_t *)(&mcg_gid[12]) = ctx->qp->qp_num;
				memcpy(gid.raw, mcg_gid, 16);

				if (ibv_attach_mcast(ctx->qp, &gid, MCG_LID)) {
					fprintf(stderr, "Couldn't attach QP to mcg\n");
					return NULL;
				}
			}
		} else if (ibv_modify_qp(ctx->qp, &attr,
					 IBV_QP_STATE              |
					 IBV_QP_PKEY_INDEX         |
					 IBV_QP_PORT               |
					 IBV_QP_ACCESS_FLAGS)) {
			fprintf(stderr, "Failed to modify QP to INIT\n");
			return NULL;
		}
	}
	//send                        
	ctx->wr.wr_id      = PINGPONG_SEND_WRID;
	ctx->wr.sg_list    = &ctx->list;
	ctx->wr.num_sge    = 1;
	ctx->wr.opcode     = IBV_WR_SEND;
	ctx->wr.next       = NULL;
	//recieve
	ctx->rwr.wr_id      = PINGPONG_RECV_WRID;
	ctx->rwr.sg_list    = &ctx->recv_list;
	ctx->rwr.num_sge    = 1;
	ctx->rwr.next       = NULL;
	return ctx;
}
Beispiel #10
0
int rdma_backend_qp_state_rtr(RdmaBackendDev *backend_dev, RdmaBackendQP *qp,
                              uint8_t qp_type, uint8_t sgid_idx,
                              union ibv_gid *dgid, uint32_t dqpn,
                              uint32_t rq_psn, uint32_t qkey, bool use_qkey)
{
    struct ibv_qp_attr attr = {};
    union ibv_gid ibv_gid = {
        .global.interface_id = dgid->global.interface_id,
        .global.subnet_prefix = dgid->global.subnet_prefix
    };
    int rc, attr_mask;

    attr.qp_state = IBV_QPS_RTR;
    attr_mask = IBV_QP_STATE;

    qp->sgid_idx = sgid_idx;

    switch (qp_type) {
    case IBV_QPT_RC:
        attr.path_mtu               = IBV_MTU_1024;
        attr.dest_qp_num            = dqpn;
        attr.max_dest_rd_atomic     = 1;
        attr.min_rnr_timer          = 12;
        attr.ah_attr.port_num       = backend_dev->port_num;
        attr.ah_attr.is_global      = 1;
        attr.ah_attr.grh.hop_limit  = 1;
        attr.ah_attr.grh.dgid       = ibv_gid;
        attr.ah_attr.grh.sgid_index = qp->sgid_idx;
        attr.rq_psn                 = rq_psn;

        attr_mask |= IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN |
                     IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC |
                     IBV_QP_MIN_RNR_TIMER;

        trace_rdma_backend_rc_qp_state_rtr(qp->ibqp->qp_num,
                                           be64_to_cpu(ibv_gid.global.
                                                       subnet_prefix),
                                           be64_to_cpu(ibv_gid.global.
                                                       interface_id),
                                           qp->sgid_idx, dqpn, rq_psn);
        break;

    case IBV_QPT_UD:
        if (use_qkey) {
            attr.qkey = qkey;
            attr_mask |= IBV_QP_QKEY;
        }
        trace_rdma_backend_ud_qp_state_rtr(qp->ibqp->qp_num, use_qkey ? qkey :
                                           0);
        break;
    }

    rc = ibv_modify_qp(qp->ibqp, &attr, attr_mask);
    if (rc) {
        rdma_error_report("ibv_modify_qp fail, rc=%d, errno=%d", rc, errno);
        return -EIO;
    }

    return 0;
}

int rdma_backend_qp_state_rts(RdmaBackendQP *qp, uint8_t qp_type,
                              uint32_t sq_psn, uint32_t qkey, bool use_qkey)
{
    struct ibv_qp_attr attr = {};
    int rc, attr_mask;

    attr.qp_state = IBV_QPS_RTS;
    attr.sq_psn = sq_psn;
    attr_mask = IBV_QP_STATE | IBV_QP_SQ_PSN;

    switch (qp_type) {
    case IBV_QPT_RC:
        attr.timeout       = 14;
        attr.retry_cnt     = 7;
        attr.rnr_retry     = 7;
        attr.max_rd_atomic = 1;

        attr_mask |= IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY |
                     IBV_QP_MAX_QP_RD_ATOMIC;
        trace_rdma_backend_rc_qp_state_rts(qp->ibqp->qp_num, sq_psn);
        break;

    case IBV_QPT_UD:
        if (use_qkey) {
            attr.qkey = qkey;
            attr_mask |= IBV_QP_QKEY;
        }
        trace_rdma_backend_ud_qp_state_rts(qp->ibqp->qp_num, sq_psn,
                                           use_qkey ? qkey : 0);
        break;
    }

    rc = ibv_modify_qp(qp->ibqp, &attr, attr_mask);
    if (rc) {
        rdma_error_report("ibv_modify_qp fail, rc=%d, errno=%d", rc, errno);
        return -EIO;
    }

    return 0;
}

int rdma_backend_query_qp(RdmaBackendQP *qp, struct ibv_qp_attr *attr,
                          int attr_mask, struct ibv_qp_init_attr *init_attr)
{
    if (!qp->ibqp) {
        attr->qp_state = IBV_QPS_RTS;
        return 0;
    }

    return ibv_query_qp(qp->ibqp, attr, attr_mask, init_attr);
}

void rdma_backend_destroy_qp(RdmaBackendQP *qp, RdmaDeviceResources *dev_res)
{
    if (qp->ibqp) {
        ibv_destroy_qp(qp->ibqp);
    }
    g_slist_foreach(qp->cqe_ctx_list.list, free_cqe_ctx, dev_res);
    rdma_protected_gslist_destroy(&qp->cqe_ctx_list);
}

int rdma_backend_create_srq(RdmaBackendSRQ *srq, RdmaBackendPD *pd,
                            uint32_t max_wr, uint32_t max_sge,
                            uint32_t srq_limit)
{
    struct ibv_srq_init_attr srq_init_attr = {};

    srq_init_attr.attr.max_wr = max_wr;
    srq_init_attr.attr.max_sge = max_sge;
    srq_init_attr.attr.srq_limit = srq_limit;

    srq->ibsrq = ibv_create_srq(pd->ibpd, &srq_init_attr);
    if (!srq->ibsrq) {
        rdma_error_report("ibv_create_srq failed, errno=%d", errno);
        return -EIO;
    }

    rdma_protected_gslist_init(&srq->cqe_ctx_list);

    return 0;
}

int rdma_backend_query_srq(RdmaBackendSRQ *srq, struct ibv_srq_attr *srq_attr)
{
    if (!srq->ibsrq) {
        return -EINVAL;
    }

    return ibv_query_srq(srq->ibsrq, srq_attr);
}

int rdma_backend_modify_srq(RdmaBackendSRQ *srq, struct ibv_srq_attr *srq_attr,
                int srq_attr_mask)
{
    if (!srq->ibsrq) {
        return -EINVAL;
    }

    return ibv_modify_srq(srq->ibsrq, srq_attr, srq_attr_mask);
}

void rdma_backend_destroy_srq(RdmaBackendSRQ *srq, RdmaDeviceResources *dev_res)
{
    if (srq->ibsrq) {
        ibv_destroy_srq(srq->ibsrq);
    }
    g_slist_foreach(srq->cqe_ctx_list.list, free_cqe_ctx, dev_res);
    rdma_protected_gslist_destroy(&srq->cqe_ctx_list);
}

#define CHK_ATTR(req, dev, member, fmt) ({ \
    trace_rdma_check_dev_attr(#member, dev.member, req->member); \
    if (req->member > dev.member) { \
        rdma_warn_report("%s = "fmt" is higher than host device capability "fmt, \
                         #member, req->member, dev.member); \
        req->member = dev.member; \
    } \
})

static int init_device_caps(RdmaBackendDev *backend_dev,
                            struct ibv_device_attr *dev_attr)
{
    struct ibv_device_attr bk_dev_attr;
    int rc;

    rc = ibv_query_device(backend_dev->context, &bk_dev_attr);
    if (rc) {
        rdma_error_report("ibv_query_device fail, rc=%d, errno=%d", rc, errno);
        return -EIO;
    }

    dev_attr->max_sge = MAX_SGE;
    dev_attr->max_srq_sge = MAX_SGE;

    CHK_ATTR(dev_attr, bk_dev_attr, max_mr_size, "%" PRId64);
    CHK_ATTR(dev_attr, bk_dev_attr, max_qp, "%d");
    CHK_ATTR(dev_attr, bk_dev_attr, max_sge, "%d");
    CHK_ATTR(dev_attr, bk_dev_attr, max_cq, "%d");
    CHK_ATTR(dev_attr, bk_dev_attr, max_mr, "%d");
    CHK_ATTR(dev_attr, bk_dev_attr, max_pd, "%d");
    CHK_ATTR(dev_attr, bk_dev_attr, max_qp_rd_atom, "%d");
    CHK_ATTR(dev_attr, bk_dev_attr, max_qp_init_rd_atom, "%d");
    CHK_ATTR(dev_attr, bk_dev_attr, max_ah, "%d");
    CHK_ATTR(dev_attr, bk_dev_attr, max_srq, "%d");

    return 0;
}

static inline void build_mad_hdr(struct ibv_grh *grh, union ibv_gid *sgid,
                                 union ibv_gid *my_gid, int paylen)
{
    grh->paylen = htons(paylen);
    grh->sgid = *sgid;
    grh->dgid = *my_gid;
}
Beispiel #11
0
int main(int argc, char *argv[])
{
    struct ibv_pd		       *pd1, *pd2;
    struct ibv_comp_channel	       *comp_chan1, *comp_chan2;
    struct ibv_cq		       *cq1, *cq2;
    struct ibv_cq		       *evt_cq = NULL;
    struct ibv_mr		       *mr1, *mr2;
    struct ibv_qp_init_attr		qp_attr1 = { }, qp_attr2 = {};
    struct ibv_sge			sge;
    struct ibv_send_wr		send_wr = { };
    struct ibv_send_wr	       *bad_send_wr = NULL;
    struct ibv_wc			wc;
    struct ibv_qp			*qp1, *qp2;
    void			       *cq_context = NULL;
    union ibv_gid			gid1, gid2;

    int				n;

    uint8_t			       *buf1, *buf2;

    int				err;
    int 				num_devices;
    struct ibv_context	*	verbs1, *verbs2;
    struct ibv_device ** dev_list = ibv_get_device_list(&num_devices);
    struct ibv_device_attr		dev_attr;
    int use = 0;
    int port = 1;
    int x = 0;
    unsigned long mb = 0;
    unsigned long bytes = 0;
    unsigned long save_diff = 0;
    struct timeval start, stop, diff;
    int iterations = 0;

    struct rusage usage;
    struct timeval ustart, uend;
    struct timeval sstart, send;
    struct timeval tstart, tend;

    DPRINTF("There are %d devices\n", num_devices);

    for(x = 0; x < num_devices; x++) {
        printf("Device: %d, %s\n", x, ibv_get_device_name(dev_list[use]));
    }

    if(num_devices == 0 || dev_list == NULL) {
        printf("No devices found\n");
        return 1;
    }

    if(argc < 2) {
        printf("Which RDMA device to use? 0, 1, 2, 3...\n");
        return 1;
    }

    use = atoi(argv[1]);

    DPRINTF("Using device %d\n", use);

    verbs1 = ibv_open_device(dev_list[use]);

    if(verbs1 == NULL) {
        printf("Failed to open device!\n");
        return 1;
    }

    DPRINTF("Device open %s\n", ibv_get_device_name(dev_list[use]));

    verbs2 = ibv_open_device(dev_list[use]);

    if(verbs2 == NULL) {
        printf("Failed to open device again!\n");
        return 1;
    }

    if(ibv_query_device(verbs1, &dev_attr)) {
        printf("Failed to query device attributes.\n");
        return 1;
    }

    printf("Device open: %d, %s which has %d ports\n", x, ibv_get_device_name(dev_list[use]), dev_attr.phys_port_cnt);

    if(argc < 3) {
        printf("Which port on the device to use? 1, 2, 3...\n");
        return 1;
    }

    port = atoi(argv[2]);

    if(port <= 0) {
        printf("Port #%d invalid, must start with 1, 2, 3, ...\n", port);
        return 1;
    }

    printf("Using port %d\n", port);

    if(argc < 4) {
        printf("How many iterations to perform?\n");
        return 1;
    }

    iterations = atoi(argv[3]);
    printf("Will perform %d iterations\n", iterations);

    pd1 = ibv_alloc_pd(verbs1);
    if (!pd1)
        return 1;

    if(argc < 5) {
        printf("How many megabytes to allocate? (This will be allocated twice. Once for source, once for destination.)\n");
        return 1;
    }

    mb = atoi(argv[4]);

    if(mb <= 0) {
        printf("Megabytes %lu invalid\n", mb);
        return 1;
    }

    DPRINTF("protection domain1 allocated\n");

    pd2 = ibv_alloc_pd(verbs2);
    if (!pd2)
        return 1;

    DPRINTF("protection domain2 allocated\n");

    comp_chan1 = ibv_create_comp_channel(verbs1);
    if (!comp_chan1)
        return 1;

    DPRINTF("completion chan1 created\n");

    comp_chan2 = ibv_create_comp_channel(verbs2);
    if (!comp_chan2)
        return 1;

    DPRINTF("completion chan2 created\n");

    cq1 = ibv_create_cq(verbs1, 2, NULL, comp_chan1, 0);
    if (!cq1)
        return 1;

    DPRINTF("CQ1 created\n");

    cq2 = ibv_create_cq(verbs2, 2, NULL, comp_chan2, 0);
    if (!cq2)
        return 1;

    DPRINTF("CQ2 created\n");

    bytes = mb * 1024UL * 1024UL;

    buf1 = malloc(bytes);
    if (!buf1)
        return 1;

    buf2 = malloc(bytes);
    if (!buf2)
        return 1;

    printf("Populating %lu MB memory.\n", mb * 2);

    for(x = 0; x < bytes; x++) {
        buf1[x] = 123;
    }

    buf1[bytes - 1] = 123;

    mr1 = ibv_reg_mr(pd1, buf1, bytes, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ);
    if (!mr1) {
        printf("Failed to register memory.\n");
        return 1;
    }

    mr2 = ibv_reg_mr(pd2, buf2, bytes, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ);
    if (!mr2) {
        printf("Failed to register memory.\n");
        return 1;
    }

    DPRINTF("memory registered.\n");

    qp_attr1.cap.max_send_wr	 = 10;
    qp_attr1.cap.max_send_sge = 10;
    qp_attr1.cap.max_recv_wr	 = 10;
    qp_attr1.cap.max_recv_sge = 10;
    qp_attr1.sq_sig_all = 1;

    qp_attr1.send_cq		 = cq1;
    qp_attr1.recv_cq		 = cq1;

    qp_attr1.qp_type		 = IBV_QPT_RC;

    qp1 = ibv_create_qp(pd1, &qp_attr1);
    if (!qp1) {
        printf("failed to create queue pair #1\n");
        return 1;
    }

    DPRINTF("queue pair1 created\n");

    qp_attr2.cap.max_send_wr	 = 10;
    qp_attr2.cap.max_send_sge = 10;
    qp_attr2.cap.max_recv_wr	 = 10;
    qp_attr2.cap.max_recv_sge = 10;
    qp_attr2.sq_sig_all = 1;

    qp_attr2.send_cq		 = cq2;
    qp_attr2.recv_cq		 = cq2;

    qp_attr2.qp_type		 = IBV_QPT_RC;


    qp2 = ibv_create_qp(pd2, &qp_attr2);
    if (!qp2) {
        printf("failed to create queue pair #2\n");
        return 1;
    }

    DPRINTF("queue pair2 created\n");

    struct ibv_qp_attr attr1 = {
        .qp_state = IBV_QPS_INIT,
        .pkey_index = 0,
        .port_num = port,
        .qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE,
    };

    if(ibv_modify_qp(qp1, &attr1,
                     IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) {
        printf("verbs 1 Failed to go to init\n");
        return 1;
    }

    DPRINTF("verbs1 to init\n");

    struct ibv_qp_attr attr2 = {
        .qp_state = IBV_QPS_INIT,
        .pkey_index = 0,
        .port_num = port,
        .qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE,
    };

    if(ibv_modify_qp(qp2, &attr2,
                     IBV_QP_STATE |
                     IBV_QP_PKEY_INDEX |
                     IBV_QP_PORT |
                     IBV_QP_ACCESS_FLAGS)) {
        printf("verbs 2 Failed to go to init\n");
        return 1;
    }

    DPRINTF("verbs2 to init\n");

    //struct ibv_gid gid1, gid2;
    struct ibv_port_attr port1, port2;
    uint64_t psn1 = lrand48() & 0xffffff;
    uint64_t psn2 = lrand48() & 0xffffff;

    if(ibv_query_port(verbs1, port, &port1))
        return 1;

    DPRINTF("got port1 information\n");

    if(ibv_query_port(verbs2, port, &port2))
        return 1;

    DPRINTF("got port2 information\n");

    if(ibv_query_gid(verbs1, 1, 0, &gid1))
        return 1;
    DPRINTF("got gid1 information\n");

    if(ibv_query_gid(verbs2, 1, 0, &gid2))
        return 1;

    DPRINTF("got gid2 information\n");

    struct ibv_qp_attr next2 = {
        .qp_state = IBV_QPS_RTR,
        .path_mtu = IBV_MTU_1024,
        .dest_qp_num = qp2->qp_num,
        .rq_psn = psn2,
        .max_dest_rd_atomic = 5,
        .min_rnr_timer = 12,
        .ah_attr = {
            .is_global = 0,
            .dlid = port2.lid,
            .sl = 0,
            .src_path_bits = 0,
            .port_num = port,
        }
    };

    if(gid2.global.interface_id) {
        next2.ah_attr.is_global = 1;
        next2.ah_attr.grh.hop_limit = 1;
        next2.ah_attr.grh.dgid = gid2;
        next2.ah_attr.grh.sgid_index = 0;
    }

    struct ibv_qp_attr next1 = {
        .qp_state = IBV_QPS_RTR,
        .path_mtu = IBV_MTU_1024,
        .dest_qp_num = qp1->qp_num,
        .rq_psn = psn1,
        .max_dest_rd_atomic = 1,
        .min_rnr_timer = 12,
        .ah_attr = {
            .is_global = 0,
            .dlid = port1.lid,
            .sl = 0,
            .src_path_bits = 0,
            .port_num = port,
        }
    };

    if(gid1.global.interface_id) {
        next1.ah_attr.is_global = 1;
        next1.ah_attr.grh.hop_limit = 1;
        next1.ah_attr.grh.dgid = gid1;
        next1.ah_attr.grh.sgid_index = 0;
    }

    if(ibv_modify_qp(qp2, &next1,
                     IBV_QP_STATE |
                     IBV_QP_AV |
                     IBV_QP_PATH_MTU |
                     IBV_QP_DEST_QPN |
                     IBV_QP_RQ_PSN |
                     IBV_QP_MAX_DEST_RD_ATOMIC |
                     IBV_QP_MIN_RNR_TIMER)) {
        printf("Failed to modify verbs2 to ready\n");
        return 1;
    }

    DPRINTF("verbs2 RTR\n");

    if(ibv_modify_qp(qp1, &next2,
                     IBV_QP_STATE |
                     IBV_QP_AV |
                     IBV_QP_PATH_MTU |
                     IBV_QP_DEST_QPN |
                     IBV_QP_RQ_PSN |
                     IBV_QP_MAX_DEST_RD_ATOMIC |
                     IBV_QP_MIN_RNR_TIMER)) {
        printf("Failed to modify verbs1 to ready\n");
        return 1;
    }

    DPRINTF("verbs1 RTR\n");

    next2.qp_state = IBV_QPS_RTS;
    next2.timeout = 14;
    next2.retry_cnt = 7;
    next2.rnr_retry = 7;
    next2.sq_psn = psn1;
    next2.max_rd_atomic = 1;

    if(ibv_modify_qp(qp1, &next2,
                     IBV_QP_STATE |
                     IBV_QP_TIMEOUT |
                     IBV_QP_RETRY_CNT |
                     IBV_QP_RNR_RETRY |
                     IBV_QP_SQ_PSN |
                     IBV_QP_MAX_QP_RD_ATOMIC)) {
        printf("Failed again to modify verbs1 to ready\n");
        return 1;
    }

    DPRINTF("verbs1 RTS\n");

    next1.qp_state = IBV_QPS_RTS;
    next1.timeout = 14;
    next1.retry_cnt = 7;
    next1.rnr_retry = 7;
    next1.sq_psn = psn2;
    next1.max_rd_atomic = 1;

    if(ibv_modify_qp(qp2, &next1,
                     IBV_QP_STATE |
                     IBV_QP_TIMEOUT |
                     IBV_QP_RETRY_CNT |
                     IBV_QP_RNR_RETRY |
                     IBV_QP_SQ_PSN |
                     IBV_QP_MAX_QP_RD_ATOMIC)) {
        printf("Failed again to modify verbs2 to ready\n");
        return 1;
    }

    DPRINTF("verbs2 RTS\n");

    printf("Performing RDMA first.\n");
    iterations = atoi(argv[3]);

    getrusage(RUSAGE_SELF, &usage);
    ustart = usage.ru_utime;
    sstart = usage.ru_stime;

    gettimeofday(&tstart, NULL);

    while(iterations-- > 0) {
        sge.addr   = (uintptr_t) buf1;
        sge.length = bytes;
        sge.lkey   = mr1->lkey;

        send_wr.wr_id		    = 1;
        send_wr.opcode		    = IBV_WR_RDMA_WRITE;
        send_wr.sg_list		    = &sge;
        send_wr.num_sge		    = 1;
        send_wr.send_flags          = IBV_SEND_SIGNALED;
        send_wr.wr.rdma.rkey 	    = mr2->rkey;
        send_wr.wr.rdma.remote_addr = (uint64_t) buf2;

        DPRINTF("Iterations left: %d\n", iterations);
        if (ibv_req_notify_cq(cq1, 0))
            return 1;

        DPRINTF("Submitting local RDMA\n");
        gettimeofday(&start, NULL);
        if (ibv_post_send(qp1, &send_wr, &bad_send_wr))
            return 1;

        DPRINTF("RDMA posted %p %p\n", &send_wr, bad_send_wr);

        DPRINTF("blocking...\n");
        if(ibv_get_cq_event(comp_chan1, &evt_cq, &cq_context)) {
            printf("failed to get CQ event\n");
            return 1;
        }
        gettimeofday(&stop, NULL);
        timersub(&stop, &start, &diff);

        DPRINTF("RDMA took: %lu us\n", diff.tv_usec);

        ibv_ack_cq_events(evt_cq, 1);

        DPRINTF("got event\n");

        n = ibv_poll_cq(cq1, 1, &wc);
        if (n > 0) {
            DPRINTF("return from poll: %lu\n", wc.wr_id);
            if (wc.status != IBV_WC_SUCCESS) {
                printf("poll failed %s\n", ibv_wc_status_str(wc.status));
                return 1;
            }

            if (wc.wr_id == 1) {
                DPRINTF("Finished %d bytes %d %d\n", n, buf1[bytes - 1], buf2[bytes - 1]);
            } else {
                printf("didn't find completion\n");
            }
        }

        if (n < 0) {
            printf("poll returned error\n");
            return 1;
        }

        DPRINTF("Poll returned %d bytes %d %d\n", n, buf1[0], buf2[0]);

    }

    gettimeofday(&tend, NULL);

    getrusage(RUSAGE_SELF, &usage);
    uend = usage.ru_utime;
    send = usage.ru_stime;

    save_diff = 0;
    timersub(&uend, &ustart, &diff);
    save_diff += diff.tv_usec;
    printf("User CPU time: %lu us\n", diff.tv_usec);
    timersub(&send, &sstart, &diff);
    save_diff += diff.tv_usec;
    printf("System CPU time: %lu us\n", diff.tv_usec);
    timersub(&tend, &tstart, &diff);
    printf("Sleeping time: %lu us\n", diff.tv_usec - save_diff);
    printf("Wall clock CPU time: %lu us\n", diff.tv_usec);

    iterations = atoi(argv[3]);

    printf("Now using the CPU instead....\n");

    getrusage(RUSAGE_SELF, &usage);
    ustart = usage.ru_utime;
    sstart = usage.ru_stime;

    gettimeofday(&tstart, NULL);

    while(iterations-- > 0) {
        DPRINTF("Repeating without RDMA...\n");

        gettimeofday(&start, NULL);

        memcpy(buf2, buf1, bytes);

        gettimeofday(&stop, NULL);
        timersub(&stop, &start, &diff);
        DPRINTF("Regular copy too took: %lu us\n", diff.tv_usec);
    }

    gettimeofday(&tend, NULL);

    getrusage(RUSAGE_SELF, &usage);
    uend = usage.ru_utime;
    send = usage.ru_stime;

    save_diff = 0;
    timersub(&uend, &ustart, &diff);
    save_diff += diff.tv_usec;
    printf("User CPU time: %lu us\n", diff.tv_usec);
    timersub(&send, &sstart, &diff);
    save_diff += diff.tv_usec;
    printf("System CPU time: %lu us\n", diff.tv_usec);
    timersub(&tend, &tstart, &diff);
    printf("Sleeping time: %lu us\n", diff.tv_usec - save_diff);
    printf("Wall clock CPU time: %lu us\n", diff.tv_usec);
    return 0;
}
Beispiel #12
0
/*
 * Create a queue pair.
 */
int
rd_create_qp(DEVICE *dev,
             CONNECTION *con,
             struct ibv_context *context,
             struct rdma_cm_id *id)
{
    /* Set up and verify rd_atomic parameters */
    {
        struct ibv_device_attr dev_attr;

        if (ibv_query_device(context, &dev_attr) != SUCCESS0)
            return error(SYS, "query device failed");
#if 1
        if (Req.rd_atomic == 0)
            con->local.rd_atomic = dev_attr.max_qp_rd_atom;
        else if (Req.rd_atomic <= dev_attr.max_qp_rd_atom)
            con->local.rd_atomic = Req.rd_atomic;
        else
            return error(0, "device only supports %d (< %d) RDMA reads or atomics",
                                    dev_attr.max_qp_rd_atom, Req.rd_atomic);
#endif
    }

    /* Create queue pair */
    {
        struct ibv_qp_init_attr qp_attr ={
            .send_cq = dev->cq,
            .recv_cq = dev->cq,
            .cap     ={
                .max_send_wr     = dev->max_send_wr,
                .max_recv_wr     = dev->max_recv_wr,
                .max_send_sge    = 1,
                .max_recv_sge    = 1,
            },
            .qp_type = IBV_QPT_RC,
        };

#if 0
        if (Req.use_cm) {
            if (rdma_create_qp(id, dev->pd, &qp_attr) != 0)
                return error(SYS, "failed to create QP");
            dev->qp = id->qp;
        } else {
#ifdef HAS_XRC
            if (dev->trans == IBV_QPT_XRC) {
                struct ibv_srq_init_attr srq_attr ={
                    .attr ={
                        .max_wr  = dev->max_recv_wr,
                        .max_sge = 1
                    }
                };

                dev->xrc = ibv_open_xrc_domain(context, -1, O_CREAT);
                if (!dev->xrc)
                    return error(SYS, "failed to open XRC domain");

                dev->srq = ibv_create_xrc_srq(dev->pd, dev->xrc, dev->cq,
                                                                    &srq_attr);
                if (!dev->srq)
                    return error(SYS, "failed to create SRQ");

                qp_attr.cap.max_recv_wr  = 0;
                qp_attr.cap.max_recv_sge = 0;
                qp_attr.xrc_domain       = dev->xrc;
            }
#endif /* HAS_XRC */
#endif

            con->qp = ibv_create_qp(dev->pd, &qp_attr);
            if (!con->qp)
                return error(SYS, "failed to create QP");
#if 0
        }
#endif
    }
Beispiel #13
0
static int _ibv_attach(map_segment_t *s, size_t size)
{
    int rc = OSHMEM_SUCCESS;
    static openib_device_t memheap_device;
    openib_device_t *device = &memheap_device;
    int num_devs = 0;

    assert(s);

    memset(device, 0, sizeof(*device));

#ifdef HAVE_IBV_GET_DEVICE_LIST
    device->ib_devs = ibv_get_device_list(&num_devs);
#else
#error unsupported ibv_get_device_list in infiniband/verbs.h
#endif

    if (num_devs == 0 || !device->ib_devs)
    {
        rc = OSHMEM_ERR_NOT_SUPPORTED;
    }

    /* Open device */
    if (!rc)
    {
        int i = 0;

        if (num_devs > 1)
        {
            if (NULL == mca_memheap_base_param_hca_name)
            {
                MEMHEAP_VERBOSE(5, "found %d HCAs, choosing the first", num_devs);
            }
            else
            {
                MEMHEAP_VERBOSE(5, "found %d HCAs, searching for %s", num_devs, mca_memheap_base_param_hca_name);
            }
        }

        for (i = 0; i < num_devs; i++)
        {
            device->ib_dev = device->ib_devs[i];

            device->ib_dev_context = ibv_open_device(device->ib_dev);
            if (NULL == device->ib_dev_context)
            {
                MEMHEAP_ERROR("error obtaining device context for %s errno says %d: %s",
                        ibv_get_device_name(device->ib_dev), errno, strerror(errno));
                rc = OSHMEM_ERR_RESOURCE_BUSY;
            }
            else
            {
                if (NULL != mca_memheap_base_param_hca_name)
                {
                    if (0 == strcmp(mca_memheap_base_param_hca_name,ibv_get_device_name(device->ib_dev)))
                    {
                        MEMHEAP_VERBOSE(5, "mca_memheap_base_param_hca_name = %s, selected %s as %d of %d", mca_memheap_base_param_hca_name, ibv_get_device_name(device->ib_dev), i, num_devs);
                        rc = OSHMEM_SUCCESS;
                        break;
                    }
                }
                else
                {
                    MEMHEAP_VERBOSE(5, "mca_memheap_base_param_hca_name = %s, selected %s as %d of %d", mca_memheap_base_param_hca_name, ibv_get_device_name(device->ib_dev), i, num_devs);
                    rc = OSHMEM_SUCCESS;
                    break;
                }
            }
        }
    }

    /* Obtain device attributes */
    if (!rc)
    {
        if (ibv_query_device(device->ib_dev_context, &device->ib_dev_attr))
        {
            MEMHEAP_ERROR("error obtaining device attributes for %s errno says %d: %s",
                    ibv_get_device_name(device->ib_dev), errno, strerror(errno));
            rc = OSHMEM_ERR_RESOURCE_BUSY;
        }
        else
        {
            MEMHEAP_VERBOSE(5, "ibv device %s",
                    ibv_get_device_name(device->ib_dev));
        }
    }

    /* Allocate the protection domain for the device */
    if (!rc)
    {
        device->ib_pd = ibv_alloc_pd(device->ib_dev_context);
        if (NULL == device->ib_pd)
        {
            MEMHEAP_ERROR("error allocating protection domain for %s errno says %d: %s",
                    ibv_get_device_name(device->ib_dev), errno, strerror(errno));
            rc = OSHMEM_ERR_RESOURCE_BUSY;
        }
    }

    /* Allocate memory */
    if (!rc)
    {
        void *addr = NULL;
        struct ibv_mr *ib_mr = NULL;
        int access_flag = IBV_ACCESS_LOCAL_WRITE |
        IBV_ACCESS_REMOTE_WRITE |
        IBV_ACCESS_REMOTE_READ;

        OBJ_CONSTRUCT(&device->ib_mr_array, opal_value_array_t);
        opal_value_array_init(&device->ib_mr_array, sizeof(struct ibv_mr *));

#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0)
        access_flag |= IBV_ACCESS_ALLOCATE_MR |
        IBV_ACCESS_SHARED_MR_USER_READ |
        IBV_ACCESS_SHARED_MR_USER_WRITE;
#endif /* MPAGE_ENABLE */

        ib_mr = ibv_reg_mr(device->ib_pd, addr, size, access_flag);
        if (NULL == ib_mr)
        {
            MEMHEAP_ERROR("error to ibv_reg_mr() %llu bytes errno says %d: %s",
                    (unsigned long long)size, errno, strerror(errno));
            rc = OSHMEM_ERR_OUT_OF_RESOURCE;
        }
        else
        {
            device->ib_mr_shared = ib_mr;
            opal_value_array_append_item(&device->ib_mr_array, &ib_mr);
        }

#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0)
        if (!rc)
        {
            access_flag = IBV_ACCESS_LOCAL_WRITE |
            IBV_ACCESS_REMOTE_WRITE |
            IBV_ACCESS_REMOTE_READ|
            IBV_ACCESS_NO_RDMA;

            addr = (void *)mca_memheap_base_start_address;
            ib_mr = ibv_reg_shared_mr(device->ib_mr_shared->handle,
                    device->ib_pd, addr, access_flag);
            if (NULL == ib_mr)
            {
                MEMHEAP_ERROR("error to ibv_reg_shared_mr() %llu bytes errno says %d: %s",
                        (unsigned long long)size, errno, strerror(errno));
                rc = OSHMEM_ERR_OUT_OF_RESOURCE;
            }
            else
            {
                opal_value_array_append_item(&device->ib_mr_array, &ib_mr);
            }
        }
#endif /* MPAGE_ENABLE */

        if (!rc)
        {
            assert(size == device->ib_mr_shared->length);

            s->type = MAP_SEGMENT_ALLOC_IBV;
            s->shmid = device->ib_mr_shared->handle;
            s->start = ib_mr->addr;
            s->size = size;
            s->end = (void*)((uintptr_t)s->start + s->size);
            s->context = &memheap_device;
        }
    }

    return rc;
}
Beispiel #14
0
static int ucma_init(void)
{
	struct ibv_device **dev_list = NULL;
	struct cma_device *cma_dev;
	struct ibv_device_attr attr;
	int i, ret, dev_cnt;

	pthread_mutex_lock(&mut);
	if (cma_dev_cnt) {
		pthread_mutex_unlock(&mut);
		return 0;
	}

	ret = check_abi_version();
	if (ret)
		goto err1;

	dev_list = ibv_get_device_list(&dev_cnt);
	if (!dev_list) {
		printf("CMA: unable to get RDMA device list\n");
		ret = ERR(ENODEV);
		goto err1;
	}

	cma_dev_array = malloc(sizeof *cma_dev * dev_cnt);
	if (!cma_dev_array) {
		ret = ERR(ENOMEM);
		goto err2;
	}

	for (i = 0; dev_list[i];) {
		cma_dev = &cma_dev_array[i];

		cma_dev->guid = ibv_get_device_guid(dev_list[i]);
		cma_dev->verbs = ibv_open_device(dev_list[i]);
		if (!cma_dev->verbs) {
			printf("CMA: unable to open RDMA device\n");
			ret = ERR(ENODEV);
			goto err3;
		}

		i++;
		ret = ibv_query_device(cma_dev->verbs, &attr);
		if (ret) {
			printf("CMA: unable to query RDMA device\n");
			goto err3;
		}

		cma_dev->port_cnt = attr.phys_port_cnt;
		cma_dev->max_initiator_depth = (uint8_t) attr.max_qp_init_rd_atom;
		cma_dev->max_responder_resources = (uint8_t) attr.max_qp_rd_atom;
	}

	cma_dev_cnt = dev_cnt;
	pthread_mutex_unlock(&mut);
	ibv_free_device_list(dev_list);
	return 0;

err3:
	while (i--)
		ibv_close_device(cma_dev_array[i].verbs);
	free(cma_dev_array);
err2:
	ibv_free_device_list(dev_list);
err1:
	pthread_mutex_unlock(&mut);
	return ret;
}
Beispiel #15
0
/*
 * Opens the interface to the opp module.
 * Must be called before any use of the path functions.
 *
 * device		The verbs context for the HFI.
 *				Can be acquired via op_path_find_hfi.
 *
 * port_num		The port to use for sending queries.
 *
 * This information is used for querying pkeys and 
 * calculating timeouts.
 *
 * Returns a pointer to the op_path context on success, or returns NULL 
 * and sets errno if the device could not be opened.
 */
void *
op_path_open(struct ibv_device *device, int p)
{
	int         i, err;
	struct op_path_context *context;

	if (!device) {
		errno=ENXIO;
		return NULL;
	}

	context = malloc(sizeof(struct op_path_context));
	if (!context) {
		errno=ENOMEM;
		return NULL;
	}
	memset(context,0,sizeof(struct op_path_context));

	context->ibv_context = ibv_open_device(device);
	if (!context->ibv_context) {
		errno=ENODEV;
		goto open_device_failed;
	}

	context->port_num = p;

	context->reader = op_ppath_allocate_reader();
	if (!context->reader) {
		errno=ENOMEM;
		goto alloc_reader_failed;
	}

	err = op_ppath_create_reader(context->reader);
	if (err) {
		errno=err;
		goto create_reader_failed;
	}

	if ((err=ibv_query_device(context->ibv_context,
                         &(context->device_attr)))) {
		errno=EFAULT;
		goto query_attr_failed;
	}

	if ((err=ibv_query_port(context->ibv_context,
                       context->port_num,
                       &(context->port_attr)))) {
		errno=EFAULT;
		goto query_attr_failed;
	}

	context->pkey_table = malloc(context->device_attr.max_pkeys* sizeof(int));     
	if (!context->pkey_table) {
		errno= ENOMEM;
		goto query_attr_failed;
	}
	memset(context->pkey_table,0,context->device_attr.max_pkeys* sizeof(int));

	for (i = 0, err = 0; !err && i<context->device_attr.max_pkeys; i++) {
		err = ibv_query_pkey(context->ibv_context, context->port_num, i,
                             &(context->pkey_table[i]));
		if (err) {
			errno=EFAULT;
			goto query_pkey_failed;
		}
	}

	return context;

query_pkey_failed:
	free(context->pkey_table);
query_attr_failed:
	op_ppath_close_reader(context->reader);
create_reader_failed:
	free(context->reader);
alloc_reader_failed:
	ibv_close_device(context->ibv_context);
open_device_failed:
	free(context);
	return NULL;
}
/* ////////////////////////////////////////////////////////////////////////// */
static int
verbs_runtime_query(mca_base_module_t **module,
                    int *priority,
                    const char *hint)
{
    int rc = OSHMEM_SUCCESS;
    openib_device_t my_device;
    openib_device_t *device = &my_device;
    int num_devs = 0;
    int i = 0;

    *priority = 0;
    *module = NULL;

    memset(device, 0, sizeof(*device));

#ifdef HAVE_IBV_GET_DEVICE_LIST
    device->ib_devs = ibv_get_device_list(&num_devs);
#else
    #error unsupported ibv_get_device_list in infiniband/verbs.h
#endif

    if (num_devs == 0 || !device->ib_devs) {
        return OSHMEM_ERR_NOT_SUPPORTED;
    }

    /* Open device */
    if (NULL != mca_sshmem_verbs_component.hca_name) {
        for (i = 0; i < num_devs; i++) {
            if (0 == strcmp(mca_sshmem_verbs_component.hca_name, ibv_get_device_name(device->ib_devs[i]))) {
                device->ib_dev = device->ib_devs[i];
                break;
            }
        }
    } else {
        device->ib_dev = device->ib_devs[0];
    }

    if (NULL == device->ib_dev) {
        rc = OSHMEM_ERR_NOT_FOUND;
        goto out;
    }

    if (NULL == (device->ib_dev_context = ibv_open_device(device->ib_dev))) {
        rc = OSHMEM_ERR_RESOURCE_BUSY;
        goto out;
    }

    /* Obtain device attributes */
    if (ibv_query_device(device->ib_dev_context, &device->ib_dev_attr)) {
        rc = OSHMEM_ERR_RESOURCE_BUSY;
        goto out;
    }

    /* Allocate the protection domain for the device */
    device->ib_pd = ibv_alloc_pd(device->ib_dev_context);
    if (NULL == device->ib_pd) {
        rc = OSHMEM_ERR_RESOURCE_BUSY;
        goto out;
    }

    /* Allocate memory */
    if (!rc) {
        void *addr = NULL;
        size_t size = getpagesize();
        struct ibv_mr *ib_mr = NULL;
        uint64_t access_flag = IBV_ACCESS_LOCAL_WRITE |
                          IBV_ACCESS_REMOTE_WRITE |
                          IBV_ACCESS_REMOTE_READ; 
        uint64_t exp_access_flag = 0;

        OBJ_CONSTRUCT(&device->ib_mr_array, opal_value_array_t);
        opal_value_array_init(&device->ib_mr_array, sizeof(struct ibv_mr *));

#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0)
        exp_access_flag = IBV_EXP_ACCESS_ALLOCATE_MR  |
                          IBV_EXP_ACCESS_SHARED_MR_USER_READ |
                          IBV_EXP_ACCESS_SHARED_MR_USER_WRITE; 
#endif /* MPAGE_ENABLE */

        struct ibv_exp_reg_mr_in in = {device->ib_pd, addr, size, access_flag|exp_access_flag, 0};
        ib_mr = ibv_exp_reg_mr(&in);
        if (NULL == ib_mr) {
            rc = OSHMEM_ERR_OUT_OF_RESOURCE;
        } else {
            device->ib_mr_shared = ib_mr;
            opal_value_array_append_item(&device->ib_mr_array, &ib_mr);
        }

#if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0)
        if (!rc) {
            struct ibv_exp_reg_shared_mr_in in_smr;

            access_flag = IBV_ACCESS_LOCAL_WRITE |
                          IBV_ACCESS_REMOTE_WRITE |
                          IBV_ACCESS_REMOTE_READ|
                          IBV_EXP_ACCESS_NO_RDMA;

            addr = (void *)mca_sshmem_base_start_address;
            mca_sshmem_verbs_fill_shared_mr(&in_smr, device->ib_pd, device->ib_mr_shared->handle,  addr, access_flag);
            ib_mr = ibv_exp_reg_shared_mr(&in_smr);
            if (NULL == ib_mr) {
                mca_sshmem_verbs_component.has_shared_mr = 0;
            } else {
                opal_value_array_append_item(&device->ib_mr_array, &ib_mr);
                mca_sshmem_verbs_component.has_shared_mr = 1;
            }
        }
#endif /* MPAGE_ENABLE */
    }

    /* all is well - rainbows and butterflies */
    if (!rc) {
        *priority = mca_sshmem_verbs_component.priority;
        *module = (mca_base_module_t *)&mca_sshmem_verbs_module.super;
    }

out:
    if (device) {
        if (opal_value_array_get_size(&device->ib_mr_array)) {
            struct ibv_mr** array;
            struct ibv_mr* ib_mr = NULL;
            array = OPAL_VALUE_ARRAY_GET_BASE(&device->ib_mr_array, struct ibv_mr *);
            while (opal_value_array_get_size(&device->ib_mr_array) > 0) {
                ib_mr = array[0];
                ibv_dereg_mr(ib_mr);
                opal_value_array_remove_item(&device->ib_mr_array, 0);
            }

            if (device->ib_mr_shared) {
                device->ib_mr_shared = NULL;
            }
            OBJ_DESTRUCT(&device->ib_mr_array);
        }

        if (device->ib_pd) {
            ibv_dealloc_pd(device->ib_pd);
            device->ib_pd = NULL;
        }

        if(device->ib_dev_context) {
            ibv_close_device(device->ib_dev_context);
            device->ib_dev_context = NULL;
        }

        if(device->ib_devs) {
            ibv_free_device_list(device->ib_devs);
            device->ib_devs = NULL;
        }
    }

    return rc;
}
Beispiel #17
0
static int
mlx5_glue_query_device(struct ibv_context *context,
		       struct ibv_device_attr *device_attr)
{
	return ibv_query_device(context, device_attr);
}
Beispiel #18
0
int __ibv_query_device_1_0(struct ibv_context_1_0 *context,
			   struct ibv_device_attr *device_attr)
{  fprintf(stderr, "%s:%s:%d \n", __func__, __FILE__, __LINE__);
	return ibv_query_device(context->real_context, device_attr);
}
Beispiel #19
0
/**
 *  Look at rdma_set_default_parameters() in
 * "mvapich2/trunk/src/mpid/ch3/channels/mrail/src/gen2/ibv_param.c"
 */
int MPID_nem_ib_set_default_params()
{
    int mpi_errno   = 0;

    MPIDI_STATE_DECL(MPID_STATE_MPIDI_SET_DEFAULT_PARAMS);
    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_SET_DEFAULT_PARAMS);

    rdma_fp_buffer_size = RDMA_FP_DEFAULT_BUF_SIZE;
    mv2_arch_hca_type arch_hca_type = mv2_get_arch_hca_type ( hca_list[0].ib_dev );

    switch( arch_hca_type  ) {

        case MV2_ARCH_INTEL_XEON_E5630_8_HCA_MLX_CX_QDR:
            rdma_vbuf_total_size = 17 * 1024 + EAGER_THRESHOLD_ADJUST;
            rdma_fp_buffer_size = 5 * 1024;
            rdma_iba_eager_threshold = VBUF_BUFFER_SIZE;
            rdma_eagersize_1sc           = 4 * 1024;
            rdma_put_fallback_threshold  = 8 * 1024;
            rdma_get_fallback_threshold  = 0; 
            break;

        case MV2_ARCH_INTEL_NEHLM_8_HCA_MLX_CX_QDR:
            rdma_vbuf_total_size = 17 * 1024 + EAGER_THRESHOLD_ADJUST;
            rdma_fp_buffer_size = 5 * 1024;
            rdma_iba_eager_threshold = VBUF_BUFFER_SIZE;
            rdma_eagersize_1sc           = 8 * 1024;
            rdma_put_fallback_threshold  = 8 * 1024;
            rdma_get_fallback_threshold  = 0; 
            break;

        case MV2_ARCH_AMD_MGNYCRS_24_HCA_MLX_CX_QDR:
            rdma_vbuf_total_size = 16 * 1024 + EAGER_THRESHOLD_ADJUST;
            rdma_fp_buffer_size = 5 * 1024;
            rdma_iba_eager_threshold = VBUF_BUFFER_SIZE;
            rdma_eagersize_1sc           = 4 * 1024;
            rdma_put_fallback_threshold  = 4 * 1024;
            rdma_get_fallback_threshold  = 0; 
            break;

        case MV2_ARCH_AMD_BRCLNA_16_HCA_MLX_CX_DDR:
            rdma_vbuf_total_size = 16 * 1024 + EAGER_THRESHOLD_ADJUST;
            rdma_fp_buffer_size = 9 * 1024;
            rdma_iba_eager_threshold = VBUF_BUFFER_SIZE;
            rdma_eagersize_1sc           = 4 * 1024;
            rdma_put_fallback_threshold  = 128;
            rdma_get_fallback_threshold  = 0;
            break;

        case MV2_ARCH_INTEL_CLVRTWN_8_HCA_MLX_CX_DDR:
            rdma_vbuf_total_size = 17 * 1024 + EAGER_THRESHOLD_ADJUST;
            rdma_fp_buffer_size = 9 * 1024;
            rdma_iba_eager_threshold = VBUF_BUFFER_SIZE;
            rdma_eagersize_1sc           = 4 * 1024;
            rdma_put_fallback_threshold  = 8 * 1024;
            rdma_get_fallback_threshold  = 0;
            break;

        CASE_MV2_ANY_ARCH_WITH_MLX_CX_QDR:
            rdma_vbuf_total_size = 16 * 1024 + EAGER_THRESHOLD_ADJUST;
            rdma_fp_buffer_size = 5 * 1024;
            rdma_iba_eager_threshold = VBUF_BUFFER_SIZE;
            rdma_eagersize_1sc           = 4 * 1024;
            rdma_put_fallback_threshold  = 8 * 1024;
            rdma_get_fallback_threshold  = 0;
            break;

        CASE_MV2_ANY_ARCH_WITH_MLX_CX_DDR:
            rdma_vbuf_total_size = 16 * 1024 + EAGER_THRESHOLD_ADJUST;
            rdma_fp_buffer_size = 9 * 1024;
            rdma_iba_eager_threshold = VBUF_BUFFER_SIZE;
            rdma_eagersize_1sc           = 4 * 1024;
            rdma_put_fallback_threshold  = 8 * 1024;
            rdma_get_fallback_threshold  = 0;
            break;

        CASE_MV2_ANY_ARCH_WITH_MLX_PCI_X:
        CASE_MV2_ANY_ARCH_WITH_IBM_EHCA:
            rdma_vbuf_total_size     = 12*1024 + EAGER_THRESHOLD_ADJUST;
            rdma_iba_eager_threshold = VBUF_BUFFER_SIZE;
            rdma_eagersize_1sc           = 4 * 1024;
            rdma_put_fallback_threshold  = 8 * 1024;
            rdma_get_fallback_threshold  = 394 * 1024;
            break;

        CASE_MV2_ANY_ARCH_WITH_CHELSIO_T3:
        CASE_MV2_ANY_ARCH_WITH_CHELSIO_T4:
            rdma_vbuf_total_size     = 9 * 1024 + EAGER_THRESHOLD_ADJUST;
            rdma_iba_eager_threshold = VBUF_BUFFER_SIZE;
            rdma_eagersize_1sc           = 4 * 1024;
            rdma_put_fallback_threshold  = 8 * 1024;
            rdma_get_fallback_threshold  = 394 * 1024;
            break;

        CASE_MV2_ANY_ARCH_WITH_INTEL_NE020:
            rdma_vbuf_total_size     = 9 * 1024 + EAGER_THRESHOLD_ADJUST;
            rdma_iba_eager_threshold = VBUF_BUFFER_SIZE;
            rdma_eagersize_1sc           = 4 * 1024;
            rdma_put_fallback_threshold  = 8 * 1024;
            rdma_get_fallback_threshold  = 394 * 1024;
            break;

        default:
            rdma_vbuf_total_size     = 12 * 1024;
            rdma_fp_buffer_size      = VBUF_BUFFER_SIZE;
            rdma_iba_eager_threshold = VBUF_BUFFER_SIZE;
            rdma_eagersize_1sc               = 4 * 1024;
            rdma_put_fallback_threshold      = 8 * 1024;
            rdma_get_fallback_threshold      = 256 * 1024;
            break;
    }

    num_rdma_buffer          = 16;
    if (hca_list[0].hca_type == PATH_HT) {
        rdma_default_qp_ous_rd_atom = 1;
    } else {
        rdma_default_qp_ous_rd_atom = 4;
    }

    if (hca_list[0].hca_type == IBM_EHCA) {
        rdma_max_inline_size = -1;
    } else if ((hca_list[0].hca_type == CHELSIO_T3) ||
            (hca_list[0].hca_type == CHELSIO_T4)) {
        rdma_max_inline_size = 64;
    } else {
        rdma_max_inline_size = 128 + INLINE_THRESHOLD_ADJUST;
    }

    if (hca_list[0].hca_type == MLX_PCI_EX_DDR) {
        rdma_default_mtu = IBV_MTU_2048;
    } else if(hca_list[0].hca_type == MLX_CX_QDR) {
        rdma_default_mtu = IBV_MTU_2048;
    } else {
        rdma_default_mtu = IBV_MTU_1024;
    }

    if ((hca_list[0].hca_type == CHELSIO_T3) ||
            (hca_list[0].hca_type == CHELSIO_T4)) {
        /* Trac #423 */
        struct ibv_device_attr dev_attr;
        int mpi_errno = MPI_SUCCESS;

        /*quering device for cq depth*/
        mpi_errno = ibv_query_device(hca_list[0].nic_context, &dev_attr);

        if(!mpi_errno) {
            if(dev_attr.max_cqe < rdma_default_max_cq_size) {
                rdma_default_max_cq_size = dev_attr.max_cqe;
            } 
        } else {
            rdma_default_max_cq_size = RDMA_DEFAULT_IWARP_CQ_SIZE;
        } 

        rdma_prepost_noop_extra = 8;
    }

    if (process_info.has_srq) {
        rdma_credit_preserve = 100;
    } else {
        rdma_credit_preserve = 3;
    }

    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_SET_DEFAULT_PARAMS);
    return mpi_errno;
}
static inline int mca_oob_ud_device_setup (mca_oob_ud_device_t *device,
                                           struct ibv_device *ib_device)
{
    int rc, port_num;
    struct ibv_device_attr dev_attr;

    OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:device_setup attempting to setup ib device %p",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) ib_device));

    device->ib_context = ibv_open_device (ib_device);
    if (NULL == device->ib_context) {
        OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:device_setup error opening device. errno = %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno));
        return ORTE_ERROR;
    }

    rc = ibv_query_device (device->ib_context, &dev_attr); 
    if (0 != rc) {
        OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:device_setup error querying device. errno = %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno));
        return ORTE_ERROR;
    }

    device->ib_channel = ibv_create_comp_channel (device->ib_context);
    if (NULL == device->ib_channel) {
        OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:device_setup error completing completion channel."
                             "errno = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno));
        return ORTE_ERROR;
    }

    device->ib_pd = ibv_alloc_pd (device->ib_context);
    if (NULL == device->ib_pd) {
        OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:device_setup error allocating protection domain."
                             "errno = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno));
        return ORTE_ERROR;
    }

    for (port_num = 1 ; port_num <= dev_attr.phys_port_cnt ; ++port_num) {
        mca_oob_ud_port_t *port = OBJ_NEW(mca_oob_ud_port_t);

        if (NULL == port) {
            opal_output (0, "oob:ud:device_setup malloc failure. errno = %d", errno);
            return ORTE_ERR_OUT_OF_RESOURCE;
        }

        port->device = device;
        port->port_num = port_num;

        rc = mca_oob_ud_port_setup (port);
        if (ORTE_SUCCESS != rc) {
            OBJ_RELEASE(port);
            continue;
        }

        opal_list_append (&device->ports, (opal_list_item_t *) port);

	break;
    }

    if (0 == opal_list_get_size(&device->ports)) {
        OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:device_setup could not init device. no usable "
                             "ports present", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        return ORTE_ERROR;
    }

    return ORTE_SUCCESS;
}
Beispiel #21
0
/* ////////////////////////////////////////////////////////////////////////// */
static int
segment_create(map_segment_t *ds_buf,
               const char *file_name,
               size_t size)
{
    int rc = OSHMEM_SUCCESS;
    openib_device_t *device = &memheap_device;
    int num_devs = 0;
    int i = 0;

    assert(ds_buf);

    /* init the contents of map_segment_t */
    shmem_ds_reset(ds_buf);

    memset(device, 0, sizeof(*device));

#ifdef HAVE_IBV_GET_DEVICE_LIST
    device->ib_devs = ibv_get_device_list(&num_devs);
#else
#error unsupported ibv_get_device_list in infiniband/verbs.h
#endif

    if (num_devs == 0 || !device->ib_devs) {
        return OSHMEM_ERR_NOT_SUPPORTED;
    }

    /* Open device */
    if (NULL != mca_sshmem_verbs_component.hca_name) {
        for (i = 0; i < num_devs; i++) {
            if (0 == strcmp(mca_sshmem_verbs_component.hca_name, ibv_get_device_name(device->ib_devs[i]))) {
                device->ib_dev = device->ib_devs[i];
                break;
            }
        }
    } else {
        device->ib_dev = device->ib_devs[0];
    }

    if (NULL == device->ib_dev) {
        OPAL_OUTPUT_VERBOSE(
            (5, oshmem_sshmem_base_framework.framework_output,
            "error getting device says %d: %s",
            errno, strerror(errno))
            );
        return OSHMEM_ERR_NOT_FOUND;
    }

    if (NULL == (device->ib_dev_context = ibv_open_device(device->ib_dev))) {
        OPAL_OUTPUT_VERBOSE(
            (5, oshmem_sshmem_base_framework.framework_output,
            "error obtaining device context for %s errno says %d: %s",
            ibv_get_device_name(device->ib_dev), errno, strerror(errno))
            );
        return OSHMEM_ERR_RESOURCE_BUSY;
    }

    /* Obtain device attributes */
    if (ibv_query_device(device->ib_dev_context, &device->ib_dev_attr)) {
        OPAL_OUTPUT_VERBOSE(
            (5, oshmem_sshmem_base_framework.framework_output,
            "error obtaining device attributes for %s errno says %d: %s",
            ibv_get_device_name(device->ib_dev), errno, strerror(errno))
            );
        return OSHMEM_ERR_RESOURCE_BUSY;
    }

    /* Allocate the protection domain for the device */
    device->ib_pd = ibv_alloc_pd(device->ib_dev_context);
    if (NULL == device->ib_pd) {
        OPAL_OUTPUT_VERBOSE(
            (5, oshmem_sshmem_base_framework.framework_output,
            "error allocating protection domain for %s errno says %d: %s",
            ibv_get_device_name(device->ib_dev), errno, strerror(errno))
            );
        return OSHMEM_ERR_RESOURCE_BUSY;
    }

    /* Allocate memory */
    if (!rc) {
        void *addr = NULL;
        struct ibv_mr *ib_mr = NULL;
        uint64_t access_flag = IBV_ACCESS_LOCAL_WRITE |
                          IBV_ACCESS_REMOTE_WRITE |
                          IBV_ACCESS_REMOTE_READ;
        uint64_t exp_access_flag = 0;

        OBJ_CONSTRUCT(&device->ib_mr_array, opal_value_array_t);
        opal_value_array_init(&device->ib_mr_array, sizeof(struct ibv_mr *));

#if (MPAGE_ENABLE > 0)
        exp_access_flag = IBV_EXP_ACCESS_ALLOCATE_MR |
                          IBV_EXP_ACCESS_SHARED_MR_USER_READ |
                          IBV_EXP_ACCESS_SHARED_MR_USER_WRITE;
#endif /* MPAGE_ENABLE */

        struct ibv_exp_reg_mr_in in = {device->ib_pd, addr, size, access_flag|exp_access_flag, 0};

#if MPAGE_HAVE_IBV_EXP_REG_MR_CREATE_FLAGS
        if (0 == mca_sshmem_verbs_component.has_shared_mr) {
            in.addr = (void *)mca_sshmem_base_start_address;
            in.comp_mask    = IBV_EXP_REG_MR_CREATE_FLAGS;
            in.create_flags = IBV_EXP_REG_MR_CREATE_CONTIG;
            in.exp_access   = access_flag;
        }
#endif
        ib_mr = ibv_exp_reg_mr(&in);
        if (NULL == ib_mr) {
            OPAL_OUTPUT_VERBOSE(
                (5, oshmem_sshmem_base_framework.framework_output,
                    "error to ibv_exp_reg_mr() %llu bytes errno says %d: %s",
                    (unsigned long long)size, errno, strerror(errno))
                );
            rc = OSHMEM_ERR_OUT_OF_RESOURCE;
        } else {
            device->ib_mr_shared = ib_mr;
            opal_value_array_append_item(&device->ib_mr_array, &ib_mr);
        }

#if (MPAGE_ENABLE > 0)
        if (!rc && mca_sshmem_verbs_component.has_shared_mr) {
            void *addr = NULL;
            access_flag = IBV_ACCESS_LOCAL_WRITE |
                          IBV_ACCESS_REMOTE_WRITE |
                          IBV_ACCESS_REMOTE_READ|
                          IBV_EXP_ACCESS_NO_RDMA;

            addr = (void *)mca_sshmem_base_start_address;
            struct ibv_exp_reg_shared_mr_in in;
            mca_sshmem_verbs_fill_shared_mr(&in, device->ib_pd, device->ib_mr_shared->handle, addr, access_flag);
            ib_mr = ibv_exp_reg_shared_mr(&in);
            if (NULL == ib_mr) {
                OPAL_OUTPUT_VERBOSE(
                    (5, oshmem_sshmem_base_framework.framework_output,
                        "error to ibv_reg_shared_mr() %llu bytes errno says %d: %s has_shared_mr: %d",
                        (unsigned long long)size, errno, strerror(errno),
                        mca_sshmem_verbs_component.has_shared_mr
                        )
                    );
                rc = OSHMEM_ERR_OUT_OF_RESOURCE;
            } else {
                opal_value_array_append_item(&device->ib_mr_array, &ib_mr);
            }
        }
#endif /* MPAGE_ENABLE */

        if (!rc) {
            OPAL_OUTPUT_VERBOSE(
                (70, oshmem_sshmem_base_framework.framework_output,
                "ibv device %s shared_mr: %d",
                ibv_get_device_name(device->ib_dev),
                mca_sshmem_verbs_component.has_shared_mr)
                );

            if (mca_sshmem_verbs_component.has_shared_mr) {
                assert(size == device->ib_mr_shared->length);
                ds_buf->type = MAP_SEGMENT_ALLOC_IBV;
                ds_buf->seg_id = device->ib_mr_shared->handle;
            } else {
                ds_buf->type = MAP_SEGMENT_ALLOC_IBV_NOSHMR;
                ds_buf->seg_id = MAP_SEGMENT_SHM_INVALID;
            }
            ds_buf->super.va_base = ib_mr->addr;
            ds_buf->seg_size = size;
            ds_buf->super.va_end = (void*)((uintptr_t)ds_buf->super.va_base + ds_buf->seg_size);
        }
    }

    OPAL_OUTPUT_VERBOSE(
          (70, oshmem_sshmem_base_framework.framework_output,
           "%s: %s: create %s "
           "(id: %d, addr: %p size: %lu, name: %s)\n",
           mca_sshmem_verbs_component.super.base_version.mca_type_name,
           mca_sshmem_verbs_component.super.base_version.mca_component_name,
           (rc ? "failure" : "successful"),
           ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size, ds_buf->seg_name)
      );

    return rc;
}
Beispiel #22
0
int __ibv_query_device_1_0(struct ibv_context_1_0 *context,
			   struct ibv_device_attr *device_attr)
{
	return ibv_query_device(context->real_context, device_attr);
}
Beispiel #23
0
/**
 * the first step in original MPID_nem_ib_setup_conn() function
 * open hca, create ptags  and create cqs
 */
int MPID_nem_ib_open_ports()
{
    int mpi_errno = MPI_SUCCESS;

    /* Infiniband Verb Structures */
    struct ibv_port_attr    port_attr;
    struct ibv_device_attr  dev_attr;

    int nHca; /* , curRank, rail_index ; */

    MPIDI_STATE_DECL(MPID_STATE_MPIDI_OPEN_HCA);
    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_OPEN_HCA);

    for (nHca = 0; nHca < ib_hca_num_hcas; nHca++) {
        if (ibv_query_device(hca_list[nHca].nic_context, &dev_attr)) {
            MPIU_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**fail",
                    "**fail %s", "Error getting HCA attributes");
        }

        /* detecting active ports */
        if (rdma_default_port < 0 || ib_hca_num_ports > 1) {
            int nPort;
            int k = 0;
            for (nPort = 1; nPort <= RDMA_DEFAULT_MAX_PORTS; nPort ++) {
                if ((! ibv_query_port(hca_list[nHca].nic_context, nPort, &port_attr)) &&
                            port_attr.state == IBV_PORT_ACTIVE &&
                            (port_attr.lid || (!port_attr.lid && use_iboeth))) {
                    if (use_iboeth) {
                        if (ibv_query_gid(hca_list[nHca].nic_context,
                                        nPort, 0, &hca_list[nHca].gids[k])) {
                            /* new error information function needed */
                            MPIU_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER,
                                    "**fail", "Failed to retrieve gid on rank %d", process_info.rank);
                        }
                        DEBUG_PRINT("[%d] %s(%d): Getting gid[%d][%d] for"
                                " port %d subnet_prefix = %llx,"
                                " intf_id = %llx\r\n",
                                process_info.rank, __FUNCTION__, __LINE__, nHca, k, k,
                                hca_list[nHca].gids[k].global.subnet_prefix,
                                hca_list[nHca].gids[k].global.interface_id);
                    } else {
                        hca_list[nHca].lids[k]    = port_attr.lid;
                    }
                    hca_list[nHca].ports[k++] = nPort;

                    if (check_attrs(&port_attr, &dev_attr)) {
                        MPIU_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER,
                                "**fail", "**fail %s",
                                "Attributes failed sanity check");
                    }
                }
            }
            if (k < ib_hca_num_ports) {
                MPIU_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER,
                        "**activeports", "**activeports %d", ib_hca_num_ports);
            }
        } else {
            if(ibv_query_port(hca_list[nHca].nic_context,
                        rdma_default_port, &port_attr)
                || (!port_attr.lid && !use_iboeth)
                || (port_attr.state != IBV_PORT_ACTIVE)) {
                MPIU_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER,
                        "**portquery", "**portquery %d", rdma_default_port);
            }

            hca_list[nHca].ports[0] = rdma_default_port;

            if (use_iboeth) {
                if (ibv_query_gid(hca_list[nHca].nic_context, 0, 0, &hca_list[nHca].gids[0])) {
                    /* new error function needed */
                    MPIU_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER,
                            "**fail", "Failed to retrieve gid on rank %d", process_info.rank);
                }

                if (check_attrs(&port_attr, &dev_attr)) {
                    MPIU_ERR_SETFATALANDJUMP1(mpi_errno, MPI_ERR_OTHER,
                            "**fail", "**fail %s", "Attributes failed sanity check");
                }
            } else {
                hca_list[nHca].lids[0]  = port_attr.lid;
            }
        }

        if (rdma_use_blocking) {
            hca_list[nHca].comp_channel = ibv_create_comp_channel(hca_list[nHca].nic_context);

            if (!hca_list[nHca].comp_channel) {
                MPIU_ERR_SETFATALANDSTMT1(mpi_errno, MPI_ERR_OTHER, goto fn_fail,
                        "**fail", "**fail %s", "cannot create completion channel");
            }

            hca_list[nHca].send_cq_hndl = NULL;
            hca_list[nHca].recv_cq_hndl = NULL;
            hca_list[nHca].cq_hndl = ibv_create_cq(hca_list[nHca].nic_context,
                    rdma_default_max_cq_size, NULL, hca_list[nHca].comp_channel, 0);
            if (!hca_list[nHca].cq_hndl) {
                MPIU_ERR_SETFATALANDSTMT1(mpi_errno, MPI_ERR_OTHER, goto fn_fail,
                        "**fail", "**fail %s", "cannot create cq");
            }

            if (ibv_req_notify_cq(hca_list[nHca].cq_hndl, 0)) {
                MPIU_ERR_SETFATALANDSTMT1(mpi_errno, MPI_ERR_OTHER, goto fn_fail,
                        "**fail", "**fail %s", "cannot request cq notification");
            }
Beispiel #24
0
Datei: mlx5.c Projekt: goby/dpdk
/**
 * DPDK callback to register a PCI device.
 *
 * This function creates an Ethernet device for each port of a given
 * PCI device.
 *
 * @param[in] pci_drv
 *   PCI driver structure (mlx5_driver).
 * @param[in] pci_dev
 *   PCI device information.
 *
 * @return
 *   0 on success, negative errno value on failure.
 */
static int
mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
{
	struct ibv_device **list;
	struct ibv_device *ibv_dev;
	int err = 0;
	struct ibv_context *attr_ctx = NULL;
	struct ibv_device_attr device_attr;
	unsigned int vf;
	int idx;
	int i;

	(void)pci_drv;
	assert(pci_drv == &mlx5_driver.pci_drv);
	/* Get mlx5_dev[] index. */
	idx = mlx5_dev_idx(&pci_dev->addr);
	if (idx == -1) {
		ERROR("this driver cannot support any more adapters");
		return -ENOMEM;
	}
	DEBUG("using driver device index %d", idx);

	/* Save PCI address. */
	mlx5_dev[idx].pci_addr = pci_dev->addr;
	list = ibv_get_device_list(&i);
	if (list == NULL) {
		assert(errno);
		if (errno == ENOSYS) {
			WARN("cannot list devices, is ib_uverbs loaded?");
			return 0;
		}
		return -errno;
	}
	assert(i >= 0);
	/*
	 * For each listed device, check related sysfs entry against
	 * the provided PCI ID.
	 */
	while (i != 0) {
		struct rte_pci_addr pci_addr;

		--i;
		DEBUG("checking device \"%s\"", list[i]->name);
		if (mlx5_ibv_device_to_pci_addr(list[i], &pci_addr))
			continue;
		if ((pci_dev->addr.domain != pci_addr.domain) ||
		    (pci_dev->addr.bus != pci_addr.bus) ||
		    (pci_dev->addr.devid != pci_addr.devid) ||
		    (pci_dev->addr.function != pci_addr.function))
			continue;
		vf = ((pci_dev->id.device_id ==
		       PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) ||
		      (pci_dev->id.device_id ==
		       PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF));
		INFO("PCI information matches, using device \"%s\" (VF: %s)",
		     list[i]->name, (vf ? "true" : "false"));
		attr_ctx = ibv_open_device(list[i]);
		err = errno;
		break;
	}
	if (attr_ctx == NULL) {
		ibv_free_device_list(list);
		switch (err) {
		case 0:
			WARN("cannot access device, is mlx5_ib loaded?");
			return 0;
		case EINVAL:
			WARN("cannot use device, are drivers up to date?");
			return 0;
		}
		assert(err > 0);
		return -err;
	}
	ibv_dev = list[i];

	DEBUG("device opened");
	if (ibv_query_device(attr_ctx, &device_attr))
		goto error;
	INFO("%u port(s) detected", device_attr.phys_port_cnt);

	for (i = 0; i < device_attr.phys_port_cnt; i++) {
		uint32_t port = i + 1; /* ports are indexed from one */
		uint32_t test = (1 << i);
		struct ibv_context *ctx = NULL;
		struct ibv_port_attr port_attr;
		struct ibv_pd *pd = NULL;
		struct priv *priv = NULL;
		struct rte_eth_dev *eth_dev;
#ifdef HAVE_EXP_QUERY_DEVICE
		struct ibv_exp_device_attr exp_device_attr;
#endif /* HAVE_EXP_QUERY_DEVICE */
		struct ether_addr mac;

#ifdef HAVE_EXP_QUERY_DEVICE
		exp_device_attr.comp_mask =
			IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS |
			IBV_EXP_DEVICE_ATTR_RX_HASH;
#endif /* HAVE_EXP_QUERY_DEVICE */

		DEBUG("using port %u (%08" PRIx32 ")", port, test);

		ctx = ibv_open_device(ibv_dev);
		if (ctx == NULL)
			goto port_error;

		/* Check port status. */
		err = ibv_query_port(ctx, port, &port_attr);
		if (err) {
			ERROR("port query failed: %s", strerror(err));
			goto port_error;
		}
		if (port_attr.state != IBV_PORT_ACTIVE)
			DEBUG("port %d is not active: \"%s\" (%d)",
			      port, ibv_port_state_str(port_attr.state),
			      port_attr.state);

		/* Allocate protection domain. */
		pd = ibv_alloc_pd(ctx);
		if (pd == NULL) {
			ERROR("PD allocation failure");
			err = ENOMEM;
			goto port_error;
		}

		mlx5_dev[idx].ports |= test;

		/* from rte_ethdev.c */
		priv = rte_zmalloc("ethdev private structure",
				   sizeof(*priv),
				   RTE_CACHE_LINE_SIZE);
		if (priv == NULL) {
			ERROR("priv allocation failure");
			err = ENOMEM;
			goto port_error;
		}

		priv->ctx = ctx;
		priv->device_attr = device_attr;
		priv->port = port;
		priv->pd = pd;
		priv->mtu = ETHER_MTU;
#ifdef HAVE_EXP_QUERY_DEVICE
		if (ibv_exp_query_device(ctx, &exp_device_attr)) {
			ERROR("ibv_exp_query_device() failed");
			goto port_error;
		}

		priv->hw_csum =
			((exp_device_attr.exp_device_cap_flags &
			  IBV_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT) &&
			 (exp_device_attr.exp_device_cap_flags &
			  IBV_EXP_DEVICE_RX_CSUM_IP_PKT));
		DEBUG("checksum offloading is %ssupported",
		      (priv->hw_csum ? "" : "not "));

		priv->hw_csum_l2tun = !!(exp_device_attr.exp_device_cap_flags &
					 IBV_EXP_DEVICE_VXLAN_SUPPORT);
		DEBUG("L2 tunnel checksum offloads are %ssupported",
		      (priv->hw_csum_l2tun ? "" : "not "));

		priv->ind_table_max_size = exp_device_attr.rx_hash_caps.max_rwq_indirection_table_size;
		DEBUG("maximum RX indirection table size is %u",
		      priv->ind_table_max_size);

#else /* HAVE_EXP_QUERY_DEVICE */
		priv->ind_table_max_size = RSS_INDIRECTION_TABLE_SIZE;
#endif /* HAVE_EXP_QUERY_DEVICE */

		priv->vf = vf;
		/* Allocate and register default RSS hash keys. */
		priv->rss_conf = rte_calloc(__func__, hash_rxq_init_n,
					    sizeof((*priv->rss_conf)[0]), 0);
		if (priv->rss_conf == NULL) {
			err = ENOMEM;
			goto port_error;
		}
		err = rss_hash_rss_conf_new_key(priv,
						rss_hash_default_key,
						rss_hash_default_key_len,
						ETH_RSS_PROTO_MASK);
		if (err)
			goto port_error;
		/* Configure the first MAC address by default. */
		if (priv_get_mac(priv, &mac.addr_bytes)) {
			ERROR("cannot get MAC address, is mlx5_en loaded?"
			      " (errno: %s)", strerror(errno));
			goto port_error;
		}
		INFO("port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
		     priv->port,
		     mac.addr_bytes[0], mac.addr_bytes[1],
		     mac.addr_bytes[2], mac.addr_bytes[3],
		     mac.addr_bytes[4], mac.addr_bytes[5]);
		/* Register MAC and broadcast addresses. */
		claim_zero(priv_mac_addr_add(priv, 0,
					     (const uint8_t (*)[ETHER_ADDR_LEN])
					     mac.addr_bytes));
		claim_zero(priv_mac_addr_add(priv, (RTE_DIM(priv->mac) - 1),
					     &(const uint8_t [ETHER_ADDR_LEN])
					     { "\xff\xff\xff\xff\xff\xff" }));
#ifndef NDEBUG
		{
			char ifname[IF_NAMESIZE];

			if (priv_get_ifname(priv, &ifname) == 0)
				DEBUG("port %u ifname is \"%s\"",
				      priv->port, ifname);
			else
				DEBUG("port %u ifname is unknown", priv->port);
		}
#endif
		/* Get actual MTU if possible. */
		priv_get_mtu(priv, &priv->mtu);
		DEBUG("port %u MTU is %u", priv->port, priv->mtu);

		/* from rte_ethdev.c */
		{
			char name[RTE_ETH_NAME_MAX_LEN];

			snprintf(name, sizeof(name), "%s port %u",
				 ibv_get_device_name(ibv_dev), port);
			eth_dev = rte_eth_dev_allocate(name, RTE_ETH_DEV_PCI);
		}
		if (eth_dev == NULL) {
			ERROR("can not allocate rte ethdev");
			err = ENOMEM;
			goto port_error;
		}

		eth_dev->data->dev_private = priv;
		eth_dev->pci_dev = pci_dev;
		eth_dev->driver = &mlx5_driver;
		eth_dev->data->rx_mbuf_alloc_failed = 0;
		eth_dev->data->mtu = ETHER_MTU;

		priv->dev = eth_dev;
		eth_dev->dev_ops = &mlx5_dev_ops;
		eth_dev->data->mac_addrs = priv->mac;
		TAILQ_INIT(&eth_dev->link_intr_cbs);

		/* Bring Ethernet device up. */
		DEBUG("forcing Ethernet interface up");
		priv_set_flags(priv, ~IFF_UP, IFF_UP);
		continue;

port_error:
		rte_free(priv->rss_conf);
		rte_free(priv);
		if (pd)
			claim_zero(ibv_dealloc_pd(pd));
		if (ctx)
			claim_zero(ibv_close_device(ctx));
		break;
	}
Beispiel #25
0
/* Initialize the actual IB device */
int initIB(ArgStruct *p)
{
  int i, j, ret;
  char *tmp;
  int num_devices = 0;
  struct ibv_device **hca_list, **filtered_hca_list;
  struct ibv_device_attr hca_attr;
#if !HAVE_IBV_DEVICE_LIST
  struct dlist *hca_dlist; 
  struct ibv_device* hca_device; 
#endif

  /* Find all the devices on this host */
#if HAVE_IBV_DEVICE_LIST
  hca_list = ibv_get_device_list(&num_devices);
#else
  hca_dlist = ibv_get_devices();
  dlist_start(hca_dlist); 
  dlist_for_each_data(hca_dlist, hca_device, struct ibv_device)
    ++num_devices;
#endif

  /* If we didn't find any, return an error */
  if (0 == num_devices) {
      fprintf(stderr, "Couldn't find any IBV devices\n");
      return -1;
  }
  
#if !HAVE_IBV_DEVICE_LIST
  /* If we have the old version (ibv_get_devices()), convert it to
     the new form */
  hca_list = (struct ibv_device**) malloc(num_devices * 
                                          sizeof(struct ibv_device*));
  if (NULL == hca_list) {
      fprintf(stderr, "%s:%s:%d: malloc failed\n", __FILE__,
              __func__, __LINE__);
      return -1;
  }
  
  i = 0; 
  dlist_start(hca_dlist); 
  dlist_for_each_data(hca_dlist, hca_device, struct ibv_device)
      hca_list[i++] = hca_device;
#endif    

  /* Possible values for p->prot.device_and_port:

     1. <device>:<port> -- use only this device and only this port
     2. <device> -- use the first active port on this device
     3. :<port> -- use only this port, but on any device

     <device> names are matched exactly.
  */

  /* If a device name was specified on the command line, see if we can
     find it */
  tmp = NULL;
  port_num = -1;
  filtered_hca_list = hca_list;
  if (NULL != p->prot.device_and_port) {
      /* If there's a : in the string, then we have a port */
      tmp = strchr(p->prot.device_and_port, ':');
      if (NULL != tmp) {
          *tmp = '\0';
          ++tmp;
          port_num = atoi(tmp);
      }
      LOGPRINTF(("Pre-filter: looking for target device \"%s\", port %d",
                 p->prot.device_and_port, port_num));

      /* If the length of the device string left is >0, then there's a
         device specification */
      if (strlen(p->prot.device_and_port) > 0) {
          int found = 0;

          /* Loop through all the devices and find a matching
             name */
          for (i = 0; i < num_devices; ++i) {
              LOGPRINTF(("Pre-filter: found device: %s",
                         ibv_get_device_name(hca_list[i])));
              if (0 == strcmp(p->prot.device_and_port, 
                              ibv_get_device_name(hca_list[i]))) {
                  LOGPRINTF(("Pre-filter: found target device: %s (%d of %d)",
                             p->prot.device_and_port, i, num_devices));
                  filtered_hca_list = &(hca_list[i]);
                  num_devices = 1;
                  found = 1;
                  break;
              }
          }

          /* If we didn't find it, abort */
          if (!found) {
              fprintf(stderr, "Unable to find device \"%s\", aborting\n",
                      p->prot.device_and_port);
              return -1;
          }
      }
  }

  /* Traverse the filtered HCA list and find a good port */
  for (hca = NULL, i = 0; NULL == hca && i < num_devices; ++i) {

      /* Get a ibv_context from the ibv_device  */
      ctx = ibv_open_device(filtered_hca_list[i]);
      if (!ctx) {
          fprintf(stderr, "Couldn't create IBV context\n");
          return -1;
      } else {
          LOGPRINTF(("Found HCA %s",
                     ibv_get_device_name(filtered_hca_list[i])));
      }
      
      /* Get the device attributes */
      if (0 != ibv_query_device(ctx, &hca_attr)) {
          fprintf(stderr, "Could not get device context for %s, aborting\n",
                  ibv_get_device_name(hca));
          return -1;
      }

      for (j = 1; j <= hca_attr.phys_port_cnt; ++j) {
          /* If a specific port was asked for, *only* look at that port */
          if (port_num >= 0 && port_num != j) {
              continue;
          }
          LOGPRINTF(("Checking %s:%d...", 
                     ibv_get_device_name(filtered_hca_list[i]), j));

          /* Query this port and see if it's active */
          if (0 != ibv_query_port(ctx, j, &hca_port)) {
              fprintf(stderr, "Unable to query port %s:%d, aborting\n",
                      ibv_get_device_name(filtered_hca_list[i]), j);
              return -1;
          }

          /* If this port is active, we have a winner! */
          if (IBV_PORT_ACTIVE == hca_port.state) {
              LOGPRINTF(("%s:%d is ACTIVE", 
                         ibv_get_device_name(filtered_hca_list[i]), j));
              port_num = j;
              hca = filtered_hca_list[i];
              break;
          }
      }

      /* If we found one, we're done */
      if (hca) {
          break;
      }

      /* Otherwise, close the device (ignore any errors) */
      ibv_close_device(ctx);
      ctx = NULL;
  }

  /* If we didn't find a good device/port combo, abort */
  if (NULL == hca) {
      fprintf(stderr, "Could not find an active device and port, aborting\n");
      return -1;
  }

  /* free up the other devices in the event we would have multiple ib
     devices. if this isnt done, the device pointers will still be
     around in space somewhere -> bad */

#if HAVE_IBV_DEVICE_LIST
  ibv_free_device_list(hca_list); 
#endif
  
  /* Get HCA properties */
  
  lid = hca_port.lid;		/* local id, used to ref back to the device */
  LOGPRINTF(("  lid = %d", lid));


  /* Allocate Protection Domain */
	/* need a Protection domain to handle/register memory over the card */
  pd_hndl = ibv_alloc_pd(ctx);	
  if(!pd_hndl) {
    fprintf(stderr, "Error allocating PD\n");
    return -1;
  } else {
    LOGPRINTF(("Allocated Protection Domain"));
  }


  /* Create send completion queue */
  
  num_cqe = 30000; /* Requested number of completion q elements */
  s_cq_hndl = ibv_create_cq(ctx, num_cqe, NULL, NULL, 0);
  if(!s_cq_hndl) {
    fprintf(stderr, "Error creating send CQ\n");
    return -1;
  } else {
    act_num_cqe = s_cq_hndl->cqe;
    LOGPRINTF(("Created Send Completion Queue with %d elements", act_num_cqe));
  }


  /* Create recv completion queue */
  
  num_cqe = 20000; /* Requested number of completion q elements */
  r_cq_hndl = ibv_create_cq(ctx, num_cqe, NULL, NULL, 0);
  if(!r_cq_hndl) {
    fprintf(stderr, "Error creating send CQ\n");
    return -1;
  } else {
    act_num_cqe = r_cq_hndl->cqe;
    LOGPRINTF(("Created Recv Completion Queue with %d elements", act_num_cqe));
  }


  /* Placeholder for MR */
	/* We dont actually setup the Memory Regions here, instead
	 * this is done in the 'MyMalloc(..)' helper function.
	 * You could however, set them up here.
	 */

  /* Create Queue Pair */
    /* To setup a Queue Pair, the following qp initial attributes must be
     * specified and passed to the create_qp(..) function:
     * max send/recv write requests.  (max_recv/send_wr)
     * max scatter/gather entries. (max_recv/send_sge)
     * Command queues to associate the qp with.  (recv/send_cq)
     * Signalling type:  1-> signal all events.  0-> dont, event handler will
     *   deal with this.
     * QP type.  (RC=reliable connection, UC=unreliable.. etc.) defined 
     *   in the verbs header.
     */
  memset(&qp_init_attr, 0, sizeof(struct ibv_qp_init_attr)); 
  qp_init_attr.cap.max_recv_wr    = max_wq; /* Max outstanding WR on RQ      */
  qp_init_attr.cap.max_send_wr    = max_wq; /* Max outstanding WR on SQ      */
  qp_init_attr.cap.max_recv_sge   = 1; /* Max scatter/gather entries on RQ */
  qp_init_attr.cap.max_send_sge   = 1; /* Max scatter/gather entries on SQ */
  qp_init_attr.recv_cq            = r_cq_hndl; /* CQ handle for RQ         */
  qp_init_attr.send_cq            = s_cq_hndl; /* CQ handle for SQ         */
  qp_init_attr.sq_sig_all         = 0; /* Signalling type */
  qp_init_attr.qp_type            = IBV_QPT_RC; /* Transmission type         */

  /* ibv_create_qp( ibv_pd *pd, ibv_qp_init_attr * attr) */  
  qp_hndl = ibv_create_qp(pd_hndl, &qp_init_attr);
  if(!qp_hndl) {
    fprintf(stderr, "Error creating Queue Pair: %s\n", strerror(errno));
    return -1;
  } else {
    LOGPRINTF(("Created Queue Pair"));
  }

    /* Using the tcp connection, exchange necesary data needed to map
     *  the remote memory:
     *  (local: lid, qp_hndl->qp_num ), (remote: d_lid, d_qp_num)
     */

  /* Exchange lid and qp_num with other node */
  
  if( write(p->commfd, &lid, sizeof(lid) ) != sizeof(lid) ) {
    fprintf(stderr, "Failed to send lid over socket\n");
    return -1;
  }
  if( write(p->commfd, &qp_hndl->qp_num, sizeof(qp_hndl->qp_num) ) != sizeof(qp_hndl->qp_num) ) {
    fprintf(stderr, "Failed to send qpnum over socket\n");
    return -1;
  }
  if( read(p->commfd, &d_lid, sizeof(d_lid) ) != sizeof(d_lid) ) {
    fprintf(stderr, "Failed to read lid from socket\n");
    return -1;
  }
  if( read(p->commfd, &d_qp_num, sizeof(d_qp_num) ) != sizeof(d_qp_num) ) {
    fprintf(stderr, "Failed to read qpnum from socket\n");
    return -1;
  }
  
  LOGPRINTF(("Local: lid=%d qp_num=%d Remote: lid=%d qp_num=%d",
             lid, qp_hndl->qp_num, d_lid, d_qp_num));
    /* Further setup must be done to finalize the QP 'connection'.
     * First set the State of the qp to initialization by making a seperate
     * ibv_qp_attr* variable, giving it the initial values, and calling
     * ibv_qp_modify(..) to merge these settings into the QP.
     */
/* NOTE: According to openIB, ib_mthca's QP modify does not set alternate path
 *  fields in QP context, so you'll have to do this manually if necessary
 */

    /* Bring up Queue Pair */
  
  /******* INIT state ******/

  /* qp_attr is seperately allocated per qp/connection */
  qp_attr.qp_state = IBV_QPS_INIT;
  qp_attr.pkey_index = 0;
  qp_attr.port_num = port_num;
  qp_attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ;
  /* merge the qp_attributes into the queue pair */
  ret = ibv_modify_qp(qp_hndl, &qp_attr,
		      IBV_QP_STATE              |
		      IBV_QP_PKEY_INDEX         |
		      IBV_QP_PORT               |
		      IBV_QP_ACCESS_FLAGS);
  if(ret) {
    fprintf(stderr, "Error modifying QP to INIT\n");
    return -1;
  }

  LOGPRINTF(("Modified QP to INIT"));

/* To enable the Queue Pair to finally receive data, it must be 
 * put into the 'RTR' (Ready-To-Receive) state.  The Queue Pair will NOT
 * function properly until it has been setup, and manually put through
 * the init and rtr states.
 */
  
  /******* RTR (Ready-To-Receive) state *******/

  qp_attr.qp_state = IBV_QPS_RTR;
  qp_attr.max_dest_rd_atomic = 1;
  qp_attr.dest_qp_num = d_qp_num;
  qp_attr.ah_attr.sl = 0;
  qp_attr.ah_attr.is_global = 0;
  qp_attr.ah_attr.dlid = d_lid;
  qp_attr.ah_attr.static_rate = 0;
  qp_attr.ah_attr.src_path_bits = 0;
  qp_attr.ah_attr.port_num = port_num;
  qp_attr.path_mtu = p->prot.ib_mtu;
  qp_attr.rq_psn = 0;
  qp_attr.pkey_index = 0;
  qp_attr.min_rnr_timer = 5;
  /* merge these settings into the qp */
  ret = ibv_modify_qp(qp_hndl, &qp_attr,
		      IBV_QP_STATE              |
		      IBV_QP_AV                 |
		      IBV_QP_PATH_MTU           |
		      IBV_QP_DEST_QPN           |
		      IBV_QP_RQ_PSN             |
		      IBV_QP_MAX_DEST_RD_ATOMIC |
		      IBV_QP_MIN_RNR_TIMER);

  if(ret) {
    fprintf(stderr, "Error modifying QP to RTR\n");
    return -1;
  }

  LOGPRINTF(("Modified QP to RTR"));

  /* Sync before going to RTS state */
  Sync(p);

  /* In the same manner, 'enable' sending on the queue pair */
  
  /******* RTS (Ready-to-Send) state *******/

  qp_attr.qp_state = IBV_QPS_RTS;
  qp_attr.sq_psn = 0;
  qp_attr.timeout = 31;
  qp_attr.retry_cnt = 1;
  qp_attr.rnr_retry = 1;
  qp_attr.max_rd_atomic = 1;

  ret = ibv_modify_qp(qp_hndl, &qp_attr,
		      IBV_QP_STATE              |
		      IBV_QP_TIMEOUT            |
		      IBV_QP_RETRY_CNT          |
		      IBV_QP_RNR_RETRY          |
		      IBV_QP_SQ_PSN             |
		      IBV_QP_MAX_QP_RD_ATOMIC);

  if(ret) {
    fprintf(stderr, "Error modifying QP to RTS\n");
    return -1;
  }
  
  LOGPRINTF(("Modified QP to RTS"));

  /* If using event completion, request the initial notification */
  /* This spawns a seperate thread to do the event handling and
   * notification.
   * NOTE:  This may have problems in systems with Weak Memory Consistency
   * since there are no mutex(*) calls to preserve coherancy??
   */ 
  if( p->prot.comptype == NP_COMP_EVENT ) {
    if (pthread_create(&thread, NULL, EventThread, NULL)) {
      fprintf(stderr, "Couldn't start event thread\n");
      return -1;
    }
    ibv_req_notify_cq(r_cq_hndl, 0);	/* request completion notification  */
  }					/* for the receive cq.  2nd argument 
					   specifies if ONLY 'solicited'
					   completions will be 'noticed' */
  
 
  return 0; /* if we get here, the connection is setup correctly */
}