static int open_device(char *ib_devname) { struct ibv_device **dev_list; int i = 0; dev_list = ibv_get_device_list(NULL); if (!dev_list) { fprintf(stderr, "Failed to get IB devices list"); return -1; } if (ib_devname) { for (; dev_list[i]; ++i) { if (!strcmp(ibv_get_device_name(dev_list[i]), ib_devname)) break; } } if (!dev_list[i]) { fprintf(stderr, "IB device %s not found\n", ib_devname ? ib_devname : ""); return -1; } ctx.context = ibv_open_device(dev_list[i]); if (!ctx.context) { fprintf(stderr, "Couldn't get context for %s\n", ibv_get_device_name(dev_list[i])); return -1; } ibv_free_device_list(dev_list); return 0; }
static struct ibv_context *get_device_context(const char *device_name) { struct ibv_device **device_list; struct ibv_context *ctx = NULL; int num_devices; int i; device_list = ibv_get_device_list(&num_devices); if (!device_list) { fprintf(stderr, "Error, ibv_get_device_list() failed\n"); return NULL; } for (i = 0; i < num_devices; ++ i) { /* if this isn't the requested device */ if (strcmp(ibv_get_device_name(device_list[i]), device_name)) continue; ctx = ibv_open_device(device_list[i]); if (!ctx) { fprintf(stderr, "Error, failed to open the device '%s'\n", ibv_get_device_name(device_list[i])); goto out; } printf("The device '%s' was detected\n", device_name); break; } out: ibv_free_device_list(device_list); return ctx; }
static CtxPtr make_ctx(DevicesPtr devices, int i) { auto ptr = ibv_open_device(get_device(devices, i)); if(!ptr) { throw std::runtime_error("cannot open device"); } return CtxPtr(ptr, ibv_close_device); }
struct ibv_context_1_0 *__ibv_open_device_1_0(struct ibv_device_1_0 *device) { struct ibv_context *real_ctx; struct ibv_context_1_0 *ctx; ctx = malloc(sizeof *ctx); if (!ctx) return NULL; real_ctx = ibv_open_device(device->real_device); if (!real_ctx) { free(ctx); return NULL; } ctx->device = device; ctx->real_context = real_ctx; ctx->ops.poll_cq = poll_cq_wrapper_1_0; ctx->ops.req_notify_cq = req_notify_cq_wrapper_1_0; ctx->ops.post_send = post_send_wrapper_1_0; ctx->ops.post_recv = post_recv_wrapper_1_0; ctx->ops.post_srq_recv = post_srq_recv_wrapper_1_0; return ctx; }
static int print_device_info(void) { struct ibv_device ** ibv_devs; int i = 0; /*TODO: get num_devs automatically*/ int num_devs = 1; /*NULL => get all devices*/ ibv_devs = ibv_get_device_list(NULL); for (i = 0; i < num_devs; i++) { struct ibv_context *ibv_contxt; struct ibv_device_attr device_attr; char *dev_name; uint64_t dev_guid; ibv_contxt = ibv_open_device (ibv_devs[i]); dev_name = ibv_get_device_name(ibv_devs[i]); dev_guid = ibv_get_device_guid(ibv_devs[i]); printf("%s (%d):\n", dev_name, dev_guid); ibv_query_device (ibv_contxt, &device_attr); printf(" Record : %d\n", i); printf(" max_mr_size : %llu\n", device_attr.max_mr_size); printf(" max_mr : %llu\n", device_attr.max_mr); ibv_close_device (ibv_contxt); } ibv_free_device_list(ibv_devs); return 0; }
ibv_context* open_default_device() { ibv_device** dev_list; ibv_device* ib_dev; dev_list = ibv_get_device_list(NULL); CHECK(dev_list) << "No InfiniBand device found"; ib_dev = dev_list[0]; CHECK(ib_dev) << "No InfiniBand device found"; ibv_context* context = ibv_open_device(ib_dev); CHECK(context) << "Open context failed for " << ibv_get_device_name(ib_dev); return context; }
static void psofed_scan_hca_ports(struct ibv_device *ib_dev) { struct ibv_context *ctx; struct ibv_device_attr device_attr; int rc; unsigned port_cnt; unsigned port; const char *dev_name; dev_name =ibv_get_device_name(ib_dev); if (!dev_name) dev_name = "unknown"; ctx = ibv_open_device(ib_dev); if (!ctx) goto err_open_dev; rc = ibv_query_device(ctx, &device_attr); if (!rc) { port_cnt = device_attr.phys_port_cnt; if (port_cnt > 128) port_cnt = 128; } else { // Query failed. Assume 2 ports. port_cnt = 2; } for (port = 1; port <= port_cnt; port++) { struct ibv_port_attr port_attr; enum ibv_port_state port_state; const char *marker; rc = ibv_query_port(ctx, port, &port_attr); port_state = !rc ? port_attr.state : 999 /* unknown */; marker = ""; if (port_state == IBV_PORT_ACTIVE && (!psofed_hca || !strcmp(dev_name, psofed_hca)) && (!psofed_port || psofed_port == port)) { // use this port for the communication: if (!psofed_hca) psofed_hca = strdup(dev_name); if (!psofed_port) psofed_port = port; marker = "*"; } psofed_dprint(3, "IB port <%s:%u>: %s%s", dev_name, port, port_state_str(port_state), marker); } if (ctx) ibv_close_device(ctx); err_open_dev: return; }
/* * Convenience function. Given the name of an HFI, * returns the ibv_device structure associated with it. * Returns NULL if the HFI could not be found. * * HFI can be identified by name ("mthfi0") or by number * "1", "2", et cetera. * * OPENS THE HFI! Use ibv_close_device() to release it. */ struct ibv_context * op_path_find_hfi(const char *name, struct ibv_device **device) { struct ibv_device *ibv_dev = NULL; struct ibv_context *context = NULL; int i; if (!dev_list) { dev_list = ibv_get_device_list(&num_devices); } if (!dev_list) { errno = EFAULT; return NULL; } if (name == NULL || name[0]=='\0') { i=0; } else if (isdigit(name[0])) { i = strtoul(name,NULL,0) - 1; if (i<0 || i > num_devices) i=0; } else { for (i=0; i < num_devices; i++) { if (!strcmp(ibv_get_device_name(dev_list[i]), name)) break; } if (i >= num_devices) { errno = EFAULT; return NULL; } } ibv_dev = dev_list[i]; /* * Opens the verbs interface to the HFI. * Note that this will increment the usage counter for that * HFI. This needs to be done before we release the device list. */ if(ibv_dev) { context = ibv_open_device(ibv_dev); if (!context) { errno = EFAULT; *device = NULL; } else { *device = ibv_dev; } } else { *device = NULL; errno = ENODEV; } return context; }
//uint16_t pc_init(struct ibv_pd* pd, struct pc_ibv_co *input_co, int device_num) uint16_t pc_init(struct pc_ibv_co *input_co, int device_num) { struct ibv_device **dev_list; struct ibv_device *dev; struct ibv_context *ctx; struct ibv_port_attr pattr; struct pc_hca *hca; uint16_t lid; int num_of_hcas; dev_list = ibv_get_device_list(&num_of_hcas); if (num_of_hcas <= device_num) { fprintf(stderr, "Invalide devic_num: %d @%s:%d\n", device_num, __FILE__, __LINE__); exit(1); } dev = dev_list[device_num]; ctx = ibv_open_device(dev); ibv_query_port(ctx, 1, &pattr); lid = pattr.lid; if (init_hcas_q == 0) { lq_init(&hcas_q); lq_init(&ctx_lid_q); init_hcas_q = 1; } /*remove*/ else { return -1; } /*remove*/ hca = (struct pc_hca*) malloc(sizeof(struct pc_hca)); hca->lid = lid; hca->co = input_co; hca->co->pdg_num = 0; hca->co->pdg_size = 0; lq_enq(&hcas_q, hca); lq_init(&pc_q); ibv_free_device_list(dev_list); return lid; }
/* if hca_name == NULL choose first HCA */ static struct ibv_context *psofed_open_hca(char *hca_name) { struct ibv_device *ib_dev; struct ibv_context *ctx; ib_dev = psofed_get_dev_by_hca_name(hca_name); if (!ib_dev) goto err_no_hca; ctx = ibv_open_device(ib_dev); if (!ctx) goto err_open_device; return ctx; /* --- */ err_open_device: psofed_err_errno("ibv_open_device() failed", errno); return NULL; /* --- */ err_no_hca: return NULL; }
/* * USNIC plugs into the verbs framework, but is not a usable device. * Manually check for devices and fail gracefully if none are present. * This avoids the lower libraries (libibverbs and librdmacm) from * reporting error messages to stderr. */ static int fi_ibv_have_device(void) { struct ibv_device **devs; struct ibv_context *verbs; int i, ret = 0; devs = ibv_get_device_list(NULL); if (!devs) return 0; for (i = 0; devs[i]; i++) { verbs = ibv_open_device(devs[i]); if (verbs) { ibv_close_device(verbs); ret = 1; break; } } ibv_free_device_list(devs); return ret; }
/** ========================================================================= */ static int open_verbs_ctx(struct oib_port *port) { int i; int num_devices; struct ibv_device **dev_list; dev_list = ibv_get_device_list(&num_devices); for (i = 0; i < num_devices; ++i) if (dev_list[i] != NULL && (strncmp(dev_list[i]->name, port->hfi_name, sizeof(dev_list[i]->name)) == 0) ) break; if (i >= num_devices) { ibv_free_device_list(dev_list); OUTPUT_ERROR("failed to find verbs device\n"); return EIO; } port->verbs_ctx = ibv_open_device(dev_list[i]); ibv_free_device_list(dev_list); if (port->verbs_ctx == NULL) { OUTPUT_ERROR("failed to open verbs device\n"); return EIO; } if (sem_init(&port->lock,0,1) != 0) { ibv_close_device(port->verbs_ctx); OUTPUT_ERROR("failed to init registry lock\n"); return EIO; } return 0; }
int open_hca(void) { struct ibv_device **dev_list=NULL; struct ibv_context *cxt = NULL; int rc; int num_hcas; dev_list = ibv_get_device_list(&num_hcas); // Assume that the first device has an ACTIVE port // if it does not we do not handle this situation for now hca.ib_dev = dev_list[0]; hca.context = ibv_open_device(hca.ib_dev); if(!hca.context) { fprintf(stderr,"Couldn't get context %s\n", ibv_get_device_name(hca.ib_dev)); return 1; } hca.pd = ibv_alloc_pd(hca.context); assert(hca.pd != NULL); if(!hca.pd) { fprintf(stderr,"Couldn't get pd %s\n", ibv_get_device_name(hca.ib_dev)); return 1; } return 0; }
int main(int argc, char *argv[]) { struct ibv_device *ib_dev; struct pingpong_context ctx; struct pingpong_dest *my_dest,*rem_dest; struct perftest_parameters user_param; struct perftest_comm user_comm; int i = 0; memset(&ctx,0,sizeof(struct pingpong_context)); memset(&user_param, 0, sizeof(struct perftest_parameters)); memset(&user_comm,0,sizeof(struct perftest_comm)); user_param.verb = WRITE; user_param.tst = BW; user_param.spec = PL; user_param.version = VERSION; // Configure the parameters values according to user arguments or defalut values. if (parser(&user_param,argv,argc)) { fprintf(stderr," Parser function exited with Error\n"); return 1; } // Finding the IB device selected (or defalut if no selected). ib_dev = ctx_find_dev(user_param.ib_devname); if (!ib_dev) { fprintf(stderr," Unable to find the Infiniband/RoCE deivce\n"); return 1; } // Getting the relevant context from the device ctx.context = ibv_open_device(ib_dev); if (!ctx.context) { fprintf(stderr, " Couldn't get context for the device\n"); return 1; } // See if MTU and link type are valid and supported. if (check_link_and_mtu(ctx.context,&user_param)) { fprintf(stderr, " Couldn't get context for the device\n"); return FAILURE; } // Print basic test information. ctx_print_test_info(&user_param); ALLOCATE(my_dest , struct pingpong_dest , user_param.num_of_qps); memset(my_dest, 0, sizeof(struct pingpong_dest)*user_param.num_of_qps); ALLOCATE(rem_dest , struct pingpong_dest , user_param.num_of_qps); memset(rem_dest, 0, sizeof(struct pingpong_dest)*user_param.num_of_qps); // copy the rellevant user parameters to the comm struct + creating rdma_cm resources. if (create_comm_struct(&user_comm,&user_param)) { fprintf(stderr," Unable to create RDMA_CM resources\n"); return 1; } // Create (if nessacery) the rdma_cm ids and channel. if (user_param.work_rdma_cm == ON) { if (create_rdma_resources(&ctx,&user_param)) { fprintf(stderr," Unable to create the rdma_resources\n"); return FAILURE; } if (user_param.machine == CLIENT) { if (rdma_client_connect(&ctx,&user_param)) { fprintf(stderr,"Unable to perform rdma_client function\n"); return FAILURE; } } else { if (rdma_server_connect(&ctx,&user_param)) { fprintf(stderr,"Unable to perform rdma_client function\n"); return FAILURE; } } } else { // create all the basic IB resources (data buffer, PD, MR, CQ and events channel) if (ctx_init(&ctx,&user_param)) { fprintf(stderr, " Couldn't create IB resources\n"); return FAILURE; } } // Set up the Connection. if (set_up_connection(&ctx,&user_param,my_dest)) { fprintf(stderr," Unable to set up socket connection\n"); return FAILURE; } // Print this machine QP information for (i=0; i < user_param.num_of_qps; i++) ctx_print_pingpong_data(&my_dest[i],&user_comm); // Init the connection and print the local data. if (establish_connection(&user_comm)) { fprintf(stderr," Unable to init the socket connection\n"); return FAILURE; } // shaking hands and gather the other side info. for (i=0; i < user_param.num_of_qps; i++) { if (ctx_hand_shake(&user_comm,&my_dest[i],&rem_dest[i])) { fprintf(stderr," Failed to exchange date between server and clients\n"); return 1; } // Print remote machine QP information user_comm.rdma_params->side = REMOTE; ctx_print_pingpong_data(&rem_dest[i],&user_comm); if (user_param.work_rdma_cm == OFF) { if (pp_connect_ctx(&ctx,my_dest[i].psn,&rem_dest[i],&user_param,i)) { fprintf(stderr," Unable to Connect the HCA's through the link\n"); return FAILURE; } } // An additional handshake is required after moving qp to RTR. if (ctx_hand_shake(&user_comm,&my_dest[i],&rem_dest[i])) { fprintf(stderr," Failed to exchange date between server and clients\n"); return FAILURE; } } printf(RESULT_LINE); printf(RESULT_FMT); // For half duplex tests, server just waits for client to exit if (user_param.machine == SERVER && !user_param.duplex) { if (ctx_close_connection(&user_comm,&my_dest[0],&rem_dest[0])) { fprintf(stderr,"Failed to close connection between server and client\n"); return 1; } printf(RESULT_LINE); return 0; } ALLOCATE(tposted,cycles_t,user_param.iters*user_param.num_of_qps); ALLOCATE(tcompleted,cycles_t,user_param.iters*user_param.num_of_qps); if (user_param.all == ON) { for (i = 1; i < 24 ; ++i) { user_param.size = 1 << i; if(run_iter(&ctx,&user_param,rem_dest)) return 17; print_report(&user_param); } } else { if(run_iter(&ctx,&user_param,rem_dest)) return 18; print_report(&user_param); } free(tposted); free(tcompleted); // Closing connection. if (ctx_close_connection(&user_comm,&my_dest[0],&rem_dest[0])) { fprintf(stderr,"Failed to close connection between server and client\n"); return 1; } free(my_dest); free(rem_dest); printf(RESULT_LINE); return 0; }
/* * Find a list of ibv_ports matching a set of criteria. */ opal_list_t *opal_common_verbs_find_ports(const char *if_include, const char *if_exclude, int flags, int stream) { int32_t num_devs; struct ibv_device **devices; struct ibv_device *device; struct ibv_context *device_context; struct ibv_device_attr device_attr; struct ibv_port_attr port_attr; char **if_include_list = NULL, **if_exclude_list = NULL, **if_sanity_list = NULL; opal_common_verbs_device_item_t *di; opal_common_verbs_port_item_t *pi; int rc; uint32_t j; opal_list_t *port_list = NULL; bool want; /* Sanity check the include/exclude params */ if (NULL != if_include && NULL != if_exclude) { return NULL; } /* Query all the IBV devices on the machine. Use an ompi compatibility function, because how to get this list changed over the history of the IBV API. */ devices = opal_ibv_get_device_list(&num_devs); if (0 == num_devs) { opal_output_verbose(5, stream, "no verbs interfaces found"); return NULL; } opal_output_verbose(5, stream, "found %d verbs interface%s", num_devs, (num_devs != 1) ? "s" : ""); /* Allocate a list to fill */ port_list = OBJ_NEW(opal_list_t); if (NULL == port_list) { return NULL; } if (NULL != if_include) { opal_output_verbose(5, stream, "finding verbs interfaces, including %s", if_include); if_include_list = opal_argv_split(if_include, ','); if_sanity_list = opal_argv_copy(if_include_list); } else if (NULL != if_exclude) { opal_output_verbose(5, stream, "finding verbs interfaces, excluding %s", if_exclude); if_exclude_list = opal_argv_split(if_exclude, ','); if_sanity_list = opal_argv_copy(if_exclude_list); } /* Now loop through all the devices. Get the attributes for each port on each device to see if they match our selection criteria. */ for (int32_t i = 0; (int32_t) i < num_devs; ++i) { /* See if this device is on the include/exclude sanity check list. If it is, remove it from the sanity check list (i.e., we should end up with an empty list at the end if all entries in the sanity check list exist) */ device = devices[i]; check_sanity(&if_sanity_list, ibv_get_device_name(device), -1); opal_output_verbose(5, stream, "examining verbs interface: %s", ibv_get_device_name(device)); device_context = ibv_open_device(device); if (NULL == device_context) { opal_show_help("help-opal-common-verbs.txt", "ibv_open_device fail", true, opal_proc_local_get()->proc_hostname, ibv_get_device_name(device), errno, strerror(errno)); goto err_free_port_list; } if (ibv_query_device(device_context, &device_attr)){ opal_show_help("help-opal-common-verbs.txt", "ibv_query_device fail", true, opal_proc_local_get()->proc_hostname, ibv_get_device_name(device), errno, strerror(errno)); goto err_free_port_list; } /* Now that we have the attributes of this device, remove all ports of this device from the sanity check list. Note that IBV ports are indexed from 1, not 0. */ for (j = 1; j <= device_attr.phys_port_cnt; j++) { check_sanity(&if_sanity_list, ibv_get_device_name(device), j); } /* Check the device-specific flags to see if we want this device */ want = false; if (flags & OPAL_COMMON_VERBS_FLAGS_TRANSPORT_IB && IBV_TRANSPORT_IB == device->transport_type) { opal_output_verbose(5, stream, "verbs interface %s has right type (IB)", ibv_get_device_name(device)); want = true; } if (flags & OPAL_COMMON_VERBS_FLAGS_TRANSPORT_IWARP && IBV_TRANSPORT_IWARP == device->transport_type) { opal_output_verbose(5, stream, "verbs interface %s has right type (IWARP)", ibv_get_device_name(device)); want = true; } /* Check for RC or UD QP support */ if (flags & OPAL_COMMON_VERBS_FLAGS_RC) { rc = opal_common_verbs_qp_test(device_context, flags); if (OPAL_SUCCESS == rc) { want = true; opal_output_verbose(5, stream, "verbs interface %s supports RC QPs", ibv_get_device_name(device)); } else { opal_output_verbose(5, stream, "verbs interface %s failed to make RC QP", ibv_get_device_name(device)); } } if (flags & OPAL_COMMON_VERBS_FLAGS_UD) { rc = opal_common_verbs_qp_test(device_context, flags); if (OPAL_SUCCESS == rc) { want = true; opal_output_verbose(5, stream, "verbs interface %s supports UD QPs", ibv_get_device_name(device)); } else if (OPAL_ERR_TYPE_MISMATCH == rc) { opal_output_verbose(5, stream, "verbs interface %s made an RC QP! we don't want RC-capable devices", ibv_get_device_name(device)); } else { opal_output_verbose(5, stream, "verbs interface %s failed to make UD QP", ibv_get_device_name(device)); } } /* If we didn't want it, go to the next device */ if (!want) { continue; } /* Make a device_item_t to hold the device information */ di = OBJ_NEW(opal_common_verbs_device_item_t); if (NULL == di) { goto err_free_port_list; } di->device = device; di->context = device_context; di->device_attr = device_attr; di->device_name = strdup(ibv_get_device_name(device)); /* Note IBV ports are 1 based (not 0 based) */ for (j = 1; j <= device_attr.phys_port_cnt; j++) { /* If we don't want this port (based on if_include / if_exclude lists), skip it */ if (!want_this_port(if_include_list, if_exclude_list, di, j)) { opal_output_verbose(5, stream, "verbs interface %s:%d: rejected by include/exclude", ibv_get_device_name(device), j); continue; } /* Query the port */ if (ibv_query_port(device_context, (uint8_t) j, &port_attr)) { opal_show_help("help-opal-common-verbs.txt", "ibv_query_port fail", true, opal_proc_local_get()->proc_hostname, ibv_get_device_name(device), errno, strerror(errno)); goto err_free_port_list; } /* We definitely only want ACTIVE ports */ if (IBV_PORT_ACTIVE != port_attr.state) { opal_output_verbose(5, stream, "verbs interface %s:%d: not ACTIVE", ibv_get_device_name(device), j); continue; } /* Check the port-specific flags to see if we want this port */ want = false; if (0 == flags) { want = true; } if ((flags & (OPAL_COMMON_VERBS_FLAGS_LINK_LAYER_IB | OPAL_COMMON_VERBS_FLAGS_LINK_LAYER_ETHERNET)) == (OPAL_COMMON_VERBS_FLAGS_LINK_LAYER_IB | OPAL_COMMON_VERBS_FLAGS_LINK_LAYER_ETHERNET)) { /* If they specified both link layers, then we want this port */ want = true; } else if ((flags & (OPAL_COMMON_VERBS_FLAGS_LINK_LAYER_IB | OPAL_COMMON_VERBS_FLAGS_LINK_LAYER_ETHERNET)) == 0) { /* If they specified neither link layer, then we want this port */ want = true; } #if HAVE_DECL_IBV_LINK_LAYER_ETHERNET else if (flags & OPAL_COMMON_VERBS_FLAGS_LINK_LAYER_IB) { if (IBV_LINK_LAYER_INFINIBAND == port_attr.link_layer) { want = true; } else { opal_output_verbose(5, stream, "verbs interface %s:%d has wrong link layer (has %s, want IB)", ibv_get_device_name(device), j, link_layer_to_str(port_attr.link_layer)); } } else if (flags & OPAL_COMMON_VERBS_FLAGS_LINK_LAYER_ETHERNET) { if (IBV_LINK_LAYER_ETHERNET == port_attr.link_layer) { want = true; } else { opal_output_verbose(5, stream, "verbs interface %s:%d has wrong link layer (has %s, want Ethernet)", ibv_get_device_name(device), j, link_layer_to_str(port_attr.link_layer)); } } #endif if (!want) { continue; } /* If we got this far, we want the port. Make an item for it. */ pi = OBJ_NEW(opal_common_verbs_port_item_t); if (NULL == pi) { goto err_free_port_list; } pi->device = di; pi->port_num = j; pi->port_attr = port_attr; OBJ_RETAIN(di); /* Add the port item to the list */ opal_list_append(port_list, &pi->super); opal_output_verbose(5, stream, "found acceptable verbs interface %s:%d", ibv_get_device_name(device), j); } /* We're done with the device; if some ports are using it, its ref count will be > 0, and therefore the device won't be deleted here. */ OBJ_RELEASE(di); } /* Sanity check that the devices specified in the if_include / if_exclude lists actually existed. If this is true, then the sanity list will now be empty. If there are still items left on the list, then they didn't exist. Bad. Print a warning (if the warning is not disabled). */ if (0 != opal_argv_count(if_sanity_list)) { if (opal_common_verbs_warn_nonexistent_if) { char *str = opal_argv_join(if_sanity_list, ','); opal_show_help("help-opal-common-verbs.txt", "nonexistent port", true, opal_proc_local_get()->proc_hostname, ((NULL != if_include) ? "in" : "ex"), str); free(str); /* Only warn once per process */ opal_common_verbs_warn_nonexistent_if = false; } } if (NULL != if_sanity_list) { opal_argv_free(if_sanity_list); } opal_argv_free(if_include_list); opal_argv_free(if_exclude_list); /* All done! */ opal_ibv_free_device_list(devices); return port_list; err_free_port_list: OPAL_LIST_RELEASE(port_list); opal_ibv_free_device_list(devices); if (NULL != if_sanity_list) { opal_argv_free(if_sanity_list); } opal_argv_free(if_include_list); opal_argv_free(if_exclude_list); return NULL; }
static struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, int tx_depth, int port,struct user_parameters *user_parm) { struct pingpong_context *ctx; struct ibv_device_attr device_attr; ctx = malloc(sizeof *ctx); if (!ctx) return NULL; ctx->size = size; ctx->tx_depth = tx_depth; /* in case of UD need space for the GRH */ if (user_parm->connection_type==UD) { ctx->buf = memalign(page_size, ( size + 40 ) * 2); if (!ctx->buf) { fprintf(stderr, "Couldn't allocate work buf.\n"); return NULL; } memset(ctx->buf, 0, ( size + 40 ) * 2); } else { ctx->buf = memalign(page_size, size * 2); if (!ctx->buf) { fprintf(stderr, "Couldn't allocate work buf.\n"); return NULL; } memset(ctx->buf, 0, size * 2); } ctx->post_buf = (char*)ctx->buf + (size - 1); ctx->poll_buf = (char*)ctx->buf + (2 * size - 1); ctx->context = ibv_open_device(ib_dev); if (!ctx->context) { fprintf(stderr, "Couldn't get context for %s\n", ibv_get_device_name(ib_dev)); return NULL; } if (user_parm->mtu == 0) {/*user did not ask for specific mtu */ if (ibv_query_device(ctx->context, &device_attr)) { fprintf(stderr, "Failed to query device props"); return NULL; } if (device_attr.vendor_part_id == 23108 || user_parm->gid_index > -1) { user_parm->mtu = 1024; } else { user_parm->mtu = 2048; } } if (user_parm->use_event) { ctx->channel = ibv_create_comp_channel(ctx->context); if (!ctx->channel) { fprintf(stderr, "Couldn't create completion channel\n"); return NULL; } } else ctx->channel = NULL; ctx->pd = ibv_alloc_pd(ctx->context); if (!ctx->pd) { fprintf(stderr, "Couldn't allocate PD\n"); return NULL; } if (user_parm->connection_type==UD) { ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, (size + 40 ) * 2, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE); if (!ctx->mr) { fprintf(stderr, "Couldn't allocate MR\n"); return NULL; } } else { ctx->mr = ibv_reg_mr(ctx->pd, ctx->buf, size * 2, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE); if (!ctx->mr) { fprintf(stderr, "Couldn't allocate MR\n"); return NULL; } } ctx->scq = ibv_create_cq(ctx->context, tx_depth, NULL, ctx->channel, 0); if (!ctx->scq) { fprintf(stderr, "Couldn't create CQ\n"); return NULL; } ctx->rcq = ibv_create_cq(ctx->context, tx_depth, NULL, ctx->channel, 0); if (!ctx->rcq) { fprintf(stderr, "Couldn't create Recieve CQ\n"); return NULL; } { struct ibv_qp_init_attr attr; memset(&attr, 0, sizeof(struct ibv_qp_init_attr)); attr.send_cq = ctx->scq; attr.recv_cq = ctx->rcq; attr.cap.max_send_wr = tx_depth; /* Work around: driver doesnt support * recv_wr = 0 */ attr.cap.max_recv_wr = tx_depth; attr.cap.max_send_sge = 1; attr.cap.max_recv_sge = 1; attr.cap.max_inline_data = user_parm->inline_size; switch (user_parm->connection_type) { case RC : attr.qp_type = IBV_QPT_RC; break; case UC : attr.qp_type = IBV_QPT_UC; break; case UD : attr.qp_type = IBV_QPT_UD; break; default: fprintf(stderr, "Unknown connection type %d \n",user_parm->connection_type); return NULL; } attr.sq_sig_all = 0; ctx->qp = ibv_create_qp(ctx->pd, &attr); if (!ctx->qp) { fprintf(stderr, "Couldn't create QP\n"); return NULL; } } { struct ibv_qp_attr attr; memset(&attr, 0, sizeof(struct ibv_qp_init_attr)); attr.qp_state = IBV_QPS_INIT; attr.pkey_index = 0; attr.port_num = port; if (user_parm->connection_type==UD) { attr.qkey = 0x11111111; } else { attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE; } if (user_parm->connection_type==UD) { if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_QKEY)) { fprintf(stderr, "Failed to modify UD QP to INIT\n"); return NULL; } if (user_parm->use_mcg) { union ibv_gid gid; uint8_t mcg_gid[16] = MCG_GID; /* use the local QP number as part of the mcg */ mcg_gid[11] = (user_parm->servername) ? 0 : 1; *(uint32_t *)(&mcg_gid[12]) = ctx->qp->qp_num; memcpy(gid.raw, mcg_gid, 16); if (ibv_attach_mcast(ctx->qp, &gid, MCG_LID)) { fprintf(stderr, "Couldn't attach QP to mcg\n"); return NULL; } } } else if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { fprintf(stderr, "Failed to modify QP to INIT\n"); return NULL; } } //send ctx->wr.wr_id = PINGPONG_SEND_WRID; ctx->wr.sg_list = &ctx->list; ctx->wr.num_sge = 1; ctx->wr.opcode = IBV_WR_SEND; ctx->wr.next = NULL; //recieve ctx->rwr.wr_id = PINGPONG_RECV_WRID; ctx->rwr.sg_list = &ctx->recv_list; ctx->rwr.num_sge = 1; ctx->rwr.next = NULL; return ctx; }
static int resources_create(struct resources *res) { struct ibv_device **dev_list = NULL; struct ibv_qp_init_attr qp_init_attr; struct ibv_device *ib_dev = NULL; size_t size; int i; int mr_flags = 0; int cq_size = 0; int num_devices; /* if client side */ if (config.server_name) { res->sock = sock_client_connect(config.server_name, config.tcp_port); if (res->sock < 0) { fprintf(stderr, "failed to establish TCP connection to server %s, port %d\n", config.server_name, config.tcp_port); return -1; } } else { fprintf(stdout, "waiting on port %d for TCP connection\n", config.tcp_port); res->sock = sock_daemon_connect(config.tcp_port); if (res->sock < 0) { fprintf(stderr, "failed to establish TCP connection with client on port %d\n", config.tcp_port); return -1; } } fprintf(stdout, "TCP connection was established\n"); fprintf(stdout, "searching for IB devices in host\n"); /* get device names in the system */ dev_list = ibv_get_device_list(&num_devices); if (!dev_list) { fprintf(stderr, "failed to get IB devices list\n"); return 1; } /* if there isn't any IB device in host */ if (!num_devices) { fprintf(stderr, "found %d device(s)\n", num_devices); return 1; } fprintf(stdout, "found %d device(s)\n", num_devices); /* search for the specific device we want to work with */ for (i = 0; i < num_devices; i ++) { if (!config.dev_name) { config.dev_name = strdup(ibv_get_device_name(dev_list[i])); fprintf(stdout, "device not specified, using first one found: %s\n", config.dev_name); } if (!strcmp(ibv_get_device_name(dev_list[i]), config.dev_name)) { ib_dev = dev_list[i]; break; } } /* if the device wasn't found in host */ if (!ib_dev) { fprintf(stderr, "IB device %s wasn't found\n", config.dev_name); return 1; } /* get device handle */ res->ib_ctx = ibv_open_device(ib_dev); if (!res->ib_ctx) { fprintf(stderr, "failed to open device %s\n", config.dev_name); return 1; } /* We are now done with device list, free it */ ibv_free_device_list(dev_list); dev_list = NULL; ib_dev = NULL; /* query port properties */ if (ibv_query_port(res->ib_ctx, config.ib_port, &res->port_attr)) { fprintf(stderr, "ibv_query_port on port %u failed\n", config.ib_port); return 1; } /* allocate Protection Domain */ res->pd = ibv_alloc_pd(res->ib_ctx); if (!res->pd) { fprintf(stderr, "ibv_alloc_pd failed\n"); return 1; } /* each side will send only one WR, so Completion Queue with 1 entry is enough */ cq_size = 1; res->cq = ibv_create_cq(res->ib_ctx, cq_size, NULL, NULL, 0); if (!res->cq) { fprintf(stderr, "failed to create CQ with %u entries\n", cq_size); return 1; } /* allocate the memory buffer that will hold the data */ size = MSG_SIZE; res->buf = malloc(size); if (!res->buf) { fprintf(stderr, "failed to malloc %Zu bytes to memory buffer\n", size); return 1; } /* only in the daemon side put the message in the memory buffer */ if (!config.server_name) { strcpy(res->buf, MSG); fprintf(stdout, "going to send the message: '%s'\n", res->buf); } else memset(res->buf, 0, size); /* register this memory buffer */ mr_flags = (config.server_name) ? IBV_ACCESS_LOCAL_WRITE : 0; res->mr = ibv_reg_mr(res->pd, res->buf, size, mr_flags); if (!res->mr) { fprintf(stderr, "ibv_reg_mr failed with mr_flags=0x%x\n", mr_flags); return 1; } fprintf(stdout, "MR was registered with addr=%p, lkey=0x%x, rkey=0x%x, flags=0x%x\n", res->buf, res->mr->lkey, res->mr->rkey, mr_flags); /* create the Queue Pair */ memset(&qp_init_attr, 0, sizeof(qp_init_attr)); qp_init_attr.qp_type = IBV_QPT_RC; qp_init_attr.sq_sig_all = 1; qp_init_attr.send_cq = res->cq; qp_init_attr.recv_cq = res->cq; qp_init_attr.cap.max_send_wr = 1; qp_init_attr.cap.max_recv_wr = 1; qp_init_attr.cap.max_send_sge = 1; qp_init_attr.cap.max_recv_sge = 1; res->qp = ibv_create_qp(res->pd, &qp_init_attr); if (!res->qp) { fprintf(stderr, "failed to create QP\n"); return 1; } fprintf(stdout, "QP was created, QP number=0x%x\n", res->qp->qp_num); return 0; }
int main(int argc, char *argv[]) { struct ibv_device *ib_dev = NULL; struct pingpong_context ctx; struct pingpong_dest *my_dest = NULL; struct pingpong_dest *rem_dest = NULL; struct perftest_parameters user_param; struct perftest_comm user_comm; struct mcast_parameters mcg_params; struct bw_report_data my_bw_rep, rem_bw_rep; int ret_parser,i = 0; int size_max_pow = 24; /* init default values to user's parameters */ memset(&ctx, 0,sizeof(struct pingpong_context)); memset(&user_param, 0 , sizeof(struct perftest_parameters)); memset(&mcg_params, 0 , sizeof(struct mcast_parameters)); memset(&user_comm, 0,sizeof(struct perftest_comm)); user_param.verb = SEND; user_param.tst = BW; strncpy(user_param.version, VERSION, sizeof(user_param.version)); /* Configure the parameters values according to user arguments or defalut values. */ ret_parser = parser(&user_param,argv,argc); if (ret_parser) { if (ret_parser != VERSION_EXIT && ret_parser != HELP_EXIT) fprintf(stderr," Parser function exited with Error\n"); return 1; } if((user_param.connection_type == DC || user_param.use_xrc) && user_param.duplex) { user_param.num_of_qps *= 2; } /* Checking that the user did not run with RawEth. for this we have raw_etherent_bw test. */ if (user_param.connection_type == RawEth) { fprintf(stderr," This test cannot run Raw Ethernet QPs (you have chosen RawEth as connection type\n"); fprintf(stderr," For this we have raw_ethernet_bw test in this package.\n"); return FAILURE; } /* Finding the IB device selected (or defalut if no selected). */ ib_dev = ctx_find_dev(user_param.ib_devname); if (!ib_dev) { fprintf(stderr," Unable to find the Infiniband/RoCE device\n"); return 1; } if (user_param.use_mcg) GET_STRING(mcg_params.ib_devname,ibv_get_device_name(ib_dev)); /* Getting the relevant context from the device */ ctx.context = ibv_open_device(ib_dev); if (!ctx.context) { fprintf(stderr, " Couldn't get context for the device\n"); return 1; } /* See if MTU and link type are valid and supported. */ if (check_link(ctx.context,&user_param)) { fprintf(stderr, " Couldn't get context for the device\n"); return FAILURE; } /* copy the relevant user parameters to the comm struct + creating rdma_cm resources. */ if (create_comm_struct(&user_comm,&user_param)) { fprintf(stderr," Unable to create RDMA_CM resources\n"); return 1; } if (user_param.output == FULL_VERBOSITY && user_param.machine == SERVER) { printf("\n************************************\n"); printf("* Waiting for client to connect... *\n"); printf("************************************\n"); } /* Initialize the connection and print the local data. */ if (establish_connection(&user_comm)) { fprintf(stderr," Unable to init the socket connection\n"); return FAILURE; } exchange_versions(&user_comm, &user_param); check_sys_data(&user_comm, &user_param); /* See if MTU and link type are valid and supported. */ if (check_mtu(ctx.context,&user_param, &user_comm)) { fprintf(stderr, " Couldn't get context for the device\n"); return FAILURE; } /* Print basic test information. */ ctx_print_test_info(&user_param); ALLOCATE(my_dest , struct pingpong_dest , user_param.num_of_qps); memset(my_dest, 0, sizeof(struct pingpong_dest)*user_param.num_of_qps); ALLOCATE(rem_dest , struct pingpong_dest , user_param.num_of_qps); memset(rem_dest, 0, sizeof(struct pingpong_dest)*user_param.num_of_qps); if (user_param.transport_type == IBV_TRANSPORT_IWARP) ctx.send_rcredit = 1; /* Allocating arrays needed for the test. */ alloc_ctx(&ctx,&user_param); /* Create (if nessacery) the rdma_cm ids and channel. */ if (user_param.work_rdma_cm == ON) { if (user_param.machine == CLIENT) { if (retry_rdma_connect(&ctx,&user_param)) { fprintf(stderr,"Unable to perform rdma_client function\n"); return FAILURE; } } else { if (create_rdma_resources(&ctx,&user_param)) { fprintf(stderr," Unable to create the rdma_resources\n"); return FAILURE; } if (rdma_server_connect(&ctx,&user_param)) { fprintf(stderr,"Unable to perform rdma_client function\n"); return FAILURE; } } } else { /* create all the basic IB resources (data buffer, PD, MR, CQ and events channel) */ if (ctx_init(&ctx,&user_param)) { fprintf(stderr, " Couldn't create IB resources\n"); return FAILURE; } } /* Set up the Connection. */ if (send_set_up_connection(&ctx,&user_param,my_dest,&mcg_params,&user_comm)) { fprintf(stderr," Unable to set up socket connection\n"); return 1; } if (ctx.send_rcredit) ctx_alloc_credit(&ctx,&user_param,my_dest); for (i=0; i < user_param.num_of_qps; i++) ctx_print_pingpong_data(&my_dest[i],&user_comm); user_comm.rdma_params->side = REMOTE; for (i=0; i < user_param.num_of_qps; i++) { /* shaking hands and gather the other side info. */ if (ctx_hand_shake(&user_comm,&my_dest[i],&rem_dest[i])) { fprintf(stderr,"Failed to exchange data between server and clients\n"); return 1; } ctx_print_pingpong_data(&rem_dest[i],&user_comm); } if (user_param.work_rdma_cm == OFF) { if (ctx_check_gid_compatibility(&my_dest[0], &rem_dest[0])) { fprintf(stderr,"\n Found Incompatibility issue with GID types.\n"); fprintf(stderr," Please Try to use a different IP version.\n\n"); return 1; } } /* If credit for available recieve buffers is necessary, * the credit sending is done via RDMA WRITE ops and the ctx_hand_shake above * is used to exchange the rkeys and buf addresses for the RDMA WRITEs */ if (ctx.send_rcredit) ctx_set_credit_wqes(&ctx,&user_param,rem_dest); /* Joining the Send side port the Mcast gid */ if (user_param.use_mcg && (user_param.machine == CLIENT || user_param.duplex)) { memcpy(mcg_params.mgid.raw, rem_dest[0].gid.raw, 16); if (set_mcast_group(&ctx,&user_param,&mcg_params)) { fprintf(stderr," Unable to Join Sender to Mcast gid\n"); return 1; } /* * The next stall in code (50 ms sleep) is a work around for fixing the * the bug this test had in Multicast for the past 1 year. * It appears, that when a switch involved, it takes ~ 10 ms for the join * request to propogate on the IB fabric, thus we need to wait for it. * what happened before this fix was client reaching the post_send * code segment in about 350 ns from here, and the switch(es) dropped * the packet because join request wasn't finished. */ usleep(50000); } if (user_param.work_rdma_cm == OFF) { /* Prepare IB resources for rtr/rts. */ if (ctx_connect(&ctx,rem_dest,&user_param,my_dest)) { fprintf(stderr," Unable to Connect the HCA's through the link\n"); return 1; } } /* shaking hands and gather the other side info. */ if (ctx_hand_shake(&user_comm,&my_dest[0],&rem_dest[0])) { fprintf(stderr,"Failed to exchange data between server and clients\n"); return 1; } if (user_param.use_event) { if (ibv_req_notify_cq(ctx.send_cq, 0)) { fprintf(stderr, " Couldn't request CQ notification\n"); return 1; } if (ibv_req_notify_cq(ctx.recv_cq, 0)) { fprintf(stderr, " Couldn't request CQ notification\n"); return 1; } } if (user_param.output == FULL_VERBOSITY) { if (user_param.report_per_port) { printf(RESULT_LINE_PER_PORT); printf((user_param.report_fmt == MBS ? RESULT_FMT_PER_PORT : RESULT_FMT_G_PER_PORT)); } else { printf(RESULT_LINE); printf((user_param.report_fmt == MBS ? RESULT_FMT : RESULT_FMT_G)); } printf((user_param.cpu_util_data.enable ? RESULT_EXT_CPU_UTIL : RESULT_EXT)); } if (user_param.test_method == RUN_ALL) { if (user_param.connection_type == UD) size_max_pow = (int)UD_MSG_2_EXP(MTU_SIZE(user_param.curr_mtu)) + 1; for (i = 1; i < size_max_pow ; ++i) { user_param.size = (uint64_t)1 << i; if (user_param.machine == CLIENT || user_param.duplex) ctx_set_send_wqes(&ctx,&user_param,rem_dest); if (user_param.machine == SERVER || user_param.duplex) { if (ctx_set_recv_wqes(&ctx,&user_param)) { fprintf(stderr," Failed to post receive recv_wqes\n"); return 1; } } if (ctx_hand_shake(&user_comm,&my_dest[0],&rem_dest[0])) { fprintf(stderr,"Failed to exchange data between server and clients\n"); return 1; } if (ctx.send_rcredit) { int j; for (j = 0; j < user_param.num_of_qps; j++) ctx.credit_buf[j] = 0; } if (user_param.duplex) { if(run_iter_bi(&ctx,&user_param)) return 17; } else if (user_param.machine == CLIENT) { if(run_iter_bw(&ctx,&user_param)) { return 17; } } else { if(run_iter_bw_server(&ctx,&user_param)) { return 17; } } print_report_bw(&user_param,&my_bw_rep); if (user_param.duplex && user_param.test_type != DURATION) { xchg_bw_reports(&user_comm, &my_bw_rep,&rem_bw_rep,atof(user_param.rem_version)); print_full_bw_report(&user_param, &my_bw_rep, &rem_bw_rep); } if (ctx_hand_shake(&user_comm,&my_dest[0],&rem_dest[0])) { fprintf(stderr,"Failed to exchange data between server and clients\n"); return 1; } /* Check if last iteration ended well in UC/UD */ if (user_param.check_alive_exited) { break; } } } else if (user_param.test_method == RUN_REGULAR) { if (user_param.machine == CLIENT || user_param.duplex) ctx_set_send_wqes(&ctx,&user_param,rem_dest); if (user_param.machine == SERVER || user_param.duplex) { if (ctx_set_recv_wqes(&ctx,&user_param)) { fprintf(stderr," Failed to post receive recv_wqes\n"); return 1; } } if (ctx_hand_shake(&user_comm,&my_dest[0],&rem_dest[0])) { fprintf(stderr,"Failed to exchange data between server and clients\n"); return 1; } if (user_param.duplex) { if(run_iter_bi(&ctx,&user_param)) return 17; } else if (user_param.machine == CLIENT) { if(run_iter_bw(&ctx,&user_param)) { return 17; } } else if(run_iter_bw_server(&ctx,&user_param)) { return 17; } print_report_bw(&user_param,&my_bw_rep); if (user_param.duplex && user_param.test_type != DURATION) { xchg_bw_reports(&user_comm, &my_bw_rep,&rem_bw_rep,atof(user_param.rem_version)); print_full_bw_report(&user_param, &my_bw_rep, &rem_bw_rep); } if (user_param.report_both && user_param.duplex) { printf(RESULT_LINE); printf("\n Local results: \n"); printf(RESULT_LINE); printf((user_param.report_fmt == MBS ? RESULT_FMT : RESULT_FMT_G)); printf((user_param.cpu_util_data.enable ? RESULT_EXT_CPU_UTIL : RESULT_EXT)); print_full_bw_report(&user_param, &my_bw_rep, NULL); printf(RESULT_LINE); printf("\n Remote results: \n"); printf(RESULT_LINE); printf((user_param.report_fmt == MBS ? RESULT_FMT : RESULT_FMT_G)); printf((user_param.cpu_util_data.enable ? RESULT_EXT_CPU_UTIL : RESULT_EXT)); print_full_bw_report(&user_param, &rem_bw_rep, NULL); } } else if (user_param.test_method == RUN_INFINITELY) { if (user_param.machine == CLIENT) ctx_set_send_wqes(&ctx,&user_param,rem_dest); else if (user_param.machine == SERVER) { if (ctx_set_recv_wqes(&ctx,&user_param)) { fprintf(stderr," Failed to post receive recv_wqes\n"); return 1; } } if (ctx_hand_shake(&user_comm,&my_dest[0],&rem_dest[0])) { fprintf(stderr,"Failed to exchange data between server and clients\n"); return 1; } if (user_param.machine == CLIENT) { if(run_iter_bw_infinitely(&ctx,&user_param)) { fprintf(stderr," Error occured while running infinitely! aborting ...\n"); return 1; } } else if (user_param.machine == SERVER) { if(run_iter_bw_infinitely_server(&ctx,&user_param)) { fprintf(stderr," Error occured while running infinitely on server! aborting ...\n"); return 1; } } } if (user_param.output == FULL_VERBOSITY) { if (user_param.report_per_port) printf(RESULT_LINE_PER_PORT); else printf(RESULT_LINE); } if (ctx_close_connection(&user_comm,&my_dest[0],&rem_dest[0])) { fprintf(stderr," Failed to close connection between server and client\n"); fprintf(stderr," Trying to close this side resources\n"); } /* Destory all test resources, including Mcast if exists */ if (send_destroy_ctx(&ctx,&user_param,&mcg_params)) { fprintf(stderr,"Couldn't Destory all SEND resources\n"); return FAILURE; } if (user_param.work_rdma_cm == ON) { user_comm.rdma_params->work_rdma_cm = ON; if (destroy_ctx(user_comm.rdma_ctx,user_comm.rdma_params)) { fprintf(stderr,"Failed to destroy resources\n"); return 1; } } if (!user_param.is_bw_limit_passed && (user_param.is_limit_bw == ON ) ) { fprintf(stderr,"Error: BW result is below bw limit\n"); return 1; } if (!user_param.is_msgrate_limit_passed && (user_param.is_limit_bw == ON )) { fprintf(stderr,"Error: Msg rate is below msg_rate limit\n"); return 1; } return 0; }
int rdma_backend_init(RdmaBackendDev *backend_dev, PCIDevice *pdev, RdmaDeviceResources *rdma_dev_res, const char *backend_device_name, uint8_t port_num, struct ibv_device_attr *dev_attr, CharBackend *mad_chr_be) { int i; int ret = 0; int num_ibv_devices; struct ibv_device **dev_list; memset(backend_dev, 0, sizeof(*backend_dev)); backend_dev->dev = pdev; backend_dev->port_num = port_num; backend_dev->rdma_dev_res = rdma_dev_res; rdma_backend_register_comp_handler(dummy_comp_handler); dev_list = ibv_get_device_list(&num_ibv_devices); if (!dev_list) { rdma_error_report("Failed to get IB devices list"); return -EIO; } if (num_ibv_devices == 0) { rdma_error_report("No IB devices were found"); ret = -ENXIO; goto out_free_dev_list; } if (backend_device_name) { for (i = 0; dev_list[i]; ++i) { if (!strcmp(ibv_get_device_name(dev_list[i]), backend_device_name)) { break; } } backend_dev->ib_dev = dev_list[i]; if (!backend_dev->ib_dev) { rdma_error_report("Failed to find IB device %s", backend_device_name); ret = -EIO; goto out_free_dev_list; } } else { backend_dev->ib_dev = *dev_list; } rdma_info_report("uverb device %s", backend_dev->ib_dev->dev_name); backend_dev->context = ibv_open_device(backend_dev->ib_dev); if (!backend_dev->context) { rdma_error_report("Failed to open IB device %s", ibv_get_device_name(backend_dev->ib_dev)); ret = -EIO; goto out; } backend_dev->channel = ibv_create_comp_channel(backend_dev->context); if (!backend_dev->channel) { rdma_error_report("Failed to create IB communication channel"); ret = -EIO; goto out_close_device; } ret = init_device_caps(backend_dev, dev_attr); if (ret) { rdma_error_report("Failed to initialize device capabilities"); ret = -EIO; goto out_destroy_comm_channel; } ret = mad_init(backend_dev, mad_chr_be); if (ret) { rdma_error_report("Failed to initialize mad"); ret = -EIO; goto out_destroy_comm_channel; } backend_dev->comp_thread.run = false; backend_dev->comp_thread.is_running = false; ah_cache_init(); goto out_free_dev_list; out_destroy_comm_channel: ibv_destroy_comp_channel(backend_dev->channel); out_close_device: ibv_close_device(backend_dev->context); out_free_dev_list: ibv_free_device_list(dev_list); out: return ret; }
int main(int argc, char *argv[]) { struct ibv_pd *pd1, *pd2; struct ibv_comp_channel *comp_chan1, *comp_chan2; struct ibv_cq *cq1, *cq2; struct ibv_cq *evt_cq = NULL; struct ibv_mr *mr1, *mr2; struct ibv_qp_init_attr qp_attr1 = { }, qp_attr2 = {}; struct ibv_sge sge; struct ibv_send_wr send_wr = { }; struct ibv_send_wr *bad_send_wr = NULL; struct ibv_wc wc; struct ibv_qp *qp1, *qp2; void *cq_context = NULL; union ibv_gid gid1, gid2; int n; uint8_t *buf1, *buf2; int err; int num_devices; struct ibv_context * verbs1, *verbs2; struct ibv_device ** dev_list = ibv_get_device_list(&num_devices); struct ibv_device_attr dev_attr; int use = 0; int port = 1; int x = 0; unsigned long mb = 0; unsigned long bytes = 0; unsigned long save_diff = 0; struct timeval start, stop, diff; int iterations = 0; struct rusage usage; struct timeval ustart, uend; struct timeval sstart, send; struct timeval tstart, tend; DPRINTF("There are %d devices\n", num_devices); for(x = 0; x < num_devices; x++) { printf("Device: %d, %s\n", x, ibv_get_device_name(dev_list[use])); } if(num_devices == 0 || dev_list == NULL) { printf("No devices found\n"); return 1; } if(argc < 2) { printf("Which RDMA device to use? 0, 1, 2, 3...\n"); return 1; } use = atoi(argv[1]); DPRINTF("Using device %d\n", use); verbs1 = ibv_open_device(dev_list[use]); if(verbs1 == NULL) { printf("Failed to open device!\n"); return 1; } DPRINTF("Device open %s\n", ibv_get_device_name(dev_list[use])); verbs2 = ibv_open_device(dev_list[use]); if(verbs2 == NULL) { printf("Failed to open device again!\n"); return 1; } if(ibv_query_device(verbs1, &dev_attr)) { printf("Failed to query device attributes.\n"); return 1; } printf("Device open: %d, %s which has %d ports\n", x, ibv_get_device_name(dev_list[use]), dev_attr.phys_port_cnt); if(argc < 3) { printf("Which port on the device to use? 1, 2, 3...\n"); return 1; } port = atoi(argv[2]); if(port <= 0) { printf("Port #%d invalid, must start with 1, 2, 3, ...\n", port); return 1; } printf("Using port %d\n", port); if(argc < 4) { printf("How many iterations to perform?\n"); return 1; } iterations = atoi(argv[3]); printf("Will perform %d iterations\n", iterations); pd1 = ibv_alloc_pd(verbs1); if (!pd1) return 1; if(argc < 5) { printf("How many megabytes to allocate? (This will be allocated twice. Once for source, once for destination.)\n"); return 1; } mb = atoi(argv[4]); if(mb <= 0) { printf("Megabytes %lu invalid\n", mb); return 1; } DPRINTF("protection domain1 allocated\n"); pd2 = ibv_alloc_pd(verbs2); if (!pd2) return 1; DPRINTF("protection domain2 allocated\n"); comp_chan1 = ibv_create_comp_channel(verbs1); if (!comp_chan1) return 1; DPRINTF("completion chan1 created\n"); comp_chan2 = ibv_create_comp_channel(verbs2); if (!comp_chan2) return 1; DPRINTF("completion chan2 created\n"); cq1 = ibv_create_cq(verbs1, 2, NULL, comp_chan1, 0); if (!cq1) return 1; DPRINTF("CQ1 created\n"); cq2 = ibv_create_cq(verbs2, 2, NULL, comp_chan2, 0); if (!cq2) return 1; DPRINTF("CQ2 created\n"); bytes = mb * 1024UL * 1024UL; buf1 = malloc(bytes); if (!buf1) return 1; buf2 = malloc(bytes); if (!buf2) return 1; printf("Populating %lu MB memory.\n", mb * 2); for(x = 0; x < bytes; x++) { buf1[x] = 123; } buf1[bytes - 1] = 123; mr1 = ibv_reg_mr(pd1, buf1, bytes, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ); if (!mr1) { printf("Failed to register memory.\n"); return 1; } mr2 = ibv_reg_mr(pd2, buf2, bytes, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ); if (!mr2) { printf("Failed to register memory.\n"); return 1; } DPRINTF("memory registered.\n"); qp_attr1.cap.max_send_wr = 10; qp_attr1.cap.max_send_sge = 10; qp_attr1.cap.max_recv_wr = 10; qp_attr1.cap.max_recv_sge = 10; qp_attr1.sq_sig_all = 1; qp_attr1.send_cq = cq1; qp_attr1.recv_cq = cq1; qp_attr1.qp_type = IBV_QPT_RC; qp1 = ibv_create_qp(pd1, &qp_attr1); if (!qp1) { printf("failed to create queue pair #1\n"); return 1; } DPRINTF("queue pair1 created\n"); qp_attr2.cap.max_send_wr = 10; qp_attr2.cap.max_send_sge = 10; qp_attr2.cap.max_recv_wr = 10; qp_attr2.cap.max_recv_sge = 10; qp_attr2.sq_sig_all = 1; qp_attr2.send_cq = cq2; qp_attr2.recv_cq = cq2; qp_attr2.qp_type = IBV_QPT_RC; qp2 = ibv_create_qp(pd2, &qp_attr2); if (!qp2) { printf("failed to create queue pair #2\n"); return 1; } DPRINTF("queue pair2 created\n"); struct ibv_qp_attr attr1 = { .qp_state = IBV_QPS_INIT, .pkey_index = 0, .port_num = port, .qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE, }; if(ibv_modify_qp(qp1, &attr1, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { printf("verbs 1 Failed to go to init\n"); return 1; } DPRINTF("verbs1 to init\n"); struct ibv_qp_attr attr2 = { .qp_state = IBV_QPS_INIT, .pkey_index = 0, .port_num = port, .qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE, }; if(ibv_modify_qp(qp2, &attr2, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { printf("verbs 2 Failed to go to init\n"); return 1; } DPRINTF("verbs2 to init\n"); //struct ibv_gid gid1, gid2; struct ibv_port_attr port1, port2; uint64_t psn1 = lrand48() & 0xffffff; uint64_t psn2 = lrand48() & 0xffffff; if(ibv_query_port(verbs1, port, &port1)) return 1; DPRINTF("got port1 information\n"); if(ibv_query_port(verbs2, port, &port2)) return 1; DPRINTF("got port2 information\n"); if(ibv_query_gid(verbs1, 1, 0, &gid1)) return 1; DPRINTF("got gid1 information\n"); if(ibv_query_gid(verbs2, 1, 0, &gid2)) return 1; DPRINTF("got gid2 information\n"); struct ibv_qp_attr next2 = { .qp_state = IBV_QPS_RTR, .path_mtu = IBV_MTU_1024, .dest_qp_num = qp2->qp_num, .rq_psn = psn2, .max_dest_rd_atomic = 5, .min_rnr_timer = 12, .ah_attr = { .is_global = 0, .dlid = port2.lid, .sl = 0, .src_path_bits = 0, .port_num = port, } }; if(gid2.global.interface_id) { next2.ah_attr.is_global = 1; next2.ah_attr.grh.hop_limit = 1; next2.ah_attr.grh.dgid = gid2; next2.ah_attr.grh.sgid_index = 0; } struct ibv_qp_attr next1 = { .qp_state = IBV_QPS_RTR, .path_mtu = IBV_MTU_1024, .dest_qp_num = qp1->qp_num, .rq_psn = psn1, .max_dest_rd_atomic = 1, .min_rnr_timer = 12, .ah_attr = { .is_global = 0, .dlid = port1.lid, .sl = 0, .src_path_bits = 0, .port_num = port, } }; if(gid1.global.interface_id) { next1.ah_attr.is_global = 1; next1.ah_attr.grh.hop_limit = 1; next1.ah_attr.grh.dgid = gid1; next1.ah_attr.grh.sgid_index = 0; } if(ibv_modify_qp(qp2, &next1, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER)) { printf("Failed to modify verbs2 to ready\n"); return 1; } DPRINTF("verbs2 RTR\n"); if(ibv_modify_qp(qp1, &next2, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER)) { printf("Failed to modify verbs1 to ready\n"); return 1; } DPRINTF("verbs1 RTR\n"); next2.qp_state = IBV_QPS_RTS; next2.timeout = 14; next2.retry_cnt = 7; next2.rnr_retry = 7; next2.sq_psn = psn1; next2.max_rd_atomic = 1; if(ibv_modify_qp(qp1, &next2, IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC)) { printf("Failed again to modify verbs1 to ready\n"); return 1; } DPRINTF("verbs1 RTS\n"); next1.qp_state = IBV_QPS_RTS; next1.timeout = 14; next1.retry_cnt = 7; next1.rnr_retry = 7; next1.sq_psn = psn2; next1.max_rd_atomic = 1; if(ibv_modify_qp(qp2, &next1, IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC)) { printf("Failed again to modify verbs2 to ready\n"); return 1; } DPRINTF("verbs2 RTS\n"); printf("Performing RDMA first.\n"); iterations = atoi(argv[3]); getrusage(RUSAGE_SELF, &usage); ustart = usage.ru_utime; sstart = usage.ru_stime; gettimeofday(&tstart, NULL); while(iterations-- > 0) { sge.addr = (uintptr_t) buf1; sge.length = bytes; sge.lkey = mr1->lkey; send_wr.wr_id = 1; send_wr.opcode = IBV_WR_RDMA_WRITE; send_wr.sg_list = &sge; send_wr.num_sge = 1; send_wr.send_flags = IBV_SEND_SIGNALED; send_wr.wr.rdma.rkey = mr2->rkey; send_wr.wr.rdma.remote_addr = (uint64_t) buf2; DPRINTF("Iterations left: %d\n", iterations); if (ibv_req_notify_cq(cq1, 0)) return 1; DPRINTF("Submitting local RDMA\n"); gettimeofday(&start, NULL); if (ibv_post_send(qp1, &send_wr, &bad_send_wr)) return 1; DPRINTF("RDMA posted %p %p\n", &send_wr, bad_send_wr); DPRINTF("blocking...\n"); if(ibv_get_cq_event(comp_chan1, &evt_cq, &cq_context)) { printf("failed to get CQ event\n"); return 1; } gettimeofday(&stop, NULL); timersub(&stop, &start, &diff); DPRINTF("RDMA took: %lu us\n", diff.tv_usec); ibv_ack_cq_events(evt_cq, 1); DPRINTF("got event\n"); n = ibv_poll_cq(cq1, 1, &wc); if (n > 0) { DPRINTF("return from poll: %lu\n", wc.wr_id); if (wc.status != IBV_WC_SUCCESS) { printf("poll failed %s\n", ibv_wc_status_str(wc.status)); return 1; } if (wc.wr_id == 1) { DPRINTF("Finished %d bytes %d %d\n", n, buf1[bytes - 1], buf2[bytes - 1]); } else { printf("didn't find completion\n"); } } if (n < 0) { printf("poll returned error\n"); return 1; } DPRINTF("Poll returned %d bytes %d %d\n", n, buf1[0], buf2[0]); } gettimeofday(&tend, NULL); getrusage(RUSAGE_SELF, &usage); uend = usage.ru_utime; send = usage.ru_stime; save_diff = 0; timersub(&uend, &ustart, &diff); save_diff += diff.tv_usec; printf("User CPU time: %lu us\n", diff.tv_usec); timersub(&send, &sstart, &diff); save_diff += diff.tv_usec; printf("System CPU time: %lu us\n", diff.tv_usec); timersub(&tend, &tstart, &diff); printf("Sleeping time: %lu us\n", diff.tv_usec - save_diff); printf("Wall clock CPU time: %lu us\n", diff.tv_usec); iterations = atoi(argv[3]); printf("Now using the CPU instead....\n"); getrusage(RUSAGE_SELF, &usage); ustart = usage.ru_utime; sstart = usage.ru_stime; gettimeofday(&tstart, NULL); while(iterations-- > 0) { DPRINTF("Repeating without RDMA...\n"); gettimeofday(&start, NULL); memcpy(buf2, buf1, bytes); gettimeofday(&stop, NULL); timersub(&stop, &start, &diff); DPRINTF("Regular copy too took: %lu us\n", diff.tv_usec); } gettimeofday(&tend, NULL); getrusage(RUSAGE_SELF, &usage); uend = usage.ru_utime; send = usage.ru_stime; save_diff = 0; timersub(&uend, &ustart, &diff); save_diff += diff.tv_usec; printf("User CPU time: %lu us\n", diff.tv_usec); timersub(&send, &sstart, &diff); save_diff += diff.tv_usec; printf("System CPU time: %lu us\n", diff.tv_usec); timersub(&tend, &tstart, &diff); printf("Sleeping time: %lu us\n", diff.tv_usec - save_diff); printf("Wall clock CPU time: %lu us\n", diff.tv_usec); return 0; }
/* ////////////////////////////////////////////////////////////////////////// */ static int segment_create(map_segment_t *ds_buf, const char *file_name, size_t size) { int rc = OSHMEM_SUCCESS; openib_device_t *device = &memheap_device; int num_devs = 0; int i = 0; assert(ds_buf); /* init the contents of map_segment_t */ shmem_ds_reset(ds_buf); memset(device, 0, sizeof(*device)); #ifdef HAVE_IBV_GET_DEVICE_LIST device->ib_devs = ibv_get_device_list(&num_devs); #else #error unsupported ibv_get_device_list in infiniband/verbs.h #endif if (num_devs == 0 || !device->ib_devs) { return OSHMEM_ERR_NOT_SUPPORTED; } /* Open device */ if (NULL != mca_sshmem_verbs_component.hca_name) { for (i = 0; i < num_devs; i++) { if (0 == strcmp(mca_sshmem_verbs_component.hca_name, ibv_get_device_name(device->ib_devs[i]))) { device->ib_dev = device->ib_devs[i]; break; } } } else { device->ib_dev = device->ib_devs[0]; } if (NULL == device->ib_dev) { OPAL_OUTPUT_VERBOSE( (5, oshmem_sshmem_base_framework.framework_output, "error getting device says %d: %s", errno, strerror(errno)) ); return OSHMEM_ERR_NOT_FOUND; } if (NULL == (device->ib_dev_context = ibv_open_device(device->ib_dev))) { OPAL_OUTPUT_VERBOSE( (5, oshmem_sshmem_base_framework.framework_output, "error obtaining device context for %s errno says %d: %s", ibv_get_device_name(device->ib_dev), errno, strerror(errno)) ); return OSHMEM_ERR_RESOURCE_BUSY; } /* Obtain device attributes */ if (ibv_query_device(device->ib_dev_context, &device->ib_dev_attr)) { OPAL_OUTPUT_VERBOSE( (5, oshmem_sshmem_base_framework.framework_output, "error obtaining device attributes for %s errno says %d: %s", ibv_get_device_name(device->ib_dev), errno, strerror(errno)) ); return OSHMEM_ERR_RESOURCE_BUSY; } /* Allocate the protection domain for the device */ device->ib_pd = ibv_alloc_pd(device->ib_dev_context); if (NULL == device->ib_pd) { OPAL_OUTPUT_VERBOSE( (5, oshmem_sshmem_base_framework.framework_output, "error allocating protection domain for %s errno says %d: %s", ibv_get_device_name(device->ib_dev), errno, strerror(errno)) ); return OSHMEM_ERR_RESOURCE_BUSY; } /* Allocate memory */ if (!rc) { void *addr = NULL; struct ibv_mr *ib_mr = NULL; uint64_t access_flag = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ; uint64_t exp_access_flag = 0; OBJ_CONSTRUCT(&device->ib_mr_array, opal_value_array_t); opal_value_array_init(&device->ib_mr_array, sizeof(struct ibv_mr *)); #if (MPAGE_ENABLE > 0) exp_access_flag = IBV_EXP_ACCESS_ALLOCATE_MR | IBV_EXP_ACCESS_SHARED_MR_USER_READ | IBV_EXP_ACCESS_SHARED_MR_USER_WRITE; #endif /* MPAGE_ENABLE */ struct ibv_exp_reg_mr_in in = {device->ib_pd, addr, size, access_flag|exp_access_flag, 0}; #if MPAGE_HAVE_IBV_EXP_REG_MR_CREATE_FLAGS if (0 == mca_sshmem_verbs_component.has_shared_mr) { in.addr = (void *)mca_sshmem_base_start_address; in.comp_mask = IBV_EXP_REG_MR_CREATE_FLAGS; in.create_flags = IBV_EXP_REG_MR_CREATE_CONTIG; in.exp_access = access_flag; } #endif ib_mr = ibv_exp_reg_mr(&in); if (NULL == ib_mr) { OPAL_OUTPUT_VERBOSE( (5, oshmem_sshmem_base_framework.framework_output, "error to ibv_exp_reg_mr() %llu bytes errno says %d: %s", (unsigned long long)size, errno, strerror(errno)) ); rc = OSHMEM_ERR_OUT_OF_RESOURCE; } else { device->ib_mr_shared = ib_mr; opal_value_array_append_item(&device->ib_mr_array, &ib_mr); } #if (MPAGE_ENABLE > 0) if (!rc && mca_sshmem_verbs_component.has_shared_mr) { void *addr = NULL; access_flag = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ| IBV_EXP_ACCESS_NO_RDMA; addr = (void *)mca_sshmem_base_start_address; struct ibv_exp_reg_shared_mr_in in; mca_sshmem_verbs_fill_shared_mr(&in, device->ib_pd, device->ib_mr_shared->handle, addr, access_flag); ib_mr = ibv_exp_reg_shared_mr(&in); if (NULL == ib_mr) { OPAL_OUTPUT_VERBOSE( (5, oshmem_sshmem_base_framework.framework_output, "error to ibv_reg_shared_mr() %llu bytes errno says %d: %s has_shared_mr: %d", (unsigned long long)size, errno, strerror(errno), mca_sshmem_verbs_component.has_shared_mr ) ); rc = OSHMEM_ERR_OUT_OF_RESOURCE; } else { opal_value_array_append_item(&device->ib_mr_array, &ib_mr); } } #endif /* MPAGE_ENABLE */ if (!rc) { OPAL_OUTPUT_VERBOSE( (70, oshmem_sshmem_base_framework.framework_output, "ibv device %s shared_mr: %d", ibv_get_device_name(device->ib_dev), mca_sshmem_verbs_component.has_shared_mr) ); if (mca_sshmem_verbs_component.has_shared_mr) { assert(size == device->ib_mr_shared->length); ds_buf->type = MAP_SEGMENT_ALLOC_IBV; ds_buf->seg_id = device->ib_mr_shared->handle; } else { ds_buf->type = MAP_SEGMENT_ALLOC_IBV_NOSHMR; ds_buf->seg_id = MAP_SEGMENT_SHM_INVALID; } ds_buf->super.va_base = ib_mr->addr; ds_buf->seg_size = size; ds_buf->super.va_end = (void*)((uintptr_t)ds_buf->super.va_base + ds_buf->seg_size); } } OPAL_OUTPUT_VERBOSE( (70, oshmem_sshmem_base_framework.framework_output, "%s: %s: create %s " "(id: %d, addr: %p size: %lu, name: %s)\n", mca_sshmem_verbs_component.super.base_version.mca_type_name, mca_sshmem_verbs_component.super.base_version.mca_component_name, (rc ? "failure" : "successful"), ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) ); return rc; }
struct pingpong_context *pp_init_ctx(struct ibv_device *ib_dev, int size, int rx_depth, int port, int use_event, enum pp_wr_calc_op calc_op, enum pp_wr_data_type calc_data_type, char *calc_operands_str) { struct pingpong_context *ctx; int rc; ctx = malloc(sizeof *ctx); if (!ctx) return NULL; memset(ctx, 0, sizeof *ctx); ctx->size = size; ctx->rx_depth = rx_depth; ctx->calc_op.opcode = IBV_EXP_CALC_OP_NUMBER; ctx->calc_op.data_type = IBV_EXP_CALC_DATA_TYPE_NUMBER; ctx->calc_op.data_size = IBV_EXP_CALC_DATA_SIZE_NUMBER; ctx->buf = memalign(page_size, size); if (!ctx->buf) { fprintf(stderr, "Couldn't allocate work buf.\n"); goto clean_ctx; } memset(ctx->buf, 0, size); ctx->net_buf = memalign(page_size, size); if (!ctx->net_buf) { fprintf(stderr, "Couldn't allocate work buf.\n"); goto clean_buffer; } memset(ctx->net_buf, 0, size); ctx->context = ibv_open_device(ib_dev); if (!ctx->context) { fprintf(stderr, "Couldn't get context for %s\n", ibv_get_device_name(ib_dev)); goto clean_net_buf; } if (use_event) { ctx->channel = ibv_create_comp_channel(ctx->context); if (!ctx->channel) { fprintf(stderr, "Couldn't create completion channel\n"); goto clean_device; } } else ctx->channel = NULL; ctx->pd = ibv_alloc_pd(ctx->context); if (!ctx->pd) { fprintf(stderr, "Couldn't allocate PD\n"); goto clean_comp_channel; } ctx->mr = ibv_reg_mr(ctx->pd, ctx->net_buf, size, IBV_ACCESS_LOCAL_WRITE); if (!ctx->mr) { fprintf(stderr, "Couldn't register MR\n"); goto clean_pd; } if (calc_op != PP_CALC_INVALID) { int op_per_gather, num_op, max_num_op; ctx->calc_op.opcode = IBV_EXP_CALC_OP_NUMBER; ctx->calc_op.data_type = IBV_EXP_CALC_DATA_TYPE_NUMBER; ctx->calc_op.data_size = IBV_EXP_CALC_DATA_SIZE_NUMBER; num_op = pp_parse_calc_to_gather(calc_operands_str, calc_op, calc_data_type, &ctx->calc_op, ctx->context, ctx->buf, ctx->net_buf); if (num_op < 0) { fprintf(stderr, "-E- failed parsing calc operators\n"); goto clean_mr; } rc = pp_query_calc_cap(ctx->context, ctx->calc_op.opcode, ctx->calc_op.data_type, ctx->calc_op.data_size, &op_per_gather, &max_num_op); if (rc) { fprintf(stderr, "-E- operation not supported on %s. valid ops are:\n", ibv_get_device_name(ib_dev)); pp_print_dev_calc_ops(ctx->context); goto clean_mr; } if (pp_prepare_sg_list(op_per_gather, num_op, ctx->mr->lkey, &ctx->calc_op, ctx->net_buf)) { fprintf(stderr, "-failed to prepare the sg list\n"); goto clean_mr; } } ctx->cq = ibv_create_cq(ctx->context, rx_depth + 1, NULL, ctx->channel, 0); if (!ctx->cq) { fprintf(stderr, "Couldn't create CQ\n"); goto clean_mr; } { struct ibv_exp_qp_init_attr attr = { .send_cq = ctx->cq, .recv_cq = ctx->cq, .cap = { .max_send_wr = 16, .max_recv_wr = rx_depth, .max_send_sge = 16, .max_recv_sge = 16 }, .qp_type = IBV_QPT_RC, .pd = ctx->pd }; attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS | IBV_EXP_QP_INIT_ATTR_PD; attr.exp_create_flags = IBV_EXP_QP_CREATE_CROSS_CHANNEL; ctx->qp = ibv_exp_create_qp(ctx->context, &attr); if (!ctx->qp) { fprintf(stderr, "Couldn't create QP\n"); goto clean_cq; } } { struct ibv_qp_attr attr = { .qp_state = IBV_QPS_INIT, .pkey_index = 0, .port_num = port, .qp_access_flags = 0 }; if (ibv_modify_qp(ctx->qp, &attr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { fprintf(stderr, "Failed to modify QP to INIT\n"); goto clean_qp; } } ctx->mcq = ibv_create_cq(ctx->context, rx_depth + 1, NULL, ctx->channel, 0); if (!ctx->mcq) { fprintf(stderr, "Couldn't create CQ for MQP\n"); goto clean_qp; } { struct ibv_exp_qp_init_attr mattr = { .send_cq = ctx->mcq, .recv_cq = ctx->mcq, .cap = { .max_send_wr = 1, .max_recv_wr = rx_depth, .max_send_sge = 16, .max_recv_sge = 16 }, .qp_type = IBV_QPT_RC, .pd = ctx->pd }; mattr.comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS | IBV_EXP_QP_INIT_ATTR_PD; mattr.exp_create_flags = IBV_EXP_QP_CREATE_CROSS_CHANNEL; ctx->mqp = ibv_exp_create_qp(ctx->context, &mattr); if (!ctx->qp) { fprintf(stderr, "Couldn't create MQP\n"); goto clean_mcq; } } { struct ibv_qp_attr mattr = { .qp_state = IBV_QPS_INIT, .pkey_index = 0, .port_num = port, .qp_access_flags = 0 }; if (ibv_modify_qp(ctx->mqp, &mattr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS)) { fprintf(stderr, "Failed to modify MQP to INIT\n"); goto clean_mqp; } } return ctx; clean_mqp: ibv_destroy_qp(ctx->mqp); clean_mcq: ibv_destroy_cq(ctx->mcq); clean_qp: ibv_destroy_qp(ctx->qp); clean_cq: ibv_destroy_cq(ctx->cq); clean_mr: ibv_dereg_mr(ctx->mr); clean_pd: ibv_dealloc_pd(ctx->pd); clean_comp_channel: if (ctx->channel) ibv_destroy_comp_channel(ctx->channel); clean_device: ibv_close_device(ctx->context); clean_net_buf: free(ctx->net_buf); clean_buffer: free(ctx->buf); clean_ctx: free(ctx); return NULL; } int pp_close_ctx(struct pingpong_context *ctx) { if (ibv_destroy_qp(ctx->qp)) { fprintf(stderr, "Couldn't destroy QP\n"); return 1; } if (ibv_destroy_qp(ctx->mqp)) { fprintf(stderr, "Couldn't destroy MQP\n"); return 1; } if (ibv_destroy_cq(ctx->cq)) { fprintf(stderr, "Couldn't destroy CQ\n"); return 1; } if (ibv_destroy_cq(ctx->mcq)) { fprintf(stderr, "Couldn't destroy MCQ\n"); return 1; } if (ibv_dereg_mr(ctx->mr)) { fprintf(stderr, "Couldn't deregister MR\n"); return 1; } if (ibv_dealloc_pd(ctx->pd)) { fprintf(stderr, "Couldn't deallocate PD\n"); return 1; } if (ctx->channel) { if (ibv_destroy_comp_channel(ctx->channel)) { fprintf(stderr, "Couldn't destroy completion channel\n"); return 1; } } if (ibv_close_device(ctx->context)) { fprintf(stderr, "Couldn't release context\n"); return 1; } free(ctx->buf); free(ctx->net_buf); free(ctx); return 0; } static int pp_post_recv(struct pingpong_context *ctx, int n) { int rc; struct ibv_sge list = { .addr = (uintptr_t) ctx->net_buf, .length = ctx->size, .lkey = ctx->mr->lkey }; struct ibv_recv_wr wr = { .wr_id = PP_RECV_WRID, .sg_list = &list, .num_sge = 1, }; struct ibv_recv_wr *bad_wr; int i; for (i = 0; i < n; ++i) { rc = ibv_post_recv(ctx->qp, &wr, &bad_wr); if (rc) return rc; } return i; } static int pp_post_send(struct pingpong_context *ctx) { int ret; struct ibv_sge list = { .addr = (uintptr_t) ctx->net_buf, .length = ctx->size, .lkey = ctx->mr->lkey }; struct ibv_exp_send_wr wr = { .wr_id = PP_SEND_WRID, .sg_list = &list, .num_sge = 1, .exp_opcode = IBV_EXP_WR_SEND, .exp_send_flags = IBV_EXP_SEND_SIGNALED, }; struct ibv_exp_send_wr *bad_wr; /* If this is a calc operation - set the required params in the wr */ if (ctx->calc_op.opcode != IBV_EXP_CALC_OP_NUMBER) { wr.exp_opcode = IBV_EXP_WR_SEND; wr.exp_send_flags |= IBV_EXP_SEND_WITH_CALC; wr.sg_list = ctx->calc_op.gather_list; wr.num_sge = ctx->calc_op.gather_list_size; wr.op.calc.calc_op = ctx->calc_op.opcode; wr.op.calc.data_type = ctx->calc_op.data_type; wr.op.calc.data_size = ctx->calc_op.data_size; } ret = ibv_exp_post_send(ctx->qp, &wr, &bad_wr); return ret; } int pp_post_ext_wqe(struct pingpong_context *ctx, enum ibv_exp_wr_opcode op) { int ret; struct ibv_exp_send_wr wr = { .wr_id = PP_CQE_WAIT, .sg_list = NULL, .num_sge = 0, .exp_opcode = op, .exp_send_flags = IBV_EXP_SEND_SIGNALED, }; struct ibv_exp_send_wr *bad_wr; switch (op) { case IBV_EXP_WR_RECV_ENABLE: case IBV_EXP_WR_SEND_ENABLE: wr.task.wqe_enable.qp = ctx->qp; wr.task.wqe_enable.wqe_count = 0; wr.exp_send_flags |= IBV_EXP_SEND_WAIT_EN_LAST; break; case IBV_EXP_WR_CQE_WAIT: wr.task.cqe_wait.cq = ctx->cq; wr.task.cqe_wait.cq_count = 1; wr.exp_send_flags |= IBV_EXP_SEND_WAIT_EN_LAST; break; default: fprintf(stderr, "-E- unsupported m_wqe opcode %d\n", op); return -1; } ret = ibv_exp_post_send(ctx->mqp, &wr, &bad_wr); return ret; } int pp_poll_mcq(struct ibv_cq *cq, int num_cqe) { int ne; int i; struct ibv_wc wc[2]; if (num_cqe > 2) { fprintf(stderr, "-E- max num cqe exceeded\n"); return -1; } do { ne = ibv_poll_cq(cq, num_cqe, wc); if (ne < 0) { fprintf(stderr, "poll CQ failed %d\n", ne); return 1; } } while (ne < 1); for (i = 0; i < ne; ++i) { if (wc[i].status != IBV_WC_SUCCESS) { fprintf(stderr, "Failed %s status %s (%d)\n", wr_id_str[(int)wc[i].wr_id], ibv_wc_status_str(wc[i].status), wc[i].status); return 1; } if ((int) wc[i].wr_id != PP_CQE_WAIT) { fprintf(stderr, "invalid wr_id %" PRIx64 "\n", wc[i].wr_id); return -1; } } return 0; } static int pp_calc_verify(struct pingpong_context *ctx, enum pp_wr_data_type calc_data_type, enum pp_wr_calc_op calc_opcode) { uint64_t *op1 = &(ctx->last_result); uint64_t *op2 = (uint64_t *)ctx->buf + 2; uint64_t *res = (uint64_t *)ctx->buf; return !EXEC_VERIFY(calc_data_type, calc_opcode, 1, op1, op2, res); } static int pp_update_last_result(struct pingpong_context *ctx, enum pp_wr_data_type calc_data_type, enum pp_wr_calc_op calc_opcode) { /* EXEC_VERIFY derefence result parameter */ uint64_t *dummy; uint64_t *op1 = (uint64_t *)ctx->buf; uint64_t *op2 = (uint64_t *)ctx->buf + 2; uint64_t res = (uint64_t)EXEC_VERIFY(calc_data_type, calc_opcode, 0, op1, op2, dummy); ctx->last_result = res; return 0; } static void usage(const char *argv0) { printf("Usage:\n"); printf(" %s start a server and wait for connection\n", argv0); printf(" %s <host> connect to server at <host>\n", argv0); printf("\n"); printf("Options:\n"); printf(" -p, --port=<port> listen on/connect to port <port> (default 18515)\n"); printf(" -d, --ib-dev=<dev> use IB device <dev> (default first device found)\n"); printf(" -i, --ib-port=<port> use port <port> of IB device (default 1)\n"); printf(" -s, --size=<size> size of message to exchange (default 4096 minimum 16)\n"); printf(" -m, --mtu=<size> path MTU (default 1024)\n"); printf(" -r, --rx-depth=<dep> number of receives to post at a time (default 500)\n"); printf(" -n, --iters=<iters> number of exchanges (default 1000)\n"); printf(" -l, --sl=<sl> service level value\n"); printf(" -e, --events sleep on CQ events (default poll)\n"); printf(" -c, --calc=<operation> calc operation\n"); printf(" -t, --op_type=<type> calc operands type\n"); printf(" -o, --operands=<o1,o2,...> comma separated list of operands\n"); printf(" -w, --wait_cq=cqn wait for entries on cq\n"); printf(" -v, --verbose print verbose information\n"); printf(" -V, --verify verify calc operations\n"); }
/* ////////////////////////////////////////////////////////////////////////// */ static int verbs_runtime_query(mca_base_module_t **module, int *priority, const char *hint) { int rc = OSHMEM_SUCCESS; openib_device_t my_device; openib_device_t *device = &my_device; int num_devs = 0; int i = 0; *priority = 0; *module = NULL; memset(device, 0, sizeof(*device)); #ifdef HAVE_IBV_GET_DEVICE_LIST device->ib_devs = ibv_get_device_list(&num_devs); #else #error unsupported ibv_get_device_list in infiniband/verbs.h #endif if (num_devs == 0 || !device->ib_devs) { return OSHMEM_ERR_NOT_SUPPORTED; } /* Open device */ if (NULL != mca_sshmem_verbs_component.hca_name) { for (i = 0; i < num_devs; i++) { if (0 == strcmp(mca_sshmem_verbs_component.hca_name, ibv_get_device_name(device->ib_devs[i]))) { device->ib_dev = device->ib_devs[i]; break; } } } else { device->ib_dev = device->ib_devs[0]; } if (NULL == device->ib_dev) { rc = OSHMEM_ERR_NOT_FOUND; goto out; } if (NULL == (device->ib_dev_context = ibv_open_device(device->ib_dev))) { rc = OSHMEM_ERR_RESOURCE_BUSY; goto out; } /* Obtain device attributes */ if (ibv_query_device(device->ib_dev_context, &device->ib_dev_attr)) { rc = OSHMEM_ERR_RESOURCE_BUSY; goto out; } /* Allocate the protection domain for the device */ device->ib_pd = ibv_alloc_pd(device->ib_dev_context); if (NULL == device->ib_pd) { rc = OSHMEM_ERR_RESOURCE_BUSY; goto out; } /* Allocate memory */ if (!rc) { void *addr = NULL; size_t size = getpagesize(); struct ibv_mr *ib_mr = NULL; uint64_t access_flag = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ; uint64_t exp_access_flag = 0; OBJ_CONSTRUCT(&device->ib_mr_array, opal_value_array_t); opal_value_array_init(&device->ib_mr_array, sizeof(struct ibv_mr *)); #if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) exp_access_flag = IBV_EXP_ACCESS_ALLOCATE_MR | IBV_EXP_ACCESS_SHARED_MR_USER_READ | IBV_EXP_ACCESS_SHARED_MR_USER_WRITE; #endif /* MPAGE_ENABLE */ struct ibv_exp_reg_mr_in in = {device->ib_pd, addr, size, access_flag|exp_access_flag, 0}; ib_mr = ibv_exp_reg_mr(&in); if (NULL == ib_mr) { rc = OSHMEM_ERR_OUT_OF_RESOURCE; } else { device->ib_mr_shared = ib_mr; opal_value_array_append_item(&device->ib_mr_array, &ib_mr); } #if defined(MPAGE_ENABLE) && (MPAGE_ENABLE > 0) if (!rc) { struct ibv_exp_reg_shared_mr_in in_smr; access_flag = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ| IBV_EXP_ACCESS_NO_RDMA; addr = (void *)mca_sshmem_base_start_address; mca_sshmem_verbs_fill_shared_mr(&in_smr, device->ib_pd, device->ib_mr_shared->handle, addr, access_flag); ib_mr = ibv_exp_reg_shared_mr(&in_smr); if (NULL == ib_mr) { mca_sshmem_verbs_component.has_shared_mr = 0; } else { opal_value_array_append_item(&device->ib_mr_array, &ib_mr); mca_sshmem_verbs_component.has_shared_mr = 1; } } #endif /* MPAGE_ENABLE */ } /* all is well - rainbows and butterflies */ if (!rc) { *priority = mca_sshmem_verbs_component.priority; *module = (mca_base_module_t *)&mca_sshmem_verbs_module.super; } out: if (device) { if (opal_value_array_get_size(&device->ib_mr_array)) { struct ibv_mr** array; struct ibv_mr* ib_mr = NULL; array = OPAL_VALUE_ARRAY_GET_BASE(&device->ib_mr_array, struct ibv_mr *); while (opal_value_array_get_size(&device->ib_mr_array) > 0) { ib_mr = array[0]; ibv_dereg_mr(ib_mr); opal_value_array_remove_item(&device->ib_mr_array, 0); } if (device->ib_mr_shared) { device->ib_mr_shared = NULL; } OBJ_DESTRUCT(&device->ib_mr_array); } if (device->ib_pd) { ibv_dealloc_pd(device->ib_pd); device->ib_pd = NULL; } if(device->ib_dev_context) { ibv_close_device(device->ib_dev_context); device->ib_dev_context = NULL; } if(device->ib_devs) { ibv_free_device_list(device->ib_devs); device->ib_devs = NULL; } } return rc; }
/* * Main function. implements raw_ethernet_send_lat */ int main(int argc, char *argv[]) { struct ibv_device *ib_dev = NULL; struct pingpong_context ctx; struct raw_ethernet_info my_dest_info,rem_dest_info; int ret_parser; struct perftest_parameters user_param; #ifdef HAVE_RAW_ETH_EXP struct ibv_exp_flow *flow_create_result = NULL; struct ibv_exp_flow_attr *flow_rules = NULL; struct ibv_exp_flow *flow_promisc = NULL; #else struct ibv_flow *flow_create_result = NULL; struct ibv_flow_attr *flow_rules = NULL; #endif struct report_options report; //allocate memory space for user parameters memset(&ctx, 0, sizeof(struct pingpong_context)); memset(&user_param, 0, sizeof(struct perftest_parameters)); memset(&my_dest_info, 0 , sizeof(struct raw_ethernet_info)); memset(&rem_dest_info, 0 , sizeof(struct raw_ethernet_info)); /* init default values to user's parameters that's relvant for this test: * Raw Ethernet Send Latency Test */ user_param.verb = SEND; user_param.tst = LAT; strncpy(user_param.version, VERSION, sizeof(user_param.version)); user_param.connection_type = RawEth; user_param.r_flag = &report; if (check_flow_steering_support()) { return 1; } /* Configure the parameters values according to user arguments or default values. */ ret_parser = parser(&user_param, argv,argc); //check for parsing errors if (ret_parser) { if (ret_parser != VERSION_EXIT && ret_parser != HELP_EXIT) fprintf(stderr," Parser function exited with Error\n"); DEBUG_LOG(TRACE,"<<<<<<%s",__FUNCTION__); return 1; } //this is a bidirectional test, so we need to let the init functions //to think we are in duplex mode //TODO: ask Ido if that's ok, or should I add another field in user_param user_param.duplex = 1; // Find the selected IB device (or default if the user didn't select one). ib_dev = ctx_find_dev(user_param.ib_devname); if (!ib_dev) { fprintf(stderr," Unable to find the Infiniband/RoCE device\n"); DEBUG_LOG(TRACE,"<<<<<<%s",__FUNCTION__); return 1; } // Getting the relevant context from the device ctx.context = ibv_open_device(ib_dev); if (!ctx.context) { fprintf(stderr, " Couldn't get context for the device\n"); DEBUG_LOG(TRACE,"<<<<<<%s",__FUNCTION__); return 1; } // See if MTU and link type are valid and supported. if (check_link_and_mtu(ctx.context, &user_param)) { fprintf(stderr, " Couldn't get context for the device\n"); DEBUG_LOG(TRACE,"<<<<<<%s",__FUNCTION__); return FAILURE; } // Allocating arrays needed for the test. alloc_ctx(&ctx, &user_param); // Print basic test information. ctx_print_test_info(&user_param); //set up the connection, return the required flow rules (notice that user_param->duplex == TRUE) //so the function will setup like it's a bidirectional test if (send_set_up_connection(&flow_rules, &ctx, &user_param, &my_dest_info, &rem_dest_info)) { fprintf(stderr," Unable to set up socket connection\n"); return 1; } //print specifications of the test print_spec(flow_rules,&user_param); // Create (if necessary) the rdma_cm ids and channel. if (user_param.work_rdma_cm == ON) { //create resources if (create_rdma_resources(&ctx, &user_param)) { fprintf(stderr," Unable to create the rdma_resources\n"); return FAILURE; } if (user_param.machine == CLIENT) { //Connects the client to a QP on the other machine with rdma_cm if (rdma_client_connect(&ctx, &user_param)) { fprintf(stderr,"Unable to perform rdma_client function\n"); return FAILURE; } } else if (rdma_server_connect(&ctx, &user_param)) { //Assigning a server to listen on rdma_cm port and connect to it. fprintf(stderr,"Unable to perform rdma_server function\n"); return FAILURE; } } else { // initalize IB resources (data buffer, PD, MR, CQ and events channel) if (ctx_init(&ctx, &user_param)) { fprintf(stderr, " Couldn't create IB resources\n"); return FAILURE; } } //attaching the qp to the spec #ifdef HAVE_RAW_ETH_EXP flow_create_result = ibv_exp_create_flow(ctx.qp[0], flow_rules); #else flow_create_result = ibv_create_flow(ctx.qp[0], flow_rules); #endif if (!flow_create_result){ perror("error"); fprintf(stderr, "Couldn't attach QP\n"); return FAILURE; } #ifdef HAVE_RAW_ETH_EXP if (user_param.use_promiscuous) { struct ibv_exp_flow_attr attr = { .type = IBV_EXP_FLOW_ATTR_ALL_DEFAULT, .num_of_specs = 0, .port = user_param.ib_port, .flags = 0 }; if ((flow_promisc = ibv_exp_create_flow(ctx.qp[0], &attr)) == NULL) { perror("error"); fprintf(stderr, "Couldn't attach promiscous rule QP\n"); } } #endif //build ONE Raw Ethernet packets on ctx buffer create_raw_eth_pkt(&user_param,&ctx, &my_dest_info , &rem_dest_info); if (user_param.output == FULL_VERBOSITY) { printf(RESULT_LINE); printf("%s",(user_param.test_type == ITERATIONS) ? RESULT_FMT_LAT : RESULT_FMT_LAT_DUR); printf((user_param.cpu_util_data.enable ? RESULT_EXT_CPU_UTIL : RESULT_EXT)); } // Prepare IB resources for rtr(ready to read)/rts(ready to send) if (user_param.work_rdma_cm == OFF) { if (ctx_connect(&ctx, NULL, &user_param, NULL)) { fprintf(stderr," Unable to Connect the HCA's through the link\n"); DEBUG_LOG(TRACE,"<<<<<<%s",__FUNCTION__); return 1; } } //Post Send send_wqes for current message size ctx_set_send_wqes(&ctx,&user_param,NULL); // Post receive recv_wqes for current message size if (ctx_set_recv_wqes(&ctx,&user_param)) { fprintf(stderr," Failed to post receive recv_wqes\n"); return 1; } //latency test function for SEND verb latency test. if (run_iter_lat_send(&ctx, &user_param)) { return 17; } //print report (like print_report_bw) in the correct format // (as set before: FMT_LAT or FMT_LAT_DUR) user_param.test_type == ITERATIONS ? print_report_lat(&user_param) : print_report_lat_duration(&user_param); //destory promisc flow #ifdef HAVE_RAW_ETH_EXP if (user_param.use_promiscuous) { if (ibv_exp_destroy_flow(flow_promisc)) { perror("error"); fprintf(stderr, "Couldn't Destory promisc flow\n"); return FAILURE; } } #endif //destroy flow #ifdef HAVE_RAW_ETH_EXP if (ibv_exp_destroy_flow(flow_create_result)) { #else if (ibv_destroy_flow(flow_create_result)) { #endif perror("error"); fprintf(stderr, "Couldn't Destory flow\n"); return FAILURE; } free(flow_rules); //Deallocate all perftest resources. if (destroy_ctx(&ctx, &user_param)) { fprintf(stderr,"Failed to destroy_ctx\n"); DEBUG_LOG(TRACE,"<<<<<<%s",__FUNCTION__); return 1; } if (user_param.output == FULL_VERBOSITY) printf(RESULT_LINE); DEBUG_LOG(TRACE,"<<<<<<%s",__FUNCTION__); return 0; }
int resource_create(resource_t *res, int ib_port, int myrank) { struct ibv_device **dev_list = NULL; struct ibv_qp_init_attr qp_init_attr; struct ibv_device *ib_dev = NULL; char *dev_name = NULL; size_t size; int i; int mr_flags = 0; int cq_size = 0; int dev_numm; int rc = 0; /* Init structure */ memset(res, 0, sizeof(resource_t)); /* Get the device list */ dev_list = ibv_get_device_list(&dev_numm); if(!dev_list) { fprintf(stderr, "[%d] failed to get IB devices list\n", myrank); return 1; } // if no device if(!dev_numm) { fprintf(stderr, "[%d] No IB device is found\n", myrank); rc = 1; goto err_exit; } DEBUG { printf("[%d] found %d IB device(s)\n", myrank, dev_numm); } /* Open the requested device */ for(i = 0; i < dev_numm; i ++){ dev_name = strdup(ibv_get_device_name(dev_list[i])); DEBUG { printf("[%d] IB device name: %s\n", myrank, dev_name); } ib_dev = dev_list[i]; break; } if (!ib_dev){ fprintf(stderr, "[%d] IB device %s wasn't found\n", myrank, dev_name); rc = 1; goto err_exit; } res->ib_ctx = ibv_open_device(ib_dev); DEBUG { printf("[%d] IB context = %lx\n", myrank, (uintptr_t)res->ib_ctx); } if(!res->ib_ctx){ fprintf(stderr, "[%d] failed to open device %s\n", myrank, dev_name); rc = 1; goto err_exit; } // free device list ibv_free_device_list(dev_list); dev_list = NULL; ib_dev = NULL; // query prot properties if(ibv_query_port(res->ib_ctx, ib_port, &res->port_attr)){ fprintf(stderr, "[%d] ibv_query_port on port %u failed\n", myrank, ib_port); rc = 1; goto err_exit; } /* Create a PD */ res->pd = ibv_alloc_pd(res->ib_ctx); if (!res->pd){ fprintf(stderr, "[%d] ibv_alloc_pd failed\n", myrank); rc = 1; goto err_exit; } /* Create send/recv CQ * inputs: * device handle * CQ capacity * Output: * CQ handle */ res->scq = ibv_create_cq(res->ib_ctx, MAX_CQ_CAPACITY, NULL, NULL, 0); res->rcq = ibv_create_cq(res->ib_ctx, MAX_CQ_CAPACITY, NULL, NULL, 0); if (!res->scq){ fprintf(stderr, "[%d] failed to create SCQ with %u entries\n", myrank, cq_size); rc = 1; goto err_exit; } if (!res->rcq){ fprintf(stderr, "[%d] failed to create SCQ with %u entries\n", myrank, cq_size); rc = 1; goto err_exit; } /* Allocate fix buffer */ size = MAX_FIX_BUF_SIZE; res->buf_size = size; res->buf = (char *)malloc(size * sizeof(char)); if (!res->buf ){ fprintf(stderr, "[%d] failed to malloc %Zu bytes to memory buffer\n", myrank, size); rc = 1; goto err_exit; } memset(res->buf, 0 , size); /* Memory Region * inputs: * device handle * PD * Virtual Addr(addr of MR) * Access Ctrl: LocalWrite, RemoteRead, RemoteWrite, RemoteAtomicOp, MemWindowBinding * outputs: * MR handle * L_Key * R_Key */ res->mr_list = malloc(sizeof(struct ibv_mr*) * MAX_MR_NUM); res->mr_size = 1; mr_flags = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE ; res->mr_list[0] = ibv_reg_mr(res->pd, res->buf, size, mr_flags); if (!res->mr_list[0]){ fprintf(stderr, "[%d] ibv_reg_mr failed with mr_flags=0x%x\n", myrank, mr_flags); rc = 1; goto err_exit; } DEBUG { printf("[%d] fixed MR was registered with addr=%p, lkey=0x%x, rkey=0x%x, flags=0x%x\n", myrank, res->buf, res->mr_list[0]->lkey, res->mr_list[0]->rkey, mr_flags); } /* Create QP */ // inputs: // PD // CQs for SQ,RQ // capacity of SQ,RQ // Outputs: // QP handle memset(&qp_init_attr, 0, sizeof(qp_init_attr)); qp_init_attr.qp_type = IBV_QPT_RC; qp_init_attr.sq_sig_all = 1; qp_init_attr.send_cq = res->scq; qp_init_attr.recv_cq = res->rcq; // max SR/RR num in SQ/RQ qp_init_attr.cap.max_send_wr = MAX_SQ_CAPACITY ; qp_init_attr.cap.max_recv_wr = MAX_RQ_CAPACITY; // max SGE num qp_init_attr.cap.max_send_sge = MAX_SGE_CAPACITY; qp_init_attr.cap.max_recv_sge = MAX_SGE_CAPACITY; qp_init_attr.cap.max_inline_data = 256; res->qp = ibv_create_qp(res->pd, &qp_init_attr); if (!res->qp){ fprintf(stderr, "failed to create QP\n"); rc = 1; goto err_exit; } DEBUG { printf("[%d] QP was created, QP number=0x%x\n", myrank, res->qp->qp_num); } /* EXIT */ err_exit: if(rc){ /* Error encountered, cleanup */ if(res->qp){ ibv_destroy_qp(res->qp); res->qp = NULL; } if(res->mr_list && res->mr_size > 0){ int i; for(i=0; i<res->mr_size; i++){ ibv_dereg_mr(res->mr_list[i]); res->mr_list[i] = NULL; } free(res->mr_list); } if(res->buf){ free(res->buf); res->buf = NULL; } if(res->scq){ ibv_destroy_cq(res->scq); res->scq = NULL; } if(res->rcq){ ibv_destroy_cq(res->rcq); res->rcq = NULL; } if(res->comp_ch){ ibv_destroy_comp_channel(res->comp_ch); res->comp_ch = NULL; } if(res->pd){ ibv_dealloc_pd(res->pd); res->pd = NULL; } if (res->ib_ctx) { ibv_close_device(res->ib_ctx); res->ib_ctx = NULL; } if (dev_list) { ibv_free_device_list(dev_list); dev_list = NULL; } } return rc; }
static inline int mca_oob_ud_device_setup (mca_oob_ud_device_t *device, struct ibv_device *ib_device) { int rc, port_num; struct ibv_device_attr dev_attr; OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:device_setup attempting to setup ib device %p", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void *) ib_device)); device->ib_context = ibv_open_device (ib_device); if (NULL == device->ib_context) { OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:device_setup error opening device. errno = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno)); return ORTE_ERROR; } rc = ibv_query_device (device->ib_context, &dev_attr); if (0 != rc) { OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:device_setup error querying device. errno = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno)); return ORTE_ERROR; } device->ib_channel = ibv_create_comp_channel (device->ib_context); if (NULL == device->ib_channel) { OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:device_setup error completing completion channel." "errno = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno)); return ORTE_ERROR; } device->ib_pd = ibv_alloc_pd (device->ib_context); if (NULL == device->ib_pd) { OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:device_setup error allocating protection domain." "errno = %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), errno)); return ORTE_ERROR; } for (port_num = 1 ; port_num <= dev_attr.phys_port_cnt ; ++port_num) { mca_oob_ud_port_t *port = OBJ_NEW(mca_oob_ud_port_t); if (NULL == port) { opal_output (0, "oob:ud:device_setup malloc failure. errno = %d", errno); return ORTE_ERR_OUT_OF_RESOURCE; } port->device = device; port->port_num = port_num; rc = mca_oob_ud_port_setup (port); if (ORTE_SUCCESS != rc) { OBJ_RELEASE(port); continue; } opal_list_append (&device->ports, (opal_list_item_t *) port); break; } if (0 == opal_list_get_size(&device->ports)) { OPAL_OUTPUT_VERBOSE((5, mca_oob_base_output, "%s oob:ud:device_setup could not init device. no usable " "ports present", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return ORTE_ERROR; } return ORTE_SUCCESS; }
int main(int argc, char *argv[]) { int i = 0; int size_max_pow = 24; int ret_val; struct report_options report; struct pingpong_context ctx; struct pingpong_dest *my_dest = NULL; struct pingpong_dest *rem_dest = NULL; struct mcast_parameters mcg_params; struct ibv_device *ib_dev = NULL; struct perftest_parameters user_param; struct perftest_comm user_comm; /* init default values to user's parameters */ memset(&ctx, 0, sizeof(struct pingpong_context)); memset(&user_param, 0, sizeof(struct perftest_parameters)); memset(&user_comm , 0, sizeof(struct perftest_comm)); memset(&mcg_params, 0, sizeof(struct mcast_parameters)); user_param.verb = SEND; user_param.tst = LAT; strncpy(user_param.version, VERSION, sizeof(user_param.version)); user_param.r_flag = &report; // Configure the parameters values according to user arguments or defalut values. ret_val = parser(&user_param,argv,argc); if (ret_val) { if (ret_val != VERSION_EXIT && ret_val != HELP_EXIT) fprintf(stderr," Parser function exited with Error\n"); return 1; } if(user_param.use_xrc || user_param.connection_type == DC) { user_param.num_of_qps *= 2; } //Checking that the user did not run with RawEth. for this we have raw_etherent_bw test. if (user_param.connection_type == RawEth) { fprintf(stderr," This test cannot run Raw Ethernet QPs (you have chosen RawEth as connection type\n"); return FAILURE; } // Finding the IB device selected (or defalut if no selected). ib_dev = ctx_find_dev(user_param.ib_devname); if (!ib_dev) { fprintf(stderr," Unable to find the Infiniband/RoCE device\n"); return 1; } if (user_param.use_mcg) GET_STRING(mcg_params.ib_devname,ibv_get_device_name(ib_dev)); // Getting the relevant context from the device ctx.context = ibv_open_device(ib_dev); if (!ctx.context) { fprintf(stderr, " Couldn't get context for the device\n"); return 1; } // See if MTU and link type are valid and supported. if (check_link(ctx.context,&user_param)) { fprintf(stderr, " Couldn't get context for the device\n"); return FAILURE; } // copy the relevant user parameters to the comm struct + creating rdma_cm resources. if (create_comm_struct(&user_comm,&user_param)) { fprintf(stderr," Unable to create RDMA_CM resources\n"); return 1; } if (user_param.output == FULL_VERBOSITY && user_param.machine == SERVER) { printf("\n************************************\n"); printf("* Waiting for client to connect... *\n"); printf("************************************\n"); } // Initialize the connection and print the local data. if (establish_connection(&user_comm)) { fprintf(stderr," Unable to init the socket connection\n"); return FAILURE; } exchange_versions(&user_comm, &user_param); check_sys_data(&user_comm, &user_param); // See if MTU and link type are valid and supported. if (check_mtu(ctx.context,&user_param, &user_comm)) { fprintf(stderr, " Couldn't get context for the device\n"); return FAILURE; } // Print basic test information. ctx_print_test_info(&user_param); ALLOCATE(my_dest , struct pingpong_dest , user_param.num_of_qps); memset(my_dest, 0, sizeof(struct pingpong_dest)*user_param.num_of_qps); ALLOCATE(rem_dest , struct pingpong_dest , user_param.num_of_qps); memset(rem_dest, 0, sizeof(struct pingpong_dest)*user_param.num_of_qps); // Allocating arrays needed for the test. alloc_ctx(&ctx,&user_param); // Create (if nessacery) the rdma_cm ids and channel. if (user_param.work_rdma_cm == ON) { if (user_param.machine == CLIENT) { if (retry_rdma_connect(&ctx,&user_param)) { fprintf(stderr,"Unable to perform rdma_client function\n"); return FAILURE; } } else { if (create_rdma_resources(&ctx,&user_param)) { fprintf(stderr," Unable to create the rdma_resources\n"); return FAILURE; } if (rdma_server_connect(&ctx,&user_param)) { fprintf(stderr,"Unable to perform rdma_client function\n"); return FAILURE; } } } else { // create all the basic IB resources (data buffer, PD, MR, CQ and events channel) if (ctx_init(&ctx,&user_param)) { fprintf(stderr, " Couldn't create IB resources\n"); return FAILURE; } } // Set up the Connection. if (send_set_up_connection(&ctx,&user_param,my_dest,&mcg_params,&user_comm)) { fprintf(stderr," Unable to set up socket connection\n"); return 1; } for (i=0; i < user_param.num_of_qps; i++) ctx_print_pingpong_data(&my_dest[i],&user_comm); user_comm.rdma_params->side = REMOTE; for (i=0; i < user_param.num_of_qps; i++) { // shaking hands and gather the other side info. if (ctx_hand_shake(&user_comm,&my_dest[i],&rem_dest[i])) { fprintf(stderr,"Failed to exchange data between server and clients\n"); return 1; } ctx_print_pingpong_data(&rem_dest[i],&user_comm); } if (user_param.work_rdma_cm == OFF) { if (ctx_check_gid_compatibility(&my_dest[0], &rem_dest[0])) { fprintf(stderr,"\n Found Incompatibility issue with GID types.\n"); fprintf(stderr," Please Try to use a different IP version.\n\n"); return 1; } } if (user_param.use_mcg) { memcpy(mcg_params.base_mgid.raw,mcg_params.mgid.raw,16); memcpy(mcg_params.mgid.raw,rem_dest[0].gid.raw,16); mcg_params.base_mlid = mcg_params.mlid; mcg_params.is_2nd_mgid_used = ON; if (!strcmp(link_layer_str(user_param.link_type),"IB")) { // Request for Mcast group create registery in SM. if (join_multicast_group(SUBN_ADM_METHOD_SET,&mcg_params)) { fprintf(stderr," Failed to Join Mcast request\n"); return 1; } } /* * The next stall in code (50 ms sleep) is a work around for fixing the * the bug this test had in Multicast for the past 1 year. * It appears, that when a switch involved, it takes ~ 10 ms for the join * request to propogate on the IB fabric, thus we need to wait for it. * what happened before this fix was reaching the post_send * code segment in about 350 ns from here, and the switch(es) dropped * the packet because join request wasn't finished. */ usleep(50000); } if (user_param.work_rdma_cm == OFF) { // Prepare IB resources for rtr/rts. if (ctx_connect(&ctx,rem_dest,&user_param,my_dest)) { fprintf(stderr," Unable to Connect the HCA's through the link\n"); return 1; } } // shaking hands and gather the other side info. if (ctx_hand_shake(&user_comm,&my_dest[0],&rem_dest[0])) { fprintf(stderr,"Failed to exchange data between server and clients\n"); return 1; } if (user_param.use_event) { if (ibv_req_notify_cq(ctx.send_cq, 0)) { fprintf(stderr, "Couldn't request RCQ notification\n"); return 1; } if (ibv_req_notify_cq(ctx.recv_cq, 0)) { fprintf(stderr, "Couldn't request RCQ notification\n"); return 1; } } if (user_param.output == FULL_VERBOSITY) { printf(RESULT_LINE); printf("%s",(user_param.test_type == ITERATIONS) ? RESULT_FMT_LAT : RESULT_FMT_LAT_DUR); printf((user_param.cpu_util_data.enable ? RESULT_EXT_CPU_UTIL : RESULT_EXT)); } ctx_set_send_wqes(&ctx,&user_param,rem_dest); if (user_param.test_method == RUN_ALL) { if (user_param.connection_type == UD) size_max_pow = (int)UD_MSG_2_EXP(MTU_SIZE(user_param.curr_mtu)) + 1; for (i = 1; i < size_max_pow ; ++i) { user_param.size = (uint64_t)1 << i; // Post recevie recv_wqes fo current message size if (ctx_set_recv_wqes(&ctx,&user_param)) { fprintf(stderr," Failed to post receive recv_wqes\n"); return 1; } // Sync between the client and server so the client won't send packets // Before the server has posted his receive wqes (in UC/UD it will result in a deadlock). if (ctx_hand_shake(&user_comm,&my_dest[0],&rem_dest[0])) { fprintf(stderr,"Failed to exchange data between server and clients\n"); return 1; } if(run_iter_lat_send(&ctx, &user_param)) return 17; user_param.test_type == ITERATIONS ? print_report_lat(&user_param) : print_report_lat_duration(&user_param); } } else { // Post recevie recv_wqes fo current message size if (ctx_set_recv_wqes(&ctx,&user_param)) { fprintf(stderr," Failed to post receive recv_wqes\n"); return 1; } // Sync between the client and server so the client won't send packets // Before the server has posted his receive wqes (in UC/UD it will result in a deadlock). if (ctx_hand_shake(&user_comm,my_dest,rem_dest)) { fprintf(stderr,"Failed to exchange data between server and clients\n"); return 1; } if(run_iter_lat_send(&ctx, &user_param)) return 17; user_param.test_type == ITERATIONS ? print_report_lat(&user_param) : print_report_lat_duration(&user_param); } if (user_param.output == FULL_VERBOSITY) { printf(RESULT_LINE); } if (ctx_close_connection(&user_comm,my_dest,rem_dest)) { fprintf(stderr,"Failed to close connection between server and client\n"); fprintf(stderr," Trying to close this side resources\n"); } return send_destroy_ctx(&ctx,&user_param,&mcg_params); }
/*-----------------------------------------------------------------------------------*/ static void low_level_init(struct netif *netif) { struct ibvif *ibvif; int num_of_device, flags = IBV_ACCESS_LOCAL_WRITE; struct ibv_qp_init_attr attr; struct ibv_qp_attr qp_attr; uint8_t port_num = 1; int qp_flags; struct ibv_device **ib_dev_list; struct tcpip_thread *thread; struct ibv_exp_cq_init_attr cq_attr; ibvif = (struct ibvif *)netif->state; /* Obtain MAC address from network interface. */ ibvif->ethaddr->addr[0] = 0x00; ibvif->ethaddr->addr[1] = 0x02; ibvif->ethaddr->addr[2] = 0xc9; ibvif->ethaddr->addr[3] = 0xa4; ibvif->ethaddr->addr[4] = 0x59; ibvif->ethaddr->addr[5] = 0x41; ibvif->buf_size = ALIGN_TO_PAGE_SIZE(PBUF_POOL_SIZE * TCP_MAX_PACKET_SIZE); /* Do things needed for using Raw Packet Verbs */ ib_dev_list = ibv_get_device_list(&num_of_device); if (num_of_device <= 0 || !ib_dev_list || !ib_dev_list[0]) { perror("IBV no device found\n"); exit(1); } ibvif->context = ibv_open_device(ib_dev_list[1]); if (!ibvif->context) { perror("IBV can't open device\n"); exit(1); } ibv_free_device_list(ib_dev_list); if (set_link_layer(ibvif->context, 1) == LINK_FAILURE) { perror("IBV can't allocate PD\n"); exit(1); } ibvif->pd = ibv_alloc_pd(ibvif->context); if (!ibvif->pd) { perror("IBV can't allocate PD\n"); exit(1); } /*if (!ibv_buffer(ibvif)) { LWIP_DEBUGF(NETIF_DEBUG, ("Buffer allocation failed\n")); exit(1); }*/ ibvif->recv_buf = netif->prot_thread->pbuf_rx_handle.buf; ibvif->send_buf = netif->prot_thread->pbuf_tx_handle.buf; ibvif->send_size = TCP_MAX_PACKET_SIZE; ibvif->rx_depth = PBUF_POOL_SIZE; ibvif->tx_depth = PBUF_POOL_SIZE; ibvif->send_mr = ibv_reg_mr(ibvif->pd, ibvif->send_buf, ibvif->buf_size, flags); if (!ibvif->send_mr) { perror("IBV error reg send mr\n"); exit(1); } ibvif->recv_mr = ibv_reg_mr(ibvif->pd, ibvif->recv_buf, ibvif->buf_size, flags); if (!ibvif->recv_mr) { perror("IBV error reg recv mr\n"); exit(1); } ibvif->send_cq = ibv_create_cq(ibvif->context, ibvif->tx_depth, NULL, NULL, 0); if (!ibvif->send_cq) { perror("IBV can't create send cq\n"); exit(1); } cq_attr.flags = IBV_EXP_CQ_TIMESTAMP; cq_attr.comp_mask = IBV_EXP_CQ_INIT_ATTR_FLAGS; ibvif->recv_cq = ibv_exp_create_cq(ibvif->context, ibvif->rx_depth, NULL, NULL, 0, &cq_attr); if (!ibvif->recv_cq) { perror("IBV can't create recv cq\n"); exit(1); } memset(&attr, 0, sizeof(struct ibv_qp_init_attr)); attr.send_cq = ibvif->send_cq; attr.recv_cq = ibvif->recv_cq; attr.cap.max_send_wr = ibvif->tx_depth; attr.cap.max_send_sge = 1; attr.cap.max_recv_wr = ibvif->rx_depth; attr.cap.max_recv_sge = 1; attr.qp_type = IBV_QPT_RAW_PACKET; ibvif->qp = ibv_create_qp(ibvif->pd, &attr); if (!ibvif->qp) { perror("IBV can't create QP\n"); exit(1); } qp_flags = IBV_QP_STATE | IBV_QP_PORT; memset(&qp_attr, 0, sizeof(struct ibv_qp_attr)); qp_attr.qp_state = IBV_QPS_INIT; qp_attr.pkey_index = 0; qp_attr.port_num = port_num; qp_attr.qp_access_flags = 0; if (ibv_modify_qp(ibvif->qp, &qp_attr, qp_flags)) { perror("IBV can't set qp to init\n"); exit(1); } ibv_attach_device(netif); }
/** * DPDK callback to register a PCI device. * * This function creates an Ethernet device for each port of a given * PCI device. * * @param[in] pci_drv * PCI driver structure (mlx5_driver). * @param[in] pci_dev * PCI device information. * * @return * 0 on success, negative errno value on failure. */ static int mlx5_pci_devinit(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev) { struct ibv_device **list; struct ibv_device *ibv_dev; int err = 0; struct ibv_context *attr_ctx = NULL; struct ibv_device_attr device_attr; unsigned int vf; int idx; int i; (void)pci_drv; assert(pci_drv == &mlx5_driver.pci_drv); /* Get mlx5_dev[] index. */ idx = mlx5_dev_idx(&pci_dev->addr); if (idx == -1) { ERROR("this driver cannot support any more adapters"); return -ENOMEM; } DEBUG("using driver device index %d", idx); /* Save PCI address. */ mlx5_dev[idx].pci_addr = pci_dev->addr; list = ibv_get_device_list(&i); if (list == NULL) { assert(errno); if (errno == ENOSYS) { WARN("cannot list devices, is ib_uverbs loaded?"); return 0; } return -errno; } assert(i >= 0); /* * For each listed device, check related sysfs entry against * the provided PCI ID. */ while (i != 0) { struct rte_pci_addr pci_addr; --i; DEBUG("checking device \"%s\"", list[i]->name); if (mlx5_ibv_device_to_pci_addr(list[i], &pci_addr)) continue; if ((pci_dev->addr.domain != pci_addr.domain) || (pci_dev->addr.bus != pci_addr.bus) || (pci_dev->addr.devid != pci_addr.devid) || (pci_dev->addr.function != pci_addr.function)) continue; vf = ((pci_dev->id.device_id == PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) || (pci_dev->id.device_id == PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF)); INFO("PCI information matches, using device \"%s\" (VF: %s)", list[i]->name, (vf ? "true" : "false")); attr_ctx = ibv_open_device(list[i]); err = errno; break; } if (attr_ctx == NULL) { ibv_free_device_list(list); switch (err) { case 0: WARN("cannot access device, is mlx5_ib loaded?"); return 0; case EINVAL: WARN("cannot use device, are drivers up to date?"); return 0; } assert(err > 0); return -err; } ibv_dev = list[i]; DEBUG("device opened"); if (ibv_query_device(attr_ctx, &device_attr)) goto error; INFO("%u port(s) detected", device_attr.phys_port_cnt); for (i = 0; i < device_attr.phys_port_cnt; i++) { uint32_t port = i + 1; /* ports are indexed from one */ uint32_t test = (1 << i); struct ibv_context *ctx = NULL; struct ibv_port_attr port_attr; struct ibv_pd *pd = NULL; struct priv *priv = NULL; struct rte_eth_dev *eth_dev; #ifdef HAVE_EXP_QUERY_DEVICE struct ibv_exp_device_attr exp_device_attr; #endif /* HAVE_EXP_QUERY_DEVICE */ struct ether_addr mac; #ifdef HAVE_EXP_QUERY_DEVICE exp_device_attr.comp_mask = IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS | IBV_EXP_DEVICE_ATTR_RX_HASH; #endif /* HAVE_EXP_QUERY_DEVICE */ DEBUG("using port %u (%08" PRIx32 ")", port, test); ctx = ibv_open_device(ibv_dev); if (ctx == NULL) goto port_error; /* Check port status. */ err = ibv_query_port(ctx, port, &port_attr); if (err) { ERROR("port query failed: %s", strerror(err)); goto port_error; } if (port_attr.state != IBV_PORT_ACTIVE) DEBUG("port %d is not active: \"%s\" (%d)", port, ibv_port_state_str(port_attr.state), port_attr.state); /* Allocate protection domain. */ pd = ibv_alloc_pd(ctx); if (pd == NULL) { ERROR("PD allocation failure"); err = ENOMEM; goto port_error; } mlx5_dev[idx].ports |= test; /* from rte_ethdev.c */ priv = rte_zmalloc("ethdev private structure", sizeof(*priv), RTE_CACHE_LINE_SIZE); if (priv == NULL) { ERROR("priv allocation failure"); err = ENOMEM; goto port_error; } priv->ctx = ctx; priv->device_attr = device_attr; priv->port = port; priv->pd = pd; priv->mtu = ETHER_MTU; #ifdef HAVE_EXP_QUERY_DEVICE if (ibv_exp_query_device(ctx, &exp_device_attr)) { ERROR("ibv_exp_query_device() failed"); goto port_error; } priv->hw_csum = ((exp_device_attr.exp_device_cap_flags & IBV_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT) && (exp_device_attr.exp_device_cap_flags & IBV_EXP_DEVICE_RX_CSUM_IP_PKT)); DEBUG("checksum offloading is %ssupported", (priv->hw_csum ? "" : "not ")); priv->hw_csum_l2tun = !!(exp_device_attr.exp_device_cap_flags & IBV_EXP_DEVICE_VXLAN_SUPPORT); DEBUG("L2 tunnel checksum offloads are %ssupported", (priv->hw_csum_l2tun ? "" : "not ")); priv->ind_table_max_size = exp_device_attr.rx_hash_caps.max_rwq_indirection_table_size; DEBUG("maximum RX indirection table size is %u", priv->ind_table_max_size); #else /* HAVE_EXP_QUERY_DEVICE */ priv->ind_table_max_size = RSS_INDIRECTION_TABLE_SIZE; #endif /* HAVE_EXP_QUERY_DEVICE */ priv->vf = vf; /* Allocate and register default RSS hash keys. */ priv->rss_conf = rte_calloc(__func__, hash_rxq_init_n, sizeof((*priv->rss_conf)[0]), 0); if (priv->rss_conf == NULL) { err = ENOMEM; goto port_error; } err = rss_hash_rss_conf_new_key(priv, rss_hash_default_key, rss_hash_default_key_len, ETH_RSS_PROTO_MASK); if (err) goto port_error; /* Configure the first MAC address by default. */ if (priv_get_mac(priv, &mac.addr_bytes)) { ERROR("cannot get MAC address, is mlx5_en loaded?" " (errno: %s)", strerror(errno)); goto port_error; } INFO("port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x", priv->port, mac.addr_bytes[0], mac.addr_bytes[1], mac.addr_bytes[2], mac.addr_bytes[3], mac.addr_bytes[4], mac.addr_bytes[5]); /* Register MAC and broadcast addresses. */ claim_zero(priv_mac_addr_add(priv, 0, (const uint8_t (*)[ETHER_ADDR_LEN]) mac.addr_bytes)); claim_zero(priv_mac_addr_add(priv, (RTE_DIM(priv->mac) - 1), &(const uint8_t [ETHER_ADDR_LEN]) { "\xff\xff\xff\xff\xff\xff" })); #ifndef NDEBUG { char ifname[IF_NAMESIZE]; if (priv_get_ifname(priv, &ifname) == 0) DEBUG("port %u ifname is \"%s\"", priv->port, ifname); else DEBUG("port %u ifname is unknown", priv->port); } #endif /* Get actual MTU if possible. */ priv_get_mtu(priv, &priv->mtu); DEBUG("port %u MTU is %u", priv->port, priv->mtu); /* from rte_ethdev.c */ { char name[RTE_ETH_NAME_MAX_LEN]; snprintf(name, sizeof(name), "%s port %u", ibv_get_device_name(ibv_dev), port); eth_dev = rte_eth_dev_allocate(name, RTE_ETH_DEV_PCI); } if (eth_dev == NULL) { ERROR("can not allocate rte ethdev"); err = ENOMEM; goto port_error; } eth_dev->data->dev_private = priv; eth_dev->pci_dev = pci_dev; eth_dev->driver = &mlx5_driver; eth_dev->data->rx_mbuf_alloc_failed = 0; eth_dev->data->mtu = ETHER_MTU; priv->dev = eth_dev; eth_dev->dev_ops = &mlx5_dev_ops; eth_dev->data->mac_addrs = priv->mac; TAILQ_INIT(ð_dev->link_intr_cbs); /* Bring Ethernet device up. */ DEBUG("forcing Ethernet interface up"); priv_set_flags(priv, ~IFF_UP, IFF_UP); continue; port_error: rte_free(priv->rss_conf); rte_free(priv); if (pd) claim_zero(ibv_dealloc_pd(pd)); if (ctx) claim_zero(ibv_close_device(ctx)); break; }
int main(int argc, char *argv[]) { int i = 0; struct report_options report = {}; struct pingpong_context ctx; struct ibv_device *ib_dev; struct perftest_parameters user_param; struct pingpong_dest my_dest,rem_dest; struct perftest_comm user_comm; /* init default values to user's parameters */ memset(&ctx,0,sizeof(struct pingpong_context)); memset(&user_param,0,sizeof(struct perftest_parameters)); memset(&user_comm,0,sizeof(struct perftest_comm)); memset(&my_dest,0,sizeof(struct pingpong_dest)); memset(&rem_dest,0,sizeof(struct pingpong_dest)); user_param.verb = READ; user_param.tst = LAT; user_param.r_flag = &report; user_param.version = VERSION; // Configure the parameters values according to user arguments or defalut values. if (parser(&user_param,argv,argc)) { fprintf(stderr," Parser function exited with Error\n"); return FAILURE; } // Finding the IB device selected (or defalut if no selected). ib_dev = ctx_find_dev(user_param.ib_devname); if (!ib_dev) { fprintf(stderr," Unable to find the Infiniband/RoCE deivce\n"); return FAILURE; } // Getting the relevant context from the device ctx.context = ibv_open_device(ib_dev); if (!ctx.context) { fprintf(stderr, " Couldn't get context for the device\n"); return 1; } // See if MTU and link type are valid and supported. if (check_link_and_mtu(ctx.context,&user_param)) { fprintf(stderr, " Couldn't get context for the device\n"); return FAILURE; } // Print basic test information. ctx_print_test_info(&user_param); // copy the rellevant user parameters to the comm struct + creating rdma_cm resources. if (create_comm_struct(&user_comm,&user_param)) { fprintf(stderr," Unable to create RDMA_CM resources\n"); return 1; } // Create (if nessacery) the rdma_cm ids and channel. if (user_param.work_rdma_cm == ON) { if (create_rdma_resources(&ctx,&user_param)) { fprintf(stderr," Unable to create the rdma_resources\n"); return FAILURE; } if (user_param.machine == CLIENT) { if (rdma_client_connect(&ctx,&user_param)) { fprintf(stderr,"Unable to perform rdma_client function\n"); return FAILURE; } } else { if (rdma_server_connect(&ctx,&user_param)) { fprintf(stderr,"Unable to perform rdma_client function\n"); return FAILURE; } } } else { // create all the basic IB resources (data buffer, PD, MR, CQ and events channel) if (ctx_init(&ctx,&user_param)) { fprintf(stderr, " Couldn't create IB resources\n"); return FAILURE; } } // Set up the Connection. if (set_up_connection(&ctx,&user_param,&my_dest)) { fprintf(stderr," Unable to set up socket connection\n"); return 1; } ctx_print_pingpong_data(&my_dest,&user_comm); // Init the connection and print the local data. if (establish_connection(&user_comm)) { fprintf(stderr," Unable to init the socket connection\n"); return 1; } // shaking hands and gather the other side info. if (ctx_hand_shake(&user_comm,&my_dest,&rem_dest)) { fprintf(stderr,"Failed to exchange date between server and clients\n"); return 1; } user_comm.rdma_params->side = REMOTE; ctx_print_pingpong_data(&rem_dest,&user_comm); if (user_param.work_rdma_cm == OFF) { if (pp_connect_ctx(&ctx,my_dest.psn,&rem_dest,my_dest.out_reads,&user_param)) { fprintf(stderr," Unable to Connect the HCA's through the link\n"); return 1; } } // An additional handshake is required after moving qp to RTR. if (ctx_hand_shake(&user_comm,&my_dest,&rem_dest)) { fprintf(stderr,"Failed to exchange date between server and clients\n"); return 1; } ALLOCATE(tstamp,cycles_t,user_param.iters); // Only Client post read request. if (user_param.machine == SERVER) { if (ctx_close_connection(&user_comm,&my_dest,&rem_dest)) { fprintf(stderr,"Failed to close connection between server and client\n"); return 1; } printf(RESULT_LINE); return 0; // destroy_ctx(&ctx,&user_param); } if (user_param.use_event) { if (ibv_req_notify_cq(ctx.send_cq, 0)) { fprintf(stderr, "Couldn't request CQ notification\n"); return 1; } } printf(RESULT_LINE); printf(RESULT_FMT_LAT); if (user_param.all == ON) { for (i = 1; i < 24 ; ++i) { user_param.size = 1 << i; if(run_iter(&ctx,&user_param,&rem_dest)) return 17; print_report(&user_param); } } else { if(run_iter(&ctx,&user_param,&rem_dest)) return 18; print_report(&user_param); } if (ctx_close_connection(&user_comm,&my_dest,&rem_dest)) { fprintf(stderr,"Failed to close connection between server and client\n"); return 1; } printf(RESULT_LINE); return 0; // destroy_ctx(&ctx,&user_param); }