static ucs_status_t uct_ib_iface_init_pkey(uct_ib_iface_t *iface, uct_ib_iface_config_t *config) { uct_ib_device_t *dev = uct_ib_iface_device(iface); uint16_t pkey_tbl_len = uct_ib_iface_port_attr(iface)->pkey_tbl_len; uint16_t pkey_index, port_pkey, pkey; if (config->pkey_value > UCT_IB_PKEY_PARTITION_MASK) { ucs_error("Requested pkey 0x%x is invalid, should be in the range 0..0x%x", config->pkey_value, UCT_IB_PKEY_PARTITION_MASK); return UCS_ERR_INVALID_PARAM; } /* get the user's pkey value and find its index in the port's pkey table */ for (pkey_index = 0; pkey_index < pkey_tbl_len; ++pkey_index) { /* get the pkey values from the port's pkeys table */ if (ibv_query_pkey(dev->ibv_context, iface->port_num, pkey_index, &port_pkey)) { ucs_error("ibv_query_pkey(%s:%d, index=%d) failed: %m", uct_ib_device_name(dev), iface->port_num, pkey_index); } pkey = ntohs(port_pkey); if (!(pkey & UCT_IB_PKEY_MEMBERSHIP_MASK)) { ucs_debug("skipping send-only pkey[%d]=0x%x", pkey_index, pkey); continue; } /* take only the lower 15 bits for the comparison */ if ((pkey & UCT_IB_PKEY_PARTITION_MASK) == config->pkey_value) { iface->pkey_index = pkey_index; iface->pkey_value = pkey; ucs_debug("using pkey[%d] 0x%x on %s:%d", iface->pkey_index, iface->pkey_value, uct_ib_device_name(dev), iface->port_num); return UCS_OK; } } ucs_error("The requested pkey: 0x%x, cannot be used. " "It wasn't found or the configured pkey doesn't have full membership.", config->pkey_value); return UCS_ERR_INVALID_PARAM; }
static ucs_status_t uct_ib_iface_arm_cq(uct_ib_iface_t *iface, struct ibv_cq *cq, int solicited) { int ret; ret = ibv_req_notify_cq(cq, solicited); if (ret != 0) { uct_ib_device_t *dev = uct_ib_iface_device(iface); ucs_error("ibv_req_notify_cq(%s:%d, cq) failed: %m", uct_ib_device_name(dev), iface->port_num); return UCS_ERR_IO_ERROR; } return UCS_OK; }
static ucs_status_t uct_ib_iface_init_gid(uct_ib_iface_t *iface, uct_ib_iface_config_t *config) { uct_ib_device_t *dev = uct_ib_iface_device(iface); int ret; ret = ibv_query_gid(dev->ibv_context, iface->port_num, config->gid_index, &iface->gid); if (ret != 0) { ucs_error("ibv_query_gid(index=%d) failed: %m", config->gid_index); return UCS_ERR_INVALID_PARAM; } if ((iface->gid.global.interface_id == 0) && (iface->gid.global.subnet_prefix == 0)) { ucs_error("Invalid gid[%d] on %s:%d", config->gid_index, uct_ib_device_name(dev), iface->port_num); return UCS_ERR_INVALID_ADDR; } return UCS_OK; }
ucs_status_t uct_ib_iface_create_ah(uct_ib_iface_t *iface, const uct_ib_address_t *ib_addr, uint8_t src_path_bits, struct ibv_ah **ah_p) { struct ibv_ah_attr ah_attr; struct ibv_ah *ah; char buf[128]; char *p, *endp; uct_ib_iface_fill_ah_attr(iface, ib_addr, src_path_bits, &ah_attr); ah = ibv_create_ah(uct_ib_iface_md(iface)->pd, &ah_attr); if (ah == NULL) { p = buf; endp = buf + sizeof(buf); snprintf(p, endp - p, "dlid=%d sl=%d port=%d path_bits=%d", ah_attr.dlid, ah_attr.sl, ah_attr.port_num, ah_attr.src_path_bits); p += strlen(p); if (ah_attr.is_global) { snprintf(p, endp - p, "dgid="); p += strlen(p); inet_ntop(AF_INET6, &ah_attr.grh.dgid, p, endp - p); p += strlen(p); snprintf(p, endp - p, " sgid_index=%d", ah_attr.grh.sgid_index); } ucs_error("ibv_create_ah(%s) on %s:%d failed: %m", buf, uct_ib_device_name(uct_ib_iface_device(iface)), iface->port_num); return UCS_ERR_INVALID_ADDR; } *ah_p = ah; return UCS_OK; }
static ucs_status_t uct_ib_iface_find_port(uct_ib_device_t *dev, const char *resource_dev_name, uint8_t *p_port_num) { const char *ibdev_name; unsigned port_num; size_t devname_len; char *p; p = strrchr(resource_dev_name, ':'); if (p == NULL) { goto err; /* Wrong device name format */ } devname_len = p - resource_dev_name; ibdev_name = uct_ib_device_name(dev); if ((strlen(ibdev_name) != devname_len) || strncmp(ibdev_name, resource_dev_name, devname_len)) { goto err; /* Device name is wrong */ } port_num = strtod(p + 1, &p); if (*p != '\0') { goto err; /* Failed to parse port number */ } if ((port_num < dev->first_port) || (port_num >= dev->first_port + dev->num_ports)) { goto err; /* Port number out of range */ } *p_port_num = port_num; return UCS_OK; err: return UCS_ERR_NO_DEVICE; }
static ucs_status_t uct_ib_iface_create_cq(uct_ib_iface_t *iface, int cq_length, size_t inl, struct ibv_cq **cq_p) { static const char *cqe_size_env_var = "MLX5_CQE_SIZE"; uct_ib_device_t *dev = uct_ib_iface_device(iface); const char *cqe_size_env_value; size_t cqe_size_min, cqe_size; char cqe_size_buf[32]; ucs_status_t status; struct ibv_cq *cq; int env_var_added = 0; int ret; cqe_size_min = (inl > 32) ? 128 : 64; cqe_size_env_value = getenv(cqe_size_env_var); if (cqe_size_env_value != NULL) { cqe_size = atol(cqe_size_env_value); if (cqe_size < cqe_size_min) { ucs_error("%s is set to %zu, but at least %zu is required (inl: %zu)", cqe_size_env_var, cqe_size, cqe_size_min, inl); status = UCS_ERR_INVALID_PARAM; goto out; } } else { /* CQE size is not defined by the environment, set it according to inline * size and cache line size. */ cqe_size = ucs_max(cqe_size_min, UCS_SYS_CACHE_LINE_SIZE); cqe_size = ucs_max(cqe_size, 64); /* at least 64 */ cqe_size = ucs_min(cqe_size, 128); /* at most 128 */ snprintf(cqe_size_buf, sizeof(cqe_size_buf),"%zu", cqe_size); ucs_debug("%s: setting %s=%s", uct_ib_device_name(dev), cqe_size_env_var, cqe_size_buf); ret = ibv_exp_setenv(dev->ibv_context, cqe_size_env_var, cqe_size_buf, 1); if (ret) { ucs_error("ibv_exp_setenv(%s=%s) failed: %m", cqe_size_env_var, cqe_size_buf); status = UCS_ERR_INVALID_PARAM; goto out; } env_var_added = 1; } cq = ibv_create_cq(dev->ibv_context, cq_length, NULL, iface->comp_channel, 0); if (cq == NULL) { ucs_error("ibv_create_cq(cqe=%d) failed: %m", cq_length); status = UCS_ERR_IO_ERROR; goto out_unsetenv; } *cq_p = cq; status = UCS_OK; out_unsetenv: if (env_var_added) { /* if we created a new environment variable, remove it */ ret = ibv_exp_unsetenv(dev->ibv_context, cqe_size_env_var); if (ret) { ucs_warn("unsetenv(%s) failed: %m", cqe_size_env_var); } } out: return status; }
static ucs_status_t uct_ib_mlx5dv_create_ksm(uct_ib_md_t *ibmd, uct_ib_mem_t *ib_memh, off_t offset) { uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); uct_ib_mlx5_md_t *md = ucs_derived_of(ibmd, uct_ib_mlx5_md_t); uint32_t out[UCT_IB_MLX5DV_ST_SZ_DW(create_mkey_out)] = {}; struct ibv_mr *mr = memh->super.mr; ucs_status_t status = UCS_OK; struct mlx5dv_pd dvpd = {}; struct mlx5dv_obj dv = {}; size_t reg_length, length, inlen; int list_size, i; void *mkc, *klm; uint32_t *in; intptr_t addr; if (!(md->flags & UCT_IB_MLX5_MD_FLAG_KSM)) { return UCS_ERR_UNSUPPORTED; } reg_length = UCT_IB_MD_MAX_MR_SIZE; addr = (intptr_t)mr->addr & ~(reg_length - 1); length = mr->length + (intptr_t)mr->addr - addr; list_size = ucs_div_round_up(length, reg_length); inlen = UCT_IB_MLX5DV_ST_SZ_BYTES(create_mkey_in) + UCT_IB_MLX5DV_ST_SZ_BYTES(klm) * list_size; in = ucs_calloc(1, inlen, "mkey mailbox"); if (in == NULL) { return UCS_ERR_NO_MEMORY; } dv.pd.in = md->super.pd; dv.pd.out = &dvpd; mlx5dv_init_obj(&dv, MLX5DV_OBJ_PD); UCT_IB_MLX5DV_SET(create_mkey_in, in, opcode, UCT_IB_MLX5_CMD_OP_CREATE_MKEY); mkc = UCT_IB_MLX5DV_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); UCT_IB_MLX5DV_SET(mkc, mkc, access_mode_1_0, UCT_IB_MLX5_MKC_ACCESS_MODE_KSM); UCT_IB_MLX5DV_SET(mkc, mkc, a, 1); UCT_IB_MLX5DV_SET(mkc, mkc, rw, 1); UCT_IB_MLX5DV_SET(mkc, mkc, rr, 1); UCT_IB_MLX5DV_SET(mkc, mkc, lw, 1); UCT_IB_MLX5DV_SET(mkc, mkc, lr, 1); UCT_IB_MLX5DV_SET(mkc, mkc, pd, dvpd.pdn); UCT_IB_MLX5DV_SET(mkc, mkc, translations_octword_size, list_size); UCT_IB_MLX5DV_SET(mkc, mkc, log_entity_size, ucs_ilog2(reg_length)); UCT_IB_MLX5DV_SET(mkc, mkc, qpn, 0xffffff); UCT_IB_MLX5DV_SET(mkc, mkc, mkey_7_0, offset & 0xff); UCT_IB_MLX5DV_SET64(mkc, mkc, start_addr, addr + offset); UCT_IB_MLX5DV_SET64(mkc, mkc, len, length); UCT_IB_MLX5DV_SET(create_mkey_in, in, translations_octword_actual_size, list_size); klm = UCT_IB_MLX5DV_ADDR_OF(create_mkey_in, in, klm_pas_mtt); for (i = 0; i < list_size; i++) { if (i == list_size - 1) { UCT_IB_MLX5DV_SET(klm, klm, byte_count, length % reg_length); } else { UCT_IB_MLX5DV_SET(klm, klm, byte_count, reg_length); } UCT_IB_MLX5DV_SET(klm, klm, mkey, mr->lkey); UCT_IB_MLX5DV_SET64(klm, klm, address, addr + (i * reg_length)); klm += UCT_IB_MLX5DV_ST_SZ_BYTES(klm); } memh->atomic_dvmr = mlx5dv_devx_obj_create(md->super.dev.ibv_context, in, inlen, out, sizeof(out)); if (memh->atomic_dvmr == NULL) { ucs_debug("CREATE_MKEY KSM failed: %m"); status = UCS_ERR_UNSUPPORTED; md->flags &= ~UCT_IB_MLX5_MD_FLAG_KSM; goto out; } memh->super.atomic_rkey = (UCT_IB_MLX5DV_GET(create_mkey_out, out, mkey_index) << 8) | (offset & 0xff); ucs_debug("KSM registered memory %p..%p offset 0x%lx on %s rkey 0x%x", mr->addr, mr->addr + mr->length, offset, uct_ib_device_name(&md->super.dev), memh->super.atomic_rkey); out: ucs_free(in); return status; }
ucs_status_t uct_ib_iface_query(uct_ib_iface_t *iface, size_t xport_hdr_len, uct_iface_attr_t *iface_attr) { uct_ib_device_t *dev = uct_ib_iface_device(iface); static const unsigned ib_port_widths[] = { [0] = 1, [1] = 4, [2] = 8, [3] = 12 }; uint8_t active_width, active_speed, active_mtu; double encoding, signal_rate, wire_speed; size_t mtu, width, extra_pkt_len; if (!uct_ib_device_is_port_ib(dev, iface->port_num)) { return UCS_ERR_UNSUPPORTED; } active_width = uct_ib_iface_port_attr(iface)->active_width; active_speed = uct_ib_iface_port_attr(iface)->active_speed; active_mtu = uct_ib_iface_port_attr(iface)->active_mtu; /* Get active width */ if (!ucs_is_pow2(active_width) || (active_width < 1) || (ucs_ilog2(active_width) > 3)) { ucs_error("Invalid active_width on %s:%d: %d", uct_ib_device_name(dev), iface->port_num, active_width); return UCS_ERR_IO_ERROR; } memset(iface_attr, 0, sizeof(*iface_attr)); iface_attr->device_addr_len = iface->addr_size; switch (active_speed) { case 1: /* SDR */ iface_attr->latency = 5000e-9; signal_rate = 2.5e9; encoding = 8.0/10.0; break; case 2: /* DDR */ iface_attr->latency = 2500e-9; signal_rate = 5.0e9; encoding = 8.0/10.0; break; case 4: /* QDR */ iface_attr->latency = 1300e-9; signal_rate = 10.0e9; encoding = 8.0/10.0; break; case 8: /* FDR10 */ iface_attr->latency = 700e-9; signal_rate = 10.3125e9; encoding = 64.0/66.0; break; case 16: /* FDR */ iface_attr->latency = 700e-9; signal_rate = 14.0625e9; encoding = 64.0/66.0; break; case 32: /* EDR */ iface_attr->latency = 600e-9; signal_rate = 25.78125e9; encoding = 64.0/66.0; break; default: ucs_error("Invalid active_speed on %s:%d: %d", uct_ib_device_name(dev), iface->port_num, active_speed); return UCS_ERR_IO_ERROR; } /* Wire speed calculation: Width * SignalRate * Encoding */ width = ib_port_widths[ucs_ilog2(active_width)]; wire_speed = (width * signal_rate * encoding) / 8.0; /* Calculate packet overhead */ mtu = ucs_min(uct_ib_mtu_value(active_mtu), iface->config.seg_size); extra_pkt_len = UCT_IB_LRH_LEN + UCT_IB_BTH_LEN + xport_hdr_len + UCT_IB_ICRC_LEN + UCT_IB_VCRC_LEN + UCT_IB_DELIM_LEN; iface_attr->bandwidth = (wire_speed * mtu) / (mtu + extra_pkt_len); return UCS_OK; }
static UCS_F_MAYBE_UNUSED struct ibv_mr *uct_ib_md_create_umr(uct_ib_md_t *md, struct ibv_mr *mr) { #if HAVE_EXP_UMR struct ibv_exp_mem_region mem_reg; struct ibv_exp_send_wr wr, *bad_wr; struct ibv_exp_create_mr_in mrin; struct ibv_mr *umr; struct ibv_wc wc; int ret; size_t offset; if ((md->umr_qp == NULL) || (md->umr_cq == NULL)) { return NULL; } offset = uct_ib_md_umr_offset(uct_ib_md_umr_id(md)); /* Create memory key */ memset(&mrin, 0, sizeof(mrin)); mrin.pd = md->pd; #ifdef HAVE_EXP_UMR_NEW_API mrin.attr.create_flags = IBV_EXP_MR_INDIRECT_KLMS; mrin.attr.exp_access_flags = UCT_IB_MEM_ACCESS_FLAGS; mrin.attr.max_klm_list_size = 1; #else mrin.attr.create_flags = IBV_MR_NONCONTIG_MEM; mrin.attr.access_flags = UCT_IB_MEM_ACCESS_FLAGS; mrin.attr.max_reg_descriptors = 1; #endif umr = ibv_exp_create_mr(&mrin); if (!umr) { ucs_error("Failed to create modified_mr: %m"); goto err; } /* Fill memory list and UMR */ memset(&wr, 0, sizeof(wr)); memset(&mem_reg, 0, sizeof(mem_reg)); mem_reg.base_addr = (uintptr_t) mr->addr; mem_reg.length = mr->length; #ifdef HAVE_EXP_UMR_NEW_API mem_reg.mr = mr; wr.ext_op.umr.umr_type = IBV_EXP_UMR_MR_LIST; wr.ext_op.umr.mem_list.mem_reg_list = &mem_reg; wr.ext_op.umr.exp_access = UCT_IB_MEM_ACCESS_FLAGS; wr.ext_op.umr.modified_mr = umr; wr.ext_op.umr.base_addr = (uint64_t) (uintptr_t) mr->addr + offset; wr.ext_op.umr.num_mrs = 1; #else mem_reg.m_key = mr; wr.ext_op.umr.memory_key.mkey_type = IBV_EXP_UMR_MEM_LAYOUT_NONCONTIG; wr.ext_op.umr.memory_key.mem_list.mem_reg_list = &mem_reg; wr.ext_op.umr.memory_key.access = UCT_IB_MEM_ACCESS_FLAGS; wr.ext_op.umr.memory_key.modified_mr = umr; wr.ext_op.umr.memory_key.region_base_addr = mr->addr + offset; wr.num_sge = 1; #endif wr.exp_opcode = IBV_EXP_WR_UMR_FILL; wr.exp_send_flags = IBV_EXP_SEND_INLINE | IBV_EXP_SEND_SIGNALED; /* Post UMR */ ret = ibv_exp_post_send(md->umr_qp, &wr, &bad_wr); if (ret) { ucs_error("ibv_exp_post_send(UMR_FILL) failed: %m"); goto err_free_umr; } /* Wait for send UMR completion */ for (;;) { ret = ibv_poll_cq(md->umr_cq, 1, &wc); if (ret < 0) { ucs_error("ibv_exp_poll_cq(umr_cq) failed: %m"); goto err_free_umr; } if (ret == 1) { if (wc.status != IBV_WC_SUCCESS) { ucs_error("UMR_FILL completed with error: %s vendor_err %d", ibv_wc_status_str(wc.status), wc.vendor_err); goto err_free_umr; } break; } } ucs_trace("UMR registered memory %p..%p offset 0x%x on %s lkey 0x%x rkey 0x%x", mr->addr, mr->addr + mr->length, (unsigned)offset, uct_ib_device_name(&md->dev), umr->lkey, umr->rkey); return umr; err_free_umr: ibv_dereg_mr(umr); err: #endif return NULL; }