static inline int uct_rc_verbs_is_ext_atomic_supported(struct ibv_exp_device_attr *dev_attr, size_t atomic_size) { #ifdef HAVE_IB_EXT_ATOMICS struct ibv_exp_ext_atomics_params ext_atom = dev_attr->ext_atom; return (ext_atom.log_max_atomic_inline >= ucs_ilog2(atomic_size)) && (ext_atom.log_atomic_arg_sizes & atomic_size); #else return 0; #endif }
ucs_status_t uct_ib_mlx5_get_cq(struct ibv_cq *cq, uct_ib_mlx5_cq_t *mlx5_cq) { unsigned cqe_size; #if HAVE_DECL_IBV_MLX5_EXP_GET_CQ_INFO struct ibv_mlx5_cq_info ibv_cq_info; int ret; ret = ibv_mlx5_exp_get_cq_info(cq, &ibv_cq_info); if (ret != 0) { return UCS_ERR_NO_DEVICE; } mlx5_cq->cq_buf = ibv_cq_info.buf; mlx5_cq->cq_ci = 0; mlx5_cq->cq_length = ibv_cq_info.cqe_cnt; cqe_size = ibv_cq_info.cqe_size; #else struct mlx5_cq *mcq = ucs_container_of(cq, struct mlx5_cq, ibv_cq); int ret; if (mcq->cons_index != 0) { ucs_error("CQ consumer index is not 0 (%d)", mcq->cons_index); return UCS_ERR_NO_DEVICE; } mlx5_cq->cq_buf = mcq->active_buf->buf; mlx5_cq->cq_ci = 0; mlx5_cq->cq_length = mcq->ibv_cq.cqe + 1; cqe_size = mcq->cqe_sz; #endif /* Move buffer forward for 128b CQE, so we would get pointer to the 2nd * 64b when polling. */ mlx5_cq->cq_buf += cqe_size - sizeof(struct mlx5_cqe64); ret = ibv_exp_cq_ignore_overrun(cq); if (ret != 0) { ucs_error("Failed to modify send CQ to ignore overrun: %s", strerror(ret)); return UCS_ERR_UNSUPPORTED; } mlx5_cq->cqe_size_log = ucs_ilog2(cqe_size); ucs_assert_always((1<<mlx5_cq->cqe_size_log) == cqe_size); return UCS_OK; }
static inline ucs_status_t uct_rc_verbs_ext_atomic_post(uct_rc_verbs_ep_t *ep, int opcode, uint32_t length, uint64_t compare_mask, uint64_t compare_add, uint64_t swap, uint64_t remote_addr, uct_rkey_t rkey, uct_rc_iface_send_desc_t *desc, int force_sig, ucs_status_t success) { struct ibv_exp_send_wr wr; struct ibv_sge sge; sge.addr = (uintptr_t)(desc + 1); sge.lkey = desc->lkey; sge.length = length; wr.next = NULL; wr.sg_list = &sge; wr.num_sge = 1; wr.exp_opcode = opcode; wr.exp_send_flags = IBV_EXP_SEND_EXT_ATOMIC_INLINE; wr.comp_mask = 0; wr.ext_op.masked_atomics.log_arg_sz = ucs_ilog2(length); wr.ext_op.masked_atomics.remote_addr = remote_addr; wr.ext_op.masked_atomics.rkey = rkey; switch (opcode) { case IBV_EXP_WR_EXT_MASKED_ATOMIC_CMP_AND_SWP: wr.ext_op.masked_atomics.wr_data.inline_data.op.cmp_swap.compare_mask = compare_mask; wr.ext_op.masked_atomics.wr_data.inline_data.op.cmp_swap.compare_val = compare_add; wr.ext_op.masked_atomics.wr_data.inline_data.op.cmp_swap.swap_mask = (uint64_t)(-1); wr.ext_op.masked_atomics.wr_data.inline_data.op.cmp_swap.swap_val = swap; break; case IBV_EXP_WR_EXT_MASKED_ATOMIC_FETCH_AND_ADD: wr.ext_op.masked_atomics.wr_data.inline_data.op.fetch_add.add_val = compare_add; wr.ext_op.masked_atomics.wr_data.inline_data.op.fetch_add.field_boundary = 0; break; } UCT_TL_EP_STAT_ATOMIC(&ep->super.super); uct_rc_verbs_exp_post_send(ep, &wr, force_sig); uct_rc_verbs_ep_push_desc(ep, desc); return success; }
ucs_status_t uct_ib_iface_query(uct_ib_iface_t *iface, size_t xport_hdr_len, uct_iface_attr_t *iface_attr) { uct_ib_device_t *dev = uct_ib_iface_device(iface); static const unsigned ib_port_widths[] = { [0] = 1, [1] = 4, [2] = 8, [3] = 12 }; uint8_t active_width, active_speed, active_mtu; double encoding, signal_rate, wire_speed; size_t mtu, width, extra_pkt_len; int i = 0; active_width = uct_ib_iface_port_attr(iface)->active_width; active_speed = uct_ib_iface_port_attr(iface)->active_speed; active_mtu = uct_ib_iface_port_attr(iface)->active_mtu; /* Get active width */ if (!ucs_is_pow2(active_width) || (active_width < 1) || (ucs_ilog2(active_width) > 3)) { ucs_error("Invalid active_width on %s:%d: %d", UCT_IB_IFACE_ARG(iface), active_width); return UCS_ERR_IO_ERROR; } memset(iface_attr, 0, sizeof(*iface_attr)); iface_attr->device_addr_len = iface->addr_size; switch (active_speed) { case 1: /* SDR */ iface_attr->latency = 5000e-9; signal_rate = 2.5e9; encoding = 8.0/10.0; break; case 2: /* DDR */ iface_attr->latency = 2500e-9; signal_rate = 5.0e9; encoding = 8.0/10.0; break; case 4: /* QDR */ iface_attr->latency = 1300e-9; signal_rate = 10.0e9; encoding = 8.0/10.0; break; case 8: /* FDR10 */ iface_attr->latency = 700e-9; signal_rate = 10.3125e9; encoding = 64.0/66.0; break; case 16: /* FDR */ iface_attr->latency = 700e-9; signal_rate = 14.0625e9; encoding = 64.0/66.0; break; case 32: /* EDR */ iface_attr->latency = 600e-9; signal_rate = 25.78125e9; encoding = 64.0/66.0; break; default: ucs_error("Invalid active_speed on %s:%d: %d", UCT_IB_IFACE_ARG(iface), active_speed); return UCS_ERR_IO_ERROR; } /* Wire speed calculation: Width * SignalRate * Encoding */ width = ib_port_widths[ucs_ilog2(active_width)]; wire_speed = (width * signal_rate * encoding) / 8.0; /* Calculate packet overhead */ mtu = ucs_min(uct_ib_mtu_value(active_mtu), iface->config.seg_size); extra_pkt_len = UCT_IB_BTH_LEN + xport_hdr_len + UCT_IB_ICRC_LEN + UCT_IB_VCRC_LEN + UCT_IB_DELIM_LEN; if (IBV_PORT_IS_LINK_LAYER_ETHERNET(uct_ib_iface_port_attr(iface))) { extra_pkt_len += UCT_IB_GRH_LEN + UCT_IB_ROCE_LEN; } else { /* TODO check if UCT_IB_DELIM_LEN is present in RoCE as well */ extra_pkt_len += UCT_IB_LRH_LEN; } iface_attr->bandwidth = (wire_speed * mtu) / (mtu + extra_pkt_len); /* Set priority of current device */ iface_attr->priority = 0; while (uct_ib_device_info_table[i].vendor_part_id != 0) { if (uct_ib_device_info_table[i].vendor_part_id == dev->dev_attr.vendor_part_id) { iface_attr->priority = uct_ib_device_info_table[i].priority; break; } i++; } return UCS_OK; }
static ucs_status_t ucp_address_do_pack(ucp_worker_h worker, ucp_ep_h ep, void *buffer, size_t size, uint64_t tl_bitmap, unsigned *order, const ucp_address_packed_device_t *devices, ucp_rsc_index_t num_devices) { ucp_context_h context = worker->context; const ucp_address_packed_device_t *dev; uct_iface_attr_t *iface_attr; ucp_rsc_index_t md_index; ucs_status_t status; ucp_rsc_index_t i; size_t iface_addr_len; size_t ep_addr_len; uint64_t md_flags; unsigned index; void *ptr; uint8_t *iface_addr_len_ptr; ptr = buffer; index = 0; *(uint64_t*)ptr = worker->uuid; ptr += sizeof(uint64_t); ptr = ucp_address_pack_string(ucp_worker_get_name(worker), ptr); if (num_devices == 0) { *((uint8_t*)ptr) = UCP_NULL_RESOURCE; ++ptr; goto out; } for (dev = devices; dev < devices + num_devices; ++dev) { /* MD index */ md_index = context->tl_rscs[dev->rsc_index].md_index; md_flags = context->tl_mds[md_index].attr.cap.flags; ucs_assert_always(!(md_index & ~UCP_ADDRESS_FLAG_MD_MASK)); *(uint8_t*)ptr = md_index | ((dev->tl_bitmap == 0) ? UCP_ADDRESS_FLAG_EMPTY : 0) | ((md_flags & UCT_MD_FLAG_ALLOC) ? UCP_ADDRESS_FLAG_MD_ALLOC : 0) | ((md_flags & UCT_MD_FLAG_REG) ? UCP_ADDRESS_FLAG_MD_REG : 0); ++ptr; /* Device address length */ ucs_assert(dev->dev_addr_len < UCP_ADDRESS_FLAG_LAST); *(uint8_t*)ptr = dev->dev_addr_len | ((dev == (devices + num_devices - 1)) ? UCP_ADDRESS_FLAG_LAST : 0); ++ptr; /* Device address */ status = uct_iface_get_device_address(worker->ifaces[dev->rsc_index].iface, (uct_device_addr_t*)ptr); if (status != UCS_OK) { return status; } ucp_address_memchek(ptr, dev->dev_addr_len, &context->tl_rscs[dev->rsc_index].tl_rsc); ptr += dev->dev_addr_len; for (i = 0; i < context->num_tls; ++i) { if (!(UCS_BIT(i) & dev->tl_bitmap)) { continue; } /* Transport name checksum */ *(uint16_t*)ptr = context->tl_rscs[i].tl_name_csum; ptr += sizeof(uint16_t); /* Transport information */ ucp_address_pack_iface_attr(ptr, &worker->ifaces[i].attr, worker->atomic_tls & UCS_BIT(i)); ucp_address_memchek(ptr, sizeof(ucp_address_packed_iface_attr_t), &context->tl_rscs[dev->rsc_index].tl_rsc); ptr += sizeof(ucp_address_packed_iface_attr_t); iface_attr = &worker->ifaces[i].attr; if (!(iface_attr->cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) && !(iface_attr->cap.flags & UCT_IFACE_FLAG_CONNECT_TO_EP)) { return UCS_ERR_INVALID_ADDR; } /* Pack iface address */ iface_addr_len = iface_attr->iface_addr_len; ucs_assert(iface_addr_len < UCP_ADDRESS_FLAG_EP_ADDR); status = uct_iface_get_address(worker->ifaces[i].iface, (uct_iface_addr_t*)(ptr + 1)); if (status != UCS_OK) { return status; } ucp_address_memchek(ptr + 1, iface_addr_len, &context->tl_rscs[dev->rsc_index].tl_rsc); iface_addr_len_ptr = ptr; *iface_addr_len_ptr = iface_addr_len | ((i == ucs_ilog2(dev->tl_bitmap)) ? UCP_ADDRESS_FLAG_LAST : 0); ptr += 1 + iface_addr_len; /* Pack ep address if present */ if (!(iface_attr->cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) && (ep != NULL)) { *iface_addr_len_ptr |= UCP_ADDRESS_FLAG_EP_ADDR; ep_addr_len = iface_attr->ep_addr_len; ucs_assert(ep_addr_len < UINT8_MAX); *(uint8_t*)ptr = ep_addr_len; status = ucp_address_pack_ep_address(ep, i, ptr + 1); if (status != UCS_OK) { return status; } ucp_address_memchek(ptr + 1, ep_addr_len, &context->tl_rscs[dev->rsc_index].tl_rsc); ptr += 1 + ep_addr_len; } /* Save the address index of this transport */ if (order != NULL) { order[ucs_count_one_bits(tl_bitmap & UCS_MASK(i))] = index; } ucs_trace("pack addr[%d] : "UCT_TL_RESOURCE_DESC_FMT " md_flags 0x%"PRIx64" tl_flags 0x%"PRIx64" bw %e ovh %e " "lat_ovh: %e dev_priority %d", index, UCT_TL_RESOURCE_DESC_ARG(&context->tl_rscs[i].tl_rsc), md_flags, worker->ifaces[i].attr.cap.flags, worker->ifaces[i].attr.bandwidth, worker->ifaces[i].attr.overhead, worker->ifaces[i].attr.latency.overhead, worker->ifaces[i].attr.priority); ++index; } } out: ucs_assertv(buffer + size == ptr, "buffer=%p size=%zu ptr=%p ptr-buffer=%zd", buffer, size, ptr, ptr - buffer); return UCS_OK; }
static ucs_status_t uct_ib_mlx5dv_create_ksm(uct_ib_md_t *ibmd, uct_ib_mem_t *ib_memh, off_t offset) { uct_ib_mlx5_mem_t *memh = ucs_derived_of(ib_memh, uct_ib_mlx5_mem_t); uct_ib_mlx5_md_t *md = ucs_derived_of(ibmd, uct_ib_mlx5_md_t); uint32_t out[UCT_IB_MLX5DV_ST_SZ_DW(create_mkey_out)] = {}; struct ibv_mr *mr = memh->super.mr; ucs_status_t status = UCS_OK; struct mlx5dv_pd dvpd = {}; struct mlx5dv_obj dv = {}; size_t reg_length, length, inlen; int list_size, i; void *mkc, *klm; uint32_t *in; intptr_t addr; if (!(md->flags & UCT_IB_MLX5_MD_FLAG_KSM)) { return UCS_ERR_UNSUPPORTED; } reg_length = UCT_IB_MD_MAX_MR_SIZE; addr = (intptr_t)mr->addr & ~(reg_length - 1); length = mr->length + (intptr_t)mr->addr - addr; list_size = ucs_div_round_up(length, reg_length); inlen = UCT_IB_MLX5DV_ST_SZ_BYTES(create_mkey_in) + UCT_IB_MLX5DV_ST_SZ_BYTES(klm) * list_size; in = ucs_calloc(1, inlen, "mkey mailbox"); if (in == NULL) { return UCS_ERR_NO_MEMORY; } dv.pd.in = md->super.pd; dv.pd.out = &dvpd; mlx5dv_init_obj(&dv, MLX5DV_OBJ_PD); UCT_IB_MLX5DV_SET(create_mkey_in, in, opcode, UCT_IB_MLX5_CMD_OP_CREATE_MKEY); mkc = UCT_IB_MLX5DV_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); UCT_IB_MLX5DV_SET(mkc, mkc, access_mode_1_0, UCT_IB_MLX5_MKC_ACCESS_MODE_KSM); UCT_IB_MLX5DV_SET(mkc, mkc, a, 1); UCT_IB_MLX5DV_SET(mkc, mkc, rw, 1); UCT_IB_MLX5DV_SET(mkc, mkc, rr, 1); UCT_IB_MLX5DV_SET(mkc, mkc, lw, 1); UCT_IB_MLX5DV_SET(mkc, mkc, lr, 1); UCT_IB_MLX5DV_SET(mkc, mkc, pd, dvpd.pdn); UCT_IB_MLX5DV_SET(mkc, mkc, translations_octword_size, list_size); UCT_IB_MLX5DV_SET(mkc, mkc, log_entity_size, ucs_ilog2(reg_length)); UCT_IB_MLX5DV_SET(mkc, mkc, qpn, 0xffffff); UCT_IB_MLX5DV_SET(mkc, mkc, mkey_7_0, offset & 0xff); UCT_IB_MLX5DV_SET64(mkc, mkc, start_addr, addr + offset); UCT_IB_MLX5DV_SET64(mkc, mkc, len, length); UCT_IB_MLX5DV_SET(create_mkey_in, in, translations_octword_actual_size, list_size); klm = UCT_IB_MLX5DV_ADDR_OF(create_mkey_in, in, klm_pas_mtt); for (i = 0; i < list_size; i++) { if (i == list_size - 1) { UCT_IB_MLX5DV_SET(klm, klm, byte_count, length % reg_length); } else { UCT_IB_MLX5DV_SET(klm, klm, byte_count, reg_length); } UCT_IB_MLX5DV_SET(klm, klm, mkey, mr->lkey); UCT_IB_MLX5DV_SET64(klm, klm, address, addr + (i * reg_length)); klm += UCT_IB_MLX5DV_ST_SZ_BYTES(klm); } memh->atomic_dvmr = mlx5dv_devx_obj_create(md->super.dev.ibv_context, in, inlen, out, sizeof(out)); if (memh->atomic_dvmr == NULL) { ucs_debug("CREATE_MKEY KSM failed: %m"); status = UCS_ERR_UNSUPPORTED; md->flags &= ~UCT_IB_MLX5_MD_FLAG_KSM; goto out; } memh->super.atomic_rkey = (UCT_IB_MLX5DV_GET(create_mkey_out, out, mkey_index) << 8) | (offset & 0xff); ucs_debug("KSM registered memory %p..%p offset 0x%lx on %s rkey 0x%x", mr->addr, mr->addr + mr->length, offset, uct_ib_device_name(&md->super.dev), memh->super.atomic_rkey); out: ucs_free(in); return status; }
static ucs_status_t ucp_address_do_pack(ucp_worker_h worker, ucp_ep_h ep, void *buffer, size_t size, uint64_t tl_bitmap, unsigned *order, const ucp_address_packed_device_t *devices, ucp_rsc_index_t num_devices) { ucp_context_h context = worker->context; const ucp_address_packed_device_t *dev; uct_iface_attr_t *iface_attr; ucs_status_t status; ucp_rsc_index_t i; size_t tl_addr_len; unsigned index; void *ptr; ptr = buffer; index = 0; *(uint64_t*)ptr = worker->uuid; ptr += sizeof(uint64_t); ptr = ucp_address_pack_string(ucp_worker_get_name(worker), ptr); if (num_devices == 0) { *((uint8_t*)ptr) = UCP_NULL_RESOURCE; ++ptr; goto out; } for (dev = devices; dev < devices + num_devices; ++dev) { /* PD index */ *(uint8_t*)ptr = context->tl_rscs[dev->rsc_index].pd_index | ((dev->tl_bitmap == 0) ? UCP_ADDRESS_FLAG_EMPTY : 0); ++ptr; /* Device address length */ ucs_assert(dev->dev_addr_len < UCP_ADDRESS_FLAG_LAST); *(uint8_t*)ptr = dev->dev_addr_len | ((dev == (devices + num_devices - 1)) ? UCP_ADDRESS_FLAG_LAST : 0); ++ptr; /* Device address */ status = uct_iface_get_device_address(worker->ifaces[dev->rsc_index], (uct_device_addr_t*)ptr); if (status != UCS_OK) { return status; } ptr += dev->dev_addr_len; for (i = 0; i < context->num_tls; ++i) { if (!(UCS_BIT(i) & dev->tl_bitmap)) { continue; } /* Transport name */ ptr = ucp_address_pack_string(context->tl_rscs[i].tl_rsc.tl_name, ptr); /* Transport address length */ iface_attr = &worker->iface_attrs[i]; if (iface_attr->cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { tl_addr_len = iface_attr->iface_addr_len; status = uct_iface_get_address(worker->ifaces[i], (uct_iface_addr_t*)(ptr + 1)); } else if (iface_attr->cap.flags & UCT_IFACE_FLAG_CONNECT_TO_EP) { if (ep == NULL) { tl_addr_len = 0; status = UCS_OK; } else { tl_addr_len = iface_attr->ep_addr_len; status = ucp_address_pack_ep_address(ep, i, ptr + 1); } } else { status = UCS_ERR_INVALID_ADDR; } if (status != UCS_OK) { return status; } ucp_address_memchek(ptr + 1, tl_addr_len, &context->tl_rscs[dev->rsc_index].tl_rsc); /* Save the address index of this transport */ if (order != NULL) { order[ucs_count_one_bits(tl_bitmap & UCS_MASK(i))] = index++; } ucs_assert(tl_addr_len < UCP_ADDRESS_FLAG_LAST); *(uint8_t*)ptr = tl_addr_len | ((i == ucs_ilog2(dev->tl_bitmap)) ? UCP_ADDRESS_FLAG_LAST : 0); ptr += 1 + tl_addr_len; } } out: ucs_assertv(buffer + size == ptr, "buffer=%p size=%zu ptr=%p", buffer, size, ptr); return UCS_OK; }
ucs_status_t uct_ib_iface_query(uct_ib_iface_t *iface, size_t xport_hdr_len, uct_iface_attr_t *iface_attr) { uct_ib_device_t *dev = uct_ib_iface_device(iface); static const unsigned ib_port_widths[] = { [0] = 1, [1] = 4, [2] = 8, [3] = 12 }; uint8_t active_width, active_speed, active_mtu; double encoding, signal_rate, wire_speed; size_t mtu, width, extra_pkt_len; if (!uct_ib_device_is_port_ib(dev, iface->port_num)) { return UCS_ERR_UNSUPPORTED; } active_width = uct_ib_iface_port_attr(iface)->active_width; active_speed = uct_ib_iface_port_attr(iface)->active_speed; active_mtu = uct_ib_iface_port_attr(iface)->active_mtu; /* Get active width */ if (!ucs_is_pow2(active_width) || (active_width < 1) || (ucs_ilog2(active_width) > 3)) { ucs_error("Invalid active_width on %s:%d: %d", uct_ib_device_name(dev), iface->port_num, active_width); return UCS_ERR_IO_ERROR; } memset(iface_attr, 0, sizeof(*iface_attr)); iface_attr->device_addr_len = iface->addr_size; switch (active_speed) { case 1: /* SDR */ iface_attr->latency = 5000e-9; signal_rate = 2.5e9; encoding = 8.0/10.0; break; case 2: /* DDR */ iface_attr->latency = 2500e-9; signal_rate = 5.0e9; encoding = 8.0/10.0; break; case 4: /* QDR */ iface_attr->latency = 1300e-9; signal_rate = 10.0e9; encoding = 8.0/10.0; break; case 8: /* FDR10 */ iface_attr->latency = 700e-9; signal_rate = 10.3125e9; encoding = 64.0/66.0; break; case 16: /* FDR */ iface_attr->latency = 700e-9; signal_rate = 14.0625e9; encoding = 64.0/66.0; break; case 32: /* EDR */ iface_attr->latency = 600e-9; signal_rate = 25.78125e9; encoding = 64.0/66.0; break; default: ucs_error("Invalid active_speed on %s:%d: %d", uct_ib_device_name(dev), iface->port_num, active_speed); return UCS_ERR_IO_ERROR; } /* Wire speed calculation: Width * SignalRate * Encoding */ width = ib_port_widths[ucs_ilog2(active_width)]; wire_speed = (width * signal_rate * encoding) / 8.0; /* Calculate packet overhead */ mtu = ucs_min(uct_ib_mtu_value(active_mtu), iface->config.seg_size); extra_pkt_len = UCT_IB_LRH_LEN + UCT_IB_BTH_LEN + xport_hdr_len + UCT_IB_ICRC_LEN + UCT_IB_VCRC_LEN + UCT_IB_DELIM_LEN; iface_attr->bandwidth = (wire_speed * mtu) / (mtu + extra_pkt_len); return UCS_OK; }