static void ucp_address_unpack_iface_attr(ucp_address_iface_attr_t *iface_attr, const ucp_address_packed_iface_attr_t *packed) { uint32_t packed_flag; uint64_t bit; iface_attr->cap_flags = 0; iface_attr->priority = packed->prio_cap_flags & UCS_MASK(8); iface_attr->overhead = packed->overhead; iface_attr->bandwidth = packed->bandwidth; iface_attr->lat_ovh = packed->lat_ovh; packed_flag = UCS_BIT(8); bit = 1; while (UCP_ADDRESS_IFACE_FLAGS & ~(bit - 1)) { if (UCP_ADDRESS_IFACE_FLAGS & bit) { if (packed->prio_cap_flags & packed_flag) { iface_attr->cap_flags |= bit; } packed_flag <<= 1; } bit <<= 1; } }
static UCS_F_ALWAYS_INLINE void uct_rc_mlx5_iface_poll_tx(uct_rc_mlx5_iface_t *iface) { uct_rc_iface_send_op_t *op; struct mlx5_cqe64 *cqe; uct_rc_mlx5_ep_t *ep; unsigned qp_num; uint16_t hw_ci; cqe = uct_ib_mlx5_get_cqe(&iface->tx.cq, UCT_IB_MLX5_CQE64_SIZE_LOG); if (cqe == NULL) { return; } UCS_STATS_UPDATE_COUNTER(iface->super.stats, UCT_RC_IFACE_STAT_TX_COMPLETION, 1); ucs_memory_cpu_load_fence(); qp_num = ntohl(cqe->sop_drop_qpn) & UCS_MASK(UCT_IB_QPN_ORDER); ep = ucs_derived_of(uct_rc_iface_lookup_ep(&iface->super, qp_num), uct_rc_mlx5_ep_t); ucs_assert(ep != NULL); hw_ci = ntohs(cqe->wqe_counter); ep->super.available = uct_ib_mlx5_txwq_update_bb(&ep->tx.wq, hw_ci); ++iface->super.tx.cq_available; /* Process completions */ ucs_queue_for_each_extract(op, &ep->super.outstanding, queue, UCS_CIRCULAR_COMPARE16(op->sn, <=, hw_ci)) { op->handler(op); } }
static void uct_dc_mlx5_iface_handle_failure(uct_ib_iface_t *ib_iface, void *arg) { struct mlx5_cqe64 *cqe = arg; uint32_t qp_num = ntohl(cqe->sop_drop_qpn) & UCS_MASK(UCT_IB_QPN_ORDER); uct_ib_mlx5_completion_with_err(arg, UCS_LOG_LEVEL_ERROR); uct_dc_handle_failure(ib_iface, qp_num); }
ucs_status_t uct_dc_mlx5_ep_atomic_cswap32(uct_ep_h tl_ep, uint32_t compare, uint32_t swap, uint64_t remote_addr, uct_rkey_t rkey, uint32_t *result, uct_completion_t *comp) { return uct_dc_mlx5_ep_atomic_fop(ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t), MLX5_OPCODE_ATOMIC_MASKED_CS, result, 1, sizeof(uint32_t), remote_addr, rkey, UCS_MASK(32), htonl(compare), -1, htonl(swap), comp); }
static int ucp_is_resource_enabled(uct_tl_resource_desc_t *resource, const ucp_config_t *config, uint64_t *devices_mask_p) { int device_enabled, tl_enabled; unsigned config_idx; ucs_assert(config->devices.count > 0); if (!strcmp(config->devices.names[0], "all")) { /* if the user's list is 'all', use all the available resources */ device_enabled = 1; *devices_mask_p = 1; } else { /* go over the device list from the user and check (against the available resources) * which can be satisfied */ device_enabled = 0; *devices_mask_p = 0; ucs_assert_always(config->devices.count <= 64); /* Using uint64_t bitmap */ for (config_idx = 0; config_idx < config->devices.count; ++config_idx) { if (!strcmp(config->devices.names[config_idx], resource->dev_name)) { device_enabled = 1; *devices_mask_p |= UCS_MASK(config_idx); } } } /* Disable the posix mmap and xpmem 'devices'. ONLY for now - use sysv for mm . * This will be removed after multi-rail is supported */ if (!strcmp(resource->dev_name,"posix") || !strcmp(resource->dev_name, "xpmem")) { device_enabled = 0; } ucs_assert(config->tls.count > 0); if (!strcmp(config->tls.names[0], "all")) { /* if the user's list is 'all', use all the available tls */ tl_enabled = 1; } else { /* go over the tls list from the user and compare it against the available resources */ tl_enabled = 0; for (config_idx = 0; config_idx < config->tls.count; ++config_idx) { if (!strcmp(config->tls.names[config_idx], resource->tl_name)) { tl_enabled = 1; break; } } } ucs_trace(UCT_TL_RESOURCE_DESC_FMT " is %sabled", UCT_TL_RESOURCE_DESC_ARG(resource), (device_enabled && tl_enabled) ? "en" : "dis"); return device_enabled && tl_enabled; }
static inline uct_rkey_t ucp_lookup_uct_rkey(ucp_ep_h ep, ucp_rkey_h rkey) { unsigned rkey_index; /* * Calculate the rkey index inside the compact array. This is actually the * number of PDs in the map with index less-than ours. So mask pd_map to get * only the less-than indices, and then count them using popcount operation. * TODO save the mask in ep->uct, to avoid the shift operation. */ rkey_index = ucs_count_one_bits(rkey->pd_map & UCS_MASK(ep->uct.dst_pd_index)); return rkey->uct[rkey_index].rkey; }
static ucs_status_ptr_t ucp_disconnect_nb_internal(ucp_ep_h ep) { ucs_status_t status; ucp_request_t *req; ucs_debug("disconnect ep %p", ep); req = ucs_mpool_get(&ep->worker->req_mp); if (req == NULL) { return UCS_STATUS_PTR(UCS_ERR_NO_MEMORY); } /* * Flush operation can be queued on the pending queue of only one of the * lanes (indicated by req->send.lane) and scheduled for completion on any * number of lanes. req->send.uct_comp.count keeps track of how many lanes * are not flushed yet, and when it reaches zero, it means all lanes are * flushed. req->send.flush.lanes keeps track of which lanes we still have * to start flush on. * If a flush is completed from a pending/completion callback, we need to * schedule slow-path callback to release the endpoint later, since a UCT * endpoint cannot be released from pending/completion callback context. */ req->flags = 0; req->status = UCS_OK; req->send.ep = ep; req->send.flush.flushed_cb = ucp_ep_disconnected; req->send.flush.lanes = UCS_MASK(ucp_ep_num_lanes(ep)); req->send.flush.cbq_elem.cb = ucp_ep_flushed_slow_path_callback; req->send.flush.cbq_elem_on = 0; req->send.lane = UCP_NULL_LANE; req->send.uct.func = ucp_ep_flush_progress_pending; req->send.uct_comp.func = ucp_ep_flush_completion; req->send.uct_comp.count = ucp_ep_num_lanes(ep); ucp_ep_flush_progress(req); if (req->send.uct_comp.count == 0) { status = req->status; ucp_ep_disconnected(req); ucs_trace_req("ep %p: releasing flush request %p, returning status %s", ep, req, ucs_status_string(status)); ucs_mpool_put(req); return UCS_STATUS_PTR(status); } ucs_trace_req("ep %p: return inprogress flush request %p (%p)", ep, req, req + 1); return req + 1; }
static UCS_F_ALWAYS_INLINE void uct_dc_mlx5_poll_tx(uct_dc_mlx5_iface_t *iface) { uint8_t dci; struct mlx5_cqe64 *cqe; uint32_t qp_num; uint16_t hw_ci; UCT_DC_MLX5_TXQP_DECL(txqp, txwq); cqe = uct_ib_mlx5_poll_cq(&iface->super.super.super, &iface->mlx5_common.tx.cq); if (cqe == NULL) { return; } UCS_STATS_UPDATE_COUNTER(iface->super.super.stats, UCT_RC_IFACE_STAT_TX_COMPLETION, 1); ucs_memory_cpu_load_fence(); qp_num = ntohl(cqe->sop_drop_qpn) & UCS_MASK(UCT_IB_QPN_ORDER); dci = uct_dc_iface_dci_find(&iface->super, qp_num); txqp = &iface->super.tx.dcis[dci].txqp; txwq = &iface->dci_wqs[dci]; hw_ci = ntohs(cqe->wqe_counter); ucs_trace_poll("dc_mlx5 iface %p tx_cqe: dci[%d] qpn 0x%x txqp %p hw_ci %d", iface, dci, qp_num, txqp, hw_ci); uct_rc_txqp_available_set(txqp, uct_ib_mlx5_txwq_update_bb(txwq, hw_ci)); uct_dc_iface_dci_put(&iface->super, dci); uct_rc_mlx5_txqp_process_tx_cqe(txqp, cqe, hw_ci); iface->super.super.tx.cq_available++; if (uct_dc_iface_dci_can_alloc(&iface->super)) { ucs_arbiter_dispatch(uct_dc_iface_dci_waitq(&iface->super), 1, uct_dc_iface_dci_do_pending_wait, NULL); } ucs_arbiter_dispatch(uct_dc_iface_tx_waitq(&iface->super), 1, uct_dc_iface_dci_do_pending_tx, NULL); }
static UCS_F_ALWAYS_INLINE void uct_dc_mlx5_poll_tx(uct_dc_mlx5_iface_t *iface) { uint8_t dci; struct mlx5_cqe64 *cqe; uint32_t qp_num; uint16_t hw_ci; UCT_DC_MLX5_TXQP_DECL(txqp, txwq); cqe = uct_ib_mlx5_get_cqe(&iface->super.super.super, &iface->mlx5_common.tx.cq, iface->mlx5_common.tx.cq.cqe_size_log); if (cqe == NULL) { return; } UCS_STATS_UPDATE_COUNTER(iface->super.super.stats, UCT_RC_IFACE_STAT_TX_COMPLETION, 1); ucs_memory_cpu_load_fence(); ucs_assertv(!(cqe->op_own & (MLX5_INLINE_SCATTER_32|MLX5_INLINE_SCATTER_64)), "tx inline scatter not supported"); qp_num = ntohl(cqe->sop_drop_qpn) & UCS_MASK(UCT_IB_QPN_ORDER); dci = uct_dc_iface_dci_find(&iface->super, qp_num); txqp = &iface->super.tx.dcis[dci].txqp; txwq = &iface->dci_wqs[dci]; hw_ci = ntohs(cqe->wqe_counter); uct_rc_txqp_available_set(txqp, uct_ib_mlx5_txwq_update_bb(txwq, hw_ci)); uct_rc_txqp_completion(txqp, hw_ci); iface->super.super.tx.cq_available++; uct_dc_iface_dci_put(&iface->super, dci); if (uct_dc_iface_dci_can_alloc(&iface->super)) { ucs_arbiter_dispatch(&iface->super.super.tx.arbiter, 1, uct_dc_iface_dci_do_pending_wait, NULL); } ucs_arbiter_dispatch(&iface->super.tx.dci_arbiter, 1, uct_dc_iface_dci_do_pending_tx, NULL); }
static unsigned ucp_wireup_address_index(const unsigned *order, uint64_t tl_bitmap, ucp_rsc_index_t tl_index) { return order[ucs_count_one_bits(tl_bitmap & UCS_MASK(tl_index))]; }
static ucs_status_t ucp_address_do_pack(ucp_worker_h worker, ucp_ep_h ep, void *buffer, size_t size, uint64_t tl_bitmap, unsigned *order, const ucp_address_packed_device_t *devices, ucp_rsc_index_t num_devices) { ucp_context_h context = worker->context; const ucp_address_packed_device_t *dev; uct_iface_attr_t *iface_attr; ucp_rsc_index_t md_index; ucs_status_t status; ucp_rsc_index_t i; size_t iface_addr_len; size_t ep_addr_len; uint64_t md_flags; unsigned index; void *ptr; uint8_t *iface_addr_len_ptr; ptr = buffer; index = 0; *(uint64_t*)ptr = worker->uuid; ptr += sizeof(uint64_t); ptr = ucp_address_pack_string(ucp_worker_get_name(worker), ptr); if (num_devices == 0) { *((uint8_t*)ptr) = UCP_NULL_RESOURCE; ++ptr; goto out; } for (dev = devices; dev < devices + num_devices; ++dev) { /* MD index */ md_index = context->tl_rscs[dev->rsc_index].md_index; md_flags = context->tl_mds[md_index].attr.cap.flags; ucs_assert_always(!(md_index & ~UCP_ADDRESS_FLAG_MD_MASK)); *(uint8_t*)ptr = md_index | ((dev->tl_bitmap == 0) ? UCP_ADDRESS_FLAG_EMPTY : 0) | ((md_flags & UCT_MD_FLAG_ALLOC) ? UCP_ADDRESS_FLAG_MD_ALLOC : 0) | ((md_flags & UCT_MD_FLAG_REG) ? UCP_ADDRESS_FLAG_MD_REG : 0); ++ptr; /* Device address length */ ucs_assert(dev->dev_addr_len < UCP_ADDRESS_FLAG_LAST); *(uint8_t*)ptr = dev->dev_addr_len | ((dev == (devices + num_devices - 1)) ? UCP_ADDRESS_FLAG_LAST : 0); ++ptr; /* Device address */ status = uct_iface_get_device_address(worker->ifaces[dev->rsc_index].iface, (uct_device_addr_t*)ptr); if (status != UCS_OK) { return status; } ucp_address_memchek(ptr, dev->dev_addr_len, &context->tl_rscs[dev->rsc_index].tl_rsc); ptr += dev->dev_addr_len; for (i = 0; i < context->num_tls; ++i) { if (!(UCS_BIT(i) & dev->tl_bitmap)) { continue; } /* Transport name checksum */ *(uint16_t*)ptr = context->tl_rscs[i].tl_name_csum; ptr += sizeof(uint16_t); /* Transport information */ ucp_address_pack_iface_attr(ptr, &worker->ifaces[i].attr, worker->atomic_tls & UCS_BIT(i)); ucp_address_memchek(ptr, sizeof(ucp_address_packed_iface_attr_t), &context->tl_rscs[dev->rsc_index].tl_rsc); ptr += sizeof(ucp_address_packed_iface_attr_t); iface_attr = &worker->ifaces[i].attr; if (!(iface_attr->cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) && !(iface_attr->cap.flags & UCT_IFACE_FLAG_CONNECT_TO_EP)) { return UCS_ERR_INVALID_ADDR; } /* Pack iface address */ iface_addr_len = iface_attr->iface_addr_len; ucs_assert(iface_addr_len < UCP_ADDRESS_FLAG_EP_ADDR); status = uct_iface_get_address(worker->ifaces[i].iface, (uct_iface_addr_t*)(ptr + 1)); if (status != UCS_OK) { return status; } ucp_address_memchek(ptr + 1, iface_addr_len, &context->tl_rscs[dev->rsc_index].tl_rsc); iface_addr_len_ptr = ptr; *iface_addr_len_ptr = iface_addr_len | ((i == ucs_ilog2(dev->tl_bitmap)) ? UCP_ADDRESS_FLAG_LAST : 0); ptr += 1 + iface_addr_len; /* Pack ep address if present */ if (!(iface_attr->cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) && (ep != NULL)) { *iface_addr_len_ptr |= UCP_ADDRESS_FLAG_EP_ADDR; ep_addr_len = iface_attr->ep_addr_len; ucs_assert(ep_addr_len < UINT8_MAX); *(uint8_t*)ptr = ep_addr_len; status = ucp_address_pack_ep_address(ep, i, ptr + 1); if (status != UCS_OK) { return status; } ucp_address_memchek(ptr + 1, ep_addr_len, &context->tl_rscs[dev->rsc_index].tl_rsc); ptr += 1 + ep_addr_len; } /* Save the address index of this transport */ if (order != NULL) { order[ucs_count_one_bits(tl_bitmap & UCS_MASK(i))] = index; } ucs_trace("pack addr[%d] : "UCT_TL_RESOURCE_DESC_FMT " md_flags 0x%"PRIx64" tl_flags 0x%"PRIx64" bw %e ovh %e " "lat_ovh: %e dev_priority %d", index, UCT_TL_RESOURCE_DESC_ARG(&context->tl_rscs[i].tl_rsc), md_flags, worker->ifaces[i].attr.cap.flags, worker->ifaces[i].attr.bandwidth, worker->ifaces[i].attr.overhead, worker->ifaces[i].attr.latency.overhead, worker->ifaces[i].attr.priority); ++index; } } out: ucs_assertv(buffer + size == ptr, "buffer=%p size=%zu ptr=%p ptr-buffer=%zd", buffer, size, ptr, ptr - buffer); return UCS_OK; }
static ucs_status_t ucp_address_do_pack(ucp_worker_h worker, ucp_ep_h ep, void *buffer, size_t size, uint64_t tl_bitmap, unsigned *order, const ucp_address_packed_device_t *devices, ucp_rsc_index_t num_devices) { ucp_context_h context = worker->context; const ucp_address_packed_device_t *dev; uct_iface_attr_t *iface_attr; ucs_status_t status; ucp_rsc_index_t i; size_t tl_addr_len; unsigned index; void *ptr; ptr = buffer; index = 0; *(uint64_t*)ptr = worker->uuid; ptr += sizeof(uint64_t); ptr = ucp_address_pack_string(ucp_worker_get_name(worker), ptr); if (num_devices == 0) { *((uint8_t*)ptr) = UCP_NULL_RESOURCE; ++ptr; goto out; } for (dev = devices; dev < devices + num_devices; ++dev) { /* PD index */ *(uint8_t*)ptr = context->tl_rscs[dev->rsc_index].pd_index | ((dev->tl_bitmap == 0) ? UCP_ADDRESS_FLAG_EMPTY : 0); ++ptr; /* Device address length */ ucs_assert(dev->dev_addr_len < UCP_ADDRESS_FLAG_LAST); *(uint8_t*)ptr = dev->dev_addr_len | ((dev == (devices + num_devices - 1)) ? UCP_ADDRESS_FLAG_LAST : 0); ++ptr; /* Device address */ status = uct_iface_get_device_address(worker->ifaces[dev->rsc_index], (uct_device_addr_t*)ptr); if (status != UCS_OK) { return status; } ptr += dev->dev_addr_len; for (i = 0; i < context->num_tls; ++i) { if (!(UCS_BIT(i) & dev->tl_bitmap)) { continue; } /* Transport name */ ptr = ucp_address_pack_string(context->tl_rscs[i].tl_rsc.tl_name, ptr); /* Transport address length */ iface_attr = &worker->iface_attrs[i]; if (iface_attr->cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { tl_addr_len = iface_attr->iface_addr_len; status = uct_iface_get_address(worker->ifaces[i], (uct_iface_addr_t*)(ptr + 1)); } else if (iface_attr->cap.flags & UCT_IFACE_FLAG_CONNECT_TO_EP) { if (ep == NULL) { tl_addr_len = 0; status = UCS_OK; } else { tl_addr_len = iface_attr->ep_addr_len; status = ucp_address_pack_ep_address(ep, i, ptr + 1); } } else { status = UCS_ERR_INVALID_ADDR; } if (status != UCS_OK) { return status; } ucp_address_memchek(ptr + 1, tl_addr_len, &context->tl_rscs[dev->rsc_index].tl_rsc); /* Save the address index of this transport */ if (order != NULL) { order[ucs_count_one_bits(tl_bitmap & UCS_MASK(i))] = index++; } ucs_assert(tl_addr_len < UCP_ADDRESS_FLAG_LAST); *(uint8_t*)ptr = tl_addr_len | ((i == ucs_ilog2(dev->tl_bitmap)) ? UCP_ADDRESS_FLAG_LAST : 0); ptr += 1 + tl_addr_len; } } out: ucs_assertv(buffer + size == ptr, "buffer=%p size=%zu ptr=%p", buffer, size, ptr); return UCS_OK; }
static int ucp_is_resource_enabled(uct_tl_resource_desc_t *resource, const ucp_config_t *config, uint64_t *devices_mask_p) { int device_enabled, tl_enabled; ucp_tl_alias_t *alias; int config_idx; ucs_assert(config->devices.count > 0); if (!strcmp(config->devices.names[0], "all")) { /* if the user's list is 'all', use all the available resources */ device_enabled = 1; *devices_mask_p = 1; } else { /* go over the device list from the user and check (against the available resources) * which can be satisfied */ device_enabled = 0; *devices_mask_p = 0; ucs_assert_always(config->devices.count <= 64); /* Using uint64_t bitmap */ config_idx = ucp_str_array_search((const char**)config->devices.names, config->devices.count, resource->dev_name); if (config_idx >= 0) { device_enabled = 1; *devices_mask_p |= UCS_MASK(config_idx); } } /* Disable the posix mmap and xpmem 'devices'. ONLY for now - use sysv for mm . * This will be removed after multi-rail is supported */ if (!strcmp(resource->dev_name,"posix") || !strcmp(resource->dev_name, "xpmem")) { device_enabled = 0; } ucs_assert(config->tls.count > 0); if (ucp_config_is_tl_enabled(config, resource->tl_name)) { tl_enabled = 1; } else { tl_enabled = 0; /* check aliases */ for (alias = ucp_tl_aliases; alias->alias != NULL; ++alias) { /* If an alias is enabled, and the transport is part of this alias, * enable the transport. */ if (ucp_config_is_tl_enabled(config, alias->alias) && (ucp_str_array_search(alias->tls, ucp_tl_alias_count(alias), resource->tl_name) >= 0)) { tl_enabled = 1; ucs_trace("enabling tl '%s' for alias '%s'", resource->tl_name, alias->alias); break; } } } ucs_trace(UCT_TL_RESOURCE_DESC_FMT " is %sabled", UCT_TL_RESOURCE_DESC_ARG(resource), (device_enabled && tl_enabled) ? "en" : "dis"); return device_enabled && tl_enabled; }