ucs_status_t uct_ugni_ep_atomic32_fetch(uct_ep_h ep, uct_atomic_op_t opcode, uint32_t value, uint32_t *result, uint64_t remote_addr, uct_rkey_t rkey, uct_completion_t *comp) { switch (opcode) { case UCT_ATOMIC_OP_ADD: return uct_ugni_ep_atomic_fop32(ep, value, remote_addr, rkey, result, comp, GNI_FMA_ATOMIC2_FIADD_S, "ADD"); case UCT_ATOMIC_OP_SWAP: return uct_ugni_ep_atomic_fop32(ep, value, remote_addr, rkey, result, comp, GNI_FMA_ATOMIC2_FSWAP_S, "SWAP"); case UCT_ATOMIC_OP_XOR: return uct_ugni_ep_atomic_fop32(ep, value, remote_addr, rkey, result, comp, GNI_FMA_ATOMIC2_FXOR_S, "XOR"); case UCT_ATOMIC_OP_AND: return uct_ugni_ep_atomic_fop32(ep, value, remote_addr, rkey, result, comp, GNI_FMA_ATOMIC2_FAND_S, "AND"); case UCT_ATOMIC_OP_OR: return uct_ugni_ep_atomic_fop32(ep, value, remote_addr, rkey, result, comp, GNI_FMA_ATOMIC2_FOR_S, "OR"); default: ucs_assertv(0, "incorrect opcode for atomic: %d", opcode); return UCS_ERR_UNSUPPORTED; } }
static ucs_status_t ucp_wireup_connect_local(ucp_ep_h ep, const uint8_t *tli, unsigned address_count, const ucp_address_entry_t *address_list) { ucp_worker_h worker = ep->worker; const ucp_address_entry_t *address; ucp_rsc_index_t rsc_index; ucp_lane_index_t lane, amo_index; ucs_status_t status; ucp_md_map_t UCS_V_UNUSED md_map; ucs_trace("ep %p: connect local transports", ep); for (lane = 0; lane < ucp_ep_num_lanes(ep); ++lane) { rsc_index = ucp_ep_get_rsc_index(ep, lane); if (!ucp_worker_is_tl_p2p(worker, rsc_index)) { continue; } address = &address_list[tli[lane]]; ucs_assert(address->tl_addr_len > 0); /* Check that if the lane is used for RMA/AMO, destination md index matches */ md_map = ucp_lane_map_get_lane(ucp_ep_config(ep)->key.rma_lane_map, lane); ucs_assertv((md_map == 0) || (md_map == UCS_BIT(address->md_index)), "lane=%d ai=%d md_map=0x%x md_index=%d", lane, tli[lane], md_map, address->md_index); amo_index = ucp_ep_get_amo_lane_index(&ucp_ep_config(ep)->key, lane); if (amo_index != UCP_NULL_LANE) { md_map = ucp_lane_map_get_lane(ucp_ep_config(ep)->key.amo_lane_map, amo_index); ucs_assertv((md_map == 0) || (md_map == UCS_BIT(address->md_index)), "lane=%d ai=%d md_map=0x%x md_index=%d", lane, tli[lane], md_map, address->md_index); } status = uct_ep_connect_to_ep(ep->uct_eps[lane], address->dev_addr, address->ep_addr); if (status != UCS_OK) { return status; } } return UCS_OK; }
void ucp_tag_eager_sync_completion(ucp_request_t *req, uint16_t flag) { static const uint16_t all_completed = UCP_REQUEST_FLAG_LOCAL_COMPLETED | UCP_REQUEST_FLAG_REMOTE_COMPLETED; ucs_assertv(!(req->flags & flag), "req->flags=%d flag=%d", req->flags, flag); req->flags |= flag; if (ucs_test_all_flags(req->flags, all_completed)) { ucp_request_complete(req, req->cb.send, UCS_OK); } }
static inline void uct_ud_verbs_iface_tx_data(uct_ud_verbs_iface_t *iface, uct_ud_verbs_ep_t *ep) { int UCS_V_UNUSED ret; struct ibv_send_wr *bad_wr; uct_ud_verbs_iface_fill_tx_wr(iface, ep, &iface->tx.wr_bcp, 0); UCT_UD_EP_HOOK_CALL_TX(&ep->super, (uct_ud_neth_t *)iface->tx.sge[0].addr); ret = ibv_post_send(iface->super.qp, &iface->tx.wr_bcp, &bad_wr); ucs_assertv(ret == 0, "ibv_post_send() returned %d (%m)", ret); uct_ib_log_post_send(iface->super.qp, &iface->tx.wr_bcp, NULL); }
static size_t ucp_tag_pack_eager_last_generic(void *dest, void *arg) { ucp_eager_hdr_t *hdr = dest; ucp_request_t *req = arg; size_t max_length, length; max_length = req->send.length - req->send.state.offset; hdr->super.tag = req->send.tag; length = ucp_request_generic_dt_pack(req, hdr + 1, max_length); ucs_assertv(length == max_length, "length=%zu, max_length=%zu", length, max_length); return sizeof(*hdr) + length; }
static inline void uct_ud_verbs_iface_tx_inl(uct_ud_verbs_iface_t *iface, uct_ud_verbs_ep_t *ep, const void *buffer, unsigned length) { int UCS_V_UNUSED ret; struct ibv_send_wr *bad_wr; iface->tx.sge[1].addr = (uintptr_t)buffer; iface->tx.sge[1].length = length; uct_ud_verbs_iface_fill_tx_wr(iface, ep, &iface->tx.wr_inl, IBV_SEND_INLINE); UCT_UD_EP_HOOK_CALL_TX(&ep->super, (uct_ud_neth_t *)iface->tx.sge[0].addr); ret = ibv_post_send(iface->super.qp, &iface->tx.wr_inl, &bad_wr); ucs_assertv(ret == 0, "ibv_post_send() returned %d (%m)", ret); uct_ib_log_post_send(iface->super.qp, &iface->tx.wr_inl, NULL); }
/** * dispatch requests waiting for tx resources */ ucs_arbiter_cb_result_t uct_dc_mlx5_iface_dci_do_pending_tx(ucs_arbiter_t *arbiter, ucs_arbiter_elem_t *elem, void *arg) { uct_dc_mlx5_ep_t *ep = ucs_container_of(ucs_arbiter_elem_group(elem), uct_dc_mlx5_ep_t, arb_group); uct_dc_mlx5_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_dc_mlx5_iface_t); uct_pending_req_t *req = ucs_container_of(elem, uct_pending_req_t, priv); ucs_status_t status; if (!uct_rc_iface_has_tx_resources(&iface->super.super)) { return UCS_ARBITER_CB_RESULT_STOP; } status = req->func(req); ucs_trace_data("progress pending request %p returned: %s", req, ucs_status_string(status)); if (status == UCS_OK) { /* For dcs* policies release dci if this is the last elem in the group * and the dci has no outstanding operations. For example pending * callback did not send anything. (uct_ep_flush or just return ok) */ if (ucs_arbiter_elem_is_last(&ep->arb_group, elem)) { uct_dc_mlx5_iface_dci_free(iface, ep); } return UCS_ARBITER_CB_RESULT_REMOVE_ELEM; } if (status == UCS_INPROGRESS) { return UCS_ARBITER_CB_RESULT_NEXT_GROUP; } if (!uct_dc_mlx5_iface_dci_ep_can_send(ep)) { /* Deschedule the group even if FC is the only resource, which * is missing. It will be scheduled again when credits arrive. * We can't desched group with rand policy if non FC resources are * missing, since it's never scheduled again. */ if (uct_dc_mlx5_iface_is_dci_rand(iface) && uct_rc_fc_has_resources(&iface->super.super, &ep->fc)) { return UCS_ARBITER_CB_RESULT_RESCHED_GROUP; } else { return UCS_ARBITER_CB_RESULT_DESCHED_GROUP; } } ucs_assertv(!uct_rc_iface_has_tx_resources(&iface->super.super), "pending callback returned error but send resources are available"); return UCS_ARBITER_CB_RESULT_STOP; }
static size_t ucp_tag_pack_eager_last_dt(void *dest, void *arg) { ucp_eager_middle_hdr_t *hdr = dest; ucp_request_t *req = arg; size_t length, ret_length; length = req->send.length - req->send.state.dt.offset; hdr->msg_id = req->send.tag.message_id; hdr->offset = req->send.state.dt.offset; ret_length = ucp_dt_pack(req->send.datatype, hdr + 1, req->send.buffer, &req->send.state.dt, length); ucs_assertv(ret_length == length, "length=%zu, max_length=%zu", ret_length, length); return sizeof(*hdr) + ret_length; }
static size_t ucp_tag_pack_eager_last_dt(void *dest, void *arg) { ucp_eager_hdr_t *hdr = dest; ucp_request_t *req = arg; size_t length, ret_length; length = req->send.length - req->send.state.offset; hdr->super.tag = req->send.tag; ret_length = ucp_dt_pack(req->send.datatype, hdr + 1, req->send.buffer, &req->send.state, length); ucs_debug("pack eager_last paylen %zu offset %zu", length, req->send.state.offset); ucs_assertv(ret_length == length, "length=%zu, max_length=%zu", ret_length, length); return sizeof(*hdr) + ret_length; }
static inline void uct_ud_verbs_ep_tx_skb(uct_ud_verbs_iface_t *iface, uct_ud_verbs_ep_t *ep, uct_ud_send_skb_t *skb, unsigned flags) { int UCS_V_UNUSED ret; struct ibv_send_wr *bad_wr; iface->tx.sge[0].lkey = skb->lkey; iface->tx.sge[0].length = skb->len; iface->tx.sge[0].addr = (uintptr_t)skb->neth; uct_ud_verbs_iface_fill_tx_wr(iface, ep, &iface->tx.wr_skb, flags); UCT_UD_EP_HOOK_CALL_TX(&ep->super, (uct_ud_neth_t *)iface->tx.sge[0].addr); ret = ibv_post_send(iface->super.qp, &iface->tx.wr_skb, &bad_wr); ucs_assertv(ret == 0, "ibv_post_send() returned %d (%m)", ret); uct_ib_log_post_send(&iface->super.super, iface->super.qp, &iface->tx.wr_skb, NULL); --iface->super.tx.available; }
ucs_status_t uct_ugni_ep_atomic64_post(uct_ep_h ep, unsigned opcode, uint64_t value, uint64_t remote_addr, uct_rkey_t rkey) { switch (opcode) { case UCT_ATOMIC_OP_ADD: return uct_ugni_ep_atomic_op64(ep, value, remote_addr, rkey, GNI_FMA_ATOMIC_ADD, "ADD"); case UCT_ATOMIC_OP_XOR: return uct_ugni_ep_atomic_op64(ep, value, remote_addr, rkey, GNI_FMA_ATOMIC_XOR, "XOR"); case UCT_ATOMIC_OP_AND: return uct_ugni_ep_atomic_op64(ep, value, remote_addr, rkey, GNI_FMA_ATOMIC_AND, "AND"); case UCT_ATOMIC_OP_OR: return uct_ugni_ep_atomic_op64(ep, value, remote_addr, rkey, GNI_FMA_ATOMIC_OR, "OR"); default: ucs_assertv(0, "incorrect opcode for atomic: %d", opcode); return UCS_ERR_UNSUPPORTED; } }
ucs_status_t ucp_ep_add_pending_uct(ucp_ep_h ep, uct_ep_h uct_ep, uct_pending_req_t *req) { ucs_status_t status; ucs_assertv(req->func != NULL, "req=%p", req); status = uct_ep_pending_add(uct_ep, req); if (status != UCS_ERR_BUSY) { ucs_assert(status == UCS_OK); ucs_trace_data("ep %p: added pending uct request %p to uct_ep %p", ep, req, uct_ep); return UCS_OK; /* Added to pending */ } /* Forced progress */ status = req->func(req); if (status == UCS_OK) { return UCS_OK; /* Completed the operation */ } return UCS_ERR_NO_PROGRESS; }
static UCS_F_ALWAYS_INLINE void uct_dc_mlx5_poll_tx(uct_dc_mlx5_iface_t *iface) { uint8_t dci; struct mlx5_cqe64 *cqe; uint32_t qp_num; uint16_t hw_ci; UCT_DC_MLX5_TXQP_DECL(txqp, txwq); cqe = uct_ib_mlx5_get_cqe(&iface->super.super.super, &iface->mlx5_common.tx.cq, iface->mlx5_common.tx.cq.cqe_size_log); if (cqe == NULL) { return; } UCS_STATS_UPDATE_COUNTER(iface->super.super.stats, UCT_RC_IFACE_STAT_TX_COMPLETION, 1); ucs_memory_cpu_load_fence(); ucs_assertv(!(cqe->op_own & (MLX5_INLINE_SCATTER_32|MLX5_INLINE_SCATTER_64)), "tx inline scatter not supported"); qp_num = ntohl(cqe->sop_drop_qpn) & UCS_MASK(UCT_IB_QPN_ORDER); dci = uct_dc_iface_dci_find(&iface->super, qp_num); txqp = &iface->super.tx.dcis[dci].txqp; txwq = &iface->dci_wqs[dci]; hw_ci = ntohs(cqe->wqe_counter); uct_rc_txqp_available_set(txqp, uct_ib_mlx5_txwq_update_bb(txwq, hw_ci)); uct_rc_txqp_completion(txqp, hw_ci); iface->super.super.tx.cq_available++; uct_dc_iface_dci_put(&iface->super, dci); if (uct_dc_iface_dci_can_alloc(&iface->super)) { ucs_arbiter_dispatch(&iface->super.super.tx.arbiter, 1, uct_dc_iface_dci_do_pending_wait, NULL); } ucs_arbiter_dispatch(&iface->super.tx.dci_arbiter, 1, uct_dc_iface_dci_do_pending_tx, NULL); }
static ucs_status_t ucp_address_do_pack(ucp_worker_h worker, ucp_ep_h ep, void *buffer, size_t size, uint64_t tl_bitmap, unsigned *order, const ucp_address_packed_device_t *devices, ucp_rsc_index_t num_devices) { ucp_context_h context = worker->context; const ucp_address_packed_device_t *dev; uct_iface_attr_t *iface_attr; ucp_rsc_index_t md_index; ucs_status_t status; ucp_rsc_index_t i; size_t iface_addr_len; size_t ep_addr_len; uint64_t md_flags; unsigned index; void *ptr; uint8_t *iface_addr_len_ptr; ptr = buffer; index = 0; *(uint64_t*)ptr = worker->uuid; ptr += sizeof(uint64_t); ptr = ucp_address_pack_string(ucp_worker_get_name(worker), ptr); if (num_devices == 0) { *((uint8_t*)ptr) = UCP_NULL_RESOURCE; ++ptr; goto out; } for (dev = devices; dev < devices + num_devices; ++dev) { /* MD index */ md_index = context->tl_rscs[dev->rsc_index].md_index; md_flags = context->tl_mds[md_index].attr.cap.flags; ucs_assert_always(!(md_index & ~UCP_ADDRESS_FLAG_MD_MASK)); *(uint8_t*)ptr = md_index | ((dev->tl_bitmap == 0) ? UCP_ADDRESS_FLAG_EMPTY : 0) | ((md_flags & UCT_MD_FLAG_ALLOC) ? UCP_ADDRESS_FLAG_MD_ALLOC : 0) | ((md_flags & UCT_MD_FLAG_REG) ? UCP_ADDRESS_FLAG_MD_REG : 0); ++ptr; /* Device address length */ ucs_assert(dev->dev_addr_len < UCP_ADDRESS_FLAG_LAST); *(uint8_t*)ptr = dev->dev_addr_len | ((dev == (devices + num_devices - 1)) ? UCP_ADDRESS_FLAG_LAST : 0); ++ptr; /* Device address */ status = uct_iface_get_device_address(worker->ifaces[dev->rsc_index].iface, (uct_device_addr_t*)ptr); if (status != UCS_OK) { return status; } ucp_address_memchek(ptr, dev->dev_addr_len, &context->tl_rscs[dev->rsc_index].tl_rsc); ptr += dev->dev_addr_len; for (i = 0; i < context->num_tls; ++i) { if (!(UCS_BIT(i) & dev->tl_bitmap)) { continue; } /* Transport name checksum */ *(uint16_t*)ptr = context->tl_rscs[i].tl_name_csum; ptr += sizeof(uint16_t); /* Transport information */ ucp_address_pack_iface_attr(ptr, &worker->ifaces[i].attr, worker->atomic_tls & UCS_BIT(i)); ucp_address_memchek(ptr, sizeof(ucp_address_packed_iface_attr_t), &context->tl_rscs[dev->rsc_index].tl_rsc); ptr += sizeof(ucp_address_packed_iface_attr_t); iface_attr = &worker->ifaces[i].attr; if (!(iface_attr->cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) && !(iface_attr->cap.flags & UCT_IFACE_FLAG_CONNECT_TO_EP)) { return UCS_ERR_INVALID_ADDR; } /* Pack iface address */ iface_addr_len = iface_attr->iface_addr_len; ucs_assert(iface_addr_len < UCP_ADDRESS_FLAG_EP_ADDR); status = uct_iface_get_address(worker->ifaces[i].iface, (uct_iface_addr_t*)(ptr + 1)); if (status != UCS_OK) { return status; } ucp_address_memchek(ptr + 1, iface_addr_len, &context->tl_rscs[dev->rsc_index].tl_rsc); iface_addr_len_ptr = ptr; *iface_addr_len_ptr = iface_addr_len | ((i == ucs_ilog2(dev->tl_bitmap)) ? UCP_ADDRESS_FLAG_LAST : 0); ptr += 1 + iface_addr_len; /* Pack ep address if present */ if (!(iface_attr->cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) && (ep != NULL)) { *iface_addr_len_ptr |= UCP_ADDRESS_FLAG_EP_ADDR; ep_addr_len = iface_attr->ep_addr_len; ucs_assert(ep_addr_len < UINT8_MAX); *(uint8_t*)ptr = ep_addr_len; status = ucp_address_pack_ep_address(ep, i, ptr + 1); if (status != UCS_OK) { return status; } ucp_address_memchek(ptr + 1, ep_addr_len, &context->tl_rscs[dev->rsc_index].tl_rsc); ptr += 1 + ep_addr_len; } /* Save the address index of this transport */ if (order != NULL) { order[ucs_count_one_bits(tl_bitmap & UCS_MASK(i))] = index; } ucs_trace("pack addr[%d] : "UCT_TL_RESOURCE_DESC_FMT " md_flags 0x%"PRIx64" tl_flags 0x%"PRIx64" bw %e ovh %e " "lat_ovh: %e dev_priority %d", index, UCT_TL_RESOURCE_DESC_ARG(&context->tl_rscs[i].tl_rsc), md_flags, worker->ifaces[i].attr.cap.flags, worker->ifaces[i].attr.bandwidth, worker->ifaces[i].attr.overhead, worker->ifaces[i].attr.latency.overhead, worker->ifaces[i].attr.priority); ++index; } } out: ucs_assertv(buffer + size == ptr, "buffer=%p size=%zu ptr=%p ptr-buffer=%zd", buffer, size, ptr, ptr - buffer); return UCS_OK; }
static ucs_status_t ucp_address_do_pack(ucp_worker_h worker, ucp_ep_h ep, void *buffer, size_t size, uint64_t tl_bitmap, unsigned *order, const ucp_address_packed_device_t *devices, ucp_rsc_index_t num_devices) { ucp_context_h context = worker->context; const ucp_address_packed_device_t *dev; uct_iface_attr_t *iface_attr; ucs_status_t status; ucp_rsc_index_t i; size_t tl_addr_len; unsigned index; void *ptr; ptr = buffer; index = 0; *(uint64_t*)ptr = worker->uuid; ptr += sizeof(uint64_t); ptr = ucp_address_pack_string(ucp_worker_get_name(worker), ptr); if (num_devices == 0) { *((uint8_t*)ptr) = UCP_NULL_RESOURCE; ++ptr; goto out; } for (dev = devices; dev < devices + num_devices; ++dev) { /* PD index */ *(uint8_t*)ptr = context->tl_rscs[dev->rsc_index].pd_index | ((dev->tl_bitmap == 0) ? UCP_ADDRESS_FLAG_EMPTY : 0); ++ptr; /* Device address length */ ucs_assert(dev->dev_addr_len < UCP_ADDRESS_FLAG_LAST); *(uint8_t*)ptr = dev->dev_addr_len | ((dev == (devices + num_devices - 1)) ? UCP_ADDRESS_FLAG_LAST : 0); ++ptr; /* Device address */ status = uct_iface_get_device_address(worker->ifaces[dev->rsc_index], (uct_device_addr_t*)ptr); if (status != UCS_OK) { return status; } ptr += dev->dev_addr_len; for (i = 0; i < context->num_tls; ++i) { if (!(UCS_BIT(i) & dev->tl_bitmap)) { continue; } /* Transport name */ ptr = ucp_address_pack_string(context->tl_rscs[i].tl_rsc.tl_name, ptr); /* Transport address length */ iface_attr = &worker->iface_attrs[i]; if (iface_attr->cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE) { tl_addr_len = iface_attr->iface_addr_len; status = uct_iface_get_address(worker->ifaces[i], (uct_iface_addr_t*)(ptr + 1)); } else if (iface_attr->cap.flags & UCT_IFACE_FLAG_CONNECT_TO_EP) { if (ep == NULL) { tl_addr_len = 0; status = UCS_OK; } else { tl_addr_len = iface_attr->ep_addr_len; status = ucp_address_pack_ep_address(ep, i, ptr + 1); } } else { status = UCS_ERR_INVALID_ADDR; } if (status != UCS_OK) { return status; } ucp_address_memchek(ptr + 1, tl_addr_len, &context->tl_rscs[dev->rsc_index].tl_rsc); /* Save the address index of this transport */ if (order != NULL) { order[ucs_count_one_bits(tl_bitmap & UCS_MASK(i))] = index++; } ucs_assert(tl_addr_len < UCP_ADDRESS_FLAG_LAST); *(uint8_t*)ptr = tl_addr_len | ((i == ucs_ilog2(dev->tl_bitmap)) ? UCP_ADDRESS_FLAG_LAST : 0); ptr += 1 + tl_addr_len; } } out: ucs_assertv(buffer + size == ptr, "buffer=%p size=%zu ptr=%p", buffer, size, ptr); return UCS_OK; }
/* Will be called if request is completed internally before returned to user */ static void ucp_tag_stub_send_completion(void *request, ucs_status_t status) { ucs_assertv(status == UCS_OK, "status=%s", ucs_status_string(status)); }