static UCS_F_ALWAYS_INLINE void uct_rc_mlx5_iface_poll_tx(uct_rc_mlx5_iface_t *iface) { uct_rc_iface_send_op_t *op; struct mlx5_cqe64 *cqe; uct_rc_mlx5_ep_t *ep; unsigned qp_num; uint16_t hw_ci; cqe = uct_ib_mlx5_get_cqe(&iface->tx.cq, UCT_IB_MLX5_CQE64_SIZE_LOG); if (cqe == NULL) { return; } UCS_STATS_UPDATE_COUNTER(iface->super.stats, UCT_RC_IFACE_STAT_TX_COMPLETION, 1); ucs_memory_cpu_load_fence(); qp_num = ntohl(cqe->sop_drop_qpn) & UCS_MASK(UCT_IB_QPN_ORDER); ep = ucs_derived_of(uct_rc_iface_lookup_ep(&iface->super, qp_num), uct_rc_mlx5_ep_t); ucs_assert(ep != NULL); hw_ci = ntohs(cqe->wqe_counter); ep->super.available = uct_ib_mlx5_txwq_update_bb(&ep->tx.wq, hw_ci); ++iface->super.tx.cq_available; /* Process completions */ ucs_queue_for_each_extract(op, &ep->super.outstanding, queue, UCS_CIRCULAR_COMPARE16(op->sn, <=, hw_ci)) { op->handler(op); } }
static UCS_F_ALWAYS_INLINE ucs_status_t uct_ugni_smsg_ep_am_common_send(uct_ugni_smsg_ep_t *ep, uct_ugni_smsg_iface_t *iface, uint8_t am_id, unsigned header_length, void *header, unsigned payload_length, void *payload, uct_ugni_smsg_desc_t *desc) { gni_return_t gni_rc; if (ucs_unlikely(!uct_ugni_ep_can_send(&ep->super))) { goto exit_no_res; } desc->msg_id = iface->smsg_id++; desc->flush_group = ep->super.flush_group; uct_ugni_cdm_lock(&iface->super.cdm); gni_rc = GNI_SmsgSendWTag(ep->super.ep, header, header_length, payload, payload_length, desc->msg_id, am_id); uct_ugni_cdm_unlock(&iface->super.cdm); if(GNI_RC_SUCCESS != gni_rc){ goto exit_no_res; } ++desc->flush_group->flush_comp.count; ++iface->super.outstanding; sglib_hashed_uct_ugni_smsg_desc_t_add(iface->smsg_list, desc); return UCS_OK; exit_no_res: ucs_trace("Smsg send failed."); ucs_mpool_put(desc); UCS_STATS_UPDATE_COUNTER(ep->super.super.stats, UCT_EP_STAT_NO_RES, 1); return UCS_ERR_NO_RESOURCE; }
static UCS_F_ALWAYS_INLINE ucs_status_t uct_rc_verbs_iface_poll_rx(uct_rc_verbs_iface_t *iface) { uct_ib_iface_recv_desc_t *desc; uct_rc_hdr_t *hdr; struct ibv_wc wc[UCT_IB_MAX_WC]; int i, ret; ret = ibv_poll_cq(iface->super.super.recv_cq, UCT_IB_MAX_WC, wc); if (ret > 0) { for (i = 0; i < ret; ++i) { if (ucs_unlikely(wc[i].status != IBV_WC_SUCCESS)) { ucs_fatal("Receive completion with error: %s", ibv_wc_status_str(wc[i].status)); } UCS_STATS_UPDATE_COUNTER(iface->super.stats, UCT_RC_IFACE_STAT_RX_COMPLETION, 1); desc = (void*)wc[i].wr_id; uct_ib_iface_desc_received(&iface->super.super, desc, wc[i].byte_len, 1); hdr = uct_ib_iface_recv_desc_hdr(&iface->super.super, desc); uct_ib_log_recv_completion(IBV_QPT_RC, &wc[i], hdr, uct_rc_ep_am_packet_dump); uct_rc_iface_invoke_am(&iface->super, hdr, wc[i].byte_len, desc); } iface->super.rx.available += ret; return UCS_OK; } else if (ret == 0) { uct_rc_verbs_iface_post_recv(iface, 0); return UCS_ERR_NO_PROGRESS; } else { ucs_fatal("Failed to poll receive CQ"); } }
static UCS_F_ALWAYS_INLINE ucs_status_t uct_rc_mlx5_iface_poll_rx(uct_rc_mlx5_iface_t *iface) { struct mlx5_wqe_srq_next_seg *seg; uct_rc_mlx5_recv_desc_t *desc; uct_rc_hdr_t *hdr; struct mlx5_cqe64 *cqe; unsigned byte_len; uint16_t wqe_ctr_be; uint16_t max_batch; ucs_status_t status; cqe = uct_ib_mlx5_get_cqe(&iface->rx.cq, iface->rx.cq.cqe_size_log); if (cqe == NULL) { /* If not CQE - post receives */ status = UCS_ERR_NO_PROGRESS; goto done; } UCS_STATS_UPDATE_COUNTER(iface->super.stats, UCT_RC_IFACE_STAT_RX_COMPLETION, 1); ucs_assert(!ucs_queue_is_empty(&iface->rx.desc_q)); ucs_memory_cpu_load_fence(); desc = ucs_queue_pull_elem_non_empty(&iface->rx.desc_q, uct_rc_mlx5_recv_desc_t, queue); byte_len = ntohl(cqe->byte_cnt); uct_ib_iface_desc_received(&iface->super.super, &desc->super, byte_len, !(cqe->op_own & (MLX5_INLINE_SCATTER_32|MLX5_INLINE_SCATTER_64))); /* Get a pointer to AM header (after which comes the payload) * Support cases of inline scatter by pointing directly to CQE. */ if (cqe->op_own & MLX5_INLINE_SCATTER_32) { hdr = (uct_rc_hdr_t*)cqe; UCS_STATS_UPDATE_COUNTER(iface->stats, UCT_RC_MLX5_IFACE_STAT_RX_INL_32, 1); } else if (cqe->op_own & MLX5_INLINE_SCATTER_64) { hdr = (uct_rc_hdr_t*)(cqe - 1); UCS_STATS_UPDATE_COUNTER(iface->stats, UCT_RC_MLX5_IFACE_STAT_RX_INL_64, 1) } else {
static UCS_F_ALWAYS_INLINE void uct_rc_verbs_iface_poll_tx(uct_rc_verbs_iface_t *iface) { struct ibv_wc wc[UCT_IB_MAX_WC]; uct_rc_verbs_ep_t *ep; uct_rc_iface_send_op_t *op; unsigned count; uint16_t sn; int i, ret; ret = ibv_poll_cq(iface->super.super.send_cq, UCT_IB_MAX_WC, wc); if (ucs_unlikely(ret <= 0)) { if (ucs_unlikely(ret < 0)) { ucs_fatal("Failed to poll send CQ"); } return; } for (i = 0; i < ret; ++i) { if (ucs_unlikely(wc[i].status != IBV_WC_SUCCESS)) { ucs_fatal("Send completion with error: %s", ibv_wc_status_str(wc[i].status)); } UCS_STATS_UPDATE_COUNTER(iface->super.stats, UCT_RC_IFACE_STAT_TX_COMPLETION, 1); ep = ucs_derived_of(uct_rc_iface_lookup_ep(&iface->super, wc[i].qp_num), uct_rc_verbs_ep_t); ucs_assert(ep != NULL); count = wc[i].wr_id + 1; /* Number of sends with WC completes in batch */ ep->super.available += count; ep->tx.completion_count += count; ++iface->super.tx.cq_available; sn = ep->tx.completion_count; ucs_queue_for_each_extract(op, &ep->super.outstanding, queue, UCS_CIRCULAR_COMPARE16(op->sn, <=, sn)) { op->handler(op); } } }
static UCS_F_ALWAYS_INLINE void uct_dc_mlx5_poll_tx(uct_dc_mlx5_iface_t *iface) { uint8_t dci; struct mlx5_cqe64 *cqe; uint32_t qp_num; uint16_t hw_ci; UCT_DC_MLX5_TXQP_DECL(txqp, txwq); cqe = uct_ib_mlx5_poll_cq(&iface->super.super.super, &iface->mlx5_common.tx.cq); if (cqe == NULL) { return; } UCS_STATS_UPDATE_COUNTER(iface->super.super.stats, UCT_RC_IFACE_STAT_TX_COMPLETION, 1); ucs_memory_cpu_load_fence(); qp_num = ntohl(cqe->sop_drop_qpn) & UCS_MASK(UCT_IB_QPN_ORDER); dci = uct_dc_iface_dci_find(&iface->super, qp_num); txqp = &iface->super.tx.dcis[dci].txqp; txwq = &iface->dci_wqs[dci]; hw_ci = ntohs(cqe->wqe_counter); ucs_trace_poll("dc_mlx5 iface %p tx_cqe: dci[%d] qpn 0x%x txqp %p hw_ci %d", iface, dci, qp_num, txqp, hw_ci); uct_rc_txqp_available_set(txqp, uct_ib_mlx5_txwq_update_bb(txwq, hw_ci)); uct_dc_iface_dci_put(&iface->super, dci); uct_rc_mlx5_txqp_process_tx_cqe(txqp, cqe, hw_ci); iface->super.super.tx.cq_available++; if (uct_dc_iface_dci_can_alloc(&iface->super)) { ucs_arbiter_dispatch(uct_dc_iface_dci_waitq(&iface->super), 1, uct_dc_iface_dci_do_pending_wait, NULL); } ucs_arbiter_dispatch(uct_dc_iface_tx_waitq(&iface->super), 1, uct_dc_iface_dci_do_pending_tx, NULL); }
static UCS_F_ALWAYS_INLINE void uct_dc_mlx5_poll_tx(uct_dc_mlx5_iface_t *iface) { uint8_t dci; struct mlx5_cqe64 *cqe; uint32_t qp_num; uint16_t hw_ci; UCT_DC_MLX5_TXQP_DECL(txqp, txwq); cqe = uct_ib_mlx5_get_cqe(&iface->super.super.super, &iface->mlx5_common.tx.cq, iface->mlx5_common.tx.cq.cqe_size_log); if (cqe == NULL) { return; } UCS_STATS_UPDATE_COUNTER(iface->super.super.stats, UCT_RC_IFACE_STAT_TX_COMPLETION, 1); ucs_memory_cpu_load_fence(); ucs_assertv(!(cqe->op_own & (MLX5_INLINE_SCATTER_32|MLX5_INLINE_SCATTER_64)), "tx inline scatter not supported"); qp_num = ntohl(cqe->sop_drop_qpn) & UCS_MASK(UCT_IB_QPN_ORDER); dci = uct_dc_iface_dci_find(&iface->super, qp_num); txqp = &iface->super.tx.dcis[dci].txqp; txwq = &iface->dci_wqs[dci]; hw_ci = ntohs(cqe->wqe_counter); uct_rc_txqp_available_set(txqp, uct_ib_mlx5_txwq_update_bb(txwq, hw_ci)); uct_rc_txqp_completion(txqp, hw_ci); iface->super.super.tx.cq_available++; uct_dc_iface_dci_put(&iface->super, dci); if (uct_dc_iface_dci_can_alloc(&iface->super)) { ucs_arbiter_dispatch(&iface->super.super.tx.arbiter, 1, uct_dc_iface_dci_do_pending_wait, NULL); } ucs_arbiter_dispatch(&iface->super.tx.dci_arbiter, 1, uct_dc_iface_dci_do_pending_tx, NULL); }
/* A common mm active message sending function. * The first parameter indicates the origin of the call. * is_short = 1 - perform AM short sending * is_short = 0 - perform AM bcopy sending */ static UCS_F_ALWAYS_INLINE ssize_t uct_mm_ep_am_common_send(const unsigned is_short, uct_mm_ep_t *ep, uct_mm_iface_t *iface, uint8_t am_id, size_t length, uint64_t header, const void *payload, uct_pack_callback_t pack_cb, void *arg) { uct_mm_fifo_element_t *elem; ucs_status_t status; void *base_address; uint64_t head; UCT_CHECK_AM_ID(am_id); head = ep->fifo_ctl->head; /* check if there is room in the remote process's receive FIFO to write */ if (!UCT_MM_EP_IS_ABLE_TO_SEND(head, ep->cached_tail, iface->config.fifo_size)) { if (!ucs_arbiter_group_is_empty(&ep->arb_group)) { /* pending isn't empty. don't send now to prevent out-of-order sending */ UCS_STATS_UPDATE_COUNTER(ep->super.stats, UCT_EP_STAT_NO_RES, 1); return UCS_ERR_NO_RESOURCE; } else { /* pending is empty */ /* update the local copy of the tail to its actual value on the remote peer */ uct_mm_ep_update_cached_tail(ep); if (!UCT_MM_EP_IS_ABLE_TO_SEND(head, ep->cached_tail, iface->config.fifo_size)) { UCS_STATS_UPDATE_COUNTER(ep->super.stats, UCT_EP_STAT_NO_RES, 1); return UCS_ERR_NO_RESOURCE; } } } status = uct_mm_ep_get_remote_elem(ep, head, &elem); if (status != UCS_OK) { ucs_trace_poll("couldn't get an available FIFO element"); UCS_STATS_UPDATE_COUNTER(ep->super.stats, UCT_EP_STAT_NO_RES, 1); return status; } if (is_short) { /* AM_SHORT */ /* write to the remote FIFO */ *(uint64_t*) (elem + 1) = header; memcpy((void*) (elem + 1) + sizeof(header), payload, length); elem->flags |= UCT_MM_FIFO_ELEM_FLAG_INLINE; elem->length = length + sizeof(header); uct_iface_trace_am(&iface->super, UCT_AM_TRACE_TYPE_SEND, am_id, elem + 1, length + sizeof(header), "TX: AM_SHORT"); UCT_TL_EP_STAT_OP(&ep->super, AM, SHORT, sizeof(header) + length); } else { /* AM_BCOPY */ /* write to the remote descriptor */ /* get the base_address: local ptr to remote memory chunk after attaching to it */ base_address = uct_mm_ep_attach_remote_seg(ep, iface, elem); length = pack_cb(base_address + elem->desc_offset, arg); elem->flags &= ~UCT_MM_FIFO_ELEM_FLAG_INLINE; elem->length = length; uct_iface_trace_am(&iface->super, UCT_AM_TRACE_TYPE_SEND, am_id, base_address + elem->desc_offset, length, "TX: AM_BCOPY"); UCT_TL_EP_STAT_OP(&ep->super, AM, BCOPY, length); } elem->am_id = am_id; /* memory barrier - make sure that the memory is flushed before setting the * 'writing is complete' flag which the reader checks */ ucs_memory_cpu_store_fence(); /* change the owner bit to indicate that the writing is complete. * the owner bit flips after every FIFO wraparound */ if (head & iface->config.fifo_size) { elem->flags |= UCT_MM_FIFO_ELEM_FLAG_OWNER; } else { elem->flags &= ~UCT_MM_FIFO_ELEM_FLAG_OWNER; } if (is_short) { return UCS_OK; } else { return length; } }
ucs_status_t uct_dc_mlx5_ep_fc_ctrl(uct_ep_t *tl_ep, unsigned op, uct_rc_fc_request_t *req) { uct_dc_mlx5_ep_t *dc_ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t); uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_mlx5_iface_t); uct_ib_iface_t *ib_iface = &iface->super.super.super; struct ibv_ah_attr ah_attr = {.is_global = 0}; uct_dc_fc_sender_data_t sender; uct_dc_fc_request_t *dc_req; struct mlx5_wqe_av mlx5_av; uct_ib_mlx5_base_av_t av; ucs_status_t status; uintptr_t sender_ep; struct ibv_ah *ah; UCT_DC_MLX5_TXQP_DECL(txqp, txwq); ucs_assert((sizeof(uint8_t) + sizeof(sender_ep)) <= UCT_IB_MLX5_AV_FULL_SIZE); UCT_DC_MLX5_CHECK_RES(iface, dc_ep); UCT_DC_MLX5_IFACE_TXQP_GET(iface, dc_ep, txqp, txwq); dc_req = ucs_derived_of(req, uct_dc_fc_request_t); if (op == UCT_RC_EP_FC_PURE_GRANT) { ucs_assert(req != NULL); sender_ep = (uintptr_t)dc_req->sender.ep; /* TODO: look at common code with uct_ud_mlx5_iface_get_av */ if (dc_req->sender.global.is_global) { uct_ib_iface_fill_ah_attr_from_gid_lid(ib_iface, dc_req->lid, ucs_unaligned_ptr(&dc_req->sender.global.gid), ib_iface->path_bits[0], &ah_attr); status = uct_ib_iface_create_ah(ib_iface, &ah_attr, &ah); if (status != UCS_OK) { return status; } uct_ib_mlx5_get_av(ah, &mlx5_av); } /* Note av initialization is copied from exp verbs */ av.stat_rate_sl = ib_iface->config.sl; /* (attr->static_rate << 4) | attr->sl */ av.fl_mlid = ib_iface->path_bits[0] & 0x7f; /* lid in dc_req is in BE already */ av.rlid = uct_ib_iface_is_roce(ib_iface) ? 0 : (dc_req->lid | htons(ib_iface->path_bits[0])); av.dqp_dct = htonl(dc_req->dct_num); uct_dc_mlx5_iface_set_av_sport(iface, &av, dc_req->dct_num); if (!iface->ud_common.config.compact_av || ah_attr.is_global) { av.dqp_dct |= UCT_IB_MLX5_EXTENDED_UD_AV; } uct_rc_mlx5_txqp_inline_post(&iface->super, UCT_IB_QPT_DCI, txqp, txwq, MLX5_OPCODE_SEND, &av /*dummy*/, 0, op, sender_ep, 0, 0, 0, &av, ah_attr.is_global ? mlx5_av_grh(&mlx5_av) : NULL, uct_ib_mlx5_wqe_av_size(&av), 0, INT_MAX); } else { ucs_assert(op == UCT_RC_EP_FC_FLAG_HARD_REQ); sender.ep = (uint64_t)dc_ep; sender.global.gid = ib_iface->gid; sender.global.is_global = dc_ep->flags & UCT_DC_MLX5_EP_FLAG_GRH; UCS_STATS_UPDATE_COUNTER(dc_ep->fc.stats, UCT_RC_FC_STAT_TX_HARD_REQ, 1); uct_rc_mlx5_txqp_inline_post(&iface->super, UCT_IB_QPT_DCI, txqp, txwq, MLX5_OPCODE_SEND_IMM, &sender.global, sizeof(sender.global), op, sender.ep, uct_dc_mlx5_get_dct_num(iface), 0, 0, &dc_ep->av, uct_dc_mlx5_ep_get_grh(dc_ep), uct_ib_mlx5_wqe_av_size(&dc_ep->av), MLX5_WQE_CTRL_SOLICITED, INT_MAX); } return UCS_OK; } UCS_CLASS_INIT_FUNC(uct_dc_mlx5_ep_t, uct_dc_mlx5_iface_t *iface, const uct_dc_mlx5_iface_addr_t *if_addr, uct_ib_mlx5_base_av_t *av) { uint32_t remote_dctn; ucs_trace_func(""); UCS_CLASS_CALL_SUPER_INIT(uct_base_ep_t, &iface->super.super.super.super); self->atomic_mr_offset = uct_ib_md_atomic_offset(if_addr->atomic_mr_id); remote_dctn = uct_ib_unpack_uint24(if_addr->qp_num); memcpy(&self->av, av, sizeof(*av)); self->av.dqp_dct |= htonl(remote_dctn); uct_dc_mlx5_iface_set_av_sport(iface, &self->av, remote_dctn); return uct_dc_mlx5_ep_basic_init(iface, self); }
ucs_status_t uct_dc_mlx5_ep_fc_ctrl(uct_ep_t *tl_ep, unsigned op, uct_rc_fc_request_t *req) { uintptr_t sender_ep; uct_ib_iface_t *ib_iface; uct_ib_mlx5_base_av_t av; uct_dc_fc_request_t *dc_req; uct_dc_mlx5_ep_t *dc_mlx5_ep; uct_dc_ep_t *dc_ep = ucs_derived_of(tl_ep, uct_dc_ep_t); uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_mlx5_iface_t); UCT_DC_MLX5_TXQP_DECL(txqp, txwq); ucs_assert((sizeof(uint8_t) + sizeof(sender_ep)) <= UCT_IB_MLX5_AV_FULL_SIZE); UCT_DC_CHECK_RES(&iface->super, dc_ep); UCT_DC_MLX5_IFACE_TXQP_GET(iface, dc_ep, txqp, txwq); if (op == UCT_RC_EP_FC_PURE_GRANT) { ucs_assert(req != NULL); dc_req = ucs_derived_of(req, uct_dc_fc_request_t); sender_ep = (uintptr_t)dc_req->sender_ep; ib_iface = &iface->super.super.super; /* Note av initialization is copied from exp verbs */ av.stat_rate_sl = ib_iface->config.sl; /* (attr->static_rate << 4) | attr->sl */ av.fl_mlid = ib_iface->path_bits[0] & 0x7f; /* lid in dc_req is in BE already */ av.rlid = dc_req->lid | htons(ib_iface->path_bits[0]); av.dqp_dct = htonl(dc_req->dct_num); if (!iface->ud_common.config.compact_av) { av.dqp_dct |= UCT_IB_MLX5_EXTENDED_UD_AV; } uct_rc_mlx5_txqp_inline_post(&iface->super.super, IBV_EXP_QPT_DC_INI, txqp, txwq, MLX5_OPCODE_SEND, NULL, 0, op, sender_ep, 0, 0, 0, &av, uct_ib_mlx5_wqe_av_size(&av)); } else { ucs_assert(op == UCT_RC_EP_FC_FLAG_HARD_REQ); sender_ep = (uintptr_t)dc_ep; dc_mlx5_ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t); UCS_STATS_UPDATE_COUNTER(dc_ep->fc.stats, UCT_RC_FC_STAT_TX_HARD_REQ, 1); uct_rc_mlx5_txqp_inline_post(&iface->super.super, IBV_EXP_QPT_DC_INI, txqp, txwq, MLX5_OPCODE_SEND_IMM, NULL, 0, op, sender_ep, iface->super.rx.dct->dct_num, 0, 0, &dc_mlx5_ep->av, uct_ib_mlx5_wqe_av_size(&dc_mlx5_ep->av)); } return UCS_OK; }