static UCS_F_ALWAYS_INLINE void uct_dc_mlx5_iface_atomic_post(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep, unsigned opcode, uct_rc_iface_send_desc_t *desc, unsigned length, uint64_t remote_addr, uct_rkey_t rkey, uint64_t compare_mask, uint64_t compare, uint64_t swap_mask, uint64_t swap_add) { uint32_t ib_rkey = uct_ib_resolve_atomic_rkey(rkey, ep->atomic_mr_offset, &remote_addr); UCT_DC_MLX5_TXQP_DECL(txqp, txwq); UCT_DC_MLX5_IFACE_TXQP_GET(iface, ep, txqp, txwq); desc->super.sn = txwq->sw_pi; uct_rc_mlx5_txqp_dptr_post(&iface->super, UCT_IB_QPT_DCI, txqp, txwq, opcode, desc + 1, length, &desc->lkey, remote_addr, ib_rkey, compare_mask, compare, swap_mask, swap_add, &ep->av, uct_dc_mlx5_ep_get_grh(ep), uct_ib_mlx5_wqe_av_size(&ep->av), MLX5_WQE_CTRL_CQ_UPDATE, 0, INT_MAX, NULL); UCT_TL_EP_STAT_ATOMIC(&ep->super); uct_rc_txqp_add_send_op(txqp, &desc->super); }
ucs_status_t uct_sm_ep_atomic_add64(uct_ep_h tl_ep, uint64_t add, uint64_t remote_addr, uct_rkey_t rkey) { uint64_t *ptr = (uint64_t *)(rkey + remote_addr); ucs_atomic_add64(ptr, add); uct_sm_ep_trace_data(remote_addr, rkey, "ATOMIC_ADD64 [add %"PRIu64"]", add); UCT_TL_EP_STAT_ATOMIC(ucs_derived_of(tl_ep, uct_base_ep_t)); return UCS_OK; }
ucs_status_t uct_sm_ep_atomic_swap32(uct_ep_h tl_ep, uint32_t swap, uint64_t remote_addr, uct_rkey_t rkey, uint32_t *result, uct_completion_t *comp) { uint32_t *ptr = (uint32_t *)(rkey + remote_addr); *result = ucs_atomic_swap32(ptr, swap); uct_sm_ep_trace_data(remote_addr, rkey, "ATOMIC_SWAP32 [swap %"PRIu32 " result %"PRIu32"]", swap, *result); UCT_TL_EP_STAT_ATOMIC(ucs_derived_of(tl_ep, uct_base_ep_t)); return UCS_OK; }
static UCS_F_ALWAYS_INLINE void uct_rc_verbs_ep_atomic_post(uct_rc_verbs_ep_t *ep, int opcode, uint64_t compare_add, uint64_t swap, uint64_t remote_addr, uct_rkey_t rkey, uct_rc_iface_send_desc_t *desc, int force_sig) { struct ibv_send_wr wr; struct ibv_sge sge; UCT_RC_VERBS_FILL_ATOMIC_WR(wr, wr.opcode, sge, opcode, compare_add, swap, remote_addr, rkey, ep->super.umr_offset); UCT_TL_EP_STAT_ATOMIC(&ep->super.super); uct_rc_verbs_ep_post_send_desc(ep, &wr, desc, force_sig); }
ucs_status_t uct_sm_ep_atomic_cswap64(uct_ep_h tl_ep, uint64_t compare, uint64_t swap, uint64_t remote_addr, uct_rkey_t rkey, uint64_t *result, uct_completion_t *comp) { uint64_t *ptr = (uint64_t *)(rkey + remote_addr); *result = ucs_atomic_cswap64(ptr, compare, swap); uct_sm_ep_trace_data(remote_addr, rkey, "ATOMIC_CSWAP64 [compare %"PRIu64 " swap %"PRIu64" result %"PRIu64"]", compare, swap, *result); UCT_TL_EP_STAT_ATOMIC(ucs_derived_of(tl_ep, uct_base_ep_t)); return UCS_OK; }
static UCS_F_ALWAYS_INLINE void uct_rc_mlx5_ep_atomic_post(uct_rc_mlx5_ep_t *ep, unsigned opcode, uct_rc_iface_send_desc_t *desc, unsigned length, uint64_t remote_addr, uct_rkey_t rkey, uint64_t compare_mask, uint64_t compare, uint64_t swap_add, int signal) { desc->super.sn = ep->tx.wq.sw_pi; uct_rc_mlx5_ep_dptr_post(ep, opcode, desc + 1, length, &desc->lkey, 0, NULL, 0, remote_addr, rkey, compare_mask, compare, swap_add, signal); UCT_TL_EP_STAT_ATOMIC(&ep->super.super); ucs_queue_push(&ep->super.outstanding, &desc->super.queue); }
static inline void uct_rc_verbs_ep_ext_atomic_post(uct_rc_verbs_ep_t *ep, int opcode, uint32_t length, uint64_t compare_mask, uint64_t compare_add, uint64_t swap, uint64_t remote_addr, uct_rkey_t rkey, uct_rc_iface_send_desc_t *desc, uint64_t force_sig) { struct ibv_exp_send_wr wr; struct ibv_sge sge; uct_rc_verbs_fill_ext_atomic_wr(&wr, &sge, opcode, length, compare_mask, compare_add, swap, remote_addr, rkey, ep->super.atomic_mr_offset); UCT_RC_VERBS_FILL_DESC_WR(&wr, desc); UCT_TL_EP_STAT_ATOMIC(&ep->super.super); uct_rc_verbs_exp_post_send(ep, &wr, force_sig|IBV_EXP_SEND_EXT_ATOMIC_INLINE); uct_rc_txqp_add_send_op_sn(&ep->super.txqp, &desc->super, ep->txcnt.pi); }
static inline ucs_status_t uct_rc_verbs_ext_atomic_post(uct_rc_verbs_ep_t *ep, int opcode, uint32_t length, uint64_t compare_mask, uint64_t compare_add, uint64_t swap, uint64_t remote_addr, uct_rkey_t rkey, uct_rc_iface_send_desc_t *desc, int force_sig, ucs_status_t success) { struct ibv_exp_send_wr wr; struct ibv_sge sge; sge.addr = (uintptr_t)(desc + 1); sge.lkey = desc->lkey; sge.length = length; wr.next = NULL; wr.sg_list = &sge; wr.num_sge = 1; wr.exp_opcode = opcode; wr.exp_send_flags = IBV_EXP_SEND_EXT_ATOMIC_INLINE; wr.comp_mask = 0; wr.ext_op.masked_atomics.log_arg_sz = ucs_ilog2(length); wr.ext_op.masked_atomics.remote_addr = remote_addr; wr.ext_op.masked_atomics.rkey = rkey; switch (opcode) { case IBV_EXP_WR_EXT_MASKED_ATOMIC_CMP_AND_SWP: wr.ext_op.masked_atomics.wr_data.inline_data.op.cmp_swap.compare_mask = compare_mask; wr.ext_op.masked_atomics.wr_data.inline_data.op.cmp_swap.compare_val = compare_add; wr.ext_op.masked_atomics.wr_data.inline_data.op.cmp_swap.swap_mask = (uint64_t)(-1); wr.ext_op.masked_atomics.wr_data.inline_data.op.cmp_swap.swap_val = swap; break; case IBV_EXP_WR_EXT_MASKED_ATOMIC_FETCH_AND_ADD: wr.ext_op.masked_atomics.wr_data.inline_data.op.fetch_add.add_val = compare_add; wr.ext_op.masked_atomics.wr_data.inline_data.op.fetch_add.field_boundary = 0; break; } UCT_TL_EP_STAT_ATOMIC(&ep->super.super); uct_rc_verbs_exp_post_send(ep, &wr, force_sig); uct_rc_verbs_ep_push_desc(ep, desc); return success; }
static UCS_F_ALWAYS_INLINE void uct_rc_verbs_ep_atomic_post(uct_rc_verbs_ep_t *ep, int opcode, uint64_t compare_add, uint64_t swap, uint64_t remote_addr, uct_rkey_t rkey, uct_rc_iface_send_desc_t *desc, int force_sig) { struct ibv_send_wr wr; struct ibv_sge sge; wr.sg_list = &sge; wr.num_sge = 1; wr.opcode = opcode; wr.wr.atomic.compare_add = compare_add; wr.wr.atomic.swap = swap; wr.wr.atomic.remote_addr = remote_addr; wr.wr.atomic.rkey = rkey; sge.length = sizeof(uint64_t); UCT_TL_EP_STAT_ATOMIC(&ep->super.super); uct_rc_verbs_ep_post_send_desc(ep, &wr, desc, force_sig); }
static UCS_F_ALWAYS_INLINE ucs_status_t uct_rc_mlx5_ep_atomic_post(uct_rc_mlx5_ep_t *ep, unsigned opcode, uct_rc_iface_send_desc_t *desc, unsigned length, uint64_t remote_addr, uct_rkey_t rkey, uint64_t compare_mask, uint64_t compare, uint64_t swap_add, int signal, ucs_status_t success) { ucs_status_t status; desc->super.sn = ep->tx.sw_pi; status = uct_rc_mlx5_ep_dptr_post(ep, opcode, desc + 1, length, &desc->lkey, 0, NULL, 0, remote_addr, rkey, compare_mask, compare, swap_add, signal); if (status != UCS_OK) { return status; } UCT_TL_EP_STAT_ATOMIC(&ep->super.super); ucs_queue_push(&ep->super.outstanding, &desc->super.queue); return success; }
ucs_status_t uct_ugni_ep_atomic_add32(uct_ep_h tl_ep, uint32_t add, uint64_t remote_addr, uct_rkey_t rkey) { uct_ugni_ep_t *ep = ucs_derived_of(tl_ep, uct_ugni_ep_t); uct_ugni_rdma_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ugni_rdma_iface_t); uct_ugni_rdma_fetch_desc_t *fma; UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc_famo, fma, return UCS_ERR_NO_RESOURCE); uct_ugni_format_fma_amo(fma, GNI_POST_AMO, GNI_FMA_ATOMIC2_IADD_S, (uint64_t)add, 0, NULL, remote_addr, rkey, LEN_32, ep, NULL, NULL, NULL); ucs_trace_data("Posting AMO ADD, GNI_PostFma of size %"PRIx64" value" "%"PRIx32" to %p, with [%"PRIx64" %"PRIx64"]", fma->super.desc.length, add, (void *)fma->super.desc.remote_addr, fma->super.desc.remote_mem_hndl.qword1, fma->super.desc.remote_mem_hndl.qword2); UCT_TL_EP_STAT_ATOMIC(ucs_derived_of(tl_ep, uct_base_ep_t)); return uct_ugni_post_fma(iface, ep, &fma->super, UCS_OK); }
ucs_status_t uct_ugni_ep_atomic_cswap32(uct_ep_h tl_ep, uint32_t compare, uint32_t swap, uint64_t remote_addr, uct_rkey_t rkey, uint32_t *result, uct_completion_t *comp) { uct_ugni_ep_t *ep = ucs_derived_of(tl_ep, uct_ugni_ep_t); uct_ugni_rdma_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ugni_rdma_iface_t); uct_ugni_rdma_fetch_desc_t *fma; UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc_famo, fma, return UCS_ERR_NO_RESOURCE); uct_ugni_format_fma_amo(fma, GNI_POST_AMO, GNI_FMA_ATOMIC2_FCSWAP_S, (uint64_t)compare, (uint64_t)swap, fma + 1, remote_addr, rkey, LEN_32, ep, comp, uct_ugni_amo_unpack32, (void *)result); ucs_trace_data("Posting AMO CSWAP, GNI_PostFma of size %"PRIx64" value" "%"PRIx32" compare %"PRIx32" to %p, with [%"PRIx64" %"PRIx64"]", fma->super.desc.length, swap, compare, (void *)fma->super.desc.remote_addr, fma->super.desc.remote_mem_hndl.qword1, fma->super.desc.remote_mem_hndl.qword2); UCT_TL_EP_STAT_ATOMIC(ucs_derived_of(tl_ep, uct_base_ep_t)); return uct_ugni_post_fma(iface, ep, &fma->super, UCS_INPROGRESS); }
ucs_status_t uct_ugni_ep_atomic_op64(uct_ep_h tl_ep, uint64_t op, uint64_t remote_addr, uct_rkey_t rkey, gni_fma_cmd_type_t op_type, char *op_str) { uct_ugni_ep_t *ep = ucs_derived_of(tl_ep, uct_ugni_ep_t); uct_ugni_rdma_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_ugni_rdma_iface_t); uct_ugni_rdma_fetch_desc_t *fma; UCT_TL_IFACE_GET_TX_DESC(&iface->super.super, &iface->free_desc_famo, fma, return UCS_ERR_NO_RESOURCE); uct_ugni_format_fma_amo(fma, GNI_POST_AMO, op_type, op, 0, NULL, remote_addr, rkey, LEN_64, ep, NULL, NULL, NULL); ucs_trace_data("Posting AMO %s, GNI_PostFma of size %"PRIx64" value" "%"PRIx64" to %p, with [%"PRIx64" %"PRIx64"]", op_str, fma->super.desc.length, op, (void *)fma->super.desc.remote_addr, fma->super.desc.remote_mem_hndl.qword1, fma->super.desc.remote_mem_hndl.qword2); UCT_TL_EP_STAT_ATOMIC(ucs_derived_of(tl_ep, uct_base_ep_t)); return uct_ugni_post_fma(iface, ep, &fma->super, UCS_OK); }
static UCS_F_ALWAYS_INLINE void uct_dc_mlx5_iface_atomic_post(uct_dc_mlx5_iface_t *iface, uct_dc_mlx5_ep_t *ep, unsigned opcode, uct_rc_iface_send_desc_t *desc, unsigned length, uint64_t remote_addr, uct_rkey_t rkey, uint64_t compare_mask, uint64_t compare, uint64_t swap_add) { UCT_DC_MLX5_TXQP_DECL(txqp, txwq); UCT_DC_MLX5_IFACE_TXQP_GET(iface, ep, txqp, txwq); desc->super.sn = txwq->sw_pi; uct_rc_mlx5_txqp_dptr_post(&iface->super.super, IBV_EXP_QPT_DC_INI, txqp, txwq, opcode, desc + 1, length, &desc->lkey, 0, NULL, 0, remote_addr + ep->super.umr_offset, uct_ib_md_umr_rkey(rkey), compare_mask, compare, swap_add, &ep->av, uct_ib_mlx5_wqe_av_size(&ep->av), MLX5_WQE_CTRL_CQ_UPDATE); UCT_TL_EP_STAT_ATOMIC(&ep->super.super); uct_rc_txqp_add_send_op(txqp, &desc->super); }
static UCS_F_ALWAYS_INLINE void uct_rc_mlx5_ep_atomic_post(uct_rc_mlx5_ep_t *ep, unsigned opcode, uct_rc_iface_send_desc_t *desc, unsigned length, uint64_t remote_addr, uct_rkey_t rkey, uint64_t compare_mask, uint64_t compare, uint64_t swap_add, int signal) { uct_rc_iface_t *iface = ucs_derived_of(ep->super.super.super.iface, uct_rc_iface_t); uint32_t ib_rkey = uct_ib_resolve_atomic_rkey(rkey, ep->super.atomic_mr_offset, &remote_addr); desc->super.sn = ep->tx.wq.sw_pi; uct_rc_mlx5_txqp_dptr_post(iface, IBV_QPT_RC, &ep->super.txqp, &ep->tx.wq, opcode, desc + 1, length, &desc->lkey, 0, NULL, 0, remote_addr, ib_rkey, compare_mask, compare, swap_add, NULL, 0, signal); UCT_TL_EP_STAT_ATOMIC(&ep->super.super); uct_rc_txqp_add_send_op(&ep->super.txqp, &desc->super); }