static UCS_F_ALWAYS_INLINE void uct_rc_verbs_ep_post_send(uct_rc_verbs_iface_t* iface, uct_rc_verbs_ep_t* ep, struct ibv_send_wr *wr, int send_flags) { struct ibv_send_wr *bad_wr; int ret; uct_rc_txqp_check(&ep->super.txqp); if (!(send_flags & IBV_SEND_SIGNALED)) { send_flags |= uct_rc_iface_tx_moderation(&iface->super, &ep->super.txqp, IBV_SEND_SIGNALED); } wr->send_flags = send_flags; wr->wr_id = uct_rc_txqp_unsignaled(&ep->super.txqp); uct_ib_log_post_send(&iface->super.super, ep->super.txqp.qp, wr, (wr->opcode == IBV_WR_SEND) ? uct_rc_ep_am_packet_dump : NULL); UCT_IB_INSTRUMENT_RECORD_SEND_WR_LEN("uct_rc_verbs_ep_post_send", wr); ret = ibv_post_send(ep->super.txqp.qp, wr, &bad_wr); if (ret != 0) { ucs_fatal("ibv_post_send() returned %d (%m)", ret); } uct_rc_verbs_txqp_posted(&ep->super.txqp, &ep->txcnt, &iface->super, send_flags & IBV_SEND_SIGNALED); }
static UCS_F_ALWAYS_INLINE void uct_rc_verbs_exp_post_send(uct_rc_verbs_ep_t *ep, struct ibv_exp_send_wr *wr, uint64_t signal) { uct_rc_verbs_iface_t *iface = ucs_derived_of(ep->super.super.super.iface, uct_rc_verbs_iface_t); uct_rc_txqp_check(&ep->super.txqp); struct ibv_exp_send_wr *bad_wr; int ret; signal |= uct_rc_iface_tx_moderation(&iface->super, &ep->super.txqp, IBV_EXP_SEND_SIGNALED); wr->exp_send_flags = signal; wr->wr_id = uct_rc_txqp_unsignaled(&ep->super.txqp); uct_ib_log_exp_post_send(&iface->super.super, ep->super.txqp.qp, wr, (wr->exp_opcode == IBV_EXP_WR_SEND) ? uct_rc_ep_am_packet_dump : NULL); UCT_IB_INSTRUMENT_RECORD_SEND_EXP_WR_LEN("uct_rc_verbs_exp_post_send", wr); ret = ibv_exp_post_send(ep->super.txqp.qp, wr, &bad_wr); if (ret != 0) { ucs_fatal("ibv_exp_post_send() returned %d (%m)", ret); } uct_rc_verbs_txqp_posted(&ep->super.txqp, &ep->txcnt, &iface->super, signal); }
static UCS_F_ALWAYS_INLINE void uct_rc_verbs_ep_post_send(uct_rc_verbs_iface_t* iface, uct_rc_verbs_ep_t* ep, struct ibv_send_wr *wr, int send_flags, int max_log_sge) { struct ibv_send_wr *bad_wr; int ret; uct_rc_txqp_check(&ep->super.txqp); if (!(send_flags & IBV_SEND_SIGNALED)) { send_flags |= uct_rc_iface_tx_moderation(&iface->super, &ep->super.txqp, IBV_SEND_SIGNALED); } if (wr->opcode == IBV_WR_RDMA_READ) { send_flags |= uct_rc_ep_atomic_fence(&iface->super, &ep->fi, IBV_SEND_FENCE); } wr->send_flags = send_flags; wr->wr_id = uct_rc_txqp_unsignaled(&ep->super.txqp); uct_ib_log_post_send(&iface->super.super, ep->super.txqp.qp, wr, max_log_sge, (wr->opcode == IBV_WR_SEND) ? uct_rc_ep_packet_dump : NULL); ret = ibv_post_send(ep->super.txqp.qp, wr, &bad_wr); if (ret != 0) { ucs_fatal("ibv_post_send() returned %d (%m)", ret); } uct_rc_verbs_txqp_posted(&ep->super.txqp, &ep->txcnt, &iface->super, send_flags & IBV_SEND_SIGNALED); }
static UCS_F_ALWAYS_INLINE void uct_rc_verbs_exp_post_send(uct_rc_verbs_ep_t *ep, struct ibv_exp_send_wr *wr, int signal) { uct_rc_verbs_iface_t *iface = ucs_derived_of(ep->super.super.super.iface, uct_rc_verbs_iface_t); struct ibv_exp_send_wr *bad_wr; int ret; if (!signal) { signal = uct_rc_iface_tx_moderation(&iface->super, &ep->super, IBV_EXP_SEND_SIGNALED); } wr->exp_send_flags |= signal; wr->wr_id = ep->super.unsignaled; uct_ib_log_exp_post_send(ep->super.qp, wr, (wr->exp_opcode == IBV_EXP_WR_SEND) ? uct_rc_ep_am_packet_dump : NULL); ret = ibv_exp_post_send(ep->super.qp, wr, &bad_wr); if (ret != 0) { ucs_fatal("ibv_exp_post_send() returned %d (%m)", ret); } uct_rc_verbs_ep_posted(ep, signal); }
static UCS_F_ALWAYS_INLINE void uct_rc_verbs_ep_post_send(uct_rc_verbs_iface_t* iface, uct_rc_verbs_ep_t* ep, struct ibv_send_wr *wr, int send_flags) { struct ibv_send_wr *bad_wr; int ret; if (!(send_flags & IBV_SEND_SIGNALED)) { send_flags |= uct_rc_iface_tx_moderation(&iface->super, &ep->super, IBV_SEND_SIGNALED); } wr->send_flags = send_flags; wr->wr_id = ep->super.unsignaled; uct_ib_log_post_send(ep->super.qp, wr, (wr->opcode == IBV_WR_SEND) ? uct_rc_ep_am_packet_dump : NULL); ret = ibv_post_send(ep->super.qp, wr, &bad_wr); if (ret != 0) { ucs_fatal("ibv_post_send() returned %d (%m)", ret); } uct_rc_verbs_ep_posted(ep, send_flags & IBV_SEND_SIGNALED); }
/* * Generic data-pointer posting function. * Parameters which are not relevant to the opcode are ignored. * * +--------+-----+-------+--------+-------+ * SEND | CTRL | INL | am_id | am_hdr | DPSEG | * +--------+-----+---+---+----+----+------+ * RDMA_WRITE | CTRL | RADDR | DPSEG | * +--------+---------+--------+-------+ * ATOMIC | CTRL | RADDR | ATOMIC | DPSEG | * +--------+---------+--------+-------+ */ static UCS_F_ALWAYS_INLINE ucs_status_t uct_rc_mlx5_ep_dptr_post(uct_rc_mlx5_ep_t *ep, unsigned opcode_flags, const void *buffer, unsigned length, uint32_t *lkey_p, /* SEND */ uint8_t am_id, const void *am_hdr, unsigned am_hdr_len, /* RDMA/ATOMIC */ uint64_t remote_addr, uct_rkey_t rkey, /* ATOMIC */ uint64_t compare_mask, uint64_t compare, uint64_t swap_add, int signal) { struct mlx5_wqe_ctrl_seg *ctrl; struct mlx5_wqe_raddr_seg *raddr; struct mlx5_wqe_atomic_seg *atomic; struct mlx5_wqe_data_seg *dptr; struct mlx5_wqe_inl_data_seg *inl; struct uct_ib_mlx5_atomic_masked_cswap32_seg *masked_cswap32; struct uct_ib_mlx5_atomic_masked_fadd32_seg *masked_fadd32; struct uct_ib_mlx5_atomic_masked_cswap64_seg *masked_cswap64; uct_rc_mlx5_iface_t *iface; uct_rc_hdr_t *rch; unsigned wqe_size, inl_seg_size; uint8_t opmod; iface = ucs_derived_of(ep->super.super.super.iface, uct_rc_mlx5_iface_t); if (!signal) { signal = uct_rc_iface_tx_moderation(&iface->super, &ep->super, MLX5_WQE_CTRL_CQ_UPDATE); } else { ucs_assert(signal == MLX5_WQE_CTRL_CQ_UPDATE); } opmod = 0; ctrl = ep->tx.seg; switch (opcode_flags) { case MLX5_OPCODE_SEND: UCT_CHECK_LENGTH(length + sizeof(*rch) + am_hdr_len, iface->super.super.config.seg_size, "am_zcopy payload"); inl_seg_size = ucs_align_up_pow2(sizeof(*inl) + sizeof(*rch) + am_hdr_len, UCT_IB_MLX5_WQE_SEG_SIZE); UCT_CHECK_LENGTH(sizeof(*ctrl) + inl_seg_size + sizeof(*dptr), UCT_RC_MLX5_MAX_BB * MLX5_SEND_WQE_BB, "am_zcopy header"); /* Inline segment with AM ID and header */ inl = (void*)(ctrl + 1); inl->byte_count = htonl((sizeof(*rch) + am_hdr_len) | MLX5_INLINE_SEG); rch = (void*)(inl + 1); rch->am_id = am_id; uct_rc_mlx5_inline_copy(rch + 1, am_hdr, am_hdr_len, ep); /* Data segment with payload */ if (length == 0) { wqe_size = sizeof(*ctrl) + inl_seg_size; } else { wqe_size = sizeof(*ctrl) + inl_seg_size + sizeof(*dptr); dptr = (void*)(ctrl + 1) + inl_seg_size; if (ucs_unlikely((void*)dptr >= ep->tx.qend)) { dptr = (void*)dptr - (ep->tx.qend - ep->tx.qstart); } ucs_assert((void*)dptr >= ep->tx.qstart); ucs_assert((void*)(dptr + 1) <= ep->tx.qend); uct_rc_mlx5_ep_set_dptr_seg(dptr, buffer, length, *lkey_p); } break; case MLX5_OPCODE_SEND|UCT_RC_MLX5_OPCODE_FLAG_RAW: /* Data segment only */ UCT_CHECK_LENGTH(length, iface->super.super.config.seg_size, "send"); ucs_assert(length < (2ul << 30)); wqe_size = sizeof(*ctrl) + sizeof(*dptr); uct_rc_mlx5_ep_set_dptr_seg((void*)(ctrl + 1), buffer, length, *lkey_p); break; case MLX5_OPCODE_RDMA_READ: case MLX5_OPCODE_RDMA_WRITE: /* Set RDMA segment */ UCT_CHECK_LENGTH(length, UCT_IB_MAX_MESSAGE_SIZE, "put/get"); raddr = (void*)(ctrl + 1); uct_rc_mlx5_ep_set_rdma_seg(raddr, remote_addr, rkey); /* Data segment */ if (length == 0) { wqe_size = sizeof(*ctrl) + sizeof(*raddr); } else { wqe_size = sizeof(*ctrl) + sizeof(*raddr) + sizeof(*dptr); uct_rc_mlx5_ep_set_dptr_seg((void*)(raddr + 1), buffer, length, *lkey_p); } break; case MLX5_OPCODE_ATOMIC_FA: case MLX5_OPCODE_ATOMIC_CS: ucs_assert(length == sizeof(uint64_t)); raddr = (void*)(ctrl + 1); uct_rc_mlx5_ep_set_rdma_seg(raddr, remote_addr, rkey); atomic = (void*)(raddr + 1); if (opcode_flags == MLX5_OPCODE_ATOMIC_CS) { atomic->compare = compare; } atomic->swap_add = swap_add; uct_rc_mlx5_ep_set_dptr_seg((void*)(atomic + 1), buffer, length, *lkey_p); wqe_size = sizeof(*ctrl) + sizeof(*raddr) + sizeof(*atomic) + sizeof(*dptr); break; case MLX5_OPCODE_ATOMIC_MASKED_CS: raddr = (void*)(ctrl + 1); uct_rc_mlx5_ep_set_rdma_seg(raddr, remote_addr, rkey); switch (length) { case sizeof(uint32_t): opmod = UCT_IB_MLX5_OPMOD_EXT_ATOMIC(2); masked_cswap32 = (void*)(raddr + 1); masked_cswap32->swap = swap_add; masked_cswap32->compare = compare; masked_cswap32->swap_mask = (uint32_t)-1; masked_cswap32->compare_mask = compare_mask; dptr = (void*)(masked_cswap32 + 1); wqe_size = sizeof(*ctrl) + sizeof(*raddr) + sizeof(*masked_cswap32) + sizeof(*dptr); break; case sizeof(uint64_t): opmod = UCT_IB_MLX5_OPMOD_EXT_ATOMIC(3); /* Ext. atomic, size 2**3 */ masked_cswap64 = (void*)(raddr + 1); masked_cswap64->swap = swap_add; masked_cswap64->compare = compare; masked_cswap64->swap_mask = (uint64_t)-1; masked_cswap64->compare_mask = compare_mask; dptr = (void*)(masked_cswap64 + 1); wqe_size = sizeof(*ctrl) + sizeof(*raddr) + sizeof(*masked_cswap64) + sizeof(*dptr); /* Handle QP wrap-around. It cannot happen in the middle of * masked-cswap segment, because it's still in the first BB. */ ucs_assert((void*)dptr <= ep->tx.qend); if (dptr == ep->tx.qend) { dptr = ep->tx.qstart; } else { ucs_assert((void*)masked_cswap64 < ep->tx.qend); } break; default: ucs_assert(0); } uct_rc_mlx5_ep_set_dptr_seg(dptr, buffer, length, *lkey_p); break; case MLX5_OPCODE_ATOMIC_MASKED_FA: ucs_assert(length == sizeof(uint32_t)); raddr = (void*)(ctrl + 1); uct_rc_mlx5_ep_set_rdma_seg(raddr, remote_addr, rkey); opmod = UCT_IB_MLX5_OPMOD_EXT_ATOMIC(2); masked_fadd32 = (void*)(raddr + 1); masked_fadd32->add = swap_add; masked_fadd32->filed_boundary = 0; uct_rc_mlx5_ep_set_dptr_seg((void*)(masked_fadd32 + 1), buffer, length, *lkey_p); wqe_size = sizeof(*ctrl) + sizeof(*raddr) + sizeof(*masked_fadd32) + sizeof(*dptr); break; default: return UCS_ERR_INVALID_PARAM; } uct_rc_mlx5_post_send(ep, ctrl, (opcode_flags & UCT_RC_MLX5_OPCODE_MASK), opmod, signal, wqe_size); return UCS_OK; }
/* * Generic inline posting function. * Parameters which are not relevant to the opcode are ignored. * * +--------+-----+-------+--------+------------ * SEND | CTRL | INL | am_id | am_hdr | payload ... * +--------+-----+---+---+-+-------+----------- * RDMA_WRITE | CTRL | RADDR | INL | payload ... * +--------+---------+-----+------------------- * */ static UCS_F_ALWAYS_INLINE ucs_status_t uct_rc_mlx5_ep_inline_post(uct_rc_mlx5_ep_t *ep, unsigned opcode, const void *buffer, unsigned length, /* SEND */ uint8_t am_id, uint64_t am_hdr, /* RDMA */ uint64_t rdma_raddr, uct_rkey_t rdma_rkey ) { uct_rc_mlx5_iface_t *iface = ucs_derived_of(ep->super.super.super.iface, uct_rc_mlx5_iface_t); struct mlx5_wqe_ctrl_seg *ctrl; struct mlx5_wqe_raddr_seg *raddr; struct mlx5_wqe_inl_data_seg *inl; uct_rc_am_short_hdr_t *am; unsigned wqe_size; unsigned sig_flag; ctrl = ep->tx.seg; UCT_RC_MLX5_CHECK_RES(iface, ep); switch (opcode) { case MLX5_OPCODE_SEND: /* Set inline segment which has AM id, AM header, and AM payload */ wqe_size = sizeof(*ctrl) + sizeof(*inl) + sizeof(*am) + length; UCT_CHECK_LENGTH(wqe_size, UCT_RC_MLX5_MAX_BB * MLX5_SEND_WQE_BB, "am_short"); inl = (void*)(ctrl + 1); inl->byte_count = htonl((length + sizeof(*am)) | MLX5_INLINE_SEG); am = (void*)(inl + 1); am->rc_hdr.am_id = am_id; am->am_hdr = am_hdr; uct_rc_mlx5_inline_copy(am + 1, buffer, length, ep); sig_flag = uct_rc_iface_tx_moderation(&iface->super, &ep->super, MLX5_WQE_CTRL_CQ_UPDATE); break; case MLX5_OPCODE_RDMA_WRITE: /* Set RDMA segment */ if (length == 0) { wqe_size = sizeof(*ctrl) + sizeof(*raddr); } else { wqe_size = sizeof(*ctrl) + sizeof(*raddr) + sizeof(*inl) + length; } UCT_CHECK_LENGTH(wqe_size, UCT_RC_MLX5_MAX_BB * MLX5_SEND_WQE_BB, "put_short"); raddr = (void*)(ctrl + 1); uct_rc_mlx5_ep_set_rdma_seg(raddr, rdma_raddr, rdma_rkey); inl = (void*)(raddr + 1); inl->byte_count = htonl(length | MLX5_INLINE_SEG); uct_rc_mlx5_inline_copy(inl + 1, buffer, length, ep); sig_flag = MLX5_WQE_CTRL_CQ_UPDATE; break; case MLX5_OPCODE_NOP: /* Empty inline segment */ wqe_size = sizeof(*ctrl); inl = (void*)(ctrl + 1); inl->byte_count = htonl(MLX5_INLINE_SEG); sig_flag = MLX5_WQE_CTRL_CQ_UPDATE | MLX5_WQE_CTRL_FENCE; break; default: return UCS_ERR_INVALID_PARAM; } uct_rc_mlx5_post_send(ep, ctrl, opcode, 0, sig_flag, wqe_size); return UCS_OK; }