static UCS_F_ALWAYS_INLINE void uct_rc_verbs_ep_posted(uct_rc_verbs_ep_t* ep, int signaled) { uct_rc_ep_tx_posted(&ep->super, signaled); --ep->tx.available; ++ep->tx.post_count; }
static UCS_F_ALWAYS_INLINE void uct_rc_mlx5_post_send(uct_rc_mlx5_ep_t *ep, struct mlx5_wqe_ctrl_seg *ctrl, uint8_t opcode, uint8_t opmod, unsigned sig_flag, unsigned wqe_size) { unsigned n, num_seg, num_bb; void *src, *dst; uint16_t sw_pi; num_seg = ucs_div_round_up(wqe_size, UCT_IB_MLX5_WQE_SEG_SIZE); num_bb = ucs_div_round_up(wqe_size, MLX5_SEND_WQE_BB); sw_pi = ep->tx.sw_pi; uct_rc_mlx5_set_ctrl_seg(ctrl, sw_pi, opcode, opmod, ep->qp_num, sig_flag, num_seg); uct_ib_mlx5_log_tx(IBV_QPT_RC, ctrl, ep->tx.qstart, ep->tx.qend, (opcode == MLX5_OPCODE_SEND) ? uct_rc_ep_am_packet_dump : NULL); /* TODO Put memory store fence here too, to prevent WC being flushed after DBrec */ ucs_memory_cpu_store_fence(); /* Write doorbell record */ ep->tx.prev_sw_pi = sw_pi; *ep->tx.dbrec = htonl(sw_pi += num_bb); /* Make sure that doorbell record is written before ringing the doorbell */ ucs_memory_bus_store_fence(); /* Set up copy pointers */ dst = ep->tx.bf_reg; src = ctrl; /* BF copy */ /* TODO support DB without BF */ ucs_assert(wqe_size <= ep->tx.bf_size); ucs_assert(num_bb <= UCT_RC_MLX5_MAX_BB); for (n = 0; n < num_bb; ++n) { uct_rc_mlx5_bf_copy_bb(dst, src); dst += MLX5_SEND_WQE_BB; src += MLX5_SEND_WQE_BB; if (ucs_unlikely(src == ep->tx.qend)) { src = ep->tx.qstart; } } /* We don't want the compiler to reorder instructions and hurt latency */ ucs_compiler_fence(); /* Advance queue pointer */ ucs_assert(ctrl == ep->tx.seg); ep->tx.seg = src; ep->tx.sw_pi = sw_pi; /* Flip BF register */ ep->tx.bf_reg = (void*) ((uintptr_t) ep->tx.bf_reg ^ ep->tx.bf_size); uct_rc_ep_tx_posted(&ep->super, sig_flag & MLX5_WQE_CTRL_CQ_UPDATE); }
static UCS_F_ALWAYS_INLINE void uct_rc_mlx5_post_send(uct_rc_mlx5_ep_t *ep, struct mlx5_wqe_ctrl_seg *ctrl, uint8_t opcode, uint8_t opmod, unsigned sig_flag, unsigned wqe_size) { uint16_t posted; uct_ib_mlx5_set_ctrl_seg(ctrl, ep->tx.wq.sw_pi, opcode, opmod, ep->qp_num, sig_flag, wqe_size); uct_ib_mlx5_log_tx(ucs_derived_of(ep->super.super.super.iface, uct_ib_iface_t), IBV_QPT_RC, ctrl, ep->tx.wq.qstart, ep->tx.wq.qend, (opcode == MLX5_OPCODE_SEND) ? uct_rc_ep_am_packet_dump : NULL); posted = uct_ib_mlx5_post_send(&ep->tx.wq, ctrl, wqe_size); ep->super.available -= posted; uct_rc_ep_tx_posted(&ep->super, sig_flag & MLX5_WQE_CTRL_CQ_UPDATE); }