static UCS_F_ALWAYS_INLINE void uct_rc_mlx5_post_send(uct_rc_mlx5_ep_t *ep, struct mlx5_wqe_ctrl_seg *ctrl, uint8_t opcode, uint8_t opmod, unsigned sig_flag, unsigned wqe_size) { unsigned n, num_seg, num_bb; void *src, *dst; uint16_t sw_pi; num_seg = ucs_div_round_up(wqe_size, UCT_IB_MLX5_WQE_SEG_SIZE); num_bb = ucs_div_round_up(wqe_size, MLX5_SEND_WQE_BB); sw_pi = ep->tx.sw_pi; uct_rc_mlx5_set_ctrl_seg(ctrl, sw_pi, opcode, opmod, ep->qp_num, sig_flag, num_seg); uct_ib_mlx5_log_tx(IBV_QPT_RC, ctrl, ep->tx.qstart, ep->tx.qend, (opcode == MLX5_OPCODE_SEND) ? uct_rc_ep_am_packet_dump : NULL); /* TODO Put memory store fence here too, to prevent WC being flushed after DBrec */ ucs_memory_cpu_store_fence(); /* Write doorbell record */ ep->tx.prev_sw_pi = sw_pi; *ep->tx.dbrec = htonl(sw_pi += num_bb); /* Make sure that doorbell record is written before ringing the doorbell */ ucs_memory_bus_store_fence(); /* Set up copy pointers */ dst = ep->tx.bf_reg; src = ctrl; /* BF copy */ /* TODO support DB without BF */ ucs_assert(wqe_size <= ep->tx.bf_size); ucs_assert(num_bb <= UCT_RC_MLX5_MAX_BB); for (n = 0; n < num_bb; ++n) { uct_rc_mlx5_bf_copy_bb(dst, src); dst += MLX5_SEND_WQE_BB; src += MLX5_SEND_WQE_BB; if (ucs_unlikely(src == ep->tx.qend)) { src = ep->tx.qstart; } } /* We don't want the compiler to reorder instructions and hurt latency */ ucs_compiler_fence(); /* Advance queue pointer */ ucs_assert(ctrl == ep->tx.seg); ep->tx.seg = src; ep->tx.sw_pi = sw_pi; /* Flip BF register */ ep->tx.bf_reg = (void*) ((uintptr_t) ep->tx.bf_reg ^ ep->tx.bf_size); uct_rc_ep_tx_posted(&ep->super, sig_flag & MLX5_WQE_CTRL_CQ_UPDATE); }
ucs_status_t uct_mm_iface_flush(uct_iface_h tl_iface, unsigned flags, uct_completion_t *comp) { if (comp != NULL) { return UCS_ERR_UNSUPPORTED; } ucs_memory_cpu_store_fence(); UCT_TL_IFACE_STAT_FLUSH(ucs_derived_of(tl_iface, uct_base_iface_t)); return UCS_OK; }
ucs_status_t uct_mm_ep_flush(uct_ep_h tl_ep, unsigned flags, uct_completion_t *comp) { uct_mm_ep_t *ep = ucs_derived_of(tl_ep, uct_mm_ep_t); uct_mm_ep_update_cached_tail(ep); if (!uct_mm_ep_has_tx_resources(ep)) { return UCS_ERR_NO_RESOURCE; } ucs_memory_cpu_store_fence(); UCT_TL_EP_STAT_FLUSH(&ep->super); return UCS_OK; }
static void ucp_ep_remote_connected(ucp_ep_h ep) { ucp_worker_h worker = ep->worker; uct_iface_attr_t *iface_attr = &worker->iface_attrs[ep->uct.rsc_index]; ucs_debug("connected 0x%"PRIx64"->0x%"PRIx64, worker->uuid, ep->dest_uuid); ep->config.max_short_tag = iface_attr->cap.am.max_short - sizeof(uint64_t); ep->config.max_short_put = iface_attr->cap.put.max_short; ep->config.max_bcopy_put = iface_attr->cap.put.max_bcopy; ep->config.max_bcopy_get = iface_attr->cap.get.max_bcopy; /* Synchronize with other threads */ ucs_memory_cpu_store_fence(); ucs_assert_always(ep->state & UCP_EP_STATE_LOCAL_CONNECTED); ep->state |= UCP_EP_STATE_REMOTE_CONNECTED; }
static unsigned uct_rc_mlx5_iface_post_recv(uct_rc_mlx5_iface_t *iface, unsigned max) { struct mlx5_wqe_srq_next_seg *seg; uct_rc_mlx5_recv_desc_t *desc; unsigned count, head; uct_rc_hdr_t *hdr; unsigned length; head = iface->rx.head; length = iface->super.super.config.seg_size; count = 0; while (count < max) { ucs_assert(head != iface->rx.tail); UCT_TL_IFACE_GET_RX_DESC(&iface->super.super.super, iface->super.rx.mp, desc, break); seg = uct_rc_mlx5_iface_get_srq_wqe(iface, head); hdr = uct_ib_iface_recv_desc_hdr(&iface->super.super, &desc->super); uct_ib_mlx5_set_data_seg((void*)(seg + 1), hdr, length, /* TODO pre-init length */ desc->super.lkey); VALGRIND_MAKE_MEM_NOACCESS(hdr, length); ucs_queue_push(&iface->rx.desc_q, &desc->queue); head = uct_rc_mlx5_srq_next_wqe_ind(seg); ++count; } if (count > 0) { iface->rx.head = head; iface->rx.sw_pi += count; iface->super.rx.available -= count; ucs_memory_cpu_store_fence(); *iface->rx.db = htonl(iface->rx.sw_pi); } return count; }
/* A common mm active message sending function. * The first parameter indicates the origin of the call. * is_short = 1 - perform AM short sending * is_short = 0 - perform AM bcopy sending */ static UCS_F_ALWAYS_INLINE ssize_t uct_mm_ep_am_common_send(const unsigned is_short, uct_mm_ep_t *ep, uct_mm_iface_t *iface, uint8_t am_id, size_t length, uint64_t header, const void *payload, uct_pack_callback_t pack_cb, void *arg) { uct_mm_fifo_element_t *elem; ucs_status_t status; void *base_address; uint64_t head; UCT_CHECK_AM_ID(am_id); head = ep->fifo_ctl->head; /* check if there is room in the remote process's receive FIFO to write */ if (!UCT_MM_EP_IS_ABLE_TO_SEND(head, ep->cached_tail, iface->config.fifo_size)) { if (!ucs_arbiter_group_is_empty(&ep->arb_group)) { /* pending isn't empty. don't send now to prevent out-of-order sending */ UCS_STATS_UPDATE_COUNTER(ep->super.stats, UCT_EP_STAT_NO_RES, 1); return UCS_ERR_NO_RESOURCE; } else { /* pending is empty */ /* update the local copy of the tail to its actual value on the remote peer */ uct_mm_ep_update_cached_tail(ep); if (!UCT_MM_EP_IS_ABLE_TO_SEND(head, ep->cached_tail, iface->config.fifo_size)) { UCS_STATS_UPDATE_COUNTER(ep->super.stats, UCT_EP_STAT_NO_RES, 1); return UCS_ERR_NO_RESOURCE; } } } status = uct_mm_ep_get_remote_elem(ep, head, &elem); if (status != UCS_OK) { ucs_trace_poll("couldn't get an available FIFO element"); UCS_STATS_UPDATE_COUNTER(ep->super.stats, UCT_EP_STAT_NO_RES, 1); return status; } if (is_short) { /* AM_SHORT */ /* write to the remote FIFO */ *(uint64_t*) (elem + 1) = header; memcpy((void*) (elem + 1) + sizeof(header), payload, length); elem->flags |= UCT_MM_FIFO_ELEM_FLAG_INLINE; elem->length = length + sizeof(header); uct_iface_trace_am(&iface->super, UCT_AM_TRACE_TYPE_SEND, am_id, elem + 1, length + sizeof(header), "TX: AM_SHORT"); UCT_TL_EP_STAT_OP(&ep->super, AM, SHORT, sizeof(header) + length); } else { /* AM_BCOPY */ /* write to the remote descriptor */ /* get the base_address: local ptr to remote memory chunk after attaching to it */ base_address = uct_mm_ep_attach_remote_seg(ep, iface, elem); length = pack_cb(base_address + elem->desc_offset, arg); elem->flags &= ~UCT_MM_FIFO_ELEM_FLAG_INLINE; elem->length = length; uct_iface_trace_am(&iface->super, UCT_AM_TRACE_TYPE_SEND, am_id, base_address + elem->desc_offset, length, "TX: AM_BCOPY"); UCT_TL_EP_STAT_OP(&ep->super, AM, BCOPY, length); } elem->am_id = am_id; /* memory barrier - make sure that the memory is flushed before setting the * 'writing is complete' flag which the reader checks */ ucs_memory_cpu_store_fence(); /* change the owner bit to indicate that the writing is complete. * the owner bit flips after every FIFO wraparound */ if (head & iface->config.fifo_size) { elem->flags |= UCT_MM_FIFO_ELEM_FLAG_OWNER; } else { elem->flags &= ~UCT_MM_FIFO_ELEM_FLAG_OWNER; } if (is_short) { return UCS_OK; } else { return length; } }