static UCS_F_ALWAYS_INLINE void uct_dc_mlx5_poll_tx(uct_dc_mlx5_iface_t *iface) { uint8_t dci; struct mlx5_cqe64 *cqe; uint32_t qp_num; uint16_t hw_ci; UCT_DC_MLX5_TXQP_DECL(txqp, txwq); cqe = uct_ib_mlx5_poll_cq(&iface->super.super.super, &iface->mlx5_common.tx.cq); if (cqe == NULL) { return; } UCS_STATS_UPDATE_COUNTER(iface->super.super.stats, UCT_RC_IFACE_STAT_TX_COMPLETION, 1); ucs_memory_cpu_load_fence(); qp_num = ntohl(cqe->sop_drop_qpn) & UCS_MASK(UCT_IB_QPN_ORDER); dci = uct_dc_iface_dci_find(&iface->super, qp_num); txqp = &iface->super.tx.dcis[dci].txqp; txwq = &iface->dci_wqs[dci]; hw_ci = ntohs(cqe->wqe_counter); ucs_trace_poll("dc_mlx5 iface %p tx_cqe: dci[%d] qpn 0x%x txqp %p hw_ci %d", iface, dci, qp_num, txqp, hw_ci); uct_rc_txqp_available_set(txqp, uct_ib_mlx5_txwq_update_bb(txwq, hw_ci)); uct_dc_iface_dci_put(&iface->super, dci); uct_rc_mlx5_txqp_process_tx_cqe(txqp, cqe, hw_ci); iface->super.super.tx.cq_available++; if (uct_dc_iface_dci_can_alloc(&iface->super)) { ucs_arbiter_dispatch(uct_dc_iface_dci_waitq(&iface->super), 1, uct_dc_iface_dci_do_pending_wait, NULL); } ucs_arbiter_dispatch(uct_dc_iface_tx_waitq(&iface->super), 1, uct_dc_iface_dci_do_pending_tx, NULL); }
static UCS_F_ALWAYS_INLINE unsigned uct_cuda_copy_progress_event_queue(ucs_queue_head_t *event_queue, unsigned max_events) { unsigned count = 0; cudaError_t result = cudaSuccess; uct_cuda_copy_event_desc_t *cuda_event; ucs_queue_iter_t iter; ucs_queue_for_each_safe(cuda_event, iter, event_queue, queue) { result = cudaEventQuery(cuda_event->event); if (cudaSuccess != result) { break; } ucs_queue_del_iter(event_queue, iter); if (cuda_event->comp != NULL) { uct_invoke_completion(cuda_event->comp, UCS_OK); } ucs_trace_poll("CUDA Event Done :%p", cuda_event); ucs_mpool_put(cuda_event); count++; if (count >= max_events) { break; } }
/* A common mm active message sending function. * The first parameter indicates the origin of the call. * is_short = 1 - perform AM short sending * is_short = 0 - perform AM bcopy sending */ static UCS_F_ALWAYS_INLINE ssize_t uct_mm_ep_am_common_send(const unsigned is_short, uct_mm_ep_t *ep, uct_mm_iface_t *iface, uint8_t am_id, size_t length, uint64_t header, const void *payload, uct_pack_callback_t pack_cb, void *arg) { uct_mm_fifo_element_t *elem; ucs_status_t status; void *base_address; uint64_t head; UCT_CHECK_AM_ID(am_id); head = ep->fifo_ctl->head; /* check if there is room in the remote process's receive FIFO to write */ if (!UCT_MM_EP_IS_ABLE_TO_SEND(head, ep->cached_tail, iface->config.fifo_size)) { if (!ucs_arbiter_group_is_empty(&ep->arb_group)) { /* pending isn't empty. don't send now to prevent out-of-order sending */ UCS_STATS_UPDATE_COUNTER(ep->super.stats, UCT_EP_STAT_NO_RES, 1); return UCS_ERR_NO_RESOURCE; } else { /* pending is empty */ /* update the local copy of the tail to its actual value on the remote peer */ uct_mm_ep_update_cached_tail(ep); if (!UCT_MM_EP_IS_ABLE_TO_SEND(head, ep->cached_tail, iface->config.fifo_size)) { UCS_STATS_UPDATE_COUNTER(ep->super.stats, UCT_EP_STAT_NO_RES, 1); return UCS_ERR_NO_RESOURCE; } } } status = uct_mm_ep_get_remote_elem(ep, head, &elem); if (status != UCS_OK) { ucs_trace_poll("couldn't get an available FIFO element"); UCS_STATS_UPDATE_COUNTER(ep->super.stats, UCT_EP_STAT_NO_RES, 1); return status; } if (is_short) { /* AM_SHORT */ /* write to the remote FIFO */ *(uint64_t*) (elem + 1) = header; memcpy((void*) (elem + 1) + sizeof(header), payload, length); elem->flags |= UCT_MM_FIFO_ELEM_FLAG_INLINE; elem->length = length + sizeof(header); uct_iface_trace_am(&iface->super, UCT_AM_TRACE_TYPE_SEND, am_id, elem + 1, length + sizeof(header), "TX: AM_SHORT"); UCT_TL_EP_STAT_OP(&ep->super, AM, SHORT, sizeof(header) + length); } else { /* AM_BCOPY */ /* write to the remote descriptor */ /* get the base_address: local ptr to remote memory chunk after attaching to it */ base_address = uct_mm_ep_attach_remote_seg(ep, iface, elem); length = pack_cb(base_address + elem->desc_offset, arg); elem->flags &= ~UCT_MM_FIFO_ELEM_FLAG_INLINE; elem->length = length; uct_iface_trace_am(&iface->super, UCT_AM_TRACE_TYPE_SEND, am_id, base_address + elem->desc_offset, length, "TX: AM_BCOPY"); UCT_TL_EP_STAT_OP(&ep->super, AM, BCOPY, length); } elem->am_id = am_id; /* memory barrier - make sure that the memory is flushed before setting the * 'writing is complete' flag which the reader checks */ ucs_memory_cpu_store_fence(); /* change the owner bit to indicate that the writing is complete. * the owner bit flips after every FIFO wraparound */ if (head & iface->config.fifo_size) { elem->flags |= UCT_MM_FIFO_ELEM_FLAG_OWNER; } else { elem->flags &= ~UCT_MM_FIFO_ELEM_FLAG_OWNER; } if (is_short) { return UCS_OK; } else { return length; } }