/* TODO: currently pending code supports only dcs policy support hash/random policies */ ucs_status_t uct_dc_ep_pending_add(uct_ep_h tl_ep, uct_pending_req_t *r) { uct_dc_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_iface_t); uct_dc_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_ep_t); /* ep can tx iff * - iface has resources: cqe and tx skb * - dci is either assigned or can be assigned * - dci has resources */ if (uct_rc_iface_has_tx_resources(&iface->super)) { if (ep->dci == UCT_DC_EP_NO_DCI) { if (uct_dc_iface_dci_can_alloc(iface) && (ep->fc.fc_wnd > 0)) { return UCS_ERR_BUSY; } } else { if (uct_dc_iface_dci_ep_can_send(ep)) { return UCS_ERR_BUSY; } } } UCS_STATIC_ASSERT(sizeof(ucs_arbiter_elem_t) <= UCT_PENDING_REQ_PRIV_LEN); ucs_arbiter_elem_init((ucs_arbiter_elem_t *)r->priv); /* no dci: * Do not grab dci here. Instead put the group on dci allocation arbiter. * This way we can assure fairness between all eps waiting for * dci allocation. */ if (ep->dci == UCT_DC_EP_NO_DCI) { ucs_arbiter_group_push_elem(&ep->arb_group, (ucs_arbiter_elem_t*)r->priv); uct_dc_iface_schedule_dci_alloc(iface, ep); return UCS_OK; } ucs_arbiter_group_push_elem(&ep->arb_group, (ucs_arbiter_elem_t*)r->priv); uct_dc_iface_dci_sched_tx(iface, ep); return UCS_OK; }
/** * dispatch requests waiting for dci allocation */ ucs_arbiter_cb_result_t uct_dc_iface_dci_do_pending_wait(ucs_arbiter_t *arbiter, ucs_arbiter_elem_t *elem, void *arg) { uct_dc_ep_t *ep = ucs_container_of(ucs_arbiter_elem_group(elem), uct_dc_ep_t, arb_group); uct_dc_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_dc_iface_t); /** * stop if dci can not be allocated * else move group to the dci arbiter */ ucs_assert_always(ep->dci == UCT_DC_EP_NO_DCI); if (!uct_dc_iface_dci_can_alloc(iface)) { return UCS_ARBITER_CB_RESULT_STOP; } uct_dc_iface_dci_alloc(iface, ep); ucs_assert_always(ep->dci != UCT_DC_EP_NO_DCI); uct_dc_iface_dci_sched_tx(iface, ep); return UCS_ARBITER_CB_RESULT_DESCHED_GROUP; }
static UCS_F_ALWAYS_INLINE void uct_dc_mlx5_poll_tx(uct_dc_mlx5_iface_t *iface) { uint8_t dci; struct mlx5_cqe64 *cqe; uint32_t qp_num; uint16_t hw_ci; UCT_DC_MLX5_TXQP_DECL(txqp, txwq); cqe = uct_ib_mlx5_poll_cq(&iface->super.super.super, &iface->mlx5_common.tx.cq); if (cqe == NULL) { return; } UCS_STATS_UPDATE_COUNTER(iface->super.super.stats, UCT_RC_IFACE_STAT_TX_COMPLETION, 1); ucs_memory_cpu_load_fence(); qp_num = ntohl(cqe->sop_drop_qpn) & UCS_MASK(UCT_IB_QPN_ORDER); dci = uct_dc_iface_dci_find(&iface->super, qp_num); txqp = &iface->super.tx.dcis[dci].txqp; txwq = &iface->dci_wqs[dci]; hw_ci = ntohs(cqe->wqe_counter); ucs_trace_poll("dc_mlx5 iface %p tx_cqe: dci[%d] qpn 0x%x txqp %p hw_ci %d", iface, dci, qp_num, txqp, hw_ci); uct_rc_txqp_available_set(txqp, uct_ib_mlx5_txwq_update_bb(txwq, hw_ci)); uct_dc_iface_dci_put(&iface->super, dci); uct_rc_mlx5_txqp_process_tx_cqe(txqp, cqe, hw_ci); iface->super.super.tx.cq_available++; if (uct_dc_iface_dci_can_alloc(&iface->super)) { ucs_arbiter_dispatch(uct_dc_iface_dci_waitq(&iface->super), 1, uct_dc_iface_dci_do_pending_wait, NULL); } ucs_arbiter_dispatch(uct_dc_iface_tx_waitq(&iface->super), 1, uct_dc_iface_dci_do_pending_tx, NULL); }
ucs_status_t uct_dc_ep_flush(uct_ep_h tl_ep, unsigned flags, uct_completion_t *comp) { uct_dc_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_iface_t); uct_dc_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_ep_t); ucs_status_t status; /* If waiting for FC grant, return NO_RESOURCE to prevent ep destruction. * Otherwise grant for destroyed ep will arrive and there will be a * segfault when we will try to access the ep by address from the grant * message. */ if (!uct_rc_iface_has_tx_resources(&iface->super)) { return UCS_ERR_NO_RESOURCE; } if (ep->dci == UCT_DC_EP_NO_DCI) { if (!uct_dc_iface_dci_can_alloc(iface)) { return UCS_ERR_NO_RESOURCE; /* waiting for dci */ } else { UCT_TL_EP_STAT_FLUSH(&ep->super); /* no sends */ return UCS_OK; } } if (!uct_dc_iface_dci_ep_can_send(ep)) { return UCS_ERR_NO_RESOURCE; /* cannot send */ } status = uct_dc_iface_flush_dci(iface, ep->dci); if (status == UCS_OK) { UCT_TL_EP_STAT_FLUSH(&ep->super); return UCS_OK; /* all sends completed */ } ucs_assert(status == UCS_INPROGRESS); UCT_TL_EP_STAT_FLUSH_WAIT(&ep->super); return UCS_INPROGRESS; }
static UCS_F_ALWAYS_INLINE void uct_dc_mlx5_poll_tx(uct_dc_mlx5_iface_t *iface) { uint8_t dci; struct mlx5_cqe64 *cqe; uint32_t qp_num; uint16_t hw_ci; UCT_DC_MLX5_TXQP_DECL(txqp, txwq); cqe = uct_ib_mlx5_get_cqe(&iface->super.super.super, &iface->mlx5_common.tx.cq, iface->mlx5_common.tx.cq.cqe_size_log); if (cqe == NULL) { return; } UCS_STATS_UPDATE_COUNTER(iface->super.super.stats, UCT_RC_IFACE_STAT_TX_COMPLETION, 1); ucs_memory_cpu_load_fence(); ucs_assertv(!(cqe->op_own & (MLX5_INLINE_SCATTER_32|MLX5_INLINE_SCATTER_64)), "tx inline scatter not supported"); qp_num = ntohl(cqe->sop_drop_qpn) & UCS_MASK(UCT_IB_QPN_ORDER); dci = uct_dc_iface_dci_find(&iface->super, qp_num); txqp = &iface->super.tx.dcis[dci].txqp; txwq = &iface->dci_wqs[dci]; hw_ci = ntohs(cqe->wqe_counter); uct_rc_txqp_available_set(txqp, uct_ib_mlx5_txwq_update_bb(txwq, hw_ci)); uct_rc_txqp_completion(txqp, hw_ci); iface->super.super.tx.cq_available++; uct_dc_iface_dci_put(&iface->super, dci); if (uct_dc_iface_dci_can_alloc(&iface->super)) { ucs_arbiter_dispatch(&iface->super.super.tx.arbiter, 1, uct_dc_iface_dci_do_pending_wait, NULL); } ucs_arbiter_dispatch(&iface->super.tx.dci_arbiter, 1, uct_dc_iface_dci_do_pending_tx, NULL); }