/** * dispatch requests waiting for tx resources */ ucs_arbiter_cb_result_t uct_dc_mlx5_iface_dci_do_pending_tx(ucs_arbiter_t *arbiter, ucs_arbiter_elem_t *elem, void *arg) { uct_dc_mlx5_ep_t *ep = ucs_container_of(ucs_arbiter_elem_group(elem), uct_dc_mlx5_ep_t, arb_group); uct_dc_mlx5_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_dc_mlx5_iface_t); uct_pending_req_t *req = ucs_container_of(elem, uct_pending_req_t, priv); ucs_status_t status; if (!uct_rc_iface_has_tx_resources(&iface->super.super)) { return UCS_ARBITER_CB_RESULT_STOP; } status = req->func(req); ucs_trace_data("progress pending request %p returned: %s", req, ucs_status_string(status)); if (status == UCS_OK) { /* For dcs* policies release dci if this is the last elem in the group * and the dci has no outstanding operations. For example pending * callback did not send anything. (uct_ep_flush or just return ok) */ if (ucs_arbiter_elem_is_last(&ep->arb_group, elem)) { uct_dc_mlx5_iface_dci_free(iface, ep); } return UCS_ARBITER_CB_RESULT_REMOVE_ELEM; } if (status == UCS_INPROGRESS) { return UCS_ARBITER_CB_RESULT_NEXT_GROUP; } if (!uct_dc_mlx5_iface_dci_ep_can_send(ep)) { /* Deschedule the group even if FC is the only resource, which * is missing. It will be scheduled again when credits arrive. * We can't desched group with rand policy if non FC resources are * missing, since it's never scheduled again. */ if (uct_dc_mlx5_iface_is_dci_rand(iface) && uct_rc_fc_has_resources(&iface->super.super, &ep->fc)) { return UCS_ARBITER_CB_RESULT_RESCHED_GROUP; } else { return UCS_ARBITER_CB_RESULT_DESCHED_GROUP; } } ucs_assertv(!uct_rc_iface_has_tx_resources(&iface->super.super), "pending callback returned error but send resources are available"); return UCS_ARBITER_CB_RESULT_STOP; }
ucs_status_t uct_rc_verbs_ep_flush(uct_ep_h tl_ep, unsigned flags, uct_completion_t *comp) { uct_rc_verbs_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_rc_verbs_iface_t); uct_rc_verbs_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_verbs_ep_t); ucs_status_t status; if (!uct_rc_iface_has_tx_resources(&iface->super)) { return UCS_ERR_NO_RESOURCE; } if (uct_rc_txqp_available(&ep->super.txqp) == iface->config.tx_max_wr) { UCT_TL_EP_STAT_FLUSH(&ep->super.super); return UCS_OK; } if (uct_rc_txqp_unsignaled(&ep->super.txqp) != 0) { if (IBV_DEVICE_HAS_NOP(&uct_ib_iface_device(&iface->super.super)->dev_attr)) { status = uct_rc_verbs_ep_nop(ep); } else { status = uct_rc_verbs_ep_put_short(tl_ep, NULL, 0, 0, 0); } if (status != UCS_OK) { return status; } } else if (!uct_rc_ep_has_tx_resources(&ep->super)) { return UCS_ERR_NO_RESOURCE; } uct_rc_txqp_add_send_comp(&iface->super, &ep->super.txqp, comp, ep->txcnt.pi); UCT_TL_EP_STAT_FLUSH_WAIT(&ep->super.super); return UCS_INPROGRESS; }
ucs_status_t uct_rc_mlx5_ep_flush(uct_ep_h tl_ep, unsigned flags, uct_completion_t *comp) { uct_rc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_rc_mlx5_ep_t); uct_rc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_rc_mlx5_iface_t); uint16_t sn; if (!uct_rc_iface_has_tx_resources(&iface->super)) { return UCS_ERR_NO_RESOURCE; } if (uct_rc_txqp_available(&ep->super.txqp) == ep->tx.wq.bb_max) { UCT_TL_EP_STAT_FLUSH(&ep->super.super); return UCS_OK; } if (uct_rc_txqp_unsignaled(&ep->super.txqp) != 0) { sn = ep->tx.wq.sw_pi; UCT_RC_CHECK_RES(&iface->super, &ep->super); uct_rc_mlx5_txqp_inline_post(&iface->super, IBV_QPT_RC, &ep->super.txqp, &ep->tx.wq, MLX5_OPCODE_NOP, NULL, 0, 0, 0, 0, 0, NULL, 0); } else if (!uct_rc_ep_has_tx_resources(&ep->super)) { return UCS_ERR_NO_RESOURCE; } else { sn = ep->tx.wq.sig_pi; } uct_rc_txqp_add_send_comp(&iface->super, &ep->super.txqp, comp, sn); UCT_TL_EP_STAT_FLUSH_WAIT(&ep->super.super); return UCS_INPROGRESS; }
ucs_status_t uct_dc_mlx5_ep_flush(uct_ep_h tl_ep, unsigned flags, uct_completion_t *comp) { uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_mlx5_iface_t); uct_dc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t); ucs_status_t status; UCT_DC_MLX5_TXQP_DECL(txqp, txwq); if (ucs_unlikely(flags & UCT_FLUSH_FLAG_CANCEL)) { if (ep->dci != UCT_DC_MLX5_EP_NO_DCI) { uct_rc_txqp_purge_outstanding(&iface->tx.dcis[ep->dci].txqp, UCS_ERR_CANCELED, 0); #if ENABLE_ASSERT iface->tx.dcis[ep->dci].flags |= UCT_DC_DCI_FLAG_EP_CANCELED; #endif } uct_ep_pending_purge(tl_ep, NULL, 0); return UCS_OK; } if (!uct_rc_iface_has_tx_resources(&iface->super.super)) { return UCS_ERR_NO_RESOURCE; } if (ep->dci == UCT_DC_MLX5_EP_NO_DCI) { if (!uct_dc_mlx5_iface_dci_can_alloc(iface)) { return UCS_ERR_NO_RESOURCE; /* waiting for dci */ } else { UCT_TL_EP_STAT_FLUSH(&ep->super); /* no sends */ return UCS_OK; } } if (!uct_dc_mlx5_iface_dci_ep_can_send(ep)) { return UCS_ERR_NO_RESOURCE; /* cannot send */ } status = uct_dc_mlx5_iface_flush_dci(iface, ep->dci); if (status == UCS_OK) { UCT_TL_EP_STAT_FLUSH(&ep->super); return UCS_OK; /* all sends completed */ } ucs_assert(status == UCS_INPROGRESS); ucs_assert(ep->dci != UCT_DC_MLX5_EP_NO_DCI); UCT_DC_MLX5_IFACE_TXQP_GET(iface, ep, txqp, txwq); return uct_rc_txqp_add_flush_comp(&iface->super.super, &ep->super, txqp, comp, txwq->sig_pi); }
/* TODO: currently pending code supports only dcs policy support hash/random policies */ ucs_status_t uct_dc_mlx5_ep_pending_add(uct_ep_h tl_ep, uct_pending_req_t *r, unsigned flags) { uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_mlx5_iface_t); uct_dc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t); /* ep can tx iff * - iface has resources: cqe and tx skb * - dci is either assigned or can be assigned * - dci has resources */ if (uct_rc_iface_has_tx_resources(&iface->super.super)) { if (ep->dci == UCT_DC_MLX5_EP_NO_DCI) { if (uct_dc_mlx5_iface_dci_can_alloc(iface) && (ep->fc.fc_wnd > 0)) { return UCS_ERR_BUSY; } } else { if (uct_dc_mlx5_iface_dci_ep_can_send(ep)) { return UCS_ERR_BUSY; } } } UCS_STATIC_ASSERT(sizeof(uct_pending_req_priv_arb_t) <= UCT_PENDING_REQ_PRIV_LEN); uct_pending_req_arb_group_push(&ep->arb_group, r); /* no dci: * Do not grab dci here. Instead put the group on dci allocation arbiter. * This way we can assure fairness between all eps waiting for * dci allocation. Relevant for dcs and dcs_quota policies. */ if (ep->dci == UCT_DC_MLX5_EP_NO_DCI) { uct_dc_mlx5_iface_schedule_dci_alloc(iface, ep); UCT_TL_EP_STAT_PEND(&ep->super); return UCS_OK; } uct_dc_mlx5_iface_dci_sched_tx(iface, ep); UCT_TL_EP_STAT_PEND(&ep->super); return UCS_OK; }
ucs_status_t uct_dc_ep_flush(uct_ep_h tl_ep, unsigned flags, uct_completion_t *comp) { uct_dc_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_iface_t); uct_dc_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_ep_t); ucs_status_t status; /* If waiting for FC grant, return NO_RESOURCE to prevent ep destruction. * Otherwise grant for destroyed ep will arrive and there will be a * segfault when we will try to access the ep by address from the grant * message. */ if (!uct_rc_iface_has_tx_resources(&iface->super)) { return UCS_ERR_NO_RESOURCE; } if (ep->dci == UCT_DC_EP_NO_DCI) { if (!uct_dc_iface_dci_can_alloc(iface)) { return UCS_ERR_NO_RESOURCE; /* waiting for dci */ } else { UCT_TL_EP_STAT_FLUSH(&ep->super); /* no sends */ return UCS_OK; } } if (!uct_dc_iface_dci_ep_can_send(ep)) { return UCS_ERR_NO_RESOURCE; /* cannot send */ } status = uct_dc_iface_flush_dci(iface, ep->dci); if (status == UCS_OK) { UCT_TL_EP_STAT_FLUSH(&ep->super); return UCS_OK; /* all sends completed */ } ucs_assert(status == UCS_INPROGRESS); UCT_TL_EP_STAT_FLUSH_WAIT(&ep->super); return UCS_INPROGRESS; }