static UCS_CLASS_CLEANUP_FUNC(uct_dc_ep_t) { uct_dc_iface_t *iface = ucs_derived_of(self->super.super.iface, uct_dc_iface_t); uct_dc_ep_pending_purge(&self->super.super, NULL, NULL); ucs_arbiter_group_cleanup(&self->arb_group); uct_rc_fc_cleanup(&self->fc); ucs_assert_always(self->state != UCT_DC_EP_INVALID); if (self->dci == UCT_DC_EP_NO_DCI) { return; } /* TODO: this is good for dcs policy only. * Need to change if eps share dci */ ucs_assertv_always(uct_dc_iface_dci_has_outstanding(iface, self->dci), "iface (%p) ep (%p) dci leak detected: dci=%d", iface, self, self->dci); /* we can handle it but well behaving app should not do this */ ucs_warn("ep (%p) is destroyed with %d outstanding ops", self, (int16_t)iface->super.config.tx_qp_len - uct_rc_txqp_available(&iface->tx.dcis[self->dci].txqp)); uct_rc_txqp_purge_outstanding(&iface->tx.dcis[self->dci].txqp, UCS_ERR_CANCELED, 1); iface->tx.dcis[self->dci].ep = NULL; }
ucs_status_t uct_dc_mlx5_ep_flush(uct_ep_h tl_ep, unsigned flags, uct_completion_t *comp) { uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_ep->iface, uct_dc_mlx5_iface_t); uct_dc_mlx5_ep_t *ep = ucs_derived_of(tl_ep, uct_dc_mlx5_ep_t); ucs_status_t status; UCT_DC_MLX5_TXQP_DECL(txqp, txwq); if (ucs_unlikely(flags & UCT_FLUSH_FLAG_CANCEL)) { if (ep->dci != UCT_DC_MLX5_EP_NO_DCI) { uct_rc_txqp_purge_outstanding(&iface->tx.dcis[ep->dci].txqp, UCS_ERR_CANCELED, 0); #if ENABLE_ASSERT iface->tx.dcis[ep->dci].flags |= UCT_DC_DCI_FLAG_EP_CANCELED; #endif } uct_ep_pending_purge(tl_ep, NULL, 0); return UCS_OK; } if (!uct_rc_iface_has_tx_resources(&iface->super.super)) { return UCS_ERR_NO_RESOURCE; } if (ep->dci == UCT_DC_MLX5_EP_NO_DCI) { if (!uct_dc_mlx5_iface_dci_can_alloc(iface)) { return UCS_ERR_NO_RESOURCE; /* waiting for dci */ } else { UCT_TL_EP_STAT_FLUSH(&ep->super); /* no sends */ return UCS_OK; } } if (!uct_dc_mlx5_iface_dci_ep_can_send(ep)) { return UCS_ERR_NO_RESOURCE; /* cannot send */ } status = uct_dc_mlx5_iface_flush_dci(iface, ep->dci); if (status == UCS_OK) { UCT_TL_EP_STAT_FLUSH(&ep->super); return UCS_OK; /* all sends completed */ } ucs_assert(status == UCS_INPROGRESS); ucs_assert(ep->dci != UCT_DC_MLX5_EP_NO_DCI); UCT_DC_MLX5_IFACE_TXQP_GET(iface, ep, txqp, txwq); return uct_rc_txqp_add_flush_comp(&iface->super.super, &ep->super, txqp, comp, txwq->sig_pi); }
ucs_status_t uct_rc_verbs_ep_handle_failure(uct_rc_verbs_ep_t *ep, ucs_status_t status) { uct_rc_iface_t *iface = ucs_derived_of(ep->super.super.super.iface, uct_rc_iface_t); iface->tx.cq_available += ep->txcnt.pi - ep->txcnt.ci; /* Reset CI to prevent cq_available overrun on ep_destoroy */ ep->txcnt.ci = ep->txcnt.pi; uct_rc_txqp_purge_outstanding(&ep->super.txqp, status, 0); return iface->super.ops->set_ep_failed(&iface->super, &ep->super.super.super, status); }
void uct_dc_ep_set_failed(ucs_class_t *ep_cls, uct_dc_iface_t *iface, uint32_t qp_num) { uint8_t dci = uct_dc_iface_dci_find(iface, qp_num); uct_dc_ep_t *ep = iface->tx.dcis[dci].ep; if (!ep) { return; } uct_rc_txqp_purge_outstanding(&iface->tx.dcis[dci].txqp, UCS_ERR_ENDPOINT_TIMEOUT, 0); uct_set_ep_failed(ep_cls, &ep->super.super, &iface->super.super.super.super); if (UCS_OK != uct_dc_iface_dci_reconnect(iface, &iface->tx.dcis[dci].txqp)) { ucs_fatal("Unsuccessful reconnect of DC QP #%u", qp_num); } uct_rc_txqp_available_set(&iface->tx.dcis[dci].txqp, iface->super.config.tx_qp_len); }