static void __nic_get_completed_txd(struct gnix_nic *nic, gni_cq_handle_t hw_cq, struct gnix_tx_descriptor **txd, gni_return_t *tx_status) { gni_post_descriptor_t *gni_desc; struct gnix_tx_descriptor *txd_p = NULL; struct gnix_fab_req *req; gni_return_t status; int msg_id; gni_cq_entry_t cqe; uint32_t recov = 1; if (__gnix_nic_txd_err_get(nic, &txd_p)) { *txd = txd_p; *tx_status = GNI_RC_TRANSACTION_ERROR; return; } status = GNI_CqGetEvent(hw_cq, &cqe); if (status == GNI_RC_NOT_DONE) { *txd = NULL; *tx_status = GNI_RC_NOT_DONE; return; } assert(status == GNI_RC_SUCCESS || status == GNI_RC_TRANSACTION_ERROR); if (unlikely(status == GNI_RC_TRANSACTION_ERROR)) { status = GNI_CqErrorRecoverable(cqe, &recov); if (status == GNI_RC_SUCCESS) { if (!recov) { char ebuf[512]; GNI_CqErrorStr(cqe, ebuf, sizeof(ebuf)); GNIX_WARN(FI_LOG_EP_DATA, "CQ error status: %s\n", ebuf); } } else { GNIX_WARN(FI_LOG_EP_DATA, "GNI_CqErrorRecover returned: %s\n", gni_err_str[status]); recov = 0; /* assume something bad has happened */ } } if (GNI_CQ_GET_TYPE(cqe) == GNI_CQ_EVENT_TYPE_POST) { status = GNI_GetCompleted(hw_cq, cqe, &gni_desc); assert(status == GNI_RC_SUCCESS || status == GNI_RC_TRANSACTION_ERROR); txd_p = container_of(gni_desc, struct gnix_tx_descriptor, gni_desc); } else if (GNI_CQ_GET_TYPE(cqe) == GNI_CQ_EVENT_TYPE_SMSG) {
static inline int mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module) { ompi_common_ugni_post_desc_t *desc; mca_btl_ugni_base_frag_t *frag; gni_cq_entry_t event_data = 0; uint32_t recoverable = 1; gni_return_t rc; rc = GNI_CqGetEvent (ugni_module->rdma_local_cq, &event_data); if (GNI_RC_NOT_DONE == rc) { return 0; } if (OPAL_UNLIKELY((GNI_RC_SUCCESS != rc && !event_data) || GNI_CQ_OVERRUN(event_data))) { /* TODO -- need to handle overrun -- how do we do this without an event? will the event eventually come back? Ask Cray */ BTL_ERROR(("unhandled post error! ugni rc = %d", rc)); assert (0); return ompi_common_rc_ugni_to_ompi (rc); } rc = GNI_GetCompleted (ugni_module->rdma_local_cq, event_data, (gni_post_descriptor_t **) &desc); if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc && GNI_RC_TRANSACTION_ERROR != rc)) { BTL_ERROR(("Error in GNI_GetComplete %s", gni_err_str[rc])); return ompi_common_rc_ugni_to_ompi (rc); } frag = MCA_BTL_UGNI_DESC_TO_FRAG(desc); if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc || !GNI_CQ_STATUS_OK(event_data))) { (void) GNI_CqErrorRecoverable (event_data, &recoverable); if (OPAL_UNLIKELY(++frag->post_desc.tries >= mca_btl_ugni_component.rdma_max_retries || !recoverable)) { /* give up */ BTL_ERROR(("giving up on frag %p", (void *) frag)); frag->cbfunc (frag, OMPI_ERROR); return OMPI_ERROR; } /* repost transaction */ mca_btl_ugni_repost (frag, OMPI_SUCCESS); return 0; } BTL_VERBOSE(("RDMA/FMA complete for frag %p", (void *) frag)); frag->cbfunc (frag, ompi_common_rc_ugni_to_ompi (rc)); return 1; }
static inline int mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_device_t *device, mca_btl_ugni_cq_t *cq) { mca_btl_ugni_post_descriptor_t *post_desc[MCA_BTL_UGNI_COMPLETIONS_PER_LOOP]; gni_cq_entry_t event_data[MCA_BTL_UGNI_COMPLETIONS_PER_LOOP]; int rc; rc = mca_btl_ugni_cq_get_completed_desc (device, cq, event_data, post_desc, MCA_BTL_UGNI_COMPLETIONS_PER_LOOP); if (0 >= rc) { return rc; } BTL_VERBOSE(("got %d completed rdma descriptors", rc)); for (int i = 0 ; i < rc ; ++i) { BTL_VERBOSE(("post descriptor %p complete. GNI_CQ_STATUS_OK(): %d", post_desc[i], GNI_CQ_STATUS_OK(event_data[i]))); if (OPAL_UNLIKELY(!GNI_CQ_STATUS_OK(event_data[i]))) { uint32_t recoverable = 1; (void) GNI_CqErrorRecoverable (event_data[i], &recoverable); if (OPAL_UNLIKELY(++post_desc[i]->tries >= mca_btl_ugni_component.rdma_max_retries || !recoverable)) { char char_buffer[1024]; GNI_CqErrorStr (event_data[i], char_buffer, 1024); /* give up */ BTL_ERROR(("giving up on desciptor %p, recoverable %d: %s", (void *) post_desc[i], recoverable, char_buffer)); #if OPAL_ENABLE_DEBUG btl_ugni_dump_post_desc (post_desc[i]); #endif mca_btl_ugni_post_desc_complete (ugni_module, post_desc[i], OPAL_ERROR); return OPAL_ERROR; } mca_btl_ugni_repost (ugni_module, post_desc[i]); return 0; } mca_btl_ugni_post_desc_complete (ugni_module, post_desc[i], OPAL_SUCCESS); } /* should be resources to progress the pending post list */ (void) mca_btl_ugni_post_pending (ugni_module, device); return rc; }
static inline int mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module, int which_cq) { mca_btl_ugni_post_descriptor_t *post_desc = NULL; gni_cq_entry_t event_data = 0; gni_post_descriptor_t *desc; uint32_t recoverable = 1; gni_return_t grc; gni_cq_handle_t the_cq; the_cq = (which_cq == 0) ? ugni_module->rdma_local_cq : ugni_module->rdma_local_irq_cq; OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); grc = GNI_CqGetEvent (the_cq, &event_data); if (GNI_RC_NOT_DONE == grc) { OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); return 0; } if (OPAL_UNLIKELY((GNI_RC_SUCCESS != grc && !event_data) || GNI_CQ_OVERRUN(event_data))) { /* TODO -- need to handle overrun -- how do we do this without an event? will the event eventually come back? Ask Cray */ BTL_ERROR(("unhandled post error! ugni rc = %d %s", grc, gni_err_str[grc])); OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); return opal_common_rc_ugni_to_opal (grc); } grc = GNI_GetCompleted (the_cq, event_data, &desc); OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc && GNI_RC_TRANSACTION_ERROR != grc)) { BTL_ERROR(("Error in GNI_GetComplete %s", gni_err_str[grc])); return opal_common_rc_ugni_to_opal (grc); } post_desc = MCA_BTL_UGNI_DESC_TO_PDESC(desc); if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc || !GNI_CQ_STATUS_OK(event_data))) { (void) GNI_CqErrorRecoverable (event_data, &recoverable); if (OPAL_UNLIKELY(++post_desc->desc.tries >= mca_btl_ugni_component.rdma_max_retries || !recoverable)) { char char_buffer[1024]; GNI_CqErrorStr (event_data, char_buffer, 1024); /* give up */ BTL_ERROR(("giving up on desciptor %p, recoverable %d: %s", (void *) post_desc, recoverable, char_buffer)); #if OPAL_ENABLE_DEBUG btl_ugni_dump_post_desc (post_desc); #endif mca_btl_ugni_post_desc_complete (ugni_module, post_desc, OPAL_ERROR); return OPAL_ERROR; } mca_btl_ugni_repost (ugni_module, post_desc); return 0; } mca_btl_ugni_post_desc_complete (ugni_module, post_desc, opal_common_rc_ugni_to_opal (grc)); return 1; }