static inline int mca_btl_ugni_post_pending (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_device_t *device) { int pending_post_count = opal_list_get_size (&device->pending_post); mca_btl_ugni_post_descriptor_t *post_desc; int rc; /* check if there are any posts pending resources */ if (OPAL_LIKELY(0 == pending_post_count)) { return 0; } BTL_VERBOSE(("progressing %d pending FMA/RDMA operations", pending_post_count)); for (int i = 0 ; i < pending_post_count ; ++i) { mca_btl_ugni_device_lock (device); post_desc = (mca_btl_ugni_post_descriptor_t *) opal_list_remove_first (&device->pending_post); mca_btl_ugni_device_unlock (device); if (NULL == post_desc) { break; } rc = mca_btl_ugni_repost (ugni_module, post_desc); if (OPAL_SUCCESS != rc) { mca_btl_ugni_device_lock (device); opal_list_prepend (&device->pending_post, (opal_list_item_t *) post_desc); mca_btl_ugni_device_unlock (device); break; } } return 1; }
static inline int mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module) { ompi_common_ugni_post_desc_t *desc; mca_btl_ugni_base_frag_t *frag; gni_cq_entry_t event_data = 0; uint32_t recoverable = 1; gni_return_t rc; rc = GNI_CqGetEvent (ugni_module->rdma_local_cq, &event_data); if (GNI_RC_NOT_DONE == rc) { return 0; } if (OPAL_UNLIKELY((GNI_RC_SUCCESS != rc && !event_data) || GNI_CQ_OVERRUN(event_data))) { /* TODO -- need to handle overrun -- how do we do this without an event? will the event eventually come back? Ask Cray */ BTL_ERROR(("unhandled post error! ugni rc = %d", rc)); assert (0); return ompi_common_rc_ugni_to_ompi (rc); } rc = GNI_GetCompleted (ugni_module->rdma_local_cq, event_data, (gni_post_descriptor_t **) &desc); if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc && GNI_RC_TRANSACTION_ERROR != rc)) { BTL_ERROR(("Error in GNI_GetComplete %s", gni_err_str[rc])); return ompi_common_rc_ugni_to_ompi (rc); } frag = MCA_BTL_UGNI_DESC_TO_FRAG(desc); if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc || !GNI_CQ_STATUS_OK(event_data))) { (void) GNI_CqErrorRecoverable (event_data, &recoverable); if (OPAL_UNLIKELY(++frag->post_desc.tries >= mca_btl_ugni_component.rdma_max_retries || !recoverable)) { /* give up */ BTL_ERROR(("giving up on frag %p", (void *) frag)); frag->cbfunc (frag, OMPI_ERROR); return OMPI_ERROR; } /* repost transaction */ mca_btl_ugni_repost (frag, OMPI_SUCCESS); return 0; } BTL_VERBOSE(("RDMA/FMA complete for frag %p", (void *) frag)); frag->cbfunc (frag, ompi_common_rc_ugni_to_ompi (rc)); return 1; }
static inline int mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module, mca_btl_ugni_device_t *device, mca_btl_ugni_cq_t *cq) { mca_btl_ugni_post_descriptor_t *post_desc[MCA_BTL_UGNI_COMPLETIONS_PER_LOOP]; gni_cq_entry_t event_data[MCA_BTL_UGNI_COMPLETIONS_PER_LOOP]; int rc; rc = mca_btl_ugni_cq_get_completed_desc (device, cq, event_data, post_desc, MCA_BTL_UGNI_COMPLETIONS_PER_LOOP); if (0 >= rc) { return rc; } BTL_VERBOSE(("got %d completed rdma descriptors", rc)); for (int i = 0 ; i < rc ; ++i) { BTL_VERBOSE(("post descriptor %p complete. GNI_CQ_STATUS_OK(): %d", post_desc[i], GNI_CQ_STATUS_OK(event_data[i]))); if (OPAL_UNLIKELY(!GNI_CQ_STATUS_OK(event_data[i]))) { uint32_t recoverable = 1; (void) GNI_CqErrorRecoverable (event_data[i], &recoverable); if (OPAL_UNLIKELY(++post_desc[i]->tries >= mca_btl_ugni_component.rdma_max_retries || !recoverable)) { char char_buffer[1024]; GNI_CqErrorStr (event_data[i], char_buffer, 1024); /* give up */ BTL_ERROR(("giving up on desciptor %p, recoverable %d: %s", (void *) post_desc[i], recoverable, char_buffer)); #if OPAL_ENABLE_DEBUG btl_ugni_dump_post_desc (post_desc[i]); #endif mca_btl_ugni_post_desc_complete (ugni_module, post_desc[i], OPAL_ERROR); return OPAL_ERROR; } mca_btl_ugni_repost (ugni_module, post_desc[i]); return 0; } mca_btl_ugni_post_desc_complete (ugni_module, post_desc[i], OPAL_SUCCESS); } /* should be resources to progress the pending post list */ (void) mca_btl_ugni_post_pending (ugni_module, device); return rc; }
static inline int mca_btl_ugni_post_pending (mca_btl_ugni_module_t *ugni_module) { int count = opal_list_get_size (&ugni_module->pending_descriptors); int i; for (i = 0 ; i < count ; ++i) { OPAL_THREAD_LOCK(&ugni_module->pending_descriptors_lock); mca_btl_ugni_post_descriptor_t *post_desc = (mca_btl_ugni_post_descriptor_t *) opal_list_remove_first (&ugni_module->pending_descriptors); OPAL_THREAD_UNLOCK(&ugni_module->pending_descriptors_lock); if (OPAL_SUCCESS != mca_btl_ugni_repost (ugni_module, post_desc)) { break; } } return i; }
static inline int mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module, int which_cq) { mca_btl_ugni_post_descriptor_t *post_desc = NULL; gni_cq_entry_t event_data = 0; gni_post_descriptor_t *desc; uint32_t recoverable = 1; gni_return_t grc; gni_cq_handle_t the_cq; the_cq = (which_cq == 0) ? ugni_module->rdma_local_cq : ugni_module->rdma_local_irq_cq; OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); grc = GNI_CqGetEvent (the_cq, &event_data); if (GNI_RC_NOT_DONE == grc) { OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); return 0; } if (OPAL_UNLIKELY((GNI_RC_SUCCESS != grc && !event_data) || GNI_CQ_OVERRUN(event_data))) { /* TODO -- need to handle overrun -- how do we do this without an event? will the event eventually come back? Ask Cray */ BTL_ERROR(("unhandled post error! ugni rc = %d %s", grc, gni_err_str[grc])); OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); return opal_common_rc_ugni_to_opal (grc); } grc = GNI_GetCompleted (the_cq, event_data, &desc); OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc && GNI_RC_TRANSACTION_ERROR != grc)) { BTL_ERROR(("Error in GNI_GetComplete %s", gni_err_str[grc])); return opal_common_rc_ugni_to_opal (grc); } post_desc = MCA_BTL_UGNI_DESC_TO_PDESC(desc); if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc || !GNI_CQ_STATUS_OK(event_data))) { (void) GNI_CqErrorRecoverable (event_data, &recoverable); if (OPAL_UNLIKELY(++post_desc->desc.tries >= mca_btl_ugni_component.rdma_max_retries || !recoverable)) { char char_buffer[1024]; GNI_CqErrorStr (event_data, char_buffer, 1024); /* give up */ BTL_ERROR(("giving up on desciptor %p, recoverable %d: %s", (void *) post_desc, recoverable, char_buffer)); #if OPAL_ENABLE_DEBUG btl_ugni_dump_post_desc (post_desc); #endif mca_btl_ugni_post_desc_complete (ugni_module, post_desc, OPAL_ERROR); return OPAL_ERROR; } mca_btl_ugni_repost (ugni_module, post_desc); return 0; } mca_btl_ugni_post_desc_complete (ugni_module, post_desc, opal_common_rc_ugni_to_opal (grc)); return 1; }