int opal_common_ugni_ep_create (opal_common_ugni_endpoint_t *cep, gni_cq_handle_t cq, gni_ep_handle_t *ep_handle) { gni_return_t grc; if (OPAL_UNLIKELY(NULL == cep)) { assert (0); return OPAL_ERR_BAD_PARAM; } /* create a uGNI endpoint handle and bind it to the remote peer */ OPAL_THREAD_LOCK(&cep->dev->dev_lock); grc = GNI_EpCreate (cep->dev->dev_handle, cq, ep_handle); OPAL_THREAD_UNLOCK(&cep->dev->dev_lock); if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc)) { return opal_common_rc_ugni_to_opal (grc); } OPAL_THREAD_LOCK(&cep->dev->dev_lock); grc = GNI_EpBind (*ep_handle, cep->ep_rem_addr, cep->ep_rem_id); OPAL_THREAD_UNLOCK(&cep->dev->dev_lock); if (GNI_RC_SUCCESS != grc) { OPAL_THREAD_LOCK(&cep->dev->dev_lock); GNI_EpDestroy (*ep_handle); OPAL_THREAD_UNLOCK(&cep->dev->dev_lock); return opal_common_rc_ugni_to_opal (grc); } return OPAL_SUCCESS; }
static inline int mca_btl_ugni_directed_ep_post (mca_btl_base_endpoint_t *ep) { gni_return_t rc; BTL_VERBOSE(("posting directed datagram to remote id: %d for endpoint %p", ep->common->ep_rem_id, (void *)ep)); ep->mailbox->attr.rmt_irq_mem_hndl = mca_btl_ugni_component.modules[0].device->smsg_irq_mhndl; rc = GNI_EpPostDataWId (ep->smsg_ep_handle, &ep->mailbox->attr, sizeof (ep->mailbox->attr), &ep->remote_attr, sizeof (ep->remote_attr), MCA_BTL_UGNI_CONNECT_DIRECTED_ID | ep->index); return opal_common_rc_ugni_to_opal (rc); }
static inline int mca_btl_ugni_ep_connect_finish (mca_btl_base_endpoint_t *ep) { gni_return_t grc; int rc; BTL_VERBOSE(("finishing connection. remote attributes: msg_type = %d, msg_buffer = %p, buff_size = %d, " "mem_hndl = {qword1 = %" PRIu64 ", qword2 = %" PRIu64 "}, mbox = %d, mbox_maxcredit = %d, " "msg_maxsize = %d", ep->remote_attr.smsg_attr.msg_type, ep->remote_attr.smsg_attr.msg_buffer, ep->remote_attr.smsg_attr.buff_size, ep->remote_attr.smsg_attr.mem_hndl.qword1, ep->remote_attr.smsg_attr.mem_hndl.qword2, ep->remote_attr.smsg_attr.mbox_offset, ep->remote_attr.smsg_attr.mbox_maxcredit, ep->remote_attr.smsg_attr.msg_maxsize)); BTL_VERBOSE(("finishing connection. local attributes: msg_type = %d, msg_buffer = %p, buff_size = %d, " "mem_hndl = {qword1 = %" PRIu64 ", qword2 = %" PRIu64 "}, mbox = %d, mbox_maxcredit = %d, " "msg_maxsize = %d", ep->mailbox->attr.smsg_attr.msg_type, ep->mailbox->attr.smsg_attr.msg_buffer, ep->mailbox->attr.smsg_attr.buff_size, ep->mailbox->attr.smsg_attr.mem_hndl.qword1, ep->mailbox->attr.smsg_attr.mem_hndl.qword2, ep->mailbox->attr.smsg_attr.mbox_offset, ep->mailbox->attr.smsg_attr.mbox_maxcredit, ep->mailbox->attr.smsg_attr.msg_maxsize)); grc = GNI_SmsgInit (ep->smsg_ep_handle, &ep->mailbox->attr.smsg_attr, &ep->remote_attr.smsg_attr); if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc)) { BTL_ERROR(("error initializing SMSG protocol. rc = %d", grc)); return opal_common_rc_ugni_to_opal (grc); } /* set the local event data to the local index and the remote event data to my * index on the remote peer. This makes lookup of endpoints on completion take * a single lookup in the endpoints array. we will not be able to change the * remote peer's index in the endpoint's array after this point. */ GNI_EpSetEventData (ep->rdma_ep_handle, ep->index, ep->remote_attr.index); GNI_EpSetEventData (ep->smsg_ep_handle, ep->index, ep->remote_attr.index); ep->rmt_irq_mem_hndl = ep->remote_attr.rmt_irq_mem_hndl; ep->state = MCA_BTL_UGNI_EP_STATE_CONNECTED; (void) opal_atomic_add_64 (&ep->btl->connected_peer_count, 1); /* send all pending messages */ BTL_VERBOSE(("endpoint connected. posting %u sends", (unsigned int) opal_list_get_size (&ep->frag_wait_list))); rc = mca_btl_ugni_progress_send_wait_list (ep); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { OPAL_THREAD_LOCK(&ep->btl->ep_wait_list_lock); if (false == ep->wait_listed) { opal_list_append (&ep->btl->ep_wait_list, &ep->super); ep->wait_listed = true; } OPAL_THREAD_UNLOCK(&ep->btl->ep_wait_list_lock); } return OPAL_SUCCESS; }
static int opal_common_ugni_device_init (opal_common_ugni_device_t *device, int device_id) { int rc; /* Create a NIC Adress */ device->dev_id = device_id; /* Minor number of the Gemini NIC */ device->dev_addr = opal_common_ugni_get_nic_address (device->dev_id); OPAL_OUTPUT((-1, "Got NIC Addr: 0x%08x, CPU ID: %d", device->dev_addr, device->dev_id)); OBJ_CONSTRUCT(&device->dev_lock,opal_mutex_t); /* Attach device to the communication domain */ rc = GNI_CdmAttach (opal_common_ugni_module.cd_handle, device->dev_id, &device->dev_pe_addr, &device->dev_handle); if (GNI_RC_SUCCESS != rc) { OPAL_OUTPUT((0, "Error: Creating communication domain %d\n", rc)); return opal_common_rc_ugni_to_opal (rc); } return OPAL_SUCCESS; }
static inline int mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module, int which_cq) { mca_btl_ugni_post_descriptor_t *post_desc = NULL; gni_cq_entry_t event_data = 0; gni_post_descriptor_t *desc; uint32_t recoverable = 1; gni_return_t grc; gni_cq_handle_t the_cq; the_cq = (which_cq == 0) ? ugni_module->rdma_local_cq : ugni_module->rdma_local_irq_cq; OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); grc = GNI_CqGetEvent (the_cq, &event_data); if (GNI_RC_NOT_DONE == grc) { OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); return 0; } if (OPAL_UNLIKELY((GNI_RC_SUCCESS != grc && !event_data) || GNI_CQ_OVERRUN(event_data))) { /* TODO -- need to handle overrun -- how do we do this without an event? will the event eventually come back? Ask Cray */ BTL_ERROR(("unhandled post error! ugni rc = %d %s", grc, gni_err_str[grc])); OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); return opal_common_rc_ugni_to_opal (grc); } grc = GNI_GetCompleted (the_cq, event_data, &desc); OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc && GNI_RC_TRANSACTION_ERROR != grc)) { BTL_ERROR(("Error in GNI_GetComplete %s", gni_err_str[grc])); return opal_common_rc_ugni_to_opal (grc); } post_desc = MCA_BTL_UGNI_DESC_TO_PDESC(desc); if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc || !GNI_CQ_STATUS_OK(event_data))) { (void) GNI_CqErrorRecoverable (event_data, &recoverable); if (OPAL_UNLIKELY(++post_desc->desc.tries >= mca_btl_ugni_component.rdma_max_retries || !recoverable)) { char char_buffer[1024]; GNI_CqErrorStr (event_data, char_buffer, 1024); /* give up */ BTL_ERROR(("giving up on desciptor %p, recoverable %d: %s", (void *) post_desc, recoverable, char_buffer)); #if OPAL_ENABLE_DEBUG btl_ugni_dump_post_desc (post_desc); #endif mca_btl_ugni_post_desc_complete (ugni_module, post_desc, OPAL_ERROR); return OPAL_ERROR; } mca_btl_ugni_repost (ugni_module, post_desc); return 0; } mca_btl_ugni_post_desc_complete (ugni_module, post_desc, opal_common_rc_ugni_to_opal (grc)); return 1; }
static inline int mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *ugni_module) { uint64_t datagram_id, data, proc_id; uint32_t remote_addr, remote_id; mca_btl_base_endpoint_t *ep; gni_post_state_t post_state; gni_ep_handle_t handle; gni_return_t grc; int count = 0, rc; /* check for datagram completion */ OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); /* TODO: may not need lock for this function */ grc = GNI_PostDataProbeById (ugni_module->device->dev_handle, &datagram_id); if (OPAL_LIKELY(GNI_RC_SUCCESS != grc)) { OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); return 0; } data = datagram_id & ~(MCA_BTL_UGNI_DATAGRAM_MASK); BTL_VERBOSE(("datgram_id: %" PRIx64 ", mask: %" PRIx64, datagram_id, (uint64_t) (datagram_id & MCA_BTL_UGNI_DATAGRAM_MASK))); if ((datagram_id & MCA_BTL_UGNI_DATAGRAM_MASK) == MCA_BTL_UGNI_CONNECT_DIRECTED_ID) { ep = (mca_btl_base_endpoint_t *) opal_pointer_array_get_item (&ugni_module->endpoints, data); handle = ep->smsg_ep_handle; } else { handle = ugni_module->wildcard_ep; } /* wait for the incoming datagram to complete (in case it isn't) */ grc = GNI_EpPostDataWaitById (handle, datagram_id, -1, &post_state, &remote_addr, &remote_id); OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); if (GNI_RC_SUCCESS != grc) { BTL_ERROR(("GNI_EpPostDataWaitById failed with rc = %d", grc)); return opal_common_rc_ugni_to_opal (grc); } /* if this is a wildcard endpoint lookup the remote peer by the proc id we received */ if (handle == ugni_module->wildcard_ep) { proc_id = mca_btl_ugni_proc_name_to_id (ugni_module->wc_remote_attr.proc_name); BTL_VERBOSE(("received connection attempt on wildcard endpoint from proc id: %" PRIx64, proc_id)); OPAL_THREAD_LOCK(&ugni_module->endpoint_lock); rc = opal_hash_table_get_value_uint64 (&ugni_module->id_to_endpoint, proc_id, (void **) &ep); OPAL_THREAD_UNLOCK(&ugni_module->endpoint_lock); /* check if the endpoint is known */ if (OPAL_UNLIKELY(OPAL_SUCCESS != rc || NULL == ep)) { struct opal_proc_t *remote_proc = opal_proc_for_name (ugni_module->wc_remote_attr.proc_name); BTL_VERBOSE(("Got connection request from an unknown peer {jobid = 0x%x, vid = 0x%x}", ugni_module->wc_remote_attr.proc_name.jobid, ugni_module->wc_remote_attr.proc_name.vpid)); ep = mca_btl_ugni_get_ep (&ugni_module->super, remote_proc); if (OPAL_UNLIKELY(NULL == ep)) { return rc; } } } else { BTL_VERBOSE(("directed datagram complete for endpoint %p", (void *) ep)); } /* should not have gotten a NULL endpoint */ assert (NULL != ep); BTL_VERBOSE(("got a datagram completion: id = %" PRIx64 ", state = %d, " "data = 0x%" PRIx64 ", ep = %p, remote id: %d", datagram_id, post_state, data, (void *) ep, remote_id)); /* NTH: TODO -- error handling */ opal_mutex_lock (&ep->lock); if (handle != ugni_module->wildcard_ep) { /* directed post complete */ ep->dg_posted = false; } (void) mca_btl_ugni_ep_connect_progress (ep); opal_mutex_unlock (&ep->lock); if (MCA_BTL_UGNI_EP_STATE_CONNECTED == ep->state) { /* process messages waiting in the endpoint's smsg mailbox */ count = mca_btl_ugni_smsg_process (ep); } /* repost the wildcard datagram */ if (handle == ugni_module->wildcard_ep) { mca_btl_ugni_wildcard_ep_post (ugni_module); } return count; }
int opal_common_ugni_init (void) { int modes, rc, i; uint32_t my_cdm_id; opal_common_ugni_module_ref_count ++; if (opal_common_ugni_module_ref_count > 1) { return OPAL_SUCCESS; } /* use pid for my_cdm_id. Although its not stated in the uGNI documentation, the cdm_id only needs to be unique within a node for a given ptag/cookie tuple */ my_cdm_id = getpid(); /*TODO: eventually need something else for thread-hot support */ /* pull settings from ugni btl */ opal_common_ugni_module.rdma_max_retries = mca_btl_ugni_component.rdma_max_retries; /* Create a communication domain */ modes = GNI_CDM_MODE_FORK_FULLCOPY | GNI_CDM_MODE_CACHED_AMO_ENABLED | GNI_CDM_MODE_ERR_NO_KILL | GNI_CDM_MODE_FAST_DATAGRAM_POLL; /* collect uGNI information */ rc = get_ptag(&opal_common_ugni_module.ptag); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { return rc; } rc = get_cookie(&opal_common_ugni_module.cookie); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { return rc; } /* create a communication domain */ rc = GNI_CdmCreate (my_cdm_id, opal_common_ugni_module.ptag, opal_common_ugni_module.cookie, modes, &opal_common_ugni_module.cd_handle); if (OPAL_UNLIKELY(GNI_RC_SUCCESS != rc)) { OPAL_OUTPUT((0, "Error: Creating communication domain %d\n",rc)); return opal_common_rc_ugni_to_opal (rc); } /* setup uGNI devices. we only support one device atm */ opal_common_ugni_module.device_count = 1; opal_common_ugni_module.devices = calloc (opal_common_ugni_module.device_count, sizeof (opal_common_ugni_device_t)); for (i = 0 ; i < opal_common_ugni_module.device_count ; ++i) { rc = opal_common_ugni_device_init (opal_common_ugni_module.devices + i, i); if (OPAL_UNLIKELY(OPAL_SUCCESS != rc)) { OPAL_OUTPUT((-1, "error initializing uGNI device")); return rc; } } /* send ugni modex */ opal_common_ugni_send_modex (my_cdm_id); return OPAL_SUCCESS; }