static void mca_btl_openib_endpoint_destruct(mca_btl_base_endpoint_t* endpoint) { bool pval_clean = false; int qp; /* If the CPC has an endpoint_finalize function, call it */ if (NULL != endpoint->endpoint_local_cpc->cbm_endpoint_finalize) { endpoint->endpoint_local_cpc->cbm_endpoint_finalize(endpoint); } /* Release CTS buffer */ ompi_btl_openib_connect_base_free_cts(endpoint); /* Release memory resources */ do { /* Make sure that mca_btl_openib_endpoint_connect_eager_rdma () * was not in "connect" or "bad" flow (failed to allocate memory) * and changed the pointer back to NULL */ if(!opal_atomic_cmpset_ptr(&endpoint->eager_rdma_local.base.pval, NULL, (void*)1)) { if ((void*)1 != endpoint->eager_rdma_local.base.pval && NULL != endpoint->eager_rdma_local.base.pval) { endpoint->endpoint_btl->super.btl_mpool->mpool_free(endpoint->endpoint_btl->super.btl_mpool, endpoint->eager_rdma_local.base.pval, (mca_mpool_base_registration_t*)endpoint->eager_rdma_local.reg); pval_clean=true; } } else { pval_clean=true; } } while (!pval_clean); /* Close opened QPs if we have them*/ for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) { MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->qps[qp].no_credits_pending_frags[0]); MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->qps[qp].no_credits_pending_frags[1]); OBJ_DESTRUCT(&endpoint->qps[qp].no_credits_pending_frags[0]); OBJ_DESTRUCT(&endpoint->qps[qp].no_credits_pending_frags[1]); MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS( &endpoint->qps[qp].no_wqe_pending_frags[0]); MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS( &endpoint->qps[qp].no_wqe_pending_frags[1]); OBJ_DESTRUCT(&endpoint->qps[qp].no_wqe_pending_frags[0]); OBJ_DESTRUCT(&endpoint->qps[qp].no_wqe_pending_frags[1]); if(--endpoint->qps[qp].qp->users != 0) continue; if(endpoint->qps[qp].qp->lcl_qp != NULL) if(ibv_destroy_qp(endpoint->qps[qp].qp->lcl_qp)) BTL_ERROR(("Failed to destroy QP:%d\n", qp)); free(endpoint->qps[qp].qp); } /* free the qps */ free(endpoint->qps); endpoint->qps = NULL; free(endpoint->rem_info.rem_qps); free(endpoint->rem_info.rem_srqs); /* unregister xrc recv qp */ #if HAVE_XRC if (0 != endpoint->xrc_recv_qp_num) { if(ibv_unreg_xrc_rcv_qp(endpoint->endpoint_btl->device->xrc_domain, endpoint->xrc_recv_qp_num)) { BTL_ERROR(("Failed to unregister XRC recv QP:%d\n", endpoint->xrc_recv_qp_num)); } } #endif OBJ_DESTRUCT(&endpoint->endpoint_lock); /* Clean pending lists */ MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->pending_lazy_frags); OBJ_DESTRUCT(&endpoint->pending_lazy_frags); MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->pending_get_frags); OBJ_DESTRUCT(&endpoint->pending_get_frags); MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(&endpoint->pending_put_frags); OBJ_DESTRUCT(&endpoint->pending_put_frags); }
/* * Start a connection to the endpoint. This will likely not complete, * as the socket is set to non-blocking, so register for event * notification of connect completion. On connection we send * our globally unique process identifier to the endpoint and wait for * the endpoints response. */ static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endpoint) { int rc,flags; struct sockaddr_storage endpoint_addr; /* By default consider a IPv4 connection */ uint16_t af_family = AF_INET; opal_socklen_t addrlen = sizeof(struct sockaddr_in); #if OPAL_WANT_IPV6 if (AF_INET6 == btl_endpoint->endpoint_addr->addr_family) { af_family = AF_INET6; addrlen = sizeof (struct sockaddr_in6); } #endif btl_endpoint->endpoint_sd = socket(af_family, SOCK_STREAM, 0); if (btl_endpoint->endpoint_sd < 0) { btl_endpoint->endpoint_retries++; return OMPI_ERR_UNREACH; } /* setup socket buffer sizes */ mca_btl_tcp_set_socket_options(btl_endpoint->endpoint_sd); /* setup event callbacks */ mca_btl_tcp_endpoint_event_init(btl_endpoint); /* setup the socket as non-blocking */ if((flags = fcntl(btl_endpoint->endpoint_sd, F_GETFL, 0)) < 0) { BTL_ERROR(("fcntl(F_GETFL) failed: %s (%d)", strerror(opal_socket_errno), opal_socket_errno)); } else { flags |= O_NONBLOCK; if(fcntl(btl_endpoint->endpoint_sd, F_SETFL, flags) < 0) BTL_ERROR(("fcntl(F_SETFL) failed: %s (%d)", strerror(opal_socket_errno), opal_socket_errno)); } /* start the connect - will likely fail with EINPROGRESS */ mca_btl_tcp_proc_tosocks(btl_endpoint->endpoint_addr, &endpoint_addr); opal_output_verbose(20, mca_btl_base_output, "btl: tcp: attempting to connect() to %s address %s on port %d", ORTE_NAME_PRINT(&btl_endpoint->endpoint_proc->proc_ompi->proc_name), opal_net_get_hostname((struct sockaddr*) &endpoint_addr), ntohs(btl_endpoint->endpoint_addr->addr_port)); if(connect(btl_endpoint->endpoint_sd, (struct sockaddr*)&endpoint_addr, addrlen) < 0) { /* non-blocking so wait for completion */ if(opal_socket_errno == EINPROGRESS || opal_socket_errno == EWOULDBLOCK) { btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECTING; opal_event_add(&btl_endpoint->endpoint_send_event, 0); return OMPI_SUCCESS; } { char *address; address = opal_net_get_hostname((struct sockaddr*) &endpoint_addr); BTL_PEER_ERROR( btl_endpoint->endpoint_proc->proc_ompi, ( "Unable to connect to the peer %s on port %d: %s\n", address, btl_endpoint->endpoint_addr->addr_port, strerror(opal_socket_errno) ) ); } mca_btl_tcp_endpoint_close(btl_endpoint); btl_endpoint->endpoint_retries++; return OMPI_ERR_UNREACH; } /* send our globally unique process identifier to the endpoint */ if((rc = mca_btl_tcp_endpoint_send_connect_ack(btl_endpoint)) == OMPI_SUCCESS) { btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECT_ACK; opal_event_add(&btl_endpoint->endpoint_recv_event, 0); } else { mca_btl_tcp_endpoint_close(btl_endpoint); } return rc; }
/* This Async event thread is handling all async event of * all btls/devices in openib component */ void* btl_openib_async_thread(void * async) { int rc; int i; struct mca_btl_openib_async_poll devices_poll; if (OMPI_SUCCESS != btl_openib_async_poll_init(&devices_poll)) { BTL_ERROR(("Fatal error, stoping asynch event thread")); pthread_exit(&return_status); } while(1) { rc = poll(devices_poll.async_pollfd, devices_poll.active_poll_size, -1); if (rc < 0) { if (errno != EINTR) { BTL_ERROR(("Poll failed. Fatal error, stoping asynch event thread")); pthread_exit(&return_status); } else { /* EINTR - we got interupt */ continue; } } for(i = 0; i < devices_poll.active_poll_size; i++) { switch (devices_poll.async_pollfd[i].revents) { case 0: /* no events */ break; case POLLIN: #if defined(__SVR4) && defined(__sun) /* * Need workaround for Solaris IB user verbs since * "Poll on IB async fd returns POLLRDNORM revent even though it is masked out" */ case POLLIN | POLLRDNORM: #endif /* Processing our event */ if (0 == i) { /* 0 poll we use for comunication with main thread */ if (OMPI_SUCCESS != btl_openib_async_commandh(&devices_poll)) { free(devices_poll.async_pollfd); BTL_ERROR(("Failed to process async thread process. " "Fatal error, stoping asynch event thread")); pthread_exit(&return_status); } } else { /* We get device event */ if (btl_openib_async_deviceh(&devices_poll, i)) { free(devices_poll.async_pollfd); BTL_ERROR(("Failed to process async thread process. " "Fatal error, stoping asynch event thread")); pthread_exit(&return_status); } } break; default: /* Get event other than POLLIN * this case should not never happend */ BTL_ERROR(("Got unexpected event %d. " "Fatal error, stoping asynch event thread", devices_poll.async_pollfd[i].revents)); free(devices_poll.async_pollfd); pthread_exit(&return_status); } } } return PTHREAD_CANCELED; }
bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd) { int cnt, dont_copy_data = 0; size_t i, num_vecs; mca_btl_base_endpoint_t* btl_endpoint = frag->endpoint; repeat: num_vecs = frag->iov_cnt; #if MCA_BTL_TCP_ENDPOINT_CACHE if( 0 != btl_endpoint->endpoint_cache_length ) { size_t length; /* It's strange at the first look but cnt have to be set to the full amount of data * available. After going to advance_iov_position we will use cnt to detect if there * is still some data pending. */ cnt = length = btl_endpoint->endpoint_cache_length; for( i = 0; i < frag->iov_cnt; i++ ) { if( length > frag->iov_ptr[i].iov_len ) length = frag->iov_ptr[i].iov_len; if( (0 == dont_copy_data) || (length < frag->iov_ptr[i].iov_len) ) { memcpy( frag->iov_ptr[i].iov_base, btl_endpoint->endpoint_cache_pos, length ); } else { frag->segments[0].seg_addr.pval = btl_endpoint->endpoint_cache_pos; frag->iov_ptr[i].iov_base = btl_endpoint->endpoint_cache_pos; } btl_endpoint->endpoint_cache_pos += length; btl_endpoint->endpoint_cache_length -= length; length = btl_endpoint->endpoint_cache_length; if( 0 == length ) { btl_endpoint->endpoint_cache_pos = btl_endpoint->endpoint_cache; break; } } goto advance_iov_position; } /* What's happens if all iovecs are used by the fragment ? It still work, as we reserve one * iovec for the caching in the fragment structure (the +1). */ frag->iov_ptr[num_vecs].iov_base = btl_endpoint->endpoint_cache_pos; frag->iov_ptr[num_vecs].iov_len = mca_btl_tcp_component.tcp_endpoint_cache - btl_endpoint->endpoint_cache_length; num_vecs++; #endif /* MCA_BTL_TCP_ENDPOINT_CACHE */ /* non-blocking read, but continue if interrupted */ cnt = -1; while( cnt < 0 ) { cnt = readv(sd, frag->iov_ptr, num_vecs); if( 0 < cnt ) goto advance_iov_position; if( cnt == 0 ) { mca_btl_tcp_endpoint_close(btl_endpoint); return false; } switch(opal_socket_errno) { case EINTR: continue; case EWOULDBLOCK: return false; case EFAULT: BTL_ERROR(("mca_btl_tcp_frag_recv: readv error (%p, %d)\n\t%s(%d)\n", frag->iov_ptr[0].iov_base, frag->iov_ptr[0].iov_len, strerror(opal_socket_errno), frag->iov_cnt)); mca_btl_tcp_endpoint_close(btl_endpoint); return false; default: BTL_ERROR(("mca_btl_tcp_frag_recv: readv failed: %s (%d)", strerror(opal_socket_errno), opal_socket_errno)); mca_btl_tcp_endpoint_close(btl_endpoint); return false; } }; advance_iov_position: /* if the read didn't complete - update the iovec state */ num_vecs = frag->iov_cnt; for( i = 0; i < num_vecs; i++ ) { if( cnt < (int)frag->iov_ptr->iov_len ) { frag->iov_ptr->iov_base = (ompi_iov_base_ptr_t) (((unsigned char*)frag->iov_ptr->iov_base) + cnt); frag->iov_ptr->iov_len -= cnt; cnt = 0; break; } cnt -= frag->iov_ptr->iov_len; frag->iov_idx++; frag->iov_ptr++; frag->iov_cnt--; } #if MCA_BTL_TCP_ENDPOINT_CACHE btl_endpoint->endpoint_cache_length = cnt; #endif /* MCA_BTL_TCP_ENDPOINT_CACHE */ /* read header */ if(frag->iov_cnt == 0) { if (btl_endpoint->endpoint_nbo && frag->iov_idx == 1) MCA_BTL_TCP_HDR_NTOH(frag->hdr); switch(frag->hdr.type) { case MCA_BTL_TCP_HDR_TYPE_SEND: if(frag->iov_idx == 1 && frag->hdr.size) { frag->segments[0].seg_addr.pval = frag+1; frag->segments[0].seg_len = frag->hdr.size; frag->iov[1].iov_base = (IOVBASE_TYPE*)(frag->segments[0].seg_addr.pval); frag->iov[1].iov_len = frag->hdr.size; frag->iov_cnt++; #ifndef __sparc /* The following cannot be done for sparc code * because it causes alignment errors when accessing * structures later on in the btl and pml code. */ dont_copy_data = 1; #endif goto repeat; } break; case MCA_BTL_TCP_HDR_TYPE_PUT: if(frag->iov_idx == 1) { frag->iov[1].iov_base = (IOVBASE_TYPE*)frag->segments; frag->iov[1].iov_len = frag->hdr.count * sizeof(mca_btl_base_segment_t); frag->iov_cnt++; goto repeat; } else if (frag->iov_idx == 2) { for( i = 0; i < frag->hdr.count; i++ ) { frag->iov[i+2].iov_base = (IOVBASE_TYPE*)ompi_ptr_ltop(frag->segments[i].seg_addr.lval); frag->iov[i+2].iov_len = frag->segments[i].seg_len; } frag->iov_cnt += frag->hdr.count; goto repeat; } break; case MCA_BTL_TCP_HDR_TYPE_GET: default: break; } return true; } return false; }
static inline int mca_btl_ugni_progress_datagram (mca_btl_ugni_module_t *ugni_module) { uint64_t datagram_id, data, proc_id; uint32_t remote_addr, remote_id; mca_btl_base_endpoint_t *ep; gni_post_state_t post_state; gni_ep_handle_t handle; gni_return_t grc; int count = 0, rc; /* check for datagram completion */ OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); /* TODO: may not need lock for this function */ grc = GNI_PostDataProbeById (ugni_module->device->dev_handle, &datagram_id); if (OPAL_LIKELY(GNI_RC_SUCCESS != grc)) { OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); return 0; } data = datagram_id & ~(MCA_BTL_UGNI_DATAGRAM_MASK); BTL_VERBOSE(("datgram_id: %" PRIx64 ", mask: %" PRIx64, datagram_id, (uint64_t) (datagram_id & MCA_BTL_UGNI_DATAGRAM_MASK))); if ((datagram_id & MCA_BTL_UGNI_DATAGRAM_MASK) == MCA_BTL_UGNI_CONNECT_DIRECTED_ID) { ep = (mca_btl_base_endpoint_t *) opal_pointer_array_get_item (&ugni_module->endpoints, data); handle = ep->smsg_ep_handle; } else { handle = ugni_module->wildcard_ep; } /* wait for the incoming datagram to complete (in case it isn't) */ grc = GNI_EpPostDataWaitById (handle, datagram_id, -1, &post_state, &remote_addr, &remote_id); OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); if (GNI_RC_SUCCESS != grc) { BTL_ERROR(("GNI_EpPostDataWaitById failed with rc = %d", grc)); return opal_common_rc_ugni_to_opal (grc); } /* if this is a wildcard endpoint lookup the remote peer by the proc id we received */ if (handle == ugni_module->wildcard_ep) { proc_id = mca_btl_ugni_proc_name_to_id (ugni_module->wc_remote_attr.proc_name); BTL_VERBOSE(("received connection attempt on wildcard endpoint from proc id: %" PRIx64, proc_id)); OPAL_THREAD_LOCK(&ugni_module->endpoint_lock); rc = opal_hash_table_get_value_uint64 (&ugni_module->id_to_endpoint, proc_id, (void **) &ep); OPAL_THREAD_UNLOCK(&ugni_module->endpoint_lock); /* check if the endpoint is known */ if (OPAL_UNLIKELY(OPAL_SUCCESS != rc || NULL == ep)) { struct opal_proc_t *remote_proc = opal_proc_for_name (ugni_module->wc_remote_attr.proc_name); BTL_VERBOSE(("Got connection request from an unknown peer {jobid = 0x%x, vid = 0x%x}", ugni_module->wc_remote_attr.proc_name.jobid, ugni_module->wc_remote_attr.proc_name.vpid)); ep = mca_btl_ugni_get_ep (&ugni_module->super, remote_proc); if (OPAL_UNLIKELY(NULL == ep)) { return rc; } } } else { BTL_VERBOSE(("directed datagram complete for endpoint %p", (void *) ep)); } /* should not have gotten a NULL endpoint */ assert (NULL != ep); BTL_VERBOSE(("got a datagram completion: id = %" PRIx64 ", state = %d, " "data = 0x%" PRIx64 ", ep = %p, remote id: %d", datagram_id, post_state, data, (void *) ep, remote_id)); /* NTH: TODO -- error handling */ opal_mutex_lock (&ep->lock); if (handle != ugni_module->wildcard_ep) { /* directed post complete */ ep->dg_posted = false; } (void) mca_btl_ugni_ep_connect_progress (ep); opal_mutex_unlock (&ep->lock); if (MCA_BTL_UGNI_EP_STATE_CONNECTED == ep->state) { /* process messages waiting in the endpoint's smsg mailbox */ count = mca_btl_ugni_smsg_process (ep); } /* repost the wildcard datagram */ if (handle == ugni_module->wildcard_ep) { mca_btl_ugni_wildcard_ep_post (ugni_module); } return count; }
/* this function is called with endpoint->endpoint_lock held */ int mca_btl_openib_endpoint_post_send(mca_btl_openib_endpoint_t *endpoint, mca_btl_openib_send_frag_t *frag) { mca_btl_openib_header_t *hdr = frag->hdr; mca_btl_base_descriptor_t *des = &to_base_frag(frag)->base; int qp, ib_rc; int32_t cm_return; bool do_rdma = false; size_t eager_limit; if(OPAL_LIKELY(des->order == MCA_BTL_NO_ORDER)) des->order = frag->qp_idx; qp = des->order; if(acruire_wqe(endpoint, frag) != OMPI_SUCCESS) return OMPI_ERR_RESOURCE_BUSY; eager_limit = mca_btl_openib_component.eager_limit + sizeof(mca_btl_openib_header_coalesced_t) + sizeof(mca_btl_openib_control_header_t); if(des->des_src->seg_len + frag->coalesced_length <= eager_limit && (des->des_flags & MCA_BTL_DES_FLAGS_PRIORITY)) { /* High priority frag. Try to send over eager RDMA */ if(acquire_eager_rdma_send_credit(endpoint) == OMPI_SUCCESS) do_rdma = true; } if(!do_rdma && acquire_send_credit(endpoint, frag) != OMPI_SUCCESS) { qp_put_wqe(endpoint, qp); return OMPI_ERR_RESOURCE_BUSY; } BTL_OPENIB_GET_CREDITS(endpoint->eager_rdma_local.credits, hdr->credits); if(hdr->credits) hdr->credits |= BTL_OPENIB_RDMA_CREDITS_FLAG; if(!do_rdma) { if(BTL_OPENIB_QP_TYPE_PP(qp) && 0 == hdr->credits) { BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, hdr->credits); } } else { hdr->credits |= (qp << 11); } BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return); /* cm_seen is only 8 bytes, but cm_return is 32 bytes */ if(cm_return > 255) { hdr->cm_seen = 255; cm_return -= 255; OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return, cm_return); } else { hdr->cm_seen = cm_return; } qp_reset_signal_count(endpoint, qp); ib_rc = post_send(endpoint, frag, do_rdma, 1); if(!ib_rc) return OMPI_SUCCESS; if(endpoint->nbo) BTL_OPENIB_HEADER_NTOH(*hdr); if(BTL_OPENIB_IS_RDMA_CREDITS(hdr->credits)) { OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits, BTL_OPENIB_CREDITS(hdr->credits)); } qp_put_wqe(endpoint, qp); if(do_rdma) { OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1); } else { if(BTL_OPENIB_QP_TYPE_PP(qp)) { OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits, hdr->credits); OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.sd_credits, 1); } else if BTL_OPENIB_QP_TYPE_SRQ(qp){ mca_btl_openib_module_t *openib_btl = endpoint->endpoint_btl; OPAL_THREAD_ADD32(&openib_btl->qps[qp].u.srq_qp.sd_credits, 1); } } BTL_ERROR(("error posting send request error %d: %s\n", ib_rc, strerror(ib_rc))); return OMPI_ERROR; }
mca_btl_openib_proc_t* mca_btl_openib_proc_create(opal_proc_t* proc) { mca_btl_openib_proc_t* module_proc = NULL; size_t msg_size; uint32_t size; int rc, i, j; void *message; char *offset; int modex_message_size; mca_btl_openib_modex_message_t dummy; /* Check if we have already created a IB proc * structure for this ompi process */ module_proc = mca_btl_openib_proc_lookup_proc(proc); if (NULL != module_proc) { /* Gotcha! */ return module_proc; } /* Oops! First time, gotta create a new IB proc * out of the opal_proc ... */ module_proc = OBJ_NEW(mca_btl_openib_proc_t); /* Initialize number of peer */ module_proc->proc_endpoint_count = 0; module_proc->proc_opal = proc; /* query for the peer address info */ OPAL_MODEX_RECV(rc, &mca_btl_openib_component.super.btl_version, proc, &message, &msg_size); if (OPAL_SUCCESS != rc) { BTL_ERROR(("[%s:%d] opal_modex_recv failed for peer %s", __FILE__, __LINE__, OPAL_NAME_PRINT(proc->proc_name))); OBJ_RELEASE(module_proc); return NULL; } if (0 == msg_size) { return NULL; } /* Message was packed in btl_openib_component.c; the format is listed in a comment in that file */ modex_message_size = ((char *) &(dummy.end)) - ((char*) &dummy); /* Unpack the number of modules in the message */ offset = (char *) message; unpack8(&offset, &(module_proc->proc_port_count)); BTL_VERBOSE(("unpack: %d btls", module_proc->proc_port_count)); if (module_proc->proc_port_count > 0) { module_proc->proc_ports = (mca_btl_openib_proc_modex_t *) malloc(sizeof(mca_btl_openib_proc_modex_t) * module_proc->proc_port_count); } else { module_proc->proc_ports = NULL; } /* Loop over unpacking all the ports */ for (i = 0; i < module_proc->proc_port_count; i++) { /* Unpack the modex comment message struct */ size = modex_message_size; memcpy(&(module_proc->proc_ports[i].pm_port_info), offset, size); #if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT MCA_BTL_OPENIB_MODEX_MSG_NTOH(module_proc->proc_ports[i].pm_port_info); #endif offset += size; BTL_VERBOSE(("unpacked btl %d: modex message, offset now %d", i, (int)(offset-((char*)message)))); /* Unpack the number of CPCs that follow */ unpack8(&offset, &(module_proc->proc_ports[i].pm_cpc_data_count)); BTL_VERBOSE(("unpacked btl %d: number of cpcs to follow %d (offset now %d)", i, module_proc->proc_ports[i].pm_cpc_data_count, (int)(offset-((char*)message)))); module_proc->proc_ports[i].pm_cpc_data = (opal_btl_openib_connect_base_module_data_t *) calloc(module_proc->proc_ports[i].pm_cpc_data_count, sizeof(opal_btl_openib_connect_base_module_data_t)); if (NULL == module_proc->proc_ports[i].pm_cpc_data) { return NULL; } /* Unpack the CPCs */ for (j = 0; j < module_proc->proc_ports[i].pm_cpc_data_count; ++j) { uint8_t u8; opal_btl_openib_connect_base_module_data_t *cpcd; cpcd = module_proc->proc_ports[i].pm_cpc_data + j; unpack8(&offset, &u8); BTL_VERBOSE(("unpacked btl %d: cpc %d: index %d (offset now %d)", i, j, u8, (int)(offset-(char*)message))); cpcd->cbm_component = opal_btl_openib_connect_base_get_cpc_byindex(u8); BTL_VERBOSE(("unpacked btl %d: cpc %d: component %s", i, j, cpcd->cbm_component->cbc_name)); unpack8(&offset, &cpcd->cbm_priority); unpack8(&offset, &cpcd->cbm_modex_message_len); BTL_VERBOSE(("unpacked btl %d: cpc %d: priority %d, msg len %d (offset now %d)", i, j, cpcd->cbm_priority, cpcd->cbm_modex_message_len, (int)(offset-(char*)message))); if (cpcd->cbm_modex_message_len > 0) { cpcd->cbm_modex_message = malloc(cpcd->cbm_modex_message_len); if (NULL == cpcd->cbm_modex_message) { BTL_ERROR(("Failed to malloc")); return NULL; } memcpy(cpcd->cbm_modex_message, offset, cpcd->cbm_modex_message_len); offset += cpcd->cbm_modex_message_len; BTL_VERBOSE(("unpacked btl %d: cpc %d: blob unpacked %d %x (offset now %d)", i, j, ((uint32_t*)cpcd->cbm_modex_message)[0], ((uint32_t*)cpcd->cbm_modex_message)[1], (int)(offset-((char*)message)))); } } } if (0 == module_proc->proc_port_count) { module_proc->proc_endpoints = NULL; } else { module_proc->proc_endpoints = (mca_btl_base_endpoint_t**) malloc(module_proc->proc_port_count * sizeof(mca_btl_base_endpoint_t*)); } if (NULL == module_proc->proc_endpoints) { OBJ_RELEASE(module_proc); return NULL; } BTL_VERBOSE(("unpacking done!")); return module_proc; }
int mca_btl_ofi_context_progress(mca_btl_ofi_context_t *context) { int ret = 0; int events_read; int events = 0; struct fi_cq_entry cq_entry[MCA_BTL_OFI_DEFAULT_MAX_CQE]; struct fi_cq_err_entry cqerr = {0}; mca_btl_ofi_completion_context_t *c_ctx; mca_btl_ofi_base_completion_t *comp; mca_btl_ofi_rdma_completion_t *rdma_comp; mca_btl_ofi_frag_completion_t *frag_comp; ret = fi_cq_read(context->cq, &cq_entry, mca_btl_ofi_component.num_cqe_read); if (0 < ret) { events_read = ret; for (int i = 0; i < events_read; i++) { if (NULL != cq_entry[i].op_context) { ++events; c_ctx = (mca_btl_ofi_completion_context_t*) cq_entry[i].op_context; /* We are casting to every type here just for simplicity. */ comp = (mca_btl_ofi_base_completion_t*) c_ctx->comp; frag_comp = (mca_btl_ofi_frag_completion_t*) c_ctx->comp; rdma_comp = (mca_btl_ofi_rdma_completion_t*) c_ctx->comp; switch (comp->type) { case MCA_BTL_OFI_TYPE_GET: case MCA_BTL_OFI_TYPE_PUT: case MCA_BTL_OFI_TYPE_AOP: case MCA_BTL_OFI_TYPE_AFOP: case MCA_BTL_OFI_TYPE_CSWAP: /* call the callback */ if (rdma_comp->cbfunc) { rdma_comp->cbfunc (comp->btl, comp->endpoint, rdma_comp->local_address, rdma_comp->local_handle, rdma_comp->cbcontext, rdma_comp->cbdata, OPAL_SUCCESS); } MCA_BTL_OFI_NUM_RDMA_DEC((mca_btl_ofi_module_t*) comp->btl); break; case MCA_BTL_OFI_TYPE_RECV: mca_btl_ofi_recv_frag((mca_btl_ofi_module_t*) comp->btl, (mca_btl_ofi_endpoint_t*) comp->endpoint, context, frag_comp->frag); break; case MCA_BTL_OFI_TYPE_SEND: MCA_BTL_OFI_NUM_SEND_DEC((mca_btl_ofi_module_t*) comp->btl); mca_btl_ofi_frag_complete(frag_comp->frag, OPAL_SUCCESS); break; default: /* catasthrophic */ BTL_ERROR(("unknown completion type")); MCA_BTL_OFI_ABORT(); } /* return the completion handler */ opal_free_list_return(comp->my_list, (opal_free_list_item_t*) comp); } } } else if (OPAL_UNLIKELY(ret == -FI_EAVAIL)) { ret = fi_cq_readerr(context->cq, &cqerr, 0); /* cq readerr failed!? */ if (0 > ret) { BTL_ERROR(("%s:%d: Error returned from fi_cq_readerr: %s(%d)", __FILE__, __LINE__, fi_strerror(-ret), ret)); } else { BTL_ERROR(("fi_cq_readerr: (provider err_code = %d)\n", cqerr.prov_errno)); } MCA_BTL_OFI_ABORT(); } #ifdef FI_EINTR /* sometimes, sockets provider complain about interupt. We do nothing. */ else if (OPAL_UNLIKELY(ret == -FI_EINTR)) { } #endif /* If the error is not FI_EAGAIN, report the error and abort. */ else if (OPAL_UNLIKELY(ret != -FI_EAGAIN)) { BTL_ERROR(("fi_cq_read returned error %d:%s", ret, fi_strerror(-ret))); MCA_BTL_OFI_ABORT(); } return events; }
static int init_ud_qp(struct ibv_context *context_arg, struct mca_btl_openib_sa_qp_cache *cache) { struct ibv_qp_init_attr iattr; struct ibv_qp_attr mattr; int rc; /* create cq */ cache->cq = ibv_create_cq(cache->context, 4, NULL, NULL, 0); if (NULL == cache->cq) { BTL_ERROR(("error creating cq, errno says %s", strerror(errno))); opal_show_help("help-mpi-btl-openib.txt", "init-fail-create-q", true, opal_process_info.nodename, __FILE__, __LINE__, "ibv_create_cq", strerror(errno), errno, ibv_get_device_name(context_arg->device)); return OPAL_ERROR; } /* create qp */ memset(&iattr, 0, sizeof(iattr)); iattr.send_cq = cache->cq; iattr.recv_cq = cache->cq; iattr.cap.max_send_wr = 1; iattr.cap.max_recv_wr = 1; iattr.cap.max_send_sge = 1; iattr.cap.max_recv_sge = 1; iattr.qp_type = IBV_QPT_UD; cache->qp = ibv_create_qp(cache->pd, &iattr); if (NULL == cache->qp) { BTL_ERROR(("error creating qp %s (%d)", strerror(errno), errno)); return OPAL_ERROR; } /* modify qp to IBV_QPS_INIT */ memset(&mattr, 0, sizeof(mattr)); mattr.qp_state = IBV_QPS_INIT; mattr.port_num = cache->port_num; mattr.qkey = ntohl(IB_QP1_WELL_KNOWN_Q_KEY); rc = ibv_modify_qp(cache->qp, &mattr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_QKEY); if (rc) { BTL_ERROR(("Error modifying QP[%x] to IBV_QPS_INIT errno says: %s [%d]", cache->qp->qp_num, strerror(errno), errno)); return OPAL_ERROR; } /* modify qp to IBV_QPS_RTR */ memset(&mattr, 0, sizeof(mattr)); mattr.qp_state = IBV_QPS_RTR; rc = ibv_modify_qp(cache->qp, &mattr, IBV_QP_STATE); if (rc) { BTL_ERROR(("Error modifying QP[%x] to IBV_QPS_RTR errno says: %s [%d]", cache->qp->qp_num, strerror(errno), errno)); return OPAL_ERROR; } /* modify qp to IBV_QPS_RTS */ mattr.qp_state = IBV_QPS_RTS; rc = ibv_modify_qp(cache->qp, &mattr, IBV_QP_STATE | IBV_QP_SQ_PSN); if (rc) { BTL_ERROR(("Error modifying QP[%x] to IBV_QPS_RTR errno says: %s [%d]", cache->qp->qp_num, strerror(errno), errno)); return OPAL_ERROR; } return OPAL_SUCCESS; }
int mca_btl_tcp_component_open(void) { char* message; #ifdef __WINDOWS__ WSADATA win_sock_data; if( WSAStartup(MAKEWORD(2,2), &win_sock_data) != 0 ) { BTL_ERROR(("failed to initialise windows sockets:%d", WSAGetLastError())); return OMPI_ERROR; } #endif /* initialize state */ mca_btl_tcp_component.tcp_listen_sd = -1; #if OPAL_WANT_IPV6 mca_btl_tcp_component.tcp6_listen_sd = -1; #endif mca_btl_tcp_component.tcp_num_btls=0; mca_btl_tcp_component.tcp_addr_count = 0; mca_btl_tcp_component.tcp_btls=NULL; /* initialize objects */ OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_lock, opal_mutex_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_procs, opal_hash_table_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_events, opal_list_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_eager, ompi_free_list_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_max, ompi_free_list_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_user, ompi_free_list_t); opal_hash_table_init(&mca_btl_tcp_component.tcp_procs, 256); /* register TCP component parameters */ mca_btl_tcp_component.tcp_num_links = mca_btl_tcp_param_register_int("links", NULL, 1); mca_btl_tcp_component.tcp_if_include = mca_btl_tcp_param_register_string("if_include", NULL, ""); mca_btl_tcp_component.tcp_if_exclude = mca_btl_tcp_param_register_string("if_exclude", NULL, "lo"); mca_btl_tcp_component.tcp_free_list_num = mca_btl_tcp_param_register_int ("free_list_num", NULL, 8); mca_btl_tcp_component.tcp_free_list_max = mca_btl_tcp_param_register_int ("free_list_max", NULL, -1); mca_btl_tcp_component.tcp_free_list_inc = mca_btl_tcp_param_register_int ("free_list_inc", NULL, 32); mca_btl_tcp_component.tcp_sndbuf = mca_btl_tcp_param_register_int ("sndbuf", NULL, 128*1024); mca_btl_tcp_component.tcp_rcvbuf = mca_btl_tcp_param_register_int ("rcvbuf", NULL, 128*1024); mca_btl_tcp_component.tcp_endpoint_cache = mca_btl_tcp_param_register_int ("endpoint_cache", "The size of the internal cache for each TCP connection. This cache is" " used to reduce the number of syscalls, by replacing them with memcpy." " Every read will read the expected data plus the amount of the" " endpoint_cache", 30*1024); mca_btl_tcp_component.tcp_use_nodelay = !mca_btl_tcp_param_register_int ("use_nagle", "Whether to use Nagle's algorithm or not (using Nagle's algorithm may increase short message latency)", 0); mca_btl_tcp_component.tcp_port_min = mca_btl_tcp_param_register_int( "port_min_v4", "The minimum port where the TCP BTL will try to bind (default 1024)", 1024 ); if( mca_btl_tcp_component.tcp_port_min > USHRT_MAX ) { orte_show_help("help-mpi-btl-tcp.txt", "invalid minimum port", true, "v4", orte_process_info.nodename, mca_btl_tcp_component.tcp_port_min ); mca_btl_tcp_component.tcp_port_min = 1024; } asprintf( &message, "The number of ports where the TCP BTL will try to bind (default %d)." " This parameter together with the port min, define a range of ports" " where Open MPI will open sockets.", (0x1 << 16) - mca_btl_tcp_component.tcp_port_min - 1 ); mca_btl_tcp_component.tcp_port_range = mca_btl_tcp_param_register_int( "port_range_v4", message, (0x1 << 16) - mca_btl_tcp_component.tcp_port_min - 1); free(message); #if OPAL_WANT_IPV6 mca_btl_tcp_component.tcp6_port_min = mca_btl_tcp_param_register_int( "port_min_v6", "The minimum port where the TCP BTL will try to bind (default 1024)", 1024 ); if( mca_btl_tcp_component.tcp6_port_min > USHRT_MAX ) { orte_show_help("help-mpi-btl-tcp.txt", "invalid minimum port", true, "v6", orte_process_info.nodename, mca_btl_tcp_component.tcp6_port_min ); mca_btl_tcp_component.tcp6_port_min = 1024; } asprintf( &message, "The number of ports where the TCP BTL will try to bind (default %d)." " This parameter together with the port min, define a range of ports" " where Open MPI will open sockets.", (0x1 << 16) - mca_btl_tcp_component.tcp6_port_min - 1 ); mca_btl_tcp_component.tcp6_port_range = mca_btl_tcp_param_register_int( "port_range_v6", message, (0x1 << 16) - mca_btl_tcp_component.tcp6_port_min - 1); free(message); #endif mca_btl_tcp_module.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_LOW + 100; mca_btl_tcp_module.super.btl_eager_limit = 64*1024; mca_btl_tcp_module.super.btl_rndv_eager_limit = 64*1024; mca_btl_tcp_module.super.btl_max_send_size = 128*1024; mca_btl_tcp_module.super.btl_rdma_pipeline_send_length = 128*1024; mca_btl_tcp_module.super.btl_rdma_pipeline_frag_size = INT_MAX; mca_btl_tcp_module.super.btl_min_rdma_pipeline_size = 0; mca_btl_tcp_module.super.btl_flags = MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_NEED_CSUM | MCA_BTL_FLAGS_NEED_ACK | MCA_BTL_FLAGS_HETEROGENEOUS_RDMA; mca_btl_tcp_module.super.btl_bandwidth = 100; mca_btl_tcp_module.super.btl_latency = 100; mca_btl_base_param_register(&mca_btl_tcp_component.super.btl_version, &mca_btl_tcp_module.super); mca_btl_tcp_component.tcp_disable_family = mca_btl_tcp_param_register_int ("disable_family", NULL, 0); return OMPI_SUCCESS; }
mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc) { mca_btl_tcp_proc_t* btl_proc; size_t size; int rc; OPAL_THREAD_LOCK(&mca_btl_tcp_component.tcp_lock); rc = opal_proc_table_get_value(&mca_btl_tcp_component.tcp_procs, proc->proc_name, (void**)&btl_proc); if(OPAL_SUCCESS == rc) { OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock); return btl_proc; } do { btl_proc = OBJ_NEW(mca_btl_tcp_proc_t); if(NULL == btl_proc) { rc = OPAL_ERR_OUT_OF_RESOURCE; break; } btl_proc->proc_opal = proc; OBJ_RETAIN(btl_proc->proc_opal); /* lookup tcp parameters exported by this proc */ OPAL_MODEX_RECV(rc, &mca_btl_tcp_component.super.btl_version, &proc->proc_name, (uint8_t**)&btl_proc->proc_addrs, &size); if(rc != OPAL_SUCCESS) { if(OPAL_ERR_NOT_FOUND != rc) BTL_ERROR(("opal_modex_recv: failed with return value=%d", rc)); break; } if(0 != (size % sizeof(mca_btl_tcp_addr_t))) { BTL_ERROR(("opal_modex_recv: invalid size %lu: btl-size: %lu\n", (unsigned long) size, (unsigned long)sizeof(mca_btl_tcp_addr_t))); rc = OPAL_ERROR; break; } btl_proc->proc_addr_count = size / sizeof(mca_btl_tcp_addr_t); /* allocate space for endpoint array - one for each exported address */ btl_proc->proc_endpoints = (mca_btl_base_endpoint_t**) malloc((1 + btl_proc->proc_addr_count) * sizeof(mca_btl_base_endpoint_t*)); if(NULL == btl_proc->proc_endpoints) { rc = OPAL_ERR_OUT_OF_RESOURCE; break; } if(NULL == mca_btl_tcp_component.tcp_local && (proc == opal_proc_local_get())) { mca_btl_tcp_component.tcp_local = btl_proc; } /* convert the OPAL addr_family field to OS constants, * so we can check for AF_INET (or AF_INET6) and don't have * to deal with byte ordering anymore. */ for (unsigned int i = 0; i < btl_proc->proc_addr_count; i++) { if (MCA_BTL_TCP_AF_INET == btl_proc->proc_addrs[i].addr_family) { btl_proc->proc_addrs[i].addr_family = AF_INET; } #if OPAL_ENABLE_IPV6 if (MCA_BTL_TCP_AF_INET6 == btl_proc->proc_addrs[i].addr_family) { btl_proc->proc_addrs[i].addr_family = AF_INET6; } #endif } } while (0); if (OPAL_SUCCESS == rc) { /* add to hash table of all proc instance. */ opal_proc_table_set_value(&mca_btl_tcp_component.tcp_procs, proc->proc_name, btl_proc); } else { if (btl_proc) { OBJ_RELEASE(btl_proc); btl_proc = NULL; } } OPAL_THREAD_UNLOCK(&mca_btl_tcp_component.tcp_lock); return btl_proc; }
/* Function handle async thread commands */ static int btl_openib_async_commandh(struct mca_btl_openib_async_poll *devices_poll, opal_list_t *ignore_qp_err_list) { struct pollfd *async_pollfd_tmp; mca_btl_openib_async_cmd_t cmd; int fd,flags,j,ret; /* Got command from main thread */ ret = read(devices_poll->async_pollfd[0].fd, &cmd, sizeof(mca_btl_openib_async_cmd_t)); if (sizeof(mca_btl_openib_async_cmd_t) != ret) { BTL_ERROR(("Read failed [%d]",errno)); return OPAL_ERROR; } BTL_VERBOSE(("Got cmd %d", cmd.a_cmd)); if (OPENIB_ASYNC_CMD_FD_ADD == cmd.a_cmd) { fd = cmd.fd; BTL_VERBOSE(("Got fd %d", fd)); BTL_VERBOSE(("Adding device [%d] to async event poll[%d]", fd, devices_poll->active_poll_size)); flags = fcntl(fd, F_GETFL); if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) < 0) { BTL_ERROR(("Failed to change file descriptor of async event")); return OPAL_ERROR; } if ((devices_poll->active_poll_size + 1) > devices_poll->poll_size) { devices_poll->poll_size+=devices_poll->poll_size; async_pollfd_tmp = malloc(sizeof(struct pollfd) * devices_poll->poll_size); if (NULL == async_pollfd_tmp) { BTL_ERROR(("Failed malloc: %s:%d. " "Fatal error, stoping asynch event thread" , __FILE__, __LINE__)); return OPAL_ERROR; } memcpy (async_pollfd_tmp,devices_poll->async_pollfd, sizeof(struct pollfd) * (devices_poll->active_poll_size)); free(devices_poll->async_pollfd); devices_poll->async_pollfd = async_pollfd_tmp; } devices_poll->async_pollfd[devices_poll->active_poll_size].fd = fd; devices_poll->async_pollfd[devices_poll->active_poll_size].events = POLLIN; devices_poll->async_pollfd[devices_poll->active_poll_size].revents = 0; devices_poll->active_poll_size++; if (OPAL_SUCCESS != send_command_comp(fd)) { return OPAL_ERROR; } } else if (OPENIB_ASYNC_CMD_FD_REMOVE == cmd.a_cmd) { bool fd_found = false; fd = cmd.fd; BTL_VERBOSE(("Got fd %d", fd)); /* Removing device from poll */ BTL_VERBOSE(("Removing device [%d] from async event poll [%d]", fd, devices_poll->active_poll_size)); if (devices_poll->active_poll_size > 1) { for (j=0; (j < devices_poll->active_poll_size || !fd_found); j++) { if (devices_poll->async_pollfd[j].fd == fd) { devices_poll->async_pollfd[j].fd = devices_poll->async_pollfd[devices_poll->active_poll_size-1].fd; devices_poll->async_pollfd[j].events = devices_poll->async_pollfd[devices_poll->active_poll_size-1].events; devices_poll->async_pollfd[j].revents = devices_poll->async_pollfd[devices_poll->active_poll_size-1].revents; fd_found = true; } } if (!fd_found) { BTL_ERROR(("Requested FD[%d] was not found in poll array",fd)); return OPAL_ERROR; } } devices_poll->active_poll_size--; if (OPAL_SUCCESS != send_command_comp(fd)) { return OPAL_ERROR; } } else if (OPENIB_ASYNC_IGNORE_QP_ERR == cmd.a_cmd) { mca_btl_openib_qp_list *new_qp; new_qp = OBJ_NEW(mca_btl_openib_qp_list); BTL_VERBOSE(("Ignore errors on QP %p", (void *)cmd.qp)); new_qp->qp = cmd.qp; opal_list_append(ignore_qp_err_list, (opal_list_item_t *)new_qp); send_command_comp(OPENIB_ASYNC_IGNORE_QP_ERR); } else if (OPENIB_ASYNC_THREAD_EXIT == cmd.a_cmd) { /* Got 0 - command to close the thread */ opal_list_item_t *item; BTL_VERBOSE(("Async event thread exit")); free(devices_poll->async_pollfd); return_status = OPAL_SUCCESS; while ((item = opal_list_remove_first(ignore_qp_err_list))) { OBJ_RELEASE(item); } OBJ_DESTRUCT(ignore_qp_err_list); pthread_exit(&return_status); } return OPAL_SUCCESS; }
int mca_btl_ugni_progress_datagram (mca_btl_ugni_device_t *device) { mca_btl_ugni_module_t *ugni_module = mca_btl_ugni_component.modules; mca_btl_base_endpoint_t *ep; gni_ep_handle_t handle; int count = 0, rc; rc = mca_btl_ugni_get_datagram (ugni_module, device, &handle, &ep); if (1 != rc) { return rc; } BTL_VERBOSE(("remote datagram completion on handle %p", (void*)handle)); /* if this is a wildcard endpoint lookup the remote peer by the proc id we received */ if (handle == ugni_module->wildcard_ep) { struct opal_proc_t *remote_proc = opal_proc_for_name (ugni_module->wc_remote_attr.proc_name); BTL_VERBOSE(("received connection attempt on wildcard endpoint from proc: %s", OPAL_NAME_PRINT(ugni_module->wc_remote_attr.proc_name))); ep = mca_btl_ugni_get_ep (&ugni_module->super, remote_proc); if (OPAL_UNLIKELY(NULL == ep)) { /* there is no way to recover from this error so just abort() */ BTL_ERROR(("could not find/allocate a btl endpoint for peer %s", OPAL_NAME_PRINT(ugni_module->wc_remote_attr.proc_name))); abort (); return OPAL_ERR_NOT_FOUND; } } /* should not have gotten a NULL endpoint */ assert (NULL != ep); BTL_VERBOSE(("got a datagram completion: ep = %p. wc = %d", (void *) ep, handle == ugni_module->wildcard_ep)); /* NTH: TODO -- error handling */ opal_mutex_lock (&ep->lock); if (handle != ugni_module->wildcard_ep) { /* directed post complete */ BTL_VERBOSE(("directed datagram complete for endpoint %p", (void *) ep)); ep->dg_posted = false; (void) opal_atomic_add_32 (&ugni_module->active_datagrams, -1); } (void) mca_btl_ugni_ep_connect_progress (ep); opal_mutex_unlock (&ep->lock); if (MCA_BTL_UGNI_EP_STATE_CONNECTED == ep->state) { /* process messages waiting in the endpoint's smsg mailbox */ count = mca_btl_ugni_smsg_process (ep); } /* repost the wildcard datagram */ if (handle == ugni_module->wildcard_ep) { mca_btl_ugni_wildcard_ep_post (ugni_module); } return count; }
static void mca_btl_tcp_endpoint_recv_handler(int sd, short flags, void* user) { mca_btl_base_endpoint_t* btl_endpoint = (mca_btl_base_endpoint_t *)user; /* Make sure we don't have a race between a thread that remove the * recv event, and one event already scheduled. */ if( sd != btl_endpoint->endpoint_sd ) return; OPAL_THREAD_LOCK(&btl_endpoint->endpoint_recv_lock); switch(btl_endpoint->endpoint_state) { case MCA_BTL_TCP_CONNECT_ACK: { int rc = OMPI_ERROR; rc = mca_btl_tcp_endpoint_recv_connect_ack(btl_endpoint); if( OMPI_SUCCESS == rc ) { /* we are now connected. Start sending the data */ OPAL_THREAD_LOCK(&btl_endpoint->endpoint_send_lock); mca_btl_tcp_endpoint_connected(btl_endpoint); OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock); #if OPAL_ENABLE_DEBUG && WANT_PEER_DUMP mca_btl_tcp_endpoint_dump(btl_endpoint, "connected"); #endif } OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); return; } case MCA_BTL_TCP_CONNECTED: { mca_btl_tcp_frag_t* frag; frag = btl_endpoint->endpoint_recv_frag; if(NULL == frag) { int rc; if(mca_btl_tcp_module.super.btl_max_send_size > mca_btl_tcp_module.super.btl_eager_limit) { MCA_BTL_TCP_FRAG_ALLOC_MAX(frag, rc); } else { MCA_BTL_TCP_FRAG_ALLOC_EAGER(frag, rc); } if(NULL == frag) { OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); return; } MCA_BTL_TCP_FRAG_INIT_DST(frag, btl_endpoint); } #if MCA_BTL_TCP_ENDPOINT_CACHE assert( 0 == btl_endpoint->endpoint_cache_length ); data_still_pending_on_endpoint: #endif /* MCA_BTL_TCP_ENDPOINT_CACHE */ /* check for completion of non-blocking recv on the current fragment */ if(mca_btl_tcp_frag_recv(frag, btl_endpoint->endpoint_sd) == false) { btl_endpoint->endpoint_recv_frag = frag; } else { btl_endpoint->endpoint_recv_frag = NULL; if( MCA_BTL_TCP_HDR_TYPE_SEND == frag->hdr.type ) { mca_btl_active_message_callback_t* reg; reg = mca_btl_base_active_message_trigger + frag->hdr.base.tag; reg->cbfunc(&frag->btl->super, frag->hdr.base.tag, &frag->base, reg->cbdata); } #if MCA_BTL_TCP_ENDPOINT_CACHE if( 0 != btl_endpoint->endpoint_cache_length ) { /* If the cache still contain some data we can reuse the same fragment * until we flush it completly. */ MCA_BTL_TCP_FRAG_INIT_DST(frag, btl_endpoint); goto data_still_pending_on_endpoint; } #endif /* MCA_BTL_TCP_ENDPOINT_CACHE */ MCA_BTL_TCP_FRAG_RETURN(frag); } #if MCA_BTL_TCP_ENDPOINT_CACHE assert( 0 == btl_endpoint->endpoint_cache_length ); #endif /* MCA_BTL_TCP_ENDPOINT_CACHE */ OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); break; } case MCA_BTL_TCP_CLOSED: /* This is a thread-safety issue. As multiple threads are allowed * to generate events (in the lib event) we endup with several * threads executing the receive callback, when we reach the end * of the MPI_Finalize. The first one will close the connections, * and all others will complain. */ OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); break; default: OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_recv_lock); BTL_ERROR(("invalid socket state(%d)", btl_endpoint->endpoint_state)); mca_btl_tcp_endpoint_close(btl_endpoint); break; } }
/* * called when the connect module has completed setup of an endpoint */ void mca_btl_openib_endpoint_connected(mca_btl_openib_endpoint_t *endpoint) { opal_list_item_t *frag_item, *ep_item; mca_btl_openib_send_frag_t *frag; mca_btl_openib_endpoint_t *ep; bool master = false; opal_output(-1, "Now we are CONNECTED"); if (MCA_BTL_XRC_ENABLED) { OPAL_THREAD_LOCK(&endpoint->ib_addr->addr_lock); if (MCA_BTL_IB_ADDR_CONNECTED == endpoint->ib_addr->status) { /* We are not xrc master */ /* set our qp pointer to master qp */ master = false; } else { /* I'm master of XRC */ endpoint->ib_addr->status = MCA_BTL_IB_ADDR_CONNECTED; master = true; } } /* Run over all qps and load alternative path */ #if OPAL_HAVE_THREADS if (APM_ENABLED) { int i; if (MCA_BTL_XRC_ENABLED) { if (master) { mca_btl_openib_load_apm(endpoint->ib_addr->qp->lcl_qp, endpoint); } } else { for(i = 0; i < mca_btl_openib_component.num_qps; i++) { mca_btl_openib_load_apm(endpoint->qps[i].qp->lcl_qp, endpoint); } } } #endif endpoint->endpoint_state = MCA_BTL_IB_CONNECTED; endpoint->endpoint_btl->device->non_eager_rdma_endpoints++; /* The connection is correctly setup. Now we can decrease the event trigger. */ opal_progress_event_users_decrement(); if(MCA_BTL_XRC_ENABLED) { while(master && !opal_list_is_empty(&endpoint->ib_addr->pending_ep)) { ep_item = opal_list_remove_first(&endpoint->ib_addr->pending_ep); ep = (mca_btl_openib_endpoint_t *)ep_item; if (OMPI_SUCCESS != ompi_btl_openib_connect_base_start(endpoint->endpoint_local_cpc, ep)) { BTL_ERROR(("Failed to connect pending endpoint\n")); } } OPAL_THREAD_UNLOCK(&endpoint->ib_addr->addr_lock); } /* Process pending packet on the endpoint */ /* While there are frags in the list, process them */ while (!opal_list_is_empty(&(endpoint->pending_lazy_frags))) { frag_item = opal_list_remove_first(&(endpoint->pending_lazy_frags)); frag = to_send_frag(frag_item); /* We need to post this one */ if (OMPI_ERROR == mca_btl_openib_endpoint_post_send(endpoint, frag)) { BTL_ERROR(("Error posting send")); } } OPAL_THREAD_UNLOCK(&endpoint->endpoint_lock); /* if upper layer called put or get before connection moved to connected * state then we restart them here */ mca_btl_openib_frag_progress_pending_put_get(endpoint, mca_btl_openib_component.rdma_qp); }
static int get_pathrecord_info(struct mca_btl_openib_sa_qp_cache *cache, ib_sa_mad_t *req_mad, ib_sa_mad_t *resp_mad, struct ibv_send_wr *swr, uint16_t lid, uint16_t rem_lid) { struct ibv_send_wr *bswr; struct ibv_wc wc; struct timeval get_sl_rec_last_sent, get_sl_rec_last_poll; struct ibv_recv_wr *brwr; int got_sl_value, get_sl_rec_retries, rc, ne, i; ib_path_rec_t *req_path_record = ib_sa_mad_get_payload_ptr(req_mad); ib_path_rec_t *resp_path_record = ib_sa_mad_get_payload_ptr(resp_mad); got_sl_value = 0; get_sl_rec_retries = 0; rc = ibv_post_recv(cache->qp, &(cache->rwr), &brwr); if (0 != rc) { BTL_ERROR(("error posting receive on QP [0x%x] rc says: %s [%d]", cache->qp->qp_num, strerror(rc), rc)); return OPAL_ERROR; } while (0 == got_sl_value) { rc = ibv_post_send(cache->qp, swr, &bswr); if (0 != rc) { BTL_ERROR(("error posting send on QP [0x%x] rc says: %s [%d]", cache->qp->qp_num, strerror(rc), rc)); return OPAL_ERROR; } gettimeofday(&get_sl_rec_last_sent, NULL); while (0 == got_sl_value) { ne = ibv_poll_cq(cache->cq, 1, &wc); if (ne > 0 && IBV_WC_RECV == wc.opcode) { /* We only care about the status of receive work requests. */ /* If the status of the send work request was anything other */ /* than success, we'll eventually retransmit, so ignore them. */ if (0 == resp_mad->status && req_path_record->slid == htons(lid) && req_path_record->dlid == htons(rem_lid) && IBV_WC_SUCCESS == wc.status && wc.byte_len >= MAD_BLOCK_SIZE && resp_mad->trans_id == req_mad->trans_id) { /* Everything matches, so we have the desired SL */ cache->sl_values[rem_lid] = ib_path_rec_sl(resp_path_record); got_sl_value = 1; break; } /* Probably bad status, unlikely bad lid match. We will */ /* ignore response and let it time out so that we do a */ /* retry, but after a delay. Need to repost receive WR. */ rc = ibv_post_recv(cache->qp, &(cache->rwr), &brwr); if (0 != rc) { BTL_ERROR(("error posing receive on QP[%x] rc says: %s [%d]", cache->qp->qp_num, strerror(rc), rc)); return OPAL_ERROR; } } else if (0 == ne) { /* poll did not find anything */ gettimeofday(&get_sl_rec_last_poll, NULL); i = get_sl_rec_last_poll.tv_sec - get_sl_rec_last_sent.tv_sec; i = (i * 1000000) + get_sl_rec_last_poll.tv_usec - get_sl_rec_last_sent.tv_usec; if (i > GET_SL_REC_RETRIES_TIMEOUT_MS) { get_sl_rec_retries++; BTL_VERBOSE(("[%d/%d] retries to get PathRecord", get_sl_rec_retries, MAX_GET_SL_REC_RETRIES)); if (get_sl_rec_retries > MAX_GET_SL_REC_RETRIES) { BTL_ERROR(("No response from SA after %d retries", MAX_GET_SL_REC_RETRIES)); return OPAL_ERROR; } /* Need to retransmit request. We must make a new TID */ /* so the SM doesn't see it as the same request. */ req_mad->trans_id += hton64(1); break; } usleep(100); /* otherwise pause before polling again */ } else if (ne < 0) { BTL_ERROR(("error polling CQ returned %d\n", ne)); return OPAL_ERROR; } } } return 0; }
void mca_btl_openib_endpoint_send_credits(mca_btl_openib_endpoint_t* endpoint, const int qp) { mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl; mca_btl_openib_send_control_frag_t* frag; mca_btl_openib_rdma_credits_header_t *credits_hdr; int rc; bool do_rdma = false; int32_t cm_return; frag = endpoint->qps[qp].credit_frag; if(OPAL_UNLIKELY(NULL == frag)) { frag = alloc_control_frag(openib_btl); frag->qp_idx = qp; endpoint->qps[qp].credit_frag = frag; /* set those once and forever */ to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp; to_base_frag(frag)->base.des_cbfunc = mca_btl_openib_endpoint_credits; to_base_frag(frag)->base.des_cbdata = NULL; to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_SEND_ALWAYS_CALLBACK;; to_com_frag(frag)->endpoint = endpoint; frag->hdr->tag = MCA_BTL_TAG_BTL; to_base_frag(frag)->segment.base.seg_len = sizeof(mca_btl_openib_rdma_credits_header_t); } assert(frag->qp_idx == qp); credits_hdr = (mca_btl_openib_rdma_credits_header_t*) to_base_frag(frag)->segment.base.seg_addr.pval; if(OMPI_SUCCESS == acquire_eager_rdma_send_credit(endpoint)) { do_rdma = true; } else { if(OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, 1) > (mca_btl_openib_component.qp_infos[qp].u.pp_qp.rd_rsv - 1)) { OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, -1); BTL_OPENIB_CREDITS_SEND_UNLOCK(endpoint, qp); return; } } BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.rd_credits, frag->hdr->credits); frag->hdr->cm_seen = 0; BTL_OPENIB_GET_CREDITS(endpoint->qps[qp].u.pp_qp.cm_return, cm_return); if(cm_return > 255) { frag->hdr->cm_seen = 255; cm_return -= 255; OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_return, cm_return); } else { frag->hdr->cm_seen = cm_return; } BTL_OPENIB_GET_CREDITS(endpoint->eager_rdma_local.credits, credits_hdr->rdma_credits); credits_hdr->qpn = qp; credits_hdr->control.type = MCA_BTL_OPENIB_CONTROL_CREDITS; if(endpoint->nbo) BTL_OPENIB_RDMA_CREDITS_HEADER_HTON(*credits_hdr); qp_reset_signal_count(endpoint, qp); if((rc = post_send(endpoint, frag, do_rdma, 1)) == 0) return; if(endpoint->nbo) { BTL_OPENIB_HEADER_NTOH(*frag->hdr); BTL_OPENIB_RDMA_CREDITS_HEADER_NTOH(*credits_hdr); } BTL_OPENIB_CREDITS_SEND_UNLOCK(endpoint, qp); OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.rd_credits, frag->hdr->credits); OPAL_THREAD_ADD32(&endpoint->eager_rdma_local.credits, credits_hdr->rdma_credits); if(do_rdma) OPAL_THREAD_ADD32(&endpoint->eager_rdma_remote.tokens, 1); else OPAL_THREAD_ADD32(&endpoint->qps[qp].u.pp_qp.cm_sent, -1); BTL_ERROR(("error posting send request errno %d says %s", rc, strerror(errno))); }
static int init_device(struct ibv_context *context_arg, struct mca_btl_openib_sa_qp_cache *cache, uint32_t port_num) { struct ibv_ah_attr aattr; struct ibv_port_attr pattr; int rc; cache->context = ibv_open_device(context_arg->device); if (NULL == cache->context) { BTL_ERROR(("error obtaining device context for %s errno says %s", ibv_get_device_name(context_arg->device), strerror(errno))); return OPAL_ERROR; } cache->device_name = strdup(ibv_get_device_name(cache->context->device)); cache->port_num = port_num; /* init all sl_values to be SL_NOT_PRESENT */ memset(&cache->sl_values, SL_NOT_PRESENT, sizeof(cache->sl_values)); cache->next = sa_qp_cache; sa_qp_cache = cache; /* allocate the protection domain for the device */ cache->pd = ibv_alloc_pd(cache->context); if (NULL == cache->pd) { BTL_ERROR(("error allocating protection domain for %s errno says %s", ibv_get_device_name(context_arg->device), strerror(errno))); return OPAL_ERROR; } /* register memory region */ cache->mr = ibv_reg_mr(cache->pd, cache->send_recv_buffer, sizeof(cache->send_recv_buffer), IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE); if (NULL == cache->mr) { BTL_ERROR(("error registering memory region, errno says %s", strerror(errno))); return OPAL_ERROR; } /* init the ud qp */ rc = init_ud_qp(context_arg, cache); if (OPAL_ERROR == rc) { return OPAL_ERROR; } rc = ibv_query_port(cache->context, cache->port_num, &pattr); if (rc) { BTL_ERROR(("error getting port attributes for device %s " "port number %d errno says %s", ibv_get_device_name(context_arg->device), cache->port_num, strerror(errno))); return OPAL_ERROR; } /* create address handle */ memset(&aattr, 0, sizeof(aattr)); aattr.dlid = pattr.sm_lid; aattr.sl = pattr.sm_sl; aattr.port_num = cache->port_num; cache->ah = ibv_create_ah(cache->pd, &aattr); if (NULL == cache->ah) { BTL_ERROR(("error creating address handle: %s", strerror(errno))); return OPAL_ERROR; } memset(&(cache->rwr), 0, sizeof(cache->rwr)); cache->rwr.num_sge = 1; cache->rwr.sg_list = &(cache->rsge); memset(&(cache->rsge), 0, sizeof(cache->rsge)); cache->rsge.addr = (uint64_t)(void *) (cache->send_recv_buffer + MAD_BLOCK_SIZE); cache->rsge.length = MAD_BLOCK_SIZE + 40; cache->rsge.lkey = cache->mr->lkey; return 0; }
mca_btl_tcp2_proc_t* mca_btl_tcp2_proc_create(ompi_proc_t* ompi_proc) { int rc; size_t size; mca_btl_tcp2_proc_t* btl_proc; uint64_t hash = orte_util_hash_name(&ompi_proc->proc_name); OPAL_THREAD_LOCK(&mca_btl_tcp2_component.tcp_lock); rc = opal_hash_table_get_value_uint64(&mca_btl_tcp2_component.tcp_procs, hash, (void**)&btl_proc); if(OMPI_SUCCESS == rc) { OPAL_THREAD_UNLOCK(&mca_btl_tcp2_component.tcp_lock); return btl_proc; } btl_proc = OBJ_NEW(mca_btl_tcp2_proc_t); if(NULL == btl_proc) return NULL; btl_proc->proc_ompi = ompi_proc; /* add to hash table of all proc instance */ opal_hash_table_set_value_uint64(&mca_btl_tcp2_component.tcp_procs, hash, btl_proc); OPAL_THREAD_UNLOCK(&mca_btl_tcp2_component.tcp_lock); /* lookup tcp parameters exported by this proc */ rc = ompi_modex_recv( &mca_btl_tcp2_component.super.btl_version, ompi_proc, (void**)&btl_proc->proc_addrs, &size ); if(rc != OMPI_SUCCESS) { BTL_ERROR(("mca_base_modex_recv: failed with return value=%d", rc)); OBJ_RELEASE(btl_proc); return NULL; } if(0 != (size % sizeof(mca_btl_tcp2_addr_t))) { BTL_ERROR(("mca_base_modex_recv: invalid size %lu: btl-size: %lu\n", (unsigned long) size, (unsigned long)sizeof(mca_btl_tcp2_addr_t))); return NULL; } btl_proc->proc_addr_count = size / sizeof(mca_btl_tcp2_addr_t); /* allocate space for endpoint array - one for each exported address */ btl_proc->proc_endpoints = (mca_btl_base_endpoint_t**) malloc((1 + btl_proc->proc_addr_count) * sizeof(mca_btl_base_endpoint_t*)); if(NULL == btl_proc->proc_endpoints) { OBJ_RELEASE(btl_proc); return NULL; } if(NULL == mca_btl_tcp2_component.tcp_local && ompi_proc == ompi_proc_local()) { mca_btl_tcp2_component.tcp_local = btl_proc; } { /* convert the OMPI addr_family field to OS constants, * so we can check for AF_INET (or AF_INET6) and don't have * to deal with byte ordering anymore. */ unsigned int i; for (i = 0; i < btl_proc->proc_addr_count; i++) { if (MCA_BTL_TCP_AF_INET == btl_proc->proc_addrs[i].addr_family) { btl_proc->proc_addrs[i].addr_family = AF_INET; } #if OPAL_WANT_IPV6 if (MCA_BTL_TCP_AF_INET6 == btl_proc->proc_addrs[i].addr_family) { btl_proc->proc_addrs[i].addr_family = AF_INET6; } #endif } } return btl_proc; }
/* This func. opens XRC domain */ int mca_btl_openib_open_xrc_domain(struct mca_btl_openib_device_t *device) { int len; char *xrc_file_name; const char *dev_name; #if OPAL_HAVE_CONNECTX_XRC_DOMAINS struct ibv_xrcd_init_attr xrcd_attr; #endif dev_name = ibv_get_device_name(device->ib_dev); len = asprintf(&xrc_file_name, "%s"OPAL_PATH_SEP"openib_xrc_domain_%s", opal_process_info.job_session_dir, dev_name); if (0 > len) { BTL_ERROR(("Failed to allocate memomry for XRC file name: %s\n", strerror(errno))); return OPAL_ERROR; } device->xrc_fd = open(xrc_file_name, O_CREAT, S_IWUSR|S_IRUSR); if (0 > device->xrc_fd) { BTL_ERROR(("Failed to open XRC domain file %s, errno says %s\n", xrc_file_name,strerror(errno))); free(xrc_file_name); return OPAL_ERROR; } #if OPAL_HAVE_CONNECTX_XRC_DOMAINS memset(&xrcd_attr, 0, sizeof xrcd_attr); xrcd_attr.comp_mask = IBV_XRCD_INIT_ATTR_FD | IBV_XRCD_INIT_ATTR_OFLAGS; xrcd_attr.fd = device->xrc_fd; xrcd_attr.oflags = O_CREAT; device->xrcd = ibv_open_xrcd(device->ib_dev_context, &xrcd_attr); if (NULL == device->xrcd) { #else device->xrc_domain = ibv_open_xrc_domain(device->ib_dev_context, device->xrc_fd, O_CREAT); if (NULL == device->xrc_domain) { #endif BTL_ERROR(("Failed to open XRC domain\n")); close(device->xrc_fd); free(xrc_file_name); return OPAL_ERROR; } return OPAL_SUCCESS; } /* This func. closes XRC domain */ int mca_btl_openib_close_xrc_domain(struct mca_btl_openib_device_t *device) { #if OPAL_HAVE_CONNECTX_XRC_DOMAINS if (NULL == device->xrcd) { #else if (NULL == device->xrc_domain) { #endif /* No XRC domain, just exit */ return OPAL_SUCCESS; } #if OPAL_HAVE_CONNECTX_XRC_DOMAINS if (ibv_close_xrcd(device->xrcd)) { #else if (ibv_close_xrc_domain(device->xrc_domain)) { #endif BTL_ERROR(("Failed to close XRC domain, errno %d says %s\n", device->xrc_fd, strerror(errno))); return OPAL_ERROR; } /* do we need to check exit status */ if (close(device->xrc_fd)) { BTL_ERROR(("Failed to close XRC file descriptor, errno %d says %s\n", device->xrc_fd, strerror(errno))); return OPAL_ERROR; } return OPAL_SUCCESS; } static void ib_address_constructor(ib_address_t *ib_addr) { ib_addr->key = NULL; ib_addr->subnet_id = 0; ib_addr->lid = 0; ib_addr->status = MCA_BTL_IB_ADDR_CLOSED; ib_addr->qp = NULL; OBJ_CONSTRUCT(&ib_addr->addr_lock, opal_mutex_t); OBJ_CONSTRUCT(&ib_addr->pending_ep, opal_list_t); } static void ib_address_destructor(ib_address_t *ib_addr) { if (NULL != ib_addr->key) { free(ib_addr->key); } OBJ_DESTRUCT(&ib_addr->addr_lock); OBJ_DESTRUCT(&ib_addr->pending_ep); } static int ib_address_init(ib_address_t *ib_addr, uint16_t lid, uint64_t s_id, opal_jobid_t ep_jobid) { ib_addr->key = malloc(SIZE_OF3(s_id, lid, ep_jobid)); if (NULL == ib_addr->key) { BTL_ERROR(("Failed to allocate memory for key\n")); return OPAL_ERROR; } memset(ib_addr->key, 0, SIZE_OF3(s_id, lid, ep_jobid)); /* creating the key = lid + s_id + ep_jobid */ memcpy(ib_addr->key, &lid, sizeof(lid)); memcpy((void*)((char*)ib_addr->key + sizeof(lid)), &s_id, sizeof(s_id)); memcpy((void*)((char*)ib_addr->key + sizeof(lid) + sizeof(s_id)), &ep_jobid, sizeof(ep_jobid)); /* caching lid and subnet id */ ib_addr->subnet_id = s_id; ib_addr->lid = lid; return OPAL_SUCCESS; } /* Create new entry in hash table for subnet_id and lid, * update the endpoint pointer. * Before call to this function you need to protect with */ int mca_btl_openib_ib_address_add_new (uint16_t lid, uint64_t s_id, opal_jobid_t ep_jobid, mca_btl_openib_endpoint_t *ep) { void *tmp; int ret = OPAL_SUCCESS; struct ib_address_t *ib_addr = OBJ_NEW(ib_address_t); ret = ib_address_init(ib_addr, lid, s_id, ep_jobid); if (OPAL_SUCCESS != ret ) { BTL_ERROR(("XRC Internal error. Failed to init ib_addr\n")); OBJ_DESTRUCT(ib_addr); return ret; } /* is it already in the table ?*/ OPAL_THREAD_LOCK(&mca_btl_openib_component.ib_lock); if (OPAL_SUCCESS != opal_hash_table_get_value_ptr(&mca_btl_openib_component.ib_addr_table, ib_addr->key, SIZE_OF3(s_id, lid, ep_jobid), &tmp)) { /* It is new one, lets put it on the table */ ret = opal_hash_table_set_value_ptr(&mca_btl_openib_component.ib_addr_table, ib_addr->key, SIZE_OF3(s_id, lid, ep_jobid), (void*)ib_addr); if (OPAL_SUCCESS != ret) { BTL_ERROR(("XRC Internal error." " Failed to add element to mca_btl_openib_component.ib_addr_table\n")); OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock); OBJ_DESTRUCT(ib_addr); return ret; } /* update the endpoint with pointer to ib address */ ep->ib_addr = ib_addr; } else { /* so we have this one in the table, just add the pointer to the endpoint */ ep->ib_addr = (ib_address_t *)tmp; assert(lid == ep->ib_addr->lid && s_id == ep->ib_addr->subnet_id); OBJ_DESTRUCT(ib_addr); } OPAL_THREAD_UNLOCK(&mca_btl_openib_component.ib_lock); return ret; }
/** * This function is used to send a message to the remote side * indicating the endpoint is broken and telling the remote side to * brings its endpoint down as well. This is needed because there are * cases where only one side of the connection determines that the * there was a problem. * @param endpoint Pointer to endpoint with error * @param type Type of message to be sent, can be one of two types * @param index When sending RDMA error message, index is non zero */ static void mca_btl_openib_endpoint_notify(mca_btl_base_endpoint_t* endpoint, uint8_t type, int index) { mca_btl_openib_module_t* openib_btl = endpoint->endpoint_btl; mca_btl_openib_module_t* newbtl = NULL; bool found = false; mca_btl_openib_broken_connection_header_t *bc_hdr; mca_btl_openib_send_control_frag_t* frag; mca_btl_base_endpoint_t* newep; int i, rc; opal_proc_t* remote_proc = endpoint->endpoint_proc->proc_opal; /* First, find a different BTL than this one that got the * error to send the message over. */ for(i = 0; i < mca_btl_openib_component.ib_num_btls; i++) { if (mca_btl_openib_component.openib_btls[i] != openib_btl) { newbtl = mca_btl_openib_component.openib_btls[i]; break; } } if (NULL == newbtl) { opal_output_verbose(20, mca_btl_openib_component.verbose_failover, "IB: Endpoint Notify: No BTL found"); /* If we cannot find one, then just return. */ return; } /* Now, find the endpoint associated with it. The device * associated with the BTL has the list of all the * endpoints. */ for (i = 0; i < opal_pointer_array_get_size(newbtl->device->endpoints); i++) { newep = (mca_btl_openib_endpoint_t*) opal_pointer_array_get_item(newbtl->device->endpoints, i); if (NULL == newep) { continue; } if (newep->endpoint_proc->proc_opal == remote_proc) { found = true; break; } } if (false == found) { opal_output_verbose(20, mca_btl_openib_component.verbose_failover, "IB: Endpoint Notify: No endpoint found"); /* If we cannot find a match, then just return. */ return; } frag = alloc_control_frag(newbtl); if(NULL == frag) { opal_output_verbose(20, mca_btl_openib_component.verbose_failover, "IB: Endpoint Notify: No frag space"); /* If no frag available, then just return. */ return; } to_base_frag(frag)->base.des_cbfunc = mca_btl_openib_endpoint_notify_cb; to_base_frag(frag)->base.des_cbdata = NULL; to_base_frag(frag)->base.des_flags |= MCA_BTL_DES_FLAGS_PRIORITY|MCA_BTL_DES_SEND_ALWAYS_CALLBACK; to_base_frag(frag)->base.order = mca_btl_openib_component.credits_qp; to_base_frag(frag)->segment.base.seg_len = sizeof(mca_btl_openib_broken_connection_header_t); to_com_frag(frag)->endpoint = newep; frag->hdr->tag = MCA_BTL_TAG_IB; bc_hdr = (mca_btl_openib_broken_connection_header_t*)to_base_frag(frag)->segment.base.seg_addr.pval; bc_hdr->control.type = type; bc_hdr->lid = endpoint->endpoint_btl->port_info.lid; bc_hdr->subnet_id = endpoint->endpoint_btl->port_info.subnet_id; bc_hdr->vpid = OPAL_PROC_MY_NAME.vpid; bc_hdr->index = index; if(newep->nbo) { BTL_OPENIB_BROKEN_CONNECTION_HEADER_HTON((*bc_hdr)); } rc = mca_btl_openib_endpoint_send(newep, frag); if (OPAL_SUCCESS == rc || OPAL_ERR_RESOURCE_BUSY == rc) { return; } MCA_BTL_IB_FRAG_RETURN(frag); BTL_ERROR(("Error sending BROKEN CONNECTION buffer (%s)", strerror(errno))); return; }
/* Function handle async thread commands */ static int btl_openib_async_commandh(struct mca_btl_openib_async_poll *devices_poll) { struct pollfd *async_pollfd_tmp; int fd,flags,j; /* Got command from main thread */ if (read(devices_poll->async_pollfd[0].fd, &fd, sizeof(int)) < 0) { BTL_ERROR(("Read failed [%d]",errno)); return OMPI_ERROR; } BTL_VERBOSE(("GOT event from -> %d",fd)); if (fd > 0) { BTL_VERBOSE(("Adding device [%d] to async event poll[%d]", fd, devices_poll->active_poll_size)); flags = fcntl(fd, F_GETFL); if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) < 0) { BTL_ERROR(("Failed to change file descriptor of async event")); return OMPI_ERROR; } if ((devices_poll->active_poll_size + 1) > devices_poll->poll_size) { devices_poll->poll_size+=devices_poll->poll_size; async_pollfd_tmp = malloc(sizeof(struct pollfd) * devices_poll->poll_size); if (NULL == async_pollfd_tmp) { BTL_ERROR(("Failed malloc: %s:%d. " "Fatal error, stoping asynch event thread" , __FILE__, __LINE__)); return OMPI_ERROR; } memcpy (async_pollfd_tmp,devices_poll->async_pollfd, sizeof(struct pollfd) * (devices_poll->active_poll_size)); free(devices_poll->async_pollfd); devices_poll->async_pollfd = async_pollfd_tmp; } devices_poll->async_pollfd[devices_poll->active_poll_size].fd = fd; devices_poll->async_pollfd[devices_poll->active_poll_size].events = POLLIN; devices_poll->async_pollfd[devices_poll->active_poll_size].revents = 0; devices_poll->active_poll_size++; if (OMPI_SUCCESS != send_command_comp(fd)) { return OMPI_ERROR; } } else if (fd < 0) { bool fd_found = false; /* Removing device from poll */ fd = -(fd); BTL_VERBOSE(("Removing device [%d] from async event poll [%d]", fd, devices_poll->active_poll_size)); if (devices_poll->active_poll_size > 1) { for (j=0; (j < devices_poll->active_poll_size || !fd_found); j++) { if (devices_poll->async_pollfd[j].fd == fd) { devices_poll->async_pollfd[j].fd = devices_poll->async_pollfd[devices_poll->active_poll_size-1].fd; devices_poll->async_pollfd[j].events = devices_poll->async_pollfd[devices_poll->active_poll_size-1].events; devices_poll->async_pollfd[j].revents = devices_poll->async_pollfd[devices_poll->active_poll_size-1].revents; fd_found = true; } } if (!fd_found) { BTL_ERROR(("Requested FD[%d] was not found in poll array",fd)); return OMPI_ERROR; } } devices_poll->active_poll_size--; if (OMPI_SUCCESS != send_command_comp(-(fd))) { return OMPI_ERROR; } } else { /* Got 0 - command to close the thread */ BTL_VERBOSE(("Async event thread exit")); free(devices_poll->async_pollfd); return_status = OMPI_SUCCESS; pthread_exit(&return_status); } return OMPI_SUCCESS; }
/* * Construct/destruct an endpoint structure. */ static void endpoint_construct(mca_btl_base_endpoint_t* endpoint) { int i; endpoint->endpoint_module = NULL; endpoint->endpoint_proc = NULL; endpoint->endpoint_proc_index = -1; endpoint->endpoint_exiting = false; endpoint->endpoint_connectivity_checked = false; for (i=0; i<USNIC_NUM_CHANNELS; ++i) { endpoint->endpoint_remote_addr.qp_num[i] = 0; } endpoint->endpoint_remote_addr.gid.global.subnet_prefix = 0; endpoint->endpoint_remote_addr.gid.global.interface_id = 0; endpoint->endpoint_remote_ah = NULL; endpoint->endpoint_send_credits = 8; /* list of fragments queued to be sent */ OBJ_CONSTRUCT(&endpoint->endpoint_frag_send_queue, opal_list_t); endpoint->endpoint_next_frag_id = 1; endpoint->endpoint_acktime = 0; /* endpoint starts not-ready-to-send */ endpoint->endpoint_ready_to_send = 0; endpoint->endpoint_ack_needed = false; /* clear sent/received sequence number array */ memset(endpoint->endpoint_sent_segs, 0, sizeof(endpoint->endpoint_sent_segs)); memset(endpoint->endpoint_rcvd_segs, 0, sizeof(endpoint->endpoint_rcvd_segs)); /* * Make a new OPAL hotel for this module * "hotel" is a construct used for triggering segment retransmission * due to timeout */ OBJ_CONSTRUCT(&endpoint->endpoint_hotel, opal_hotel_t); opal_hotel_init(&endpoint->endpoint_hotel, WINDOW_SIZE, mca_btl_usnic_component.retrans_timeout, 0, ompi_btl_usnic_ack_timeout); /* Setup this endpoint's list links */ OBJ_CONSTRUCT(&(endpoint->endpoint_ack_li), opal_list_item_t); OBJ_CONSTRUCT(&(endpoint->endpoint_endpoint_li), opal_list_item_t); endpoint->endpoint_ack_needed = false; /* fragment reassembly info */ endpoint->endpoint_rx_frag_info = calloc(sizeof(struct ompi_btl_usnic_rx_frag_info_t), MAX_ACTIVE_FRAGS); assert(NULL != endpoint->endpoint_rx_frag_info); if (OPAL_UNLIKELY(endpoint->endpoint_rx_frag_info == NULL)) { BTL_ERROR(("calloc returned NULL -- this should not happen!")); ompi_btl_usnic_exit(); /* Does not return */ } }
/* Function handle async device events */ static int btl_openib_async_deviceh(struct mca_btl_openib_async_poll *devices_poll, int index) { int j; mca_btl_openib_device_t *device = NULL; struct ibv_async_event event; bool xrc_event = false; int event_type; /* We need to find correct device and process this event */ for (j=0; j < mca_btl_openib_component.ib_num_btls; j++) { if (mca_btl_openib_component.openib_btls[j]->device->ib_dev_context->async_fd == devices_poll->async_pollfd[index].fd ) { device = mca_btl_openib_component.openib_btls[j]->device; break; } } if (NULL != device) { if (ibv_get_async_event((struct ibv_context *)device->ib_dev_context,&event) < 0) { if (EWOULDBLOCK == errno) { /* No event found ? * It was handled by somebody other */ return OMPI_SUCCESS; } else { BTL_ERROR(("Failed to get async event")); return OMPI_ERROR; } } event_type = event.event_type; #if HAVE_XRC /* is it XRC event ?*/ if (IBV_XRC_QP_EVENT_FLAG & event.event_type) { xrc_event = true; /* Clean the bitnd handel as usual */ event_type ^= IBV_XRC_QP_EVENT_FLAG; } #endif switch(event_type) { case IBV_EVENT_PATH_MIG: BTL_ERROR(("Alternative path migration event reported")); if (APM_ENABLED) { BTL_ERROR(("Trying to find additional path...")); if (!xrc_event) mca_btl_openib_load_apm(event.element.qp, qp2endpoint(event.element.qp, device)); #if HAVE_XRC else mca_btl_openib_load_apm_xrc_rcv(event.element.xrc_qp_num, xrc_qp2endpoint(event.element.xrc_qp_num, device)); #endif } break; case IBV_EVENT_DEVICE_FATAL: /* Set the flag to fatal */ device->got_fatal_event = true; /* It is not critical to protect the counter */ OPAL_THREAD_ADD32(&mca_btl_openib_component.error_counter, 1); case IBV_EVENT_CQ_ERR: case IBV_EVENT_QP_FATAL: case IBV_EVENT_QP_REQ_ERR: case IBV_EVENT_QP_ACCESS_ERR: case IBV_EVENT_PATH_MIG_ERR: case IBV_EVENT_SRQ_ERR: orte_show_help("help-mpi-btl-openib.txt", "of error event", true,orte_process_info.nodename, orte_process_info.pid, event_type, openib_event_to_str((enum ibv_event_type)event_type), xrc_event ? "true" : "false"); break; case IBV_EVENT_PORT_ERR: orte_show_help("help-mpi-btl-openib.txt", "of error event", true,orte_process_info.nodename, orte_process_info.pid, event_type, openib_event_to_str((enum ibv_event_type)event_type), xrc_event ? "true" : "false"); /* Set the flag to indicate port error */ device->got_port_event = true; OPAL_THREAD_ADD32(&mca_btl_openib_component.error_counter, 1); break; case IBV_EVENT_COMM_EST: case IBV_EVENT_PORT_ACTIVE: case IBV_EVENT_SQ_DRAINED: case IBV_EVENT_LID_CHANGE: case IBV_EVENT_PKEY_CHANGE: case IBV_EVENT_SM_CHANGE: case IBV_EVENT_QP_LAST_WQE_REACHED: #if HAVE_DECL_IBV_EVENT_CLIENT_REREGISTER case IBV_EVENT_CLIENT_REREGISTER: #endif break; /* The event is signaled when number of prepost receive WQEs is going under predefined threshold - srq_limit */ case IBV_EVENT_SRQ_LIMIT_REACHED: if(OMPI_SUCCESS != btl_openib_async_srq_limit_event(event.element.srq)) { return OMPI_ERROR; } break; default: orte_show_help("help-mpi-btl-openib.txt", "of unknown event", true,orte_process_info.nodename, orte_process_info.pid, event_type, xrc_event ? "true" : "false"); } ibv_ack_async_event(&event); } else { /* if (device == NULL), then failed to locate the device! This should never happen... */ BTL_ERROR(("Failed to find device with FD %d. " "Fatal error, stoping asynch event thread", devices_poll->async_pollfd[index].fd)); return OMPI_ERROR; } return OMPI_SUCCESS; }
static inline int mca_btl_ugni_progress_rdma (mca_btl_ugni_module_t *ugni_module, int which_cq) { mca_btl_ugni_post_descriptor_t *post_desc = NULL; gni_cq_entry_t event_data = 0; gni_post_descriptor_t *desc; uint32_t recoverable = 1; gni_return_t grc; gni_cq_handle_t the_cq; the_cq = (which_cq == 0) ? ugni_module->rdma_local_cq : ugni_module->rdma_local_irq_cq; OPAL_THREAD_LOCK(&ugni_module->device->dev_lock); grc = GNI_CqGetEvent (the_cq, &event_data); if (GNI_RC_NOT_DONE == grc) { OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); return 0; } if (OPAL_UNLIKELY((GNI_RC_SUCCESS != grc && !event_data) || GNI_CQ_OVERRUN(event_data))) { /* TODO -- need to handle overrun -- how do we do this without an event? will the event eventually come back? Ask Cray */ BTL_ERROR(("unhandled post error! ugni rc = %d %s", grc, gni_err_str[grc])); OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); return opal_common_rc_ugni_to_opal (grc); } grc = GNI_GetCompleted (the_cq, event_data, &desc); OPAL_THREAD_UNLOCK(&ugni_module->device->dev_lock); if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc && GNI_RC_TRANSACTION_ERROR != grc)) { BTL_ERROR(("Error in GNI_GetComplete %s", gni_err_str[grc])); return opal_common_rc_ugni_to_opal (grc); } post_desc = MCA_BTL_UGNI_DESC_TO_PDESC(desc); if (OPAL_UNLIKELY(GNI_RC_SUCCESS != grc || !GNI_CQ_STATUS_OK(event_data))) { (void) GNI_CqErrorRecoverable (event_data, &recoverable); if (OPAL_UNLIKELY(++post_desc->desc.tries >= mca_btl_ugni_component.rdma_max_retries || !recoverable)) { char char_buffer[1024]; GNI_CqErrorStr (event_data, char_buffer, 1024); /* give up */ BTL_ERROR(("giving up on desciptor %p, recoverable %d: %s", (void *) post_desc, recoverable, char_buffer)); #if OPAL_ENABLE_DEBUG btl_ugni_dump_post_desc (post_desc); #endif mca_btl_ugni_post_desc_complete (ugni_module, post_desc, OPAL_ERROR); return OPAL_ERROR; } mca_btl_ugni_repost (ugni_module, post_desc); return 0; } mca_btl_ugni_post_desc_complete (ugni_module, post_desc, opal_common_rc_ugni_to_opal (grc)); return 1; }
static void mca_btl_tcp_endpoint_dump(mca_btl_base_endpoint_t* btl_endpoint, const char* msg) { char src[64]; char dst[64]; int sndbuf,rcvbuf,nodelay,flags; #if OPAL_WANT_IPV6 struct sockaddr_storage inaddr; #else struct sockaddr_in inaddr; #endif opal_socklen_t obtlen; opal_socklen_t addrlen = sizeof(inaddr); getsockname(btl_endpoint->endpoint_sd, (struct sockaddr*)&inaddr, &addrlen); #if OPAL_WANT_IPV6 { char *address; address = (char *) opal_net_get_hostname((struct sockaddr*) &inaddr); if (NULL != address) { sprintf(src, "%s", address); } } #else sprintf(src, "%s", inet_ntoa(inaddr.sin_addr)); #endif getpeername(btl_endpoint->endpoint_sd, (struct sockaddr*)&inaddr, &addrlen); #if OPAL_WANT_IPV6 { char *address; address = (char *) opal_net_get_hostname ((struct sockaddr*) &inaddr); if (NULL != address) { sprintf(dst, "%s", address); } } #else sprintf(dst, "%s", inet_ntoa(inaddr.sin_addr)); #endif if((flags = fcntl(btl_endpoint->endpoint_sd, F_GETFL, 0)) < 0) { BTL_ERROR(("fcntl(F_GETFL) failed: %s (%d)", strerror(opal_socket_errno), opal_socket_errno)); } #if defined(SO_SNDBUF) obtlen = sizeof(sndbuf); if(getsockopt(btl_endpoint->endpoint_sd, SOL_SOCKET, SO_SNDBUF, (char *)&sndbuf, &obtlen) < 0) { BTL_ERROR(("SO_SNDBUF option: %s (%d)", strerror(opal_socket_errno), opal_socket_errno)); } #else sndbuf = -1; #endif #if defined(SO_RCVBUF) obtlen = sizeof(rcvbuf); if(getsockopt(btl_endpoint->endpoint_sd, SOL_SOCKET, SO_RCVBUF, (char *)&rcvbuf, &obtlen) < 0) { BTL_ERROR(("SO_RCVBUF option: %s (%d)", strerror(opal_socket_errno), opal_socket_errno)); } #else rcvbuf = -1; #endif #if defined(TCP_NODELAY) obtlen = sizeof(nodelay); if(getsockopt(btl_endpoint->endpoint_sd, IPPROTO_TCP, TCP_NODELAY, (char *)&nodelay, &obtlen) < 0) { BTL_ERROR(("TCP_NODELAY option: %s (%d)", strerror(opal_socket_errno), opal_socket_errno)); } #else nodelay = 0; #endif BTL_VERBOSE(("%s: %s - %s nodelay %d sndbuf %d rcvbuf %d flags %08x", msg, src, dst, nodelay, sndbuf, rcvbuf, flags)); }