int ompi_mtl_mxm_add_procs(struct mca_mtl_base_module_t *mtl, size_t nprocs, struct ompi_proc_t** procs) { #if MXM_API < MXM_VERSION(2,0) ompi_mtl_mxm_ep_conn_info_t *ep_info; mxm_conn_req_t *conn_reqs; size_t ep_index = 0; #endif void *ep_address; size_t ep_address_len; mxm_error_t err; size_t i; int rc; mca_mtl_mxm_endpoint_t *endpoint; assert(mtl == &ompi_mtl_mxm.super); #if MXM_API < MXM_VERSION(2,0) /* Allocate connection requests */ conn_reqs = calloc(nprocs, sizeof(mxm_conn_req_t)); ep_info = calloc(nprocs, sizeof(ompi_mtl_mxm_ep_conn_info_t)); if (NULL == conn_reqs || NULL == ep_info) { rc = OMPI_ERR_OUT_OF_RESOURCE; goto bail; } #endif /* Get the EP connection requests for all the processes from modex */ for (i = 0; i < nprocs; ++i) { if (NULL != procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]) { continue; /* already connected to this endpoint */ } rc = ompi_mtl_mxm_recv_ep_address(procs[i], &ep_address, &ep_address_len); if (rc != OMPI_SUCCESS) { goto bail; } #if MXM_API < MXM_VERSION(2,0) if (ep_address_len != sizeof(ep_info[i])) { MXM_ERROR("Invalid endpoint address length"); rc = OMPI_ERROR; goto bail; } memcpy(&ep_info[i], ep_address, ep_address_len); conn_reqs[ep_index].ptl_addr[MXM_PTL_SELF] = (struct sockaddr *)&(ep_info[i].ptl_addr[MXM_PTL_SELF]); conn_reqs[ep_index].ptl_addr[MXM_PTL_SHM] = (struct sockaddr *)&(ep_info[i].ptl_addr[MXM_PTL_SHM]); conn_reqs[ep_index].ptl_addr[MXM_PTL_RDMA] = (struct sockaddr *)&(ep_info[i].ptl_addr[MXM_PTL_RDMA]); ep_index++; #else endpoint = OBJ_NEW(mca_mtl_mxm_endpoint_t); endpoint->mtl_mxm_module = &ompi_mtl_mxm; err = mxm_ep_connect(ompi_mtl_mxm.ep, ep_address, &endpoint->mxm_conn); if (err != MXM_OK) { MXM_ERROR("MXM returned connect error: %s\n", mxm_error_string(err)); rc = OMPI_ERROR; goto bail; } procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL] = endpoint; #endif free(ep_address); } #if MXM_API < MXM_VERSION(2,0) /* Connect to remote peers */ err = mxm_ep_connect(ompi_mtl_mxm.ep, conn_reqs, ep_index, -1); if (MXM_OK != err) { MXM_ERROR("MXM returned connect error: %s\n", mxm_error_string(err)); for (i = 0; i < ep_index; ++i) { if (MXM_OK != conn_reqs[i].error) { MXM_ERROR("MXM EP connect to %s error: %s\n", (NULL == procs[i]->proc_hostname) ? "unknown" : procs[i]->proc_hostname, mxm_error_string(conn_reqs[i].error)); } } rc = OMPI_ERROR; goto bail; } /* Save returned connections */ for (i = 0; i < ep_index; ++i) { endpoint = OBJ_NEW(mca_mtl_mxm_endpoint_t); endpoint->mtl_mxm_module = &ompi_mtl_mxm; endpoint->mxm_conn = conn_reqs[i].conn; procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL] = endpoint; } #endif rc = OMPI_SUCCESS; bail: #if MXM_API < MXM_VERSION(2,0) free(conn_reqs); free(ep_info); #endif return rc; }
int ompi_mtl_mxm_add_procs(struct mca_mtl_base_module_t *mtl, size_t nprocs, struct ompi_proc_t** procs, /*const*/ struct mca_mtl_base_endpoint_t **mtl_peer_data) { #if MXM_API < MXM_VERSION(2,0) ompi_mtl_mxm_ep_conn_info_t *ep_info; mxm_conn_req_t *conn_reqs; int timeout; #endif void *ep_address; size_t ep_address_len; mxm_error_t err; size_t i; int rc; assert(mtl == &ompi_mtl_mxm.super); #if MXM_API < MXM_VERSION(2,0) /* Allocate connection requests */ conn_reqs = calloc(nprocs, sizeof(mxm_conn_req_t)); ep_info = calloc(nprocs, sizeof(ompi_mtl_mxm_ep_conn_info_t)); if (NULL == conn_reqs || NULL == ep_info) { rc = OMPI_ERR_OUT_OF_RESOURCE; goto bail; } #endif /* Get the EP connection requests for all the processes from modex */ for (i = 0; i < nprocs; ++i) { rc = ompi_mtl_mxm_recv_ep_address(procs[i], &ep_address, &ep_address_len); if (rc != OMPI_SUCCESS) { goto bail; } #if MXM_API < MXM_VERSION(2,0) if (ep_address_len != sizeof(ep_info[i])) { MXM_ERROR("Invalid endpoint address length"); rc = OMPI_ERROR; goto bail; } memcpy(&ep_info[i], ep_address, ep_address_len); conn_reqs[i].ptl_addr[MXM_PTL_SELF] = (struct sockaddr *)&(ep_info[i].ptl_addr[MXM_PTL_SELF]); conn_reqs[i].ptl_addr[MXM_PTL_SHM] = (struct sockaddr *)&(ep_info[i].ptl_addr[MXM_PTL_SHM]); conn_reqs[i].ptl_addr[MXM_PTL_RDMA] = (struct sockaddr *)&(ep_info[i].ptl_addr[MXM_PTL_RDMA]); #else mtl_peer_data[i] = (mca_mtl_mxm_endpoint_t *) OBJ_NEW(mca_mtl_mxm_endpoint_t); mtl_peer_data[i]->mtl_mxm_module = &ompi_mtl_mxm; err = mxm_ep_connect(ompi_mtl_mxm.ep, ep_address, &mtl_peer_data[i]->mxm_conn); if (err != MXM_OK) { MXM_ERROR("MXM returned connect error: %s\n", mxm_error_string(err)); rc = OMPI_ERROR; goto bail; } #endif free(ep_address); } #if MXM_API < MXM_VERSION(2,0) /* Connect to remote peers */ timeout = (mxm_get_version() < MXM_VERSION(1,5)) ? 1000 : -1; err = mxm_ep_connect(ompi_mtl_mxm.ep, conn_reqs, nprocs, timeout); if (MXM_OK != err) { MXM_ERROR("MXM returned connect error: %s\n", mxm_error_string(err)); for (i = 0; i < nprocs; ++i) { if (MXM_OK != conn_reqs[i].error) { MXM_ERROR("MXM EP connect to %s error: %s\n", procs[i]->proc_hostname, mxm_error_string(conn_reqs[i].error)); } } rc = OMPI_ERROR; goto bail; } /* Save returned connections */ for (i = 0; i < nprocs; ++i) { mtl_peer_data[i] = (mca_mtl_mxm_endpoint_t *) OBJ_NEW(mca_mtl_mxm_endpoint_t); mtl_peer_data[i]->mtl_mxm_module = &ompi_mtl_mxm; mtl_peer_data[i]->mxm_conn = conn_reqs[i].conn; } #endif rc = OMPI_SUCCESS; bail: #if MXM_API < MXM_VERSION(2,0) free(conn_reqs); free(ep_info); #endif return rc; }