int mca_spml_ucx_add_procs(oshmem_proc_t** procs, size_t nprocs) { size_t i, n; int rc = OSHMEM_ERROR; int my_rank = oshmem_my_proc_id(); ucs_status_t err; ucp_address_t *wk_local_addr; size_t wk_addr_len; int *wk_roffs, *wk_rsizes; char *wk_raddrs; mca_spml_ucx.ucp_peers = (ucp_peer_t *) calloc(nprocs, sizeof(*(mca_spml_ucx.ucp_peers))); if (NULL == mca_spml_ucx.ucp_peers) { goto error; } err = ucp_worker_get_address(mca_spml_ucx.ucp_worker, &wk_local_addr, &wk_addr_len); if (err != UCS_OK) { goto error; } dump_address(my_rank, (char *)wk_local_addr, wk_addr_len); rc = oshmem_shmem_xchng(wk_local_addr, wk_addr_len, nprocs, (void **)&wk_raddrs, &wk_roffs, &wk_rsizes); if (rc != OSHMEM_SUCCESS) { goto error; } opal_progress_register(spml_ucx_progress); /* Get the EP connection requests for all the processes from modex */ for (n = 0; n < nprocs; ++n) { i = (my_rank + n) % nprocs; dump_address(i, (char *)(wk_raddrs + wk_roffs[i]), wk_rsizes[i]); err = ucp_ep_create(mca_spml_ucx.ucp_worker, (ucp_address_t *)(wk_raddrs + wk_roffs[i]), &mca_spml_ucx.ucp_peers[i].ucp_conn); if (UCS_OK != err) { SPML_ERROR("ucp_ep_create failed!!!\n"); goto error2; } procs[i]->num_transports = 1; procs[i]->transport_ids = spml_ucx_transport_ids; } ucp_worker_release_address(mca_spml_ucx.ucp_worker, wk_local_addr); free(wk_raddrs); free(wk_rsizes); free(wk_roffs); SPML_VERBOSE(50, "*** ADDED PROCS ***"); return OSHMEM_SUCCESS; error2: for (i = 0; i < nprocs; ++i) { if (mca_spml_ucx.ucp_peers[i].ucp_conn) { ucp_ep_destroy(mca_spml_ucx.ucp_peers[i].ucp_conn); } } if (mca_spml_ucx.ucp_peers) free(mca_spml_ucx.ucp_peers); if (wk_raddrs) free(wk_raddrs); if (wk_rsizes) free(wk_rsizes); if (wk_roffs) free(wk_roffs); if (mca_spml_ucx.ucp_peers) free(mca_spml_ucx.ucp_peers); error: rc = OSHMEM_ERR_OUT_OF_RESOURCE; SPML_ERROR("add procs FAILED rc=%d", rc); return rc; }
int mca_spml_ucx_add_procs(ompi_proc_t** procs, size_t nprocs) { size_t i, j, n; int rc = OSHMEM_ERROR; int my_rank = oshmem_my_proc_id(); ucs_status_t err; ucp_address_t *wk_local_addr; size_t wk_addr_len; int *wk_roffs = NULL; int *wk_rsizes = NULL; char *wk_raddrs = NULL; ucp_ep_params_t ep_params; mca_spml_ucx_ctx_default.ucp_peers = (ucp_peer_t *) calloc(nprocs, sizeof(*(mca_spml_ucx_ctx_default.ucp_peers))); if (NULL == mca_spml_ucx_ctx_default.ucp_peers) { goto error; } err = ucp_worker_get_address(mca_spml_ucx_ctx_default.ucp_worker, &wk_local_addr, &wk_addr_len); if (err != UCS_OK) { goto error; } dump_address(my_rank, (char *)wk_local_addr, wk_addr_len); rc = oshmem_shmem_xchng(wk_local_addr, wk_addr_len, nprocs, (void **)&wk_raddrs, &wk_roffs, &wk_rsizes); if (rc != OSHMEM_SUCCESS) { goto error; } opal_progress_register(spml_ucx_default_progress); mca_spml_ucx.remote_addrs_tbl = (char **)calloc(nprocs, sizeof(char *)); memset(mca_spml_ucx.remote_addrs_tbl, 0, nprocs * sizeof(char *)); /* Get the EP connection requests for all the processes from modex */ for (n = 0; n < nprocs; ++n) { i = (my_rank + n) % nprocs; dump_address(i, (char *)(wk_raddrs + wk_roffs[i]), wk_rsizes[i]); ep_params.field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS; ep_params.address = (ucp_address_t *)(wk_raddrs + wk_roffs[i]); err = ucp_ep_create(mca_spml_ucx_ctx_default.ucp_worker, &ep_params, &mca_spml_ucx_ctx_default.ucp_peers[i].ucp_conn); if (UCS_OK != err) { SPML_UCX_ERROR("ucp_ep_create(proc=%zu/%zu) failed: %s", n, nprocs, ucs_status_string(err)); goto error2; } OSHMEM_PROC_DATA(procs[i])->num_transports = 1; OSHMEM_PROC_DATA(procs[i])->transport_ids = spml_ucx_transport_ids; for (j = 0; j < MCA_MEMHEAP_SEG_COUNT; j++) { mca_spml_ucx_ctx_default.ucp_peers[i].mkeys[j].key.rkey = NULL; } mca_spml_ucx.remote_addrs_tbl[i] = (char *)malloc(wk_rsizes[i]); memcpy(mca_spml_ucx.remote_addrs_tbl[i], (char *)(wk_raddrs + wk_roffs[i]), wk_rsizes[i]); } ucp_worker_release_address(mca_spml_ucx_ctx_default.ucp_worker, wk_local_addr); free(wk_raddrs); free(wk_rsizes); free(wk_roffs); SPML_UCX_VERBOSE(50, "*** ADDED PROCS ***"); return OSHMEM_SUCCESS; error2: for (i = 0; i < nprocs; ++i) { if (mca_spml_ucx_ctx_default.ucp_peers[i].ucp_conn) { ucp_ep_destroy(mca_spml_ucx_ctx_default.ucp_peers[i].ucp_conn); } if (mca_spml_ucx.remote_addrs_tbl[i]) { free(mca_spml_ucx.remote_addrs_tbl[i]); } } if (mca_spml_ucx_ctx_default.ucp_peers) free(mca_spml_ucx_ctx_default.ucp_peers); if (mca_spml_ucx.remote_addrs_tbl) free(mca_spml_ucx.remote_addrs_tbl); free(wk_raddrs); free(wk_rsizes); free(wk_roffs); error: rc = OSHMEM_ERR_OUT_OF_RESOURCE; SPML_UCX_ERROR("add procs FAILED rc=%d", rc); return rc; }