int psmx2_epid_to_epaddr(struct psmx2_fid_domain *domain, psm2_epid_t epid, psm2_epaddr_t *epaddr) { int err; psm2_error_t errors; psm2_epconn_t epconn; struct psmx2_epaddr_context *context; err = psm2_ep_epid_lookup(epid, &epconn); if (err == PSM2_OK) { context = psm2_epaddr_getctxt(epconn.addr); if (context && context->epid == epid) { *epaddr = epconn.addr; return 0; } } err = psm2_ep_connect(domain->psm2_ep, 1, &epid, NULL, &errors, epaddr, psmx2_conn_timeout(1)); if (err != PSM2_OK) return psmx2_errno(err); psmx2_set_epaddr_context(domain,epid,*epaddr); return 0; }
int ompi_mtl_psm2_add_procs(struct mca_mtl_base_module_t *mtl, size_t nprocs, struct ompi_proc_t** procs) { int i,j; int rc; psm2_epid_t *epids_in = NULL; int *mask_in = NULL; psm2_epid_t *epid; psm2_epaddr_t *epaddrs_out = NULL; psm2_error_t *errs_out = NULL, err; size_t size; int proc_errors[PSM2_ERROR_LAST] = { 0 }; int timeout_in_secs; assert(mtl == &ompi_mtl_psm2.super); rc = OMPI_ERR_OUT_OF_RESOURCE; errs_out = (psm2_error_t *) malloc(nprocs * sizeof(psm2_error_t)); if (errs_out == NULL) { goto bail; } epids_in = (psm2_epid_t *) malloc(nprocs * sizeof(psm2_epid_t)); if (epids_in == NULL) { goto bail; } mask_in = (int *) malloc(nprocs * sizeof(int)); if (mask_in == NULL) { goto bail; } epaddrs_out = (psm2_epaddr_t *) malloc(nprocs * sizeof(psm2_epaddr_t)); if (epaddrs_out == NULL) { goto bail; } rc = OMPI_SUCCESS; /* Get the epids for all the processes from modex */ for (i = 0; i < (int) nprocs; i++) { if (NULL != procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]) { /* Already connected: don't connect again */ mask_in[i] = 0; continue; } OPAL_MODEX_RECV(rc, &mca_mtl_psm2_component.super.mtl_version, &procs[i]->super.proc_name, (void**)&epid, &size); if (rc != OMPI_SUCCESS || size != sizeof(psm2_epid_t)) { return OMPI_ERROR; } epids_in[i] = *epid; mask_in[i] = 1; } timeout_in_secs = max(ompi_mtl_psm2.connect_timeout, 0.5 * nprocs); psm2_error_register_handler(ompi_mtl_psm2.ep, PSM2_ERRHANDLER_NOP); err = psm2_ep_connect(ompi_mtl_psm2.ep, nprocs, epids_in, mask_in, errs_out, epaddrs_out, timeout_in_secs * 1e9); if (err) { char *errstr = (char *) ompi_mtl_psm2_connect_error_msg(err); if (errstr == NULL) { opal_output(0, "PSM2 returned unhandled/unknown connect error: %s\n", psm2_error_get_string(err)); } for (i = 0; i < (int) nprocs; i++) { if (0 == mask_in[i]) { continue; } psm2_error_t thiserr = errs_out[i]; errstr = (char *) ompi_mtl_psm2_connect_error_msg(thiserr); if (proc_errors[thiserr] == 0) { proc_errors[thiserr] = 1; opal_output(0, "PSM2 EP connect error (%s):", errstr ? errstr : "unknown connect error"); for (j = 0; j < (int) nprocs; j++) { if (errs_out[j] == thiserr) { opal_output(0, " %s", (NULL == procs[j]->super.proc_hostname) ? "unknown" : procs[j]->super.proc_hostname); } } opal_output(0, "\n"); } } rc = OMPI_ERROR; } else { /* Default error handling is enabled, errors will not be returned to * user. PSM2 prints the error and the offending endpoint's hostname * and exits with -1 */ psm2_error_register_handler(ompi_mtl_psm2.ep, PSM2_ERRHANDLER_DEFAULT); /* Fill in endpoint data */ for (i = 0; i < (int) nprocs; i++) { if (0 == mask_in[i]) { continue; } mca_mtl_psm2_endpoint_t *endpoint = (mca_mtl_psm2_endpoint_t *) OBJ_NEW(mca_mtl_psm2_endpoint_t); endpoint->peer_epid = epids_in[i]; endpoint->peer_addr = epaddrs_out[i]; procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL] = endpoint; } rc = OMPI_SUCCESS; } bail: if (epids_in != NULL) { free(epids_in); } if (mask_in != NULL) { free(mask_in); } if (errs_out != NULL) { free(errs_out); } if (epaddrs_out != NULL) { free(epaddrs_out); } return rc; }
static int psmx2_av_connet_eps(struct psmx2_fid_av *av, size_t count, psm2_epid_t *epids, int *mask, psm2_error_t *errors, psm2_epaddr_t *epaddrs, void *context) { int i; psm2_epconn_t epconn; struct psmx2_epaddr_context *epaddr_context; int error_count = 0; /* set up mask to prevent connecting to an already connected ep */ for (i=0; i<count; i++) { if (psm2_ep_epid_lookup(epids[i], &epconn) == PSM2_OK) { epaddr_context = psm2_epaddr_getctxt(epconn.addr); if (epaddr_context && epaddr_context->epid == epids[i]) epaddrs[i] = epconn.addr; else mask[i] = 1; } else { mask[i] = 1; } } psm2_ep_connect(av->domain->psm2_ep, count, epids, mask, errors, epaddrs, psmx2_conn_timeout(count)); for (i=0; i<count; i++){ if (!mask[i]) continue; if (errors[i] == PSM2_OK || errors[i] == PSM2_EPID_ALREADY_CONNECTED) { psmx2_set_epaddr_context(av->domain, epids[i], epaddrs[i]); } else { /* If duplicated addrs are passed to psm2_ep_connect(), * all but one will fail with error "Endpoint could not * be reached". This should be treated the same as * "Endpoint already connected". */ if (psm2_ep_epid_lookup(epids[i], &epconn) == PSM2_OK) { epaddr_context = psm2_epaddr_getctxt(epconn.addr); if (epaddr_context && epaddr_context->epid == epids[i]) { epaddrs[i] = epconn.addr; continue; } } FI_INFO(&psmx2_prov, FI_LOG_AV, "%d: psm2_ep_connect returned %s. remote epid=%lx.\n", i, psm2_error_get_string(errors[i]), epids[i]); if (epids[i] == 0) FI_INFO(&psmx2_prov, FI_LOG_AV, "does the application depend on the provider" "to resolve IP address into endpoint id? if so" "check if the name server has started correctly" "at the other side.\n"); epaddrs[i] = (void *)FI_ADDR_NOTAVAIL; error_count++; if (av->flags & FI_EVENT) psmx2_av_post_completion(av, context, i, errors[i]); } } return error_count; }