int ompi_mtl_psm2_finalize(struct mca_mtl_base_module_t* mtl) { psm2_error_t err; opal_progress_unregister(ompi_mtl_psm2_progress); /* free resources */ err = psm2_mq_finalize(ompi_mtl_psm2.mq); if (err) { opal_output(0, "Error in psm2_mq_finalize (error %s)\n", psm2_error_get_string(err)); return OMPI_ERROR; } err = psm2_ep_close(ompi_mtl_psm2.ep, PSM2_EP_CLOSE_GRACEFUL, 1*1e9); if (err) { opal_output(0, "Error in psm2_ep_close (error %s)\n", psm2_error_get_string(err)); return OMPI_ERROR; } err = psm2_finalize(); if (err) { opal_output(0, "Error in psm2_finalize (error %s)\n", psm2_error_get_string(err)); return OMPI_ERROR; } return OMPI_SUCCESS; }
static psm2_error_t ompi_mtl_psm2_errhandler(psm2_ep_t ep, const psm2_error_t error, const char *error_string, psm2_error_token_t token) { switch (error) { /* We don't want PSM2 to default to exiting when the following errors occur */ case PSM2_EP_DEVICE_FAILURE: case PSM2_EP_NO_DEVICE: case PSM2_EP_NO_PORTS_AVAIL: case PSM2_EP_NO_NETWORK: case PSM2_EP_INVALID_UUID_KEY: opal_show_help("help-mtl-psm2.txt", "unable to open endpoint", true, psm2_error_get_string(error)); break; /* We can't handle any other errors than the ones above */ default: opal_output(0, "Open MPI detected an unexpected PSM2 error in opening " "an endpoint: %s\n", error_string); return psm2_error_defer(token); break; } return error; }
static const char * ompi_mtl_psm2_connect_error_msg(psm2_error_t err) { switch (err) { /* See if we expect the error */ case PSM2_EPID_UNREACHABLE: case PSM2_EPID_INVALID_NODE: case PSM2_EPID_INVALID_MTU: case PSM2_EPID_INVALID_UUID_KEY: case PSM2_EPID_INVALID_VERSION: case PSM2_EPID_INVALID_CONNECT: return psm2_error_get_string(err); break; case PSM2_EPID_UNKNOWN: return "Connect status could not be determined " "because of other errors"; default: return NULL; } }
int ompi_mtl_psm2_module_init(int local_rank, int num_local_procs) { psm2_error_t err; psm2_ep_t ep; /* endpoint handle */ psm2_mq_t mq; psm2_epid_t epid; /* unique lid+port identifier */ psm2_uuid_t unique_job_key; struct psm2_ep_open_opts ep_opt; unsigned long long *uu = (unsigned long long *) unique_job_key; char *generated_key; char env_string[256]; int rc; generated_key = getenv("OMPI_MCA_orte_precondition_transports"); memset(uu, 0, sizeof(psm2_uuid_t)); if (!generated_key || (strlen(generated_key) != 33) || sscanf(generated_key, "%016llx-%016llx", &uu[0], &uu[1]) != 2) { opal_show_help("help-mtl-psm2.txt", "no uuid present", true, generated_key ? "could not be parsed from" : "not present in", ompi_process_info.nodename); return OMPI_ERROR; } /* Handle our own errors for opening endpoints */ psm2_error_register_handler(ompi_mtl_psm2.ep, ompi_mtl_psm2_errhandler); /* Setup MPI_LOCALRANKID and MPI_LOCALNRANKS so PSM2 can allocate hardware * contexts correctly. */ snprintf(env_string, sizeof(env_string), "%d", local_rank); setenv("MPI_LOCALRANKID", env_string, 0); snprintf(env_string, sizeof(env_string), "%d", num_local_procs); setenv("MPI_LOCALNRANKS", env_string, 0); /* Setup the endpoint options. */ psm2_ep_open_opts_get_defaults(&ep_opt); ep_opt.timeout = ompi_mtl_psm2.connect_timeout * 1e9; ep_opt.affinity = PSM2_EP_OPEN_AFFINITY_SKIP; /* do not let PSM2 set affinity */ /* Open PSM2 endpoint */ err = psm2_ep_open(unique_job_key, &ep_opt, &ep, &epid); if (err) { opal_show_help("help-mtl-psm2.txt", "unable to open endpoint", true, psm2_error_get_string(err)); return OMPI_ERROR; } /* Future errors are handled by the default error handler */ psm2_error_register_handler(ompi_mtl_psm2.ep, PSM2_ERRHANDLER_DEFAULT); err = psm2_mq_init(ep, 0xffff000000000000ULL, NULL, 0, &mq); if (err) { opal_show_help("help-mtl-psm2.txt", "psm2 init", true, psm2_error_get_string(err)); return OMPI_ERROR; } ompi_mtl_psm2.ep = ep; ompi_mtl_psm2.epid = epid; ompi_mtl_psm2.mq = mq; OPAL_MODEX_SEND(rc, OPAL_PMIX_GLOBAL, &mca_mtl_psm2_component.super.mtl_version, &ompi_mtl_psm2.epid, sizeof(psm2_epid_t)); if (OMPI_SUCCESS != rc) { opal_output(0, "Open MPI couldn't send PSM2 epid to head node process"); return OMPI_ERROR; } /* register the psm2 progress function */ opal_progress_register(ompi_mtl_psm2_progress); return OMPI_SUCCESS; }
int ompi_mtl_psm2_progress( void ) { psm2_error_t err; mca_mtl_psm2_request_t* mtl_psm2_request; psm2_mq_status2_t psm2_status; psm2_mq_req_t req; int completed = 1; do { err = psm2_mq_ipeek2(ompi_mtl_psm2.mq, &req, NULL); if (err == PSM2_MQ_INCOMPLETE) { return completed; } else if (err != PSM2_OK) { goto error; } completed++; err = psm2_mq_test2(&req, &psm2_status); if (err != PSM2_OK) { goto error; } mtl_psm2_request = (mca_mtl_psm2_request_t*) psm2_status.context; if (mtl_psm2_request->type == OMPI_mtl_psm2_IRECV) { mtl_psm2_request->super.ompi_req->req_status.MPI_SOURCE = psm2_status.msg_tag.tag1; mtl_psm2_request->super.ompi_req->req_status.MPI_TAG = psm2_status.msg_tag.tag0; mtl_psm2_request->super.ompi_req->req_status._ucount = psm2_status.nbytes; ompi_mtl_datatype_unpack(mtl_psm2_request->convertor, mtl_psm2_request->buf, psm2_status.msg_length); } if(mtl_psm2_request->type == OMPI_mtl_psm2_ISEND) { if (mtl_psm2_request->free_after) { free(mtl_psm2_request->buf); } } switch (psm2_status.error_code) { case PSM2_OK: mtl_psm2_request->super.ompi_req->req_status.MPI_ERROR = OMPI_SUCCESS; break; case PSM2_MQ_TRUNCATION: mtl_psm2_request->super.ompi_req->req_status.MPI_ERROR = MPI_ERR_TRUNCATE; break; default: mtl_psm2_request->super.ompi_req->req_status.MPI_ERROR = MPI_ERR_INTERN; } mtl_psm2_request->super.completion_callback(&mtl_psm2_request->super); } while (1); error: opal_show_help("help-mtl-psm2.txt", "error polling network", true, psm2_error_get_string(err)); return 1; }
int ompi_mtl_psm2_add_procs(struct mca_mtl_base_module_t *mtl, size_t nprocs, struct ompi_proc_t** procs) { int i,j; int rc; psm2_epid_t *epids_in = NULL; int *mask_in = NULL; psm2_epid_t *epid; psm2_epaddr_t *epaddrs_out = NULL; psm2_error_t *errs_out = NULL, err; size_t size; int proc_errors[PSM2_ERROR_LAST] = { 0 }; int timeout_in_secs; assert(mtl == &ompi_mtl_psm2.super); rc = OMPI_ERR_OUT_OF_RESOURCE; errs_out = (psm2_error_t *) malloc(nprocs * sizeof(psm2_error_t)); if (errs_out == NULL) { goto bail; } epids_in = (psm2_epid_t *) malloc(nprocs * sizeof(psm2_epid_t)); if (epids_in == NULL) { goto bail; } mask_in = (int *) malloc(nprocs * sizeof(int)); if (mask_in == NULL) { goto bail; } epaddrs_out = (psm2_epaddr_t *) malloc(nprocs * sizeof(psm2_epaddr_t)); if (epaddrs_out == NULL) { goto bail; } rc = OMPI_SUCCESS; /* Get the epids for all the processes from modex */ for (i = 0; i < (int) nprocs; i++) { if (NULL != procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL]) { /* Already connected: don't connect again */ mask_in[i] = 0; continue; } OPAL_MODEX_RECV(rc, &mca_mtl_psm2_component.super.mtl_version, &procs[i]->super.proc_name, (void**)&epid, &size); if (rc != OMPI_SUCCESS || size != sizeof(psm2_epid_t)) { return OMPI_ERROR; } epids_in[i] = *epid; mask_in[i] = 1; } timeout_in_secs = max(ompi_mtl_psm2.connect_timeout, 0.5 * nprocs); psm2_error_register_handler(ompi_mtl_psm2.ep, PSM2_ERRHANDLER_NOP); err = psm2_ep_connect(ompi_mtl_psm2.ep, nprocs, epids_in, mask_in, errs_out, epaddrs_out, timeout_in_secs * 1e9); if (err) { char *errstr = (char *) ompi_mtl_psm2_connect_error_msg(err); if (errstr == NULL) { opal_output(0, "PSM2 returned unhandled/unknown connect error: %s\n", psm2_error_get_string(err)); } for (i = 0; i < (int) nprocs; i++) { if (0 == mask_in[i]) { continue; } psm2_error_t thiserr = errs_out[i]; errstr = (char *) ompi_mtl_psm2_connect_error_msg(thiserr); if (proc_errors[thiserr] == 0) { proc_errors[thiserr] = 1; opal_output(0, "PSM2 EP connect error (%s):", errstr ? errstr : "unknown connect error"); for (j = 0; j < (int) nprocs; j++) { if (errs_out[j] == thiserr) { opal_output(0, " %s", (NULL == procs[j]->super.proc_hostname) ? "unknown" : procs[j]->super.proc_hostname); } } opal_output(0, "\n"); } } rc = OMPI_ERROR; } else { /* Default error handling is enabled, errors will not be returned to * user. PSM2 prints the error and the offending endpoint's hostname * and exits with -1 */ psm2_error_register_handler(ompi_mtl_psm2.ep, PSM2_ERRHANDLER_DEFAULT); /* Fill in endpoint data */ for (i = 0; i < (int) nprocs; i++) { if (0 == mask_in[i]) { continue; } mca_mtl_psm2_endpoint_t *endpoint = (mca_mtl_psm2_endpoint_t *) OBJ_NEW(mca_mtl_psm2_endpoint_t); endpoint->peer_epid = epids_in[i]; endpoint->peer_addr = epaddrs_out[i]; procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL] = endpoint; } rc = OMPI_SUCCESS; } bail: if (epids_in != NULL) { free(epids_in); } if (mask_in != NULL) { free(mask_in); } if (errs_out != NULL) { free(errs_out); } if (epaddrs_out != NULL) { free(epaddrs_out); } return rc; }
psm2_error_t __psm2_ep_connect(psm2_ep_t ep, int num_of_epid, psm2_epid_t const *array_of_epid, int const *array_of_epid_mask, /* can be NULL */ psm2_error_t *array_of_errors, psm2_epaddr_t *array_of_epaddr, int64_t timeout) { psm2_error_t err = PSM2_OK; ptl_ctl_t *ptlctl; ptl_t *ptl; int i, j, dup_idx; int num_toconnect = 0; int *epid_mask = NULL; int *epid_mask_isdupof = NULL; char *device; uint64_t t_start = get_cycles(); uint64_t t_left; union psmi_envvar_val timeout_intval; PSM2_LOG_MSG("entering"); PSMI_ERR_UNLESS_INITIALIZED(ep); PSMI_PLOCK(); /* * Normally we would lock here, but instead each implemented ptl component * does its own locking. This is mostly because the ptl components are * ahead of the PSM interface in that they can disconnect their peers. */ if (ep == NULL || array_of_epaddr == NULL || array_of_epid == NULL || num_of_epid < 1) { err = psmi_handle_error(ep, PSM2_PARAM_ERR, "Invalid psm2_ep_connect parameters"); goto fail; } /* We need two of these masks to detect duplicates */ err = PSM2_NO_MEMORY; epid_mask = (int *)psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epid); if (epid_mask == NULL) goto fail; epid_mask_isdupof = (int *)psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epid); if (epid_mask_isdupof == NULL) goto fail; err = PSM2_OK; /* Eventually handle timeouts across all connects. */ for (j = 0; j < num_of_epid; j++) { if (array_of_epid_mask != NULL && !array_of_epid_mask[j]) epid_mask[j] = 0; else { epid_mask[j] = 1; array_of_errors[j] = PSM2_EPID_UNKNOWN; array_of_epaddr[j] = NULL; num_toconnect++; } epid_mask_isdupof[j] = -1; } psmi_getenv("PSM2_CONNECT_TIMEOUT", "End-point connection timeout over-ride. 0 for no time-out.", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)0, &timeout_intval); if (getenv("PSM2_CONNECT_TIMEOUT")) { timeout = timeout_intval.e_uint * SEC_ULL; } else if (timeout > 0) { /* The timeout parameter provides the minimum timeout. A heuristic * is used to scale up the timeout linearly with the number of * endpoints, and we allow one second per 100 endpoints. */ timeout = max(timeout, (num_toconnect * SEC_ULL) / 100); } if (timeout > 0 && timeout < PSMI_MIN_EP_CONNECT_TIMEOUT) timeout = PSMI_MIN_EP_CONNECT_TIMEOUT; _HFI_PRDBG("Connect to %d endpoints with time-out of %.2f secs\n", num_toconnect, (double)timeout / 1e9); /* Look for duplicates in input array */ for (i = 0; i < num_of_epid; i++) { for (j = i + 1; j < num_of_epid; j++) { if (array_of_epid[i] == array_of_epid[j] && epid_mask[i] && epid_mask[j]) { epid_mask[j] = 0; /* don't connect more than once */ epid_mask_isdupof[j] = i; } } } for (i = 0; i < PTL_MAX_INIT; i++) { if (ep->devid_enabled[i] == -1) continue; /* Set up the right connect ptrs */ switch (ep->devid_enabled[i]) { case PTL_DEVID_IPS: ptlctl = &ep->ptl_ips; ptl = ep->ptl_ips.ptl; device = "ips"; break; case PTL_DEVID_AMSH: ptlctl = &ep->ptl_amsh; ptl = ep->ptl_amsh.ptl; device = "amsh"; break; case PTL_DEVID_SELF: ptlctl = &ep->ptl_self; ptl = ep->ptl_self.ptl; device = "self"; break; default: device = "unknown"; ptlctl = &ep->ptl_ips; /*no-unused */ ptl = ep->ptl_ips.ptl; /*no-unused */ device = "ips"; /*no-unused */ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Unknown/unhandled PTL id %d\n", ep->devid_enabled[i]); break; } t_left = psmi_cycles_left(t_start, timeout); _HFI_VDBG("Trying to connect with device %s\n", device); if ((err = ptlctl->ep_connect(ptl, num_of_epid, array_of_epid, epid_mask, array_of_errors, array_of_epaddr, cycles_to_nanosecs(t_left)))) { _HFI_PRDBG("Connect failure in device %s err=%d\n", device, err); goto connect_fail; } /* Now process what's been connected */ for (j = 0; j < num_of_epid; j++) { dup_idx = epid_mask_isdupof[j]; if (!epid_mask[j] && dup_idx == -1) continue; if (dup_idx != -1) { /* dup */ array_of_epaddr[j] = array_of_epaddr[dup_idx]; array_of_errors[j] = array_of_errors[dup_idx]; epid_mask_isdupof[j] = -1; } if (array_of_errors[j] == PSM2_OK) { epid_mask[j] = 0; /* don't try on next ptl */ ep->connections++; } } } for (i = 0; i < num_of_epid; i++) { ptl_ctl_t *c = NULL; if (array_of_epid_mask != NULL && !array_of_epid_mask[i]) continue; /* If we see unreachable here, that means some PTLs were not enabled */ if (array_of_errors[i] == PSM2_EPID_UNREACHABLE) { err = PSM2_EPID_UNREACHABLE; break; } psmi_assert_always(array_of_epaddr[i] != NULL); c = array_of_epaddr[i]->ptlctl; psmi_assert_always(c != NULL); _HFI_VDBG("%-20s DEVICE %s (%p)\n", psmi_epaddr_get_name(array_of_epid[i]), c == &ep->ptl_ips ? "hfi" : (c == &ep->ptl_amsh ? "amsh" : "self"), (void *)array_of_epaddr[i]->ptlctl->ptl); } connect_fail: /* If the error is a timeout (at worse) and the client is OPA MPI, * just return timeout to let OPA MPI handle the hostnames that * timed out */ if (err != PSM2_OK) { char errbuf[PSM2_ERRSTRING_MAXLEN]; size_t len; int j = 0; if (err == PSM2_EPID_UNREACHABLE) { char *deverr = "of an incorrect setting"; char *eperr = " "; char *devname = NULL; if (!psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) { deverr = "there is no shared memory PSM device (shm)"; eperr = " shared memory "; } else if (!psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { deverr = "there is no OPA PSM device (hfi)"; eperr = " OPA "; } len = snprintf(errbuf, sizeof(errbuf) - 1, "Some%sendpoints could not be connected because %s " "in the currently enabled PSM_DEVICES (", eperr, deverr); for (i = 0; i < PTL_MAX_INIT && len < sizeof(errbuf) - 1; i++) { switch (ep->devid_enabled[i]) { case PTL_DEVID_IPS: devname = "hfi"; break; case PTL_DEVID_AMSH: devname = "shm"; break; case PTL_DEVID_SELF: default: devname = "self"; break; } len += snprintf(errbuf + len, sizeof(errbuf) - len - 1, "%s,", devname); } if (len < sizeof(errbuf) - 1 && devname != NULL) /* parsed something, remove trailing comma */ errbuf[len - 1] = ')'; } else len = snprintf(errbuf, sizeof(errbuf) - 1, "%s", err == PSM2_TIMEOUT ? "Dectected connection timeout" : psm2_error_get_string(err)); /* first pass, look for all nodes with the error */ for (i = 0; i < num_of_epid && len < sizeof(errbuf) - 1; i++) { if (array_of_epid_mask != NULL && !array_of_epid_mask[i]) continue; if (array_of_errors[i] == PSM2_OK) continue; if (array_of_errors[i] == PSM2_EPID_UNREACHABLE && err != PSM2_EPID_UNREACHABLE) continue; if (err == array_of_errors[i]) { len += snprintf(errbuf + len, sizeof(errbuf) - len - 1, "%c %s", j == 0 ? ':' : ',', psmi_epaddr_get_hostname (array_of_epid[i])); j++; } } errbuf[sizeof(errbuf) - 1] = '\0'; err = psmi_handle_error(ep, err, errbuf); } fail: PSMI_PUNLOCK(); if (epid_mask != NULL) psmi_free(epid_mask); if (epid_mask_isdupof != NULL) psmi_free(epid_mask_isdupof); PSM2_LOG_MSG("leaving"); return err; }
static int psmx2_av_connet_eps(struct psmx2_fid_av *av, size_t count, psm2_epid_t *epids, int *mask, psm2_error_t *errors, psm2_epaddr_t *epaddrs, void *context) { int i; psm2_epconn_t epconn; struct psmx2_epaddr_context *epaddr_context; int error_count = 0; /* set up mask to prevent connecting to an already connected ep */ for (i=0; i<count; i++) { if (psm2_ep_epid_lookup(epids[i], &epconn) == PSM2_OK) { epaddr_context = psm2_epaddr_getctxt(epconn.addr); if (epaddr_context && epaddr_context->epid == epids[i]) epaddrs[i] = epconn.addr; else mask[i] = 1; } else { mask[i] = 1; } } psm2_ep_connect(av->domain->psm2_ep, count, epids, mask, errors, epaddrs, psmx2_conn_timeout(count)); for (i=0; i<count; i++){ if (!mask[i]) continue; if (errors[i] == PSM2_OK || errors[i] == PSM2_EPID_ALREADY_CONNECTED) { psmx2_set_epaddr_context(av->domain, epids[i], epaddrs[i]); } else { /* If duplicated addrs are passed to psm2_ep_connect(), * all but one will fail with error "Endpoint could not * be reached". This should be treated the same as * "Endpoint already connected". */ if (psm2_ep_epid_lookup(epids[i], &epconn) == PSM2_OK) { epaddr_context = psm2_epaddr_getctxt(epconn.addr); if (epaddr_context && epaddr_context->epid == epids[i]) { epaddrs[i] = epconn.addr; continue; } } FI_INFO(&psmx2_prov, FI_LOG_AV, "%d: psm2_ep_connect returned %s. remote epid=%lx.\n", i, psm2_error_get_string(errors[i]), epids[i]); if (epids[i] == 0) FI_INFO(&psmx2_prov, FI_LOG_AV, "does the application depend on the provider" "to resolve IP address into endpoint id? if so" "check if the name server has started correctly" "at the other side.\n"); epaddrs[i] = (void *)FI_ADDR_NOTAVAIL; error_count++; if (av->flags & FI_EVENT) psmx2_av_post_completion(av, context, i, errors[i]); } } return error_count; }