int ompi_mtl_psm_finalize(struct mca_mtl_base_module_t* mtl) { psm_error_t err; opal_progress_unregister(ompi_mtl_psm_progress); /* free resources */ err = psm_mq_finalize(ompi_mtl_psm.mq); if (err) { opal_output(0, "Error in psm_mq_finalize (error %s)\n", psm_error_get_string(err)); return OMPI_ERROR; } err = psm_ep_close(ompi_mtl_psm.ep, PSM_EP_CLOSE_GRACEFUL, 1*1e9); if (err) { opal_output(0, "Error in psm_ep_close (error %s)\n", psm_error_get_string(err)); return OMPI_ERROR; } err = psm_finalize(); if (err) { opal_output(0, "Error in psm_finalize (error %s)\n", psm_error_get_string(err)); return OMPI_ERROR; } return OMPI_SUCCESS; }
int main(void) { int verno_major = PSM_VERNO_MAJOR; int verno_minor = PSM_VERNO_MINOR; int err = psm_error_register_handler(NULL, // Global handler PSM_ERRHANDLER_NO_HANDLER); // return errors if (err) { fprintf(stderr, "t register global handler: %s\n", psm_error_get_string(err)); return -1; } err = psm_init(&verno_major, &verno_minor); if (err || verno_major > PSM_VERNO_MAJOR) { if (err) fprintf(stderr, "PSM initialization failure: %s\n", psm_error_get_string(err)); else fprintf(stderr, "PSM loaded an unexpected/unsupported " "version (%d.%d)\n", verno_major, verno_minor); return -1; } // We were able to initialize PSM but will defer all further error // handling since most of the errors beyond this point will be fatal. err = psm_error_register_handler(NULL, // Global handler PSM_ERRHANDLER_PSM_HANDLER); // if (err) { fprintf(stderr, "t register global errhandler: %s\n", psm_error_get_string(err)); return -1; } return 0; }
static mca_mtl_base_module_t * ompi_mtl_psm2_component_init(bool enable_progress_threads, bool enable_mpi_threads) { psm_error_t err; int verno_major = PSM_VERNO_MAJOR; int verno_minor = PSM_VERNO_MINOR; int local_rank = -1, num_local_procs = 0; int num_total_procs = 0; /* Compute the total number of processes on this host and our local rank * on that node. We need to provide PSM with these values so it can * allocate hardware contexts appropriately across processes. */ if (OMPI_SUCCESS != get_num_local_procs(&num_local_procs)) { opal_output(0, "Cannot determine number of local processes. " "Cannot continue.\n"); return NULL; } if (OMPI_SUCCESS != get_local_rank(&local_rank)) { opal_output(0, "Cannot determine local rank. Cannot continue.\n"); return NULL; } if (OMPI_SUCCESS != get_num_total_procs(&num_total_procs)) { opal_output(0, "Cannot determine total number of processes. " "Cannot continue.\n"); return NULL; } err = psm_error_register_handler(NULL /* no ep */, PSM_ERRHANDLER_NOP); if (err) { opal_output(0, "Error in psm_error_register_handler (error %s)\n", psm_error_get_string(err)); return NULL; } if (num_local_procs == num_total_procs) { setenv("PSM_DEVICES", "self,shm", 0); } err = psm_init(&verno_major, &verno_minor); if (err) { opal_show_help("help-mtl-psm.txt", "psm init", true, psm_error_get_string(err)); return NULL; } /* Complete PSM initialization */ ompi_mtl_psm2_module_init(local_rank, num_local_procs); ompi_mtl_psm2.super.mtl_request_size = sizeof(mca_mtl_psm2_request_t) - sizeof(struct mca_mtl_request_t); return &ompi_mtl_psm2.super; }
static int pscom_psm_peek() { unsigned read_progress = 0; psm_mq_req_t req; psm_mq_status_t status; psm_error_t ret; do { ret = psm_mq_ipeek(pspsm_mq, &req, /* status */ NULL); if (ret == PSM_MQ_INCOMPLETE) return read_progress; if (ret != PSM_OK) goto err; ret = psm_mq_test(&req, &status); if (ret != PSM_OK) goto err; read_progress += pscom_psm_process(&status); } while (1); err: pspsm_err(psm_error_get_string(ret)); pspsm_dprint(1, "pscom_psm_peek: %s", pspsm_err_str); return read_progress; }
static int pspsm_con_connect(pspsm_con_info_t *con_info, pspsm_info_msg_t *info_msg) { psm_error_t ret, ret1; if (memcmp(info_msg->protocol_version, PSPSM_PROTOCOL_VERSION, sizeof(info_msg->protocol_version))) { goto err_protocol; } ret = psm_ep_connect(pspsm_ep, 1, &info_msg->epid, NULL, &ret1, &con_info->epaddr, 0); con_info->send_id = info_msg->id; if (ret != PSM_OK) goto err_connect; pspsm_dprint(2, "pspsm_con_connect: OK"); pspsm_dprint(2, "sending with %"PRIx64", receiving %"PRIx64, con_info->send_id, con_info->recv_id); return 0; err_connect: pspsm_err(psm_error_get_string(ret)); pspsm_dprint(1, "pspsm_con_connect: %s", pspsm_err_str); return -1; err_protocol: { char str[80]; snprintf(str, sizeof(str), "protocol error : '%.8s' != '%.8s'", info_msg->protocol_version, PSPSM_PROTOCOL_VERSION); pspsm_err(str); pspsm_dprint(1, "pspsm_con_connect: %s", pspsm_err_str); } return -1; }
static psm_error_t ompi_mtl_psm_errhandler(psm_ep_t ep, const psm_error_t error, const char *error_string, psm_error_token_t token) { switch (error) { /* We don't want PSM to default to exiting when the following errors occur */ case PSM_EP_DEVICE_FAILURE: case PSM_EP_NO_DEVICE: case PSM_EP_NO_PORTS_AVAIL: case PSM_EP_NO_NETWORK: case PSM_EP_INVALID_UUID_KEY: opal_show_help("help-mtl-psm.txt", "unable to open endpoint", true, psm_error_get_string(error)); break; /* We can't handle any other errors than the ones above */ default: opal_output(0, "Open MPI detected an unexpected PSM error in opening " "an endpoint: %s\n", error_string); return psm_error_defer(token); break; } return error; }
static int pspsm_open_endpoint(void) { psm_error_t ret; if (!pspsm_ep){ struct psm_ep_open_opts opts; ret = psm_ep_open_opts_get_defaults(&opts); if (ret != PSM_OK) goto err; ret = psm_ep_open(pspsm_uuid.as_uuid, &opts, &pspsm_ep, &pspsm_epid); if (ret != PSM_OK) goto err; sendbuf = malloc(pscom.env.readahead); pspsm_dprint(2, "pspsm_open_endpoint: OK"); } return 0; err: pspsm_err(psm_error_get_string(ret)); pspsm_dprint(1, "pspsm_open_endpoint: %s", pspsm_err_str); return -1; }
static int pspsm_close_endpoint(void) { #if 1 /* psm_ep_close() SegFaults. A sleep(1) before sometimes helps, disabling the cleanup always helps. (Seen with infinipath-libs-3.2-32129.1162_rhel6_qlc.x86_64) */ return 0; #else psm_error_t ret; if (pspsm_ep){ ret = psm_ep_close(pspsm_ep, PSM_EP_CLOSE_GRACEFUL, 0); pspsm_ep = NULL; if (ret != PSM_OK) goto err; if (sendbuf) free(sendbuf); pspsm_dprint(2, "pspsm_close_endpoint: OK"); } return 0; err: pspsm_err(psm_error_get_string(ret)); pspsm_dprint(1, "pspsm_close_endpoint: %s", pspsm_err_str); return -1; #endif }
int ompi_mtl_psm_irecv(struct mca_mtl_base_module_t* mtl, struct ompi_communicator_t *comm, int src, int tag, struct opal_convertor_t *convertor, struct mca_mtl_request_t *mtl_request) { int ret; psm_error_t err; mca_mtl_psm_request_t * mtl_psm_request = (mca_mtl_psm_request_t*) mtl_request; uint64_t mqtag; uint64_t tagsel; size_t length; ret = ompi_mtl_datatype_recv_buf(convertor, &mtl_psm_request->buf, &length, &mtl_psm_request->free_after); if (OMPI_SUCCESS != ret) return ret; mtl_psm_request->length = length; mtl_psm_request->convertor = convertor; mtl_psm_request->type = OMPI_MTL_PSM_IRECV; PSM_MAKE_TAGSEL(src, tag, comm->c_contextid, mqtag, tagsel); #if 0 printf("recv bits: 0x%016llx 0x%016llx\n", mqtag, tagsel); #endif err = psm_mq_irecv(ompi_mtl_psm.mq, mqtag, tagsel, 0, mtl_psm_request->buf, length, mtl_psm_request, &mtl_psm_request->psm_request); if (err) { orte_show_help("help-mtl-psm.txt", "error posting receive", true, psm_error_get_string(err), mtl_psm_request->buf, length); return OMPI_ERROR; } return OMPI_SUCCESS; }
static int psmx_init_lib(void) { int major, minor; int ret = 0, err; if (psmx_lib_initialized) return 0; pthread_mutex_lock(&psmx_lib_mutex); if (psmx_lib_initialized) goto out; psm_error_register_handler(NULL, PSM_ERRHANDLER_NO_HANDLER); major = PSM_VERNO_MAJOR; minor = PSM_VERNO_MINOR; err = psm_init(&major, &minor); if (err != PSM_OK) { FI_WARN(&psmx_prov, FI_LOG_CORE, "psm_init failed: %s\n", psm_error_get_string(err)); ret = err; goto out; } FI_INFO(&psmx_prov, FI_LOG_CORE, "PSM header version = (%d, %d)\n", PSM_VERNO_MAJOR, PSM_VERNO_MINOR); FI_INFO(&psmx_prov, FI_LOG_CORE, "PSM library version = (%d, %d)\n", major, minor); if (major != PSM_VERNO_MAJOR) { psmx_am_compat_mode = 1; FI_INFO(&psmx_prov, FI_LOG_CORE, "PSM AM compat mode enabled: appliation %d.%d, library %d.%d.\n", PSM_VERNO_MAJOR, PSM_VERNO_MINOR, major, minor); } if (major > 1) { psmx_compat_lib = 1; FI_INFO(&psmx_prov, FI_LOG_CORE, "PSM is supported via the psm2-compat library over PSM2.\n"); } psmx_lib_initialized = 1; out: pthread_mutex_unlock(&psmx_lib_mutex); return ret; }
int pspsm_finalize_mq(void) { psm_error_t ret; if (pspsm_mq){ ret = psm_mq_finalize(pspsm_mq); if (ret != PSM_OK) goto err; pspsm_dprint(2, "pspsm_finalize_mq: OK"); } return 0; err: pspsm_err(psm_error_get_string(ret)); pspsm_dprint(1, "pspsm_finalize_mq: %s", pspsm_err_str); return -1; }
static inline int _pspsm_send_buf(pspsm_con_info_t *con_info, char *buf, size_t len, uint64_t tag, psm_mq_req_t *req, unsigned long nr) { void *context = (void *)((uintptr_t)con_info | nr); psm_error_t ret; assert(*req == PSM_MQ_REQINVALID); ret = psm_mq_isend(pspsm_mq, con_info->epaddr, /* flags */ 0, tag, buf, len, context, req); if (ret != PSM_OK) goto err; return 0; err: pspsm_err(psm_error_get_string(ret)); pspsm_dprint(1, "_pspsm_send_buf: %s", pspsm_err_str); return -EPIPE; }
static int pspsm_init_mq(void) { psm_error_t ret; if (!pspsm_mq){ ret = psm_mq_init(pspsm_ep, PSM_MQ_ORDERMASK_ALL, NULL, 0, &pspsm_mq); if (ret != PSM_OK) goto err; pspsm_dprint(2, "pspsm_init_mq: OK"); } return 0; err: pspsm_err(psm_error_get_string(ret)); pspsm_dprint(1, "pspsm_init_mq: %s", pspsm_err_str); return -1; }
static const char * ompi_mtl_psm_connect_error_msg(psm_error_t err) { switch (err) { /* See if we expect the error */ case PSM_EPID_UNREACHABLE: case PSM_EPID_INVALID_NODE: case PSM_EPID_INVALID_MTU: case PSM_EPID_INVALID_UUID_KEY: case PSM_EPID_INVALID_VERSION: case PSM_EPID_INVALID_CONNECT: return psm_error_get_string(err); break; case PSM_EPID_UNKNOWN: return "Connect status could not be determined " "because of other errors"; default: return NULL; } }
static int pspsm_close_endpoint(void) { psm_error_t ret; if (pspsm_ep){ ret = psm_ep_close(pspsm_ep, PSM_EP_CLOSE_GRACEFUL, 0); pspsm_ep = NULL; if (ret != PSM_OK) goto err; if (sendbuf) free(sendbuf); pspsm_dprint(2, "pspsm_close_endpoint: OK"); } return 0; err: pspsm_err(psm_error_get_string(ret)); pspsm_dprint(1, "pspsm_close_endpoint: %s", pspsm_err_str); return -1; }
/* sends an iov. FIXME: returns 0 if the send is complete, -EAGAIN if it created one or more requests for it, and -EPIPE in case of an error. */ static int _pspsm_sendv(pspsm_con_info_t *con_info, uint64_t magic) { uint64_t tag = con_info->send_id | magic; unsigned int i=0; psm_error_t ret; size_t len = con_info->iov[0].iov_len + con_info->iov[1].iov_len; if (len <= pscom.env.readahead){ pscom_memcpy_from_iov(sendbuf, con_info->iov, len); /* we hope that doesn't block - it shouldn't, as the * message is sufficiently small */ ret = psm_mq_send(pspsm_mq, con_info->epaddr, /* flags*/ 0, tag, sendbuf, len); if (ret != PSM_OK) goto err; return 0; } for (i=0; i<2; i++){ if (con_info->iov[i].iov_len){ /* pspsm_dprint(0, "Send part[%d], %p len %d to con %s\n", i, con_info->iov[i].iov_base, (int)con_info->iov[i].iov_len, con_info->con->pub.remote_con_info.name); */ if (_pspsm_send_buf(con_info, con_info->iov[i].iov_base, con_info->iov[i].iov_len, tag, &con_info->sreqs[i], i)){ return -EPIPE; } /* inc for each outstanding send request */ poll_user_inc(); } } return -EAGAIN; err: pspsm_err(psm_error_get_string(ret)); pspsm_dprint(1, "_pspsm_send_buf: %s", pspsm_err_str); return -EPIPE; }
static int pspsm_recvlook(pspsm_con_info_t *con_info) { /* ToDo: rename me to something like "post a receive". */ psm_error_t ret; uint64_t rtag = con_info->recv_id; void *context = (void *)((uintptr_t)con_info | 2); assert(con_info->rreq == PSM_MQ_REQINVALID); ret = psm_mq_irecv(pspsm_mq, rtag, mask, 0 /*flags*/, con_info->rbuf, con_info->rbuflen, context, &con_info->rreq); if (ret != PSM_OK) goto out_err; /* FIXME: Should probably not return an error code to indicate success. */ return -EAGAIN; out_err: pspsm_err(psm_error_get_string(ret)); pspsm_dprint(1, "pspsm_recvlook: %s", pspsm_err_str); return -1; }
int ompi_mtl_psm_module_init(int local_rank, int num_local_procs) { psm_error_t err; psm_ep_t ep; /* endpoint handle */ psm_mq_t mq; psm_epid_t epid; /* unique lid+port identifier */ psm_uuid_t unique_job_key; struct psm_ep_open_opts ep_opt; unsigned long long *uu = (unsigned long long *) unique_job_key; char *generated_key; char env_string[256]; generated_key = getenv("OMPI_MCA_orte_precondition_transports"); memset(uu, 0, sizeof(psm_uuid_t)); if (!generated_key || (strlen(generated_key) != 33) || sscanf(generated_key, "%016llx-%016llx", &uu[0], &uu[1]) != 2) { opal_show_help("help-mtl-psm.txt", "no uuid present", true, generated_key ? "could not be parsed from" : "not present in", ompi_process_info.nodename); return OMPI_ERROR; } /* Handle our own errors for opening endpoints */ psm_error_register_handler(ompi_mtl_psm.ep, ompi_mtl_psm_errhandler); /* Setup MPI_LOCALRANKID and MPI_LOCALNRANKS so PSM can allocate hardware * contexts correctly. */ snprintf(env_string, sizeof(env_string), "%d", local_rank); setenv("MPI_LOCALRANKID", env_string, 0); snprintf(env_string, sizeof(env_string), "%d", num_local_procs); setenv("MPI_LOCALNRANKS", env_string, 0); /* Setup the endpoint options. */ bzero((void*) &ep_opt, sizeof(ep_opt)); ep_opt.timeout = ompi_mtl_psm.connect_timeout * 1e9; ep_opt.unit = ompi_mtl_psm.ib_unit; ep_opt.affinity = PSM_EP_OPEN_AFFINITY_SKIP; /* do not let PSM set affinity */ ep_opt.shm_mbytes = -1; /* Choose PSM defaults */ ep_opt.sendbufs_num = -1; /* Choose PSM defaults */ #if PSM_VERNO >= 0x0101 ep_opt.network_pkey = ompi_mtl_psm.ib_pkey; #endif #if PSM_VERNO >= 0x0107 ep_opt.port = ompi_mtl_psm.ib_port; ep_opt.outsl = ompi_mtl_psm.ib_service_level; #endif #if PSM_VERNO >= 0x010d ep_opt.service_id = ompi_mtl_psm.ib_service_id; ep_opt.path_res_type = ompi_mtl_psm.path_res_type; #endif /* Open PSM endpoint */ err = psm_ep_open(unique_job_key, &ep_opt, &ep, &epid); if (err) { opal_show_help("help-mtl-psm.txt", "unable to open endpoint", true, psm_error_get_string(err)); return OMPI_ERROR; } /* Future errors are handled by the default error handler */ psm_error_register_handler(ompi_mtl_psm.ep, PSM_ERRHANDLER_DEFAULT); err = psm_mq_init(ep, 0xffff000000000000ULL, NULL, 0, &mq); if (err) { opal_show_help("help-mtl-psm.txt", "psm init", true, psm_error_get_string(err)); return OMPI_ERROR; } ompi_mtl_psm.ep = ep; ompi_mtl_psm.epid = epid; ompi_mtl_psm.mq = mq; if (OMPI_SUCCESS != ompi_modex_send( &mca_mtl_psm_component.super.mtl_version, &ompi_mtl_psm.epid, sizeof(psm_epid_t))) { opal_output(0, "Open MPI couldn't send PSM epid to head node process"); return OMPI_ERROR; } /* register the psm progress function */ opal_progress_register(ompi_mtl_psm_progress); return OMPI_SUCCESS; }
static mca_mtl_base_module_t* ompi_mtl_psm_component_init(bool enable_progress_threads, bool enable_mpi_threads) { psm_error_t err; int rc; int verno_major = PSM_VERNO_MAJOR; int verno_minor = PSM_VERNO_MINOR; ompi_proc_t *my_proc, **procs; size_t num_total_procs, proc; int local_rank = -1, num_local_procs = 0; /* Compute the total number of processes on this host and our local rank * on that node. We need to provide PSM with these values so it can * allocate hardware contexts appropriately across processes. */ if ((rc = ompi_proc_refresh()) != OMPI_SUCCESS) { return NULL; } my_proc = ompi_proc_local(); if (NULL == (procs = ompi_proc_world(&num_total_procs))) { return NULL; } for (proc = 0; proc < num_total_procs; proc++) { if (my_proc == procs[proc]) { local_rank = num_local_procs++; continue; } if (OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags)) { num_local_procs++; } } assert(local_rank >= 0 && num_local_procs > 0); free(procs); err = psm_error_register_handler(NULL /* no ep */, PSM_ERRHANDLER_NOP); if (err) { opal_output(0, "Error in psm_error_register_handler (error %s)\n", psm_error_get_string(err)); return NULL; } #if PSM_VERNO >= 0x010c /* Set infinipath debug level */ err = psm_setopt(PSM_COMPONENT_CORE, 0, PSM_CORE_OPT_DEBUG, (const void*) &ompi_mtl_psm.debug_level, sizeof(unsigned)); if (err) { /* Non fatal error. Can continue */ orte_show_help("help-mtl-psm.txt", "psm init", false, psm_error_get_string(err)); } #endif /* Only allow for shm and ipath devices in 2.0 and earlier releases * (unless the user overrides the setting). */ if (PSM_VERNO >= 0x0104) { setenv("PSM_DEVICES", "self,shm,ipath", 0); } else { setenv("PSM_DEVICES", "shm,ipath", 0); } err = psm_init(&verno_major, &verno_minor); if (err) { orte_show_help("help-mtl-psm.txt", "psm init", true, psm_error_get_string(err)); return NULL; } /* Complete PSM initialization */ ompi_mtl_psm_module_init(local_rank, num_local_procs); ompi_mtl_psm.super.mtl_request_size = sizeof(mca_mtl_psm_request_t) - sizeof(struct mca_mtl_request_t); return &ompi_mtl_psm.super; }
int ompi_mtl_psm_progress( void ) { psm_error_t err; mca_mtl_psm_request_t* mtl_psm_request; psm_mq_status_t psm_status; psm_mq_req_t req; int completed = 1; do { err = psm_mq_ipeek(ompi_mtl_psm.mq, &req, NULL); if (err == PSM_MQ_INCOMPLETE) { return completed; } else if (err != PSM_OK) { goto error; } completed++; err = psm_mq_test(&req, &psm_status); if (err != PSM_OK) { goto error; } mtl_psm_request = (mca_mtl_psm_request_t*) psm_status.context; if (mtl_psm_request->type == OMPI_MTL_PSM_IRECV) { ompi_mtl_datatype_unpack(mtl_psm_request->convertor, mtl_psm_request->buf, psm_status.msg_length); mtl_psm_request->super.ompi_req->req_status.MPI_SOURCE = PSM_GET_MQRANK(psm_status.msg_tag); mtl_psm_request->super.ompi_req->req_status.MPI_TAG = PSM_GET_MQUTAG(psm_status.msg_tag); mtl_psm_request->super.ompi_req->req_status._ucount = psm_status.nbytes; } if(mtl_psm_request->type == OMPI_MTL_PSM_ISEND) { if (mtl_psm_request->free_after) { free(mtl_psm_request->buf); } } switch (psm_status.error_code) { case PSM_OK: mtl_psm_request->super.ompi_req->req_status.MPI_ERROR = OMPI_SUCCESS; break; case PSM_MQ_TRUNCATION: mtl_psm_request->super.ompi_req->req_status.MPI_ERROR = MPI_ERR_TRUNCATE; break; default: mtl_psm_request->super.ompi_req->req_status.MPI_ERROR = MPI_ERR_INTERN; } mtl_psm_request->super.completion_callback(&mtl_psm_request->super); } while (1); error: opal_show_help("help-mtl-psm.txt", "error polling network", true, psm_error_get_string(err)); return 1; }
int ompi_mtl_psm_add_procs(struct mca_mtl_base_module_t *mtl, size_t nprocs, struct ompi_proc_t** procs) { int i,j; int rc; psm_epid_t *epids_in = NULL; psm_epid_t *epid; psm_epaddr_t *epaddrs_out = NULL; psm_error_t *errs_out = NULL, err; size_t size; int proc_errors[PSM_ERROR_LAST] = { 0 }; int timeout_in_secs; assert(mtl == &ompi_mtl_psm.super); rc = OMPI_ERR_OUT_OF_RESOURCE; errs_out = (psm_error_t *) malloc(nprocs * sizeof(psm_error_t)); if (errs_out == NULL) { goto bail; } epids_in = (psm_epid_t *) malloc(nprocs * sizeof(psm_epid_t)); if (epids_in == NULL) { goto bail; } epaddrs_out = (psm_epaddr_t *) malloc(nprocs * sizeof(psm_epaddr_t)); if (epaddrs_out == NULL) { goto bail; } rc = OMPI_SUCCESS; /* Get the epids for all the processes from modex */ for (i = 0; i < (int) nprocs; i++) { rc = ompi_modex_recv(&mca_mtl_psm_component.super.mtl_version, procs[i], (void**)&epid, &size); if (rc != OMPI_SUCCESS || size != sizeof(psm_epid_t)) { return OMPI_ERROR; } epids_in[i] = *epid; } timeout_in_secs = max(ompi_mtl_psm.connect_timeout, 0.5 * nprocs); psm_error_register_handler(ompi_mtl_psm.ep, PSM_ERRHANDLER_NOP); err = psm_ep_connect(ompi_mtl_psm.ep, nprocs, epids_in, NULL, /* connect all */ errs_out, epaddrs_out, timeout_in_secs * 1e9); if (err) { char *errstr = (char *) ompi_mtl_psm_connect_error_msg(err); if (errstr == NULL) { opal_output(0, "PSM returned unhandled/unknown connect error: %s\n", psm_error_get_string(err)); } for (i = 0; i < (int) nprocs; i++) { psm_error_t thiserr = errs_out[i]; errstr = (char *) ompi_mtl_psm_connect_error_msg(thiserr); if (proc_errors[thiserr] == 0) { proc_errors[thiserr] = 1; opal_output(0, "PSM EP connect error (%s):", errstr ? errstr : "unknown connect error"); for (j = 0; j < (int) nprocs; j++) { if (errs_out[j] == thiserr) { opal_output(0, " %s", (NULL == procs[j]->proc_hostname) ? "unknown" : procs[j]->proc_hostname); } } opal_output(0, "\n"); } } rc = OMPI_ERROR; } else { /* Default error handling is enabled, errors will not be returned to * user. PSM prints the error and the offending endpoint's hostname * and exits with -1 */ psm_error_register_handler(ompi_mtl_psm.ep, PSM_ERRHANDLER_DEFAULT); /* Fill in endpoint data */ for (i = 0; i < (int) nprocs; i++) { mca_mtl_psm_endpoint_t *endpoint = (mca_mtl_psm_endpoint_t *) OBJ_NEW(mca_mtl_psm_endpoint_t); endpoint->peer_epid = epids_in[i]; endpoint->peer_addr = epaddrs_out[i]; procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL] = endpoint; } rc = OMPI_SUCCESS; } bail: if (epids_in != NULL) { free(epids_in); } if (errs_out != NULL) { free(errs_out); } if (epaddrs_out != NULL) { free(epaddrs_out); } return rc; }
static int psmx_av_insert(struct fid_av *av, const void *addr, size_t count, fi_addr_t *fi_addr, uint64_t flags, void *context) { struct psmx_fid_av *av_priv; psm_error_t *errors; int error_count = 0; int *mask; int i, j, ret; fi_addr_t *result = NULL; struct psmx_epaddr_context *epaddr_context; if (count && !addr) { FI_INFO(&psmx_prov, FI_LOG_AV, "the input address array is NULL.\n"); return -FI_EINVAL; } av_priv = container_of(av, struct psmx_fid_av, av); if ((av_priv->flags & FI_EVENT) && !av_priv->eq) return -FI_ENOEQ; errors = (psm_error_t *) calloc(count, sizeof *errors); if (!errors) return -FI_ENOMEM; mask = (int *) calloc(count, sizeof *mask); if (!mask) { free(errors); return -FI_ENOMEM; } if (av_priv->type == FI_AV_TABLE) { if (psmx_av_check_table_size(av_priv, count)) { free(mask); free(errors); return -FI_ENOMEM; } for (i=0; i<count; i++) av_priv->psm_epids[av_priv->last + i] = ((psm_epid_t *)addr)[i]; result = fi_addr; addr = (const void *)(av_priv->psm_epids + av_priv->last); fi_addr = (fi_addr_t *)(av_priv->psm_epaddrs + av_priv->last); } /* prevent connecting to the same ep twice, which is fatal in PSM */ for (i=0; i<count; i++) { psm_epconn_t epconn; if (psm_ep_epid_lookup(((psm_epid_t *) addr)[i], &epconn) == PSM_OK) { epaddr_context = psm_epaddr_getctxt(epconn.addr); if (epaddr_context && epaddr_context->epid == ((psm_epid_t *) addr)[i]) ((psm_epaddr_t *) fi_addr)[i] = epconn.addr; else mask[i] = 1; } else { mask[i] = 1; } } psm_ep_connect(av_priv->domain->psm_ep, count, (psm_epid_t *) addr, mask, errors, (psm_epaddr_t *) fi_addr, 30*1e9); for (i=0; i<count; i++){ if (!mask[i]) { errors[i] = PSM_OK; continue; } if (errors[i] == PSM_OK || errors[i] == PSM_EPID_ALREADY_CONNECTED) { psmx_set_epaddr_context(av_priv->domain, ((psm_epid_t *) addr)[i], ((psm_epaddr_t *) fi_addr)[i]); errors[i] = PSM_OK; } else { psm_epconn_t epconn; /* If duplicated addresses are passed to psm_ep_connect(), all but one will fail * with error "Endpoint could not be reached". They should be treated as already * connected. */ if (psm_ep_epid_lookup(((psm_epid_t *) addr)[i], &epconn) == PSM_OK) { epaddr_context = psm_epaddr_getctxt(epconn.addr); if (epaddr_context && epaddr_context->epid == ((psm_epid_t *) addr)[i]) { ((psm_epaddr_t *) fi_addr)[i] = epconn.addr; errors[i] = PSM_OK; continue; } } FI_INFO(&psmx_prov, FI_LOG_AV, "%d: psm_ep_connect returned %s. remote epid=%lx.\n", i, psm_error_get_string(errors[i]), ((psm_epid_t *)addr)[i]); if (((psm_epid_t *)addr)[i] == 0) FI_INFO(&psmx_prov, FI_LOG_AV, "does the application depend on the provider" "to resolve IP address into endpoint id? if so" "check if the name server has started correctly" "at the other side.\n"); fi_addr[i] = FI_ADDR_NOTAVAIL; error_count++; if (av_priv->flags & FI_EVENT) psmx_av_post_completion(av_priv, context, i, errors[i]); } } if (av_priv->type == FI_AV_TABLE) { /* NOTE: unresolved addresses are left in the AV table */ if (result) { for (i=0; i<count; i++) { j = av_priv->last + i; if ((fi_addr_t)av_priv->psm_epaddrs[j] == FI_ADDR_NOTAVAIL) result[i] = FI_ADDR_NOTAVAIL; else result[i] = j; } } av_priv->last += count; } if (av_priv->flags & FI_EVENT) { psmx_av_post_completion(av_priv, context, count - error_count, 0); ret = 0; } else { if (flags & FI_SYNC_ERR) { int *fi_errors = context; for (i=0; i<count; i++) fi_errors[i] = psmx_errno(errors[i]); } ret = count - error_count; } free(mask); free(errors); return ret; }
static const char *psmx_eq_strerror(struct fid_eq *eq, int prov_errno, const void *prov_data, void *buf, size_t len) { return psm_error_get_string(prov_errno); }
static int pspsm_init(void) { static pspsm_init_state_t init_state = PSPSM_INIT_START; int verno_minor = PSM_VERNO_MINOR; int verno_major = PSM_VERNO_MAJOR; psm_error_t ret; if (init_state == PSPSM_INIT_START) { /* Check for an available /dev/ipath */ ret = pspsm_check_dev_ipath(); if (ret != 0) { goto err_dev_ipath; } ret = psm_init(&verno_major, &verno_minor); if (ret != PSM_OK) { goto err_init; } /* * All processes wanting to communicate need to use * the same UUID. * * It is unclear whether there are drawbacks from * simply using the same UUID for groups of processes * that will never communicate. * * On top of a constant fill pattern, we use: * * - PSP_PSM_UNIQ_ID if set and not zero, or * - PMI_ID, if set and not zero - that's not entirely * clean, but a practical solution for MPI apps (as * long as we do not implement communication between * two sets of MPI processes not sharing a * communicator). */ memset(pspsm_uuid.as_uuid, DEFAULT_UUID_PATTERN, sizeof(pspsm_uuid.as_uuid)); if (pscom.env.psm_uniq_id) { pspsm_dprint(2, "seeding PSM UUID with %u", pscom.env.psm_uniq_id); pspsm_uuid.as_uint = pscom.env.psm_uniq_id; } /* Open the endpoint here in init with the hope that every mpi rank call indirect psm_ep_open() before transmitting any data from or to this endpoint. This is to avoid a race condition in libpsm_infinipath. Downside: We consume PSM Contexts even in the case of only local communication. You could use PSP_PSM=0 in this case. */ if (pspsm_open_endpoint()) goto err_ep; if (pspsm_init_mq()) goto err_mq; pspsm_dprint(2, "pspsm_init: OK"); init_state = PSPSM_INIT_DONE; } return init_state; /* 0 = success, -1 = error */ err_dev_ipath: pspsm_dprint(2, "pspsm_init: No \"/dev/ipath\" found. Arch psm is disabled."); goto err_exit; err_init: pspsm_err(psm_error_get_string(ret)); pspsm_dprint(1, "pspsm_init: %s", pspsm_err_str); // Fall through err_ep: err_mq: err_exit: init_state = PSPSM_INIT_FAILED; return init_state; /* 0 = success, -1 = error */ }
static mca_mtl_base_module_t * ompi_mtl_psm_component_init(bool enable_progress_threads, bool enable_mpi_threads) { psm_error_t err; int verno_major = PSM_VERNO_MAJOR; int verno_minor = PSM_VERNO_MINOR; int local_rank = -1, num_local_procs = 0; int num_total_procs = 0; /* Compute the total number of processes on this host and our local rank * on that node. We need to provide PSM with these values so it can * allocate hardware contexts appropriately across processes. */ if (OMPI_SUCCESS != get_num_local_procs(&num_local_procs)) { opal_output(0, "Cannot determine number of local processes. " "Cannot continue.\n"); return NULL; } if (OMPI_SUCCESS != get_local_rank(&local_rank)) { opal_output(0, "Cannot determine local rank. Cannot continue.\n"); return NULL; } if (OMPI_SUCCESS != get_num_total_procs(&num_total_procs)) { opal_output(0, "Cannot determine total number of processes. " "Cannot continue.\n"); return NULL; } #if PSM_VERNO >= 0x010c /* Set infinipath debug level */ err = psm_setopt(PSM_COMPONENT_CORE, 0, PSM_CORE_OPT_DEBUG, (const void*) &ompi_mtl_psm.debug_level, sizeof(unsigned)); if (err) { /* Non fatal error. Can continue */ opal_show_help("help-mtl-psm.txt", "psm init", false, psm_error_get_string(err)); } #endif if (getenv("PSM_DEVICES") == NULL) { /* Only allow for shm and ipath devices in 2.0 and earlier releases * (unless the user overrides the setting). */ if (PSM_VERNO >= 0x0104) { if (num_local_procs == num_total_procs) { setenv("PSM_DEVICES", "self,shm", 0); } else { setenv("PSM_DEVICES", "self,shm,ipath", 0); } } else { if (num_local_procs == num_total_procs) { setenv("PSM_DEVICES", "shm", 0); } else { setenv("PSM_DEVICES", "shm,ipath", 0); } } } err = psm_init(&verno_major, &verno_minor); if (err) { opal_show_help("help-mtl-psm.txt", "psm init", true, psm_error_get_string(err)); return NULL; } /* Complete PSM initialization */ ompi_mtl_psm_module_init(local_rank, num_local_procs); ompi_mtl_psm.super.mtl_request_size = sizeof(mca_mtl_psm_request_t) - sizeof(struct mca_mtl_request_t); /* don't register the err handler until we know we will be active */ err = psm_error_register_handler(NULL /* no ep */, PSM_ERRHANDLER_NOP); if (err) { opal_output(0, "Error in psm_error_register_handler (error %s)\n", psm_error_get_string(err)); return NULL; } return &ompi_mtl_psm.super; }
static int psmx_av_insert(struct fid_av *av, const void *addr, size_t count, fi_addr_t *fi_addr, uint64_t flags, void *context) { struct psmx_fid_av *av_priv; psm_error_t *errors; int error_count = 0; int *mask; int i, j; fi_addr_t *result = NULL; struct psmx_epaddr_context *epaddr_context; av_priv = container_of(av, struct psmx_fid_av, av); errors = (psm_error_t *) calloc(count, sizeof *errors); if (!errors) return -FI_ENOMEM; mask = (int *) calloc(count, sizeof *mask); if (!mask) { free(errors); return -FI_ENOMEM; } if (av_priv->type == FI_AV_TABLE) { if (psmx_av_check_table_size(av_priv, count)) { free(mask); free(errors); return -FI_ENOMEM; } for (i=0; i<count; i++) av_priv->psm_epids[av_priv->last + i] = ((psm_epid_t *)addr)[i]; result = fi_addr; addr = (const void *)(av_priv->psm_epids + av_priv->last); fi_addr = (fi_addr_t *)(av_priv->psm_epaddrs + av_priv->last); } /* prevent connecting to the same ep twice, which is fatal in PSM */ for (i=0; i<count; i++) { psm_epconn_t epconn; if (psm_ep_epid_lookup(((psm_epid_t *) addr)[i], &epconn) == PSM_OK) { epaddr_context = psm_epaddr_getctxt(epconn.addr); if (epaddr_context && epaddr_context->epid == ((psm_epid_t *) addr)[i]) ((psm_epaddr_t *) fi_addr)[i] = epconn.addr; else mask[i] = 1; } else { mask[i] = 1; } } psm_ep_connect(av_priv->domain->psm_ep, count, (psm_epid_t *) addr, mask, errors, (psm_epaddr_t *) fi_addr, 30*1e9); for (i=0; i<count; i++){ if (!mask[i]) continue; if (errors[i] == PSM_OK || errors[i] == PSM_EPID_ALREADY_CONNECTED) { psmx_set_epaddr_context(av_priv->domain, ((psm_epid_t *) addr)[i], ((psm_epaddr_t *) fi_addr)[i]); } else { FI_INFO(&psmx_prov, FI_LOG_AV, "%d: psm_ep_connect returned %s. remote epid=%lx.\n", i, psm_error_get_string(errors[i]), ((psm_epid_t *)addr)[i]); if (((psm_epid_t *)addr)[i] == 0) FI_INFO(&psmx_prov, FI_LOG_AV, "does the application depend on the provider" "to resolve IP address into endpoint id? if so" "check if the name server has started correctly" "at the other side.\n"); fi_addr[i] = FI_ADDR_NOTAVAIL; error_count++; } } free(mask); free(errors); if (av_priv->type == FI_AV_TABLE) { /* NOTE: unresolved addresses are left in the AV table */ if (result) { for (i=0; i<count; i++) { j = av_priv->last + i; if ((fi_addr_t)av_priv->psm_epaddrs[j] == FI_ADDR_NOTAVAIL) result[i] = FI_ADDR_NOTAVAIL; else result[i] = j; } } av_priv->last += count; } return count - error_count; }