static mca_mtl_base_module_t * ompi_mtl_psm2_component_init(bool enable_progress_threads, bool enable_mpi_threads) { psm_error_t err; int verno_major = PSM_VERNO_MAJOR; int verno_minor = PSM_VERNO_MINOR; int local_rank = -1, num_local_procs = 0; int num_total_procs = 0; /* Compute the total number of processes on this host and our local rank * on that node. We need to provide PSM with these values so it can * allocate hardware contexts appropriately across processes. */ if (OMPI_SUCCESS != get_num_local_procs(&num_local_procs)) { opal_output(0, "Cannot determine number of local processes. " "Cannot continue.\n"); return NULL; } if (OMPI_SUCCESS != get_local_rank(&local_rank)) { opal_output(0, "Cannot determine local rank. Cannot continue.\n"); return NULL; } if (OMPI_SUCCESS != get_num_total_procs(&num_total_procs)) { opal_output(0, "Cannot determine total number of processes. " "Cannot continue.\n"); return NULL; } err = psm_error_register_handler(NULL /* no ep */, PSM_ERRHANDLER_NOP); if (err) { opal_output(0, "Error in psm_error_register_handler (error %s)\n", psm_error_get_string(err)); return NULL; } if (num_local_procs == num_total_procs) { setenv("PSM_DEVICES", "self,shm", 0); } err = psm_init(&verno_major, &verno_minor); if (err) { opal_show_help("help-mtl-psm.txt", "psm init", true, psm_error_get_string(err)); return NULL; } /* Complete PSM initialization */ ompi_mtl_psm2_module_init(local_rank, num_local_procs); ompi_mtl_psm2.super.mtl_request_size = sizeof(mca_mtl_psm2_request_t) - sizeof(struct mca_mtl_request_t); return &ompi_mtl_psm2.super; }
static int psmx_init_lib(void) { int major, minor; int ret = 0, err; if (psmx_lib_initialized) return 0; pthread_mutex_lock(&psmx_lib_mutex); if (psmx_lib_initialized) goto out; psm_error_register_handler(NULL, PSM_ERRHANDLER_NO_HANDLER); major = PSM_VERNO_MAJOR; minor = PSM_VERNO_MINOR; err = psm_init(&major, &minor); if (err != PSM_OK) { FI_WARN(&psmx_prov, FI_LOG_CORE, "psm_init failed: %s\n", psm_error_get_string(err)); ret = err; goto out; } FI_INFO(&psmx_prov, FI_LOG_CORE, "PSM header version = (%d, %d)\n", PSM_VERNO_MAJOR, PSM_VERNO_MINOR); FI_INFO(&psmx_prov, FI_LOG_CORE, "PSM library version = (%d, %d)\n", major, minor); if (major != PSM_VERNO_MAJOR) { psmx_am_compat_mode = 1; FI_INFO(&psmx_prov, FI_LOG_CORE, "PSM AM compat mode enabled: appliation %d.%d, library %d.%d.\n", PSM_VERNO_MAJOR, PSM_VERNO_MINOR, major, minor); } if (major > 1) { psmx_compat_lib = 1; FI_INFO(&psmx_prov, FI_LOG_CORE, "PSM is supported via the psm2-compat library over PSM2.\n"); } psmx_lib_initialized = 1; out: pthread_mutex_unlock(&psmx_lib_mutex); return ret; }
int initialize(void) { int verno_major = PSM_VERNO_MAJOR; int verno_minor = PSM_VERNO_MINOR; psm_ep_errhandler_t handler; int err; //err = psm_error_register_handler(NULL, // Global handler // PSM_ERRHANDLER_NO_HANDLER); // return errors if (err) { fprintf(stderr, "t register global handler: %s\n", psm_error_get_string(err)); return -1; } err = psm_init(&verno_major, &verno_minor); if (err || verno_major > PSM_VERNO_MAJOR) { if (err) fprintf(stderr, "PSM initialization failure: %s\n", psm_error_get_string(err)); else fprintf(stderr, "PSM loaded an unexpected/unsupported " "version (%d.%d)\n", verno_major, verno_minor); return -1; } // We were able to initialize PSM but will defer all further error // handling since most of the errors beyond this point will be fatal. handler = error_handler; err = psm_error_register_handler(NULL, // Global handler // PSM_ERRHANDLER_PSM_HANDLER); handler); if (err) { fprintf(stderr, "t register global errhandler: %s\n", psm_error_get_string(err)); return -1; } return 0; }
static int pspsm_init(void) { static pspsm_init_state_t init_state = PSPSM_INIT_START; int verno_minor = PSM_VERNO_MINOR; int verno_major = PSM_VERNO_MAJOR; psm_error_t ret; if (init_state == PSPSM_INIT_START) { /* Check for an available /dev/ipath */ ret = pspsm_check_dev_ipath(); if (ret != 0) { goto err_dev_ipath; } ret = psm_init(&verno_major, &verno_minor); if (ret != PSM_OK) { goto err_init; } /* * All processes wanting to communicate need to use * the same UUID. * * It is unclear whether there are drawbacks from * simply using the same UUID for groups of processes * that will never communicate. * * On top of a constant fill pattern, we use: * * - PSP_PSM_UNIQ_ID if set and not zero, or * - PMI_ID, if set and not zero - that's not entirely * clean, but a practical solution for MPI apps (as * long as we do not implement communication between * two sets of MPI processes not sharing a * communicator). */ memset(pspsm_uuid.as_uuid, DEFAULT_UUID_PATTERN, sizeof(pspsm_uuid.as_uuid)); if (pscom.env.psm_uniq_id) { pspsm_dprint(2, "seeding PSM UUID with %u", pscom.env.psm_uniq_id); pspsm_uuid.as_uint = pscom.env.psm_uniq_id; } /* Open the endpoint here in init with the hope that every mpi rank call indirect psm_ep_open() before transmitting any data from or to this endpoint. This is to avoid a race condition in libpsm_infinipath. Downside: We consume PSM Contexts even in the case of only local communication. You could use PSP_PSM=0 in this case. */ if (pspsm_open_endpoint()) goto err_ep; if (pspsm_init_mq()) goto err_mq; pspsm_dprint(2, "pspsm_init: OK"); init_state = PSPSM_INIT_DONE; } return init_state; /* 0 = success, -1 = error */ err_dev_ipath: pspsm_dprint(2, "pspsm_init: No \"/dev/ipath\" found. Arch psm is disabled."); goto err_exit; err_init: pspsm_err(psm_error_get_string(ret)); pspsm_dprint(1, "pspsm_init: %s", pspsm_err_str); // Fall through err_ep: err_mq: err_exit: init_state = PSPSM_INIT_FAILED; return init_state; /* 0 = success, -1 = error */ }
static mca_mtl_base_module_t * ompi_mtl_psm_component_init(bool enable_progress_threads, bool enable_mpi_threads) { psm_error_t err; int verno_major = PSM_VERNO_MAJOR; int verno_minor = PSM_VERNO_MINOR; int local_rank = -1, num_local_procs = 0; int num_total_procs = 0; /* Compute the total number of processes on this host and our local rank * on that node. We need to provide PSM with these values so it can * allocate hardware contexts appropriately across processes. */ if (OMPI_SUCCESS != get_num_local_procs(&num_local_procs)) { opal_output(0, "Cannot determine number of local processes. " "Cannot continue.\n"); return NULL; } if (OMPI_SUCCESS != get_local_rank(&local_rank)) { opal_output(0, "Cannot determine local rank. Cannot continue.\n"); return NULL; } if (OMPI_SUCCESS != get_num_total_procs(&num_total_procs)) { opal_output(0, "Cannot determine total number of processes. " "Cannot continue.\n"); return NULL; } #if PSM_VERNO >= 0x010c /* Set infinipath debug level */ err = psm_setopt(PSM_COMPONENT_CORE, 0, PSM_CORE_OPT_DEBUG, (const void*) &ompi_mtl_psm.debug_level, sizeof(unsigned)); if (err) { /* Non fatal error. Can continue */ opal_show_help("help-mtl-psm.txt", "psm init", false, psm_error_get_string(err)); } #endif if (getenv("PSM_DEVICES") == NULL) { /* Only allow for shm and ipath devices in 2.0 and earlier releases * (unless the user overrides the setting). */ if (PSM_VERNO >= 0x0104) { if (num_local_procs == num_total_procs) { setenv("PSM_DEVICES", "self,shm", 0); } else { setenv("PSM_DEVICES", "self,shm,ipath", 0); } } else { if (num_local_procs == num_total_procs) { setenv("PSM_DEVICES", "shm", 0); } else { setenv("PSM_DEVICES", "shm,ipath", 0); } } } err = psm_init(&verno_major, &verno_minor); if (err) { opal_show_help("help-mtl-psm.txt", "psm init", true, psm_error_get_string(err)); return NULL; } /* Complete PSM initialization */ ompi_mtl_psm_module_init(local_rank, num_local_procs); ompi_mtl_psm.super.mtl_request_size = sizeof(mca_mtl_psm_request_t) - sizeof(struct mca_mtl_request_t); /* don't register the err handler until we know we will be active */ err = psm_error_register_handler(NULL /* no ep */, PSM_ERRHANDLER_NOP); if (err) { opal_output(0, "Error in psm_error_register_handler (error %s)\n", psm_error_get_string(err)); return NULL; } return &ompi_mtl_psm.super; }
static mca_mtl_base_module_t* ompi_mtl_psm_component_init(bool enable_progress_threads, bool enable_mpi_threads) { psm_error_t err; int rc; int verno_major = PSM_VERNO_MAJOR; int verno_minor = PSM_VERNO_MINOR; ompi_proc_t *my_proc, **procs; size_t num_total_procs, proc; int local_rank = -1, num_local_procs = 0; /* Compute the total number of processes on this host and our local rank * on that node. We need to provide PSM with these values so it can * allocate hardware contexts appropriately across processes. */ if ((rc = ompi_proc_refresh()) != OMPI_SUCCESS) { return NULL; } my_proc = ompi_proc_local(); if (NULL == (procs = ompi_proc_world(&num_total_procs))) { return NULL; } for (proc = 0; proc < num_total_procs; proc++) { if (my_proc == procs[proc]) { local_rank = num_local_procs++; continue; } if (OPAL_PROC_ON_LOCAL_NODE(procs[proc]->proc_flags)) { num_local_procs++; } } assert(local_rank >= 0 && num_local_procs > 0); free(procs); err = psm_error_register_handler(NULL /* no ep */, PSM_ERRHANDLER_NOP); if (err) { opal_output(0, "Error in psm_error_register_handler (error %s)\n", psm_error_get_string(err)); return NULL; } #if PSM_VERNO >= 0x010c /* Set infinipath debug level */ err = psm_setopt(PSM_COMPONENT_CORE, 0, PSM_CORE_OPT_DEBUG, (const void*) &ompi_mtl_psm.debug_level, sizeof(unsigned)); if (err) { /* Non fatal error. Can continue */ orte_show_help("help-mtl-psm.txt", "psm init", false, psm_error_get_string(err)); } #endif /* Only allow for shm and ipath devices in 2.0 and earlier releases * (unless the user overrides the setting). */ if (PSM_VERNO >= 0x0104) { setenv("PSM_DEVICES", "self,shm,ipath", 0); } else { setenv("PSM_DEVICES", "shm,ipath", 0); } err = psm_init(&verno_major, &verno_minor); if (err) { orte_show_help("help-mtl-psm.txt", "psm init", true, psm_error_get_string(err)); return NULL; } /* Complete PSM initialization */ ompi_mtl_psm_module_init(local_rank, num_local_procs); ompi_mtl_psm.super.mtl_request_size = sizeof(mca_mtl_psm_request_t) - sizeof(struct mca_mtl_request_t); return &ompi_mtl_psm.super; }