static int rte_init(void) { int ret; char *error = NULL; char **hosts = NULL; /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { error = "orte_ess_base_std_prolog"; goto error; } /* Start by getting a unique name */ lsf_set_name(); /* if I am a daemon, complete my setup using the * default procedure */ if (ORTE_PROC_IS_DAEMON) { if (NULL != orte_node_regex) { /* extract the nodes */ if (ORTE_SUCCESS != (ret = orte_regex_extract_node_names(orte_node_regex, &hosts))) { error = "orte_regex_extract_node_names"; goto error; } } if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_orted_setup"; goto error; } opal_argv_free(hosts); return ORTE_SUCCESS; } if (ORTE_PROC_IS_TOOL) { /* otherwise, if I am a tool proc, use that procedure */ if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup())) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_tool_setup"; goto error; } /* as a tool, I don't need a nidmap - so just return now */ return ORTE_SUCCESS; } /* otherwise, I must be an application process - use * the default procedure to finish my setup */ if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_app_setup"; goto error; } /* setup the nidmap arrays */ if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(orte_process_info.sync_buf))) { ORTE_ERROR_LOG(ret); error = "orte_util_nidmap_init"; goto error; } return ORTE_SUCCESS; error: if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) { orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); } return ret; }
static int rte_init(void) { int ret; char *error = NULL; char *envar, *ev1, *ev2; uint64_t unique_key[2]; char *string_key; opal_value_t *kv; char *val; int u32, *u32ptr; uint16_t u16, *u16ptr; char **peers=NULL, *mycpuset; opal_process_name_t wildcard_rank, pname; bool bool_val, *bool_ptr = &bool_val, tdir_mca_override = false; size_t i; /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { error = "orte_ess_base_std_prolog"; goto error; } /* get an async event base - we use the opal_async one so * we don't startup extra threads if not needed */ orte_event_base = opal_progress_thread_init(NULL); progress_thread_running = true; /* open and setup pmix */ if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_pmix_base_framework, 0))) { ORTE_ERROR_LOG(ret); /* we cannot run */ error = "pmix init"; goto error; } if (OPAL_SUCCESS != (ret = opal_pmix_base_select())) { /* we cannot run */ error = "pmix init"; goto error; } /* set the event base */ opal_pmix_base_set_evbase(orte_event_base); /* initialize the selected module */ if (!opal_pmix.initialized() && (OPAL_SUCCESS != (ret = opal_pmix.init(NULL)))) { /* we cannot run - this could be due to being direct launched * without the required PMI support being built. Try to detect * that scenario and warn the user */ if (ORTE_SCHIZO_DIRECT_LAUNCHED == orte_schizo.check_launch_environment() && NULL != (envar = getenv("ORTE_SCHIZO_DETECTION"))) { if (0 == strcmp(envar, "SLURM")) { /* yes to both - so emit a hopefully helpful * error message and abort */ orte_show_help_finalize(); orte_show_help("help-ess-base.txt", "slurm-error", true); return ORTE_ERR_SILENT; } else if (0 == strcmp(envar, "ALPS")) { /* we were direct launched by ALPS */ orte_show_help_finalize(); orte_show_help("help-ess-base.txt", "alps-error", true); return ORTE_ERR_SILENT; } } error = "pmix init"; goto error; } u32ptr = &u32; u16ptr = &u16; /**** THE FOLLOWING ARE REQUIRED VALUES ***/ /* pmix.init set our process name down in the OPAL layer, * so carry it forward here */ ORTE_PROC_MY_NAME->jobid = OPAL_PROC_MY_NAME.jobid; ORTE_PROC_MY_NAME->vpid = OPAL_PROC_MY_NAME.vpid; /* setup a name for retrieving data associated with the job */ wildcard_rank.jobid = ORTE_PROC_MY_NAME->jobid; wildcard_rank.vpid = ORTE_NAME_WILDCARD->vpid; /* setup a name for retrieving proc-specific data */ pname.jobid = ORTE_PROC_MY_NAME->jobid; pname.vpid = 0; /* get our local rank from PMI */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_RANK, ORTE_PROC_MY_NAME, &u16ptr, OPAL_UINT16); if (OPAL_SUCCESS != ret) { error = "getting local rank"; goto error; } orte_process_info.my_local_rank = u16; /* get our node rank from PMI */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_NODE_RANK, ORTE_PROC_MY_NAME, &u16ptr, OPAL_UINT16); if (OPAL_SUCCESS != ret) { error = "getting node rank"; goto error; } orte_process_info.my_node_rank = u16; /* get max procs for this application */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_MAX_PROCS, &wildcard_rank, &u32ptr, OPAL_UINT32); if (OPAL_SUCCESS != ret) { error = "getting max procs"; goto error; } orte_process_info.max_procs = u32; /* get job size */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_JOB_SIZE, &wildcard_rank, &u32ptr, OPAL_UINT32); if (OPAL_SUCCESS != ret) { error = "getting job size"; goto error; } orte_process_info.num_procs = u32; /* push into the environ for pickup in MPI layer for * MPI-3 required info key */ if (NULL == getenv(OPAL_MCA_PREFIX"orte_ess_num_procs")) { asprintf(&ev1, OPAL_MCA_PREFIX"orte_ess_num_procs=%d", orte_process_info.num_procs); putenv(ev1); added_num_procs = true; } if (NULL == getenv("OMPI_APP_CTX_NUM_PROCS")) { asprintf(&ev2, "OMPI_APP_CTX_NUM_PROCS=%d", orte_process_info.num_procs); putenv(ev2); added_app_ctx = true; } /* get our app number from PMI - ok if not found */ OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_APPNUM, ORTE_PROC_MY_NAME, &u32ptr, OPAL_UINT32); if (OPAL_SUCCESS == ret) { orte_process_info.app_num = u32; } else { orte_process_info.app_num = 0; } /* get the number of local peers - required for wireup of * shared memory BTL */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_SIZE, &wildcard_rank, &u32ptr, OPAL_UINT32); if (OPAL_SUCCESS == ret) { orte_process_info.num_local_peers = u32 - 1; // want number besides ourselves } else { orte_process_info.num_local_peers = 0; } /* get number of nodes in the job */ OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_NUM_NODES, &wildcard_rank, &u32ptr, OPAL_UINT32); if (OPAL_SUCCESS == ret) { orte_process_info.num_nodes = u32; } /* setup transport keys in case the MPI layer needs them - * we can use the jobfam and stepid as unique keys * because they are unique values assigned by the RM */ if (NULL == getenv(OPAL_MCA_PREFIX"orte_precondition_transports")) { unique_key[0] = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid); unique_key[1] = ORTE_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid); if (NULL == (string_key = orte_pre_condition_transports_print(unique_key))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } opal_output_verbose(2, orte_ess_base_framework.framework_output, "%s transport key %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), string_key); asprintf(&envar, OPAL_MCA_PREFIX"orte_precondition_transports=%s", string_key); putenv(envar); added_transport_keys = true; /* cannot free the envar as that messes up our environ */ free(string_key); } /* retrieve temp directories info */ OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_TMPDIR, &wildcard_rank, &val, OPAL_STRING); if (OPAL_SUCCESS == ret && NULL != val) { /* We want to provide user with ability * to override RM settings at his own risk */ if( NULL == orte_process_info.top_session_dir ){ orte_process_info.top_session_dir = val; } else { /* keep the MCA setting */ tdir_mca_override = true; free(val); } val = NULL; } if( !tdir_mca_override ){ OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_NSDIR, &wildcard_rank, &val, OPAL_STRING); if (OPAL_SUCCESS == ret && NULL != val) { /* We want to provide user with ability * to override RM settings at his own risk */ if( NULL == orte_process_info.job_session_dir ){ orte_process_info.job_session_dir = val; } else { /* keep the MCA setting */ free(val); tdir_mca_override = true; } val = NULL; } } if( !tdir_mca_override ){ OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_PROCDIR, &wildcard_rank, &val, OPAL_STRING); if (OPAL_SUCCESS == ret && NULL != val) { /* We want to provide user with ability * to override RM settings at his own risk */ if( NULL == orte_process_info.proc_session_dir ){ orte_process_info.proc_session_dir = val; } else { /* keep the MCA setting */ tdir_mca_override = true; free(val); } val = NULL; } } if( !tdir_mca_override ){ OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_TDIR_RMCLEAN, &wildcard_rank, &bool_ptr, OPAL_BOOL); if (OPAL_SUCCESS == ret ) { orte_process_info.rm_session_dirs = bool_val; } } /* get our local peers */ if (0 < orte_process_info.num_local_peers) { /* if my local rank if too high, then that's an error */ if (orte_process_info.num_local_peers < orte_process_info.my_local_rank) { ret = ORTE_ERR_BAD_PARAM; error = "num local peers"; goto error; } /* retrieve the local peers */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS, &wildcard_rank, &val, OPAL_STRING); if (OPAL_SUCCESS == ret && NULL != val) { peers = opal_argv_split(val, ','); free(val); } else { peers = NULL; } } else { peers = NULL; } /* set the locality */ if (NULL != peers) { /* identify our location */ val = NULL; OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY_STRING, ORTE_PROC_MY_NAME, &val, OPAL_STRING); if (OPAL_SUCCESS == ret && NULL != val) { mycpuset = val; } else { mycpuset = NULL; } pname.jobid = ORTE_PROC_MY_NAME->jobid; for (i=0; NULL != peers[i]; i++) { pname.vpid = strtoul(peers[i], NULL, 10); if (pname.vpid == ORTE_PROC_MY_NAME->vpid) { /* we are fully local to ourselves */ u16 = OPAL_PROC_ALL_LOCAL; } else { val = NULL; OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY_STRING, &pname, &val, OPAL_STRING); if (OPAL_SUCCESS == ret && NULL != val) { u16 = opal_hwloc_compute_relative_locality(mycpuset, val); free(val); } else { /* all we can say is that it shares our node */ u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; } } kv = OBJ_NEW(opal_value_t); kv->key = strdup(OPAL_PMIX_LOCALITY); kv->type = OPAL_UINT16; OPAL_OUTPUT_VERBOSE((1, orte_ess_base_framework.framework_output, "%s ess:pmi:locality: proc %s locality %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&pname), opal_hwloc_base_print_locality(u16))); kv->data.uint16 = u16; ret = opal_pmix.store_local(&pname, kv); if (OPAL_SUCCESS != ret) { error = "local store of locality"; opal_argv_free(peers); if (NULL != mycpuset) { free(mycpuset); } goto error; } OBJ_RELEASE(kv); } opal_argv_free(peers); if (NULL != mycpuset) { free(mycpuset); } } /* now that we have all required info, complete the setup */ if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup(false))) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_app_setup"; goto error; } /* setup process binding */ if (ORTE_SUCCESS != (ret = orte_ess_base_proc_binding())) { error = "proc_binding"; goto error; } /* this needs to be set to enable debugger use when direct launched */ if (NULL == orte_process_info.my_daemon_uri) { orte_standalone_operation = true; } /* set max procs */ if (orte_process_info.max_procs < orte_process_info.num_procs) { orte_process_info.max_procs = orte_process_info.num_procs; } /* push our hostname so others can find us, if they need to - the * native PMIx component will ignore this request as the hostname * is provided by the system */ OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL, OPAL_PMIX_HOSTNAME, orte_process_info.nodename, OPAL_STRING); if (ORTE_SUCCESS != ret) { error = "db store hostname"; goto error; } /* if we are an ORTE app - and not an MPI app - then * we need to exchange our connection info here. * MPI_Init has its own modex, so we don't need to do * two of them. However, if we don't do a modex at all, * then processes have no way to communicate * * NOTE: only do this when the process originally launches. * Cannot do this on a restart as the rest of the processes * in the job won't be executing this step, so we would hang */ if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) { /* need to commit the data before we fence */ opal_pmix.commit(); opal_pmix.fence(NULL, 0); } return ORTE_SUCCESS; error: if (!progress_thread_running) { /* can't send the help message, so ensure it * comes out locally */ orte_show_help_finalize(); } if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) { orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); } return ret; }
static int rte_init(void) { int ret; char *error = NULL; char *envar, *ev1, *ev2; uint64_t unique_key[2]; char *string_key; char *rmluri; opal_value_t *kv, kvn; opal_list_t vals; /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { error = "orte_ess_base_std_prolog"; goto error; } /* we don't have to call pmix.init because the pmix select did it */ /**** THE FOLLOWING ARE REQUIRED VALUES ***/ /* get our jobid from PMI */ if (!opal_pmix.get_attr(PMIX_JOBID, &kv)) { error = "getting jobid"; ret = ORTE_ERR_NOT_FOUND; goto error; } ORTE_PROC_MY_NAME->jobid = kv->data.uint32; OBJ_RELEASE(kv); /* get our global rank from PMI */ if (!opal_pmix.get_attr(PMIX_RANK, &kv)) { error = "getting rank"; ret = ORTE_ERR_NOT_FOUND; goto error; } ORTE_PROC_MY_NAME->vpid = kv->data.uint32; OBJ_RELEASE(kv); /* get our local rank from PMI */ if (!opal_pmix.get_attr(PMIX_LOCAL_RANK, &kv)) { error = "getting local rank"; ret = ORTE_ERR_NOT_FOUND; goto error; } orte_process_info.my_local_rank = (orte_local_rank_t)kv->data.uint16; OBJ_RELEASE(kv); /* get our node rank from PMI */ if (!opal_pmix.get_attr(PMIX_NODE_RANK, &kv)) { error = "getting node rank"; ret = ORTE_ERR_NOT_FOUND; goto error; } orte_process_info.my_node_rank = (orte_local_rank_t)kv->data.uint16; /* get universe size */ if (!opal_pmix.get_attr(PMIX_UNIV_SIZE, &kv)) { error = "getting univ size"; ret = ORTE_ERR_NOT_FOUND; goto error; } orte_process_info.num_procs = kv->data.uint32; OBJ_RELEASE(kv); /* push into the environ for pickup in MPI layer for * MPI-3 required info key */ if (NULL == getenv(OPAL_MCA_PREFIX"orte_ess_num_procs")) { asprintf(&ev1, OPAL_MCA_PREFIX"orte_ess_num_procs=%d", orte_process_info.num_procs); putenv(ev1); added_num_procs = true; } if (NULL == getenv("OMPI_APP_CTX_NUM_PROCS")) { asprintf(&ev2, "OMPI_APP_CTX_NUM_PROCS=%d", orte_process_info.num_procs); putenv(ev2); added_app_ctx = true; } /* get our app number from PMI - ok if not found */ if (opal_pmix.get_attr(PMIX_APPNUM, &kv)) { orte_process_info.app_num = kv->data.uint32; OBJ_RELEASE(kv); } else { orte_process_info.app_num = 0; } /* get the number of local peers - required for wireup of * shared memory BTL */ if (opal_pmix.get_attr(PMIX_LOCAL_SIZE, &kv)) { orte_process_info.num_local_peers = kv->data.uint32 - 1; // want number besides ourselves OBJ_RELEASE(kv); } else { orte_process_info.num_local_peers = 0; } /* setup transport keys in case the MPI layer needs them - * we can use the jobfam and stepid as unique keys * because they are unique values assigned by the RM */ if (NULL == getenv(OPAL_MCA_PREFIX"orte_precondition_transports")) { unique_key[0] = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid); unique_key[1] = ORTE_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid); if (NULL == (string_key = orte_pre_condition_transports_print(unique_key))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } asprintf(&envar, OPAL_MCA_PREFIX"orte_precondition_transports=%s", string_key); putenv(envar); added_transport_keys = true; /* cannot free the envar as that messes up our environ */ free(string_key); } #if OPAL_HAVE_HWLOC /* if it wasn't passed down to us, get the topology */ if (NULL == opal_hwloc_topology) { if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) { error = "topology discovery"; goto error; } } #endif /* we don't need to force the routed system to pick the * "direct" component as that should happen automatically * in those cases where we are direct launched (i.e., no * HNP is defined in the environment */ /* now that we have all required info, complete the setup */ if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup(false))) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_app_setup"; goto error; } /* setup process binding */ if (ORTE_SUCCESS != (ret = orte_ess_base_proc_binding())) { error = "proc_binding"; goto error; } /* this needs to be set to enable debugger use when direct launched */ if (NULL == orte_process_info.my_daemon_uri) { orte_standalone_operation = true; } /* set max procs */ if (orte_process_info.max_procs < orte_process_info.num_procs) { orte_process_info.max_procs = orte_process_info.num_procs; } /*** PUSH DATA FOR OTHERS TO FIND ***/ /* if we are direct launched, then push our RML URI - there * is no need to do so when launched by mpirun as all apps * communicate thru their local daemon */ if (orte_standalone_operation) { OBJ_CONSTRUCT(&vals, opal_list_t); if (OPAL_SUCCESS != opal_dstore.fetch(opal_dstore_internal, &OPAL_PROC_MY_NAME, OPAL_DSTORE_URI, &vals)) { /* construct the RTE string */ rmluri = orte_rml.get_contact_info(); /* push it out for others to use */ OBJ_CONSTRUCT(&kvn, opal_value_t); kvn.key = strdup(OPAL_DSTORE_URI); kvn.type = OPAL_STRING; kvn.data.string = strdup(rmluri); if (ORTE_SUCCESS != (ret = opal_pmix.put(PMIX_GLOBAL, &kvn))) { error = "db store uri"; OBJ_DESTRUCT(&kvn); goto error; } OBJ_DESTRUCT(&kvn); free(rmluri); } OPAL_LIST_DESTRUCT(&vals); } /* push our hostname so others can find us, if they need to */ OBJ_CONSTRUCT(&kvn, opal_value_t); kvn.key = strdup(OPAL_DSTORE_HOSTNAME); kvn.type = OPAL_STRING; kvn.data.string = strdup(orte_process_info.nodename); if (ORTE_SUCCESS != (ret = opal_pmix.put(PMIX_GLOBAL, &kvn))) { error = "db store hostname"; OBJ_DESTRUCT(&kvn); goto error; } OBJ_DESTRUCT(&kvn); /* if our local rank was not provided by the system, then * push our local rank so others can access it */ OBJ_CONSTRUCT(&vals, opal_list_t); if (OPAL_SUCCESS != opal_dstore.fetch(opal_dstore_internal, &OPAL_PROC_MY_NAME, OPAL_DSTORE_LOCALRANK, &vals)) { OBJ_CONSTRUCT(&kvn, opal_value_t); kvn.key = strdup(OPAL_DSTORE_LOCALRANK); kvn.type = OPAL_UINT16; kvn.data.uint16 = orte_process_info.my_local_rank; if (ORTE_SUCCESS != (ret = opal_pmix.put(PMIX_GLOBAL, &kvn))) { error = "db store local rank"; OBJ_DESTRUCT(&kvn); goto error; } OBJ_DESTRUCT(&kvn); } OPAL_LIST_DESTRUCT(&vals); /* if our node rank was not provided by the system, then * push our node rank so others can access it */ OBJ_CONSTRUCT(&vals, opal_list_t); if (OPAL_SUCCESS != opal_dstore.fetch(opal_dstore_internal, &OPAL_PROC_MY_NAME, OPAL_DSTORE_NODERANK, &vals)) { OBJ_CONSTRUCT(&kvn, opal_value_t); kvn.key = strdup(OPAL_DSTORE_NODERANK); kvn.type = OPAL_UINT16; kvn.data.uint16 = orte_process_info.my_node_rank; if (ORTE_SUCCESS != (ret = opal_pmix.put(PMIX_GLOBAL, &kvn))) { error = "db store node rank"; OBJ_DESTRUCT(&kvn); goto error; } OBJ_DESTRUCT(&kvn); } OPAL_LIST_DESTRUCT(&vals); /* if we are an ORTE app - and not an MPI app - then * we need to exchange our connection info here. * MPI_Init has its own modex, so we don't need to do * two of them. However, if we don't do a modex at all, * then processes have no way to communicate * * NOTE: only do this when the process originally launches. * Cannot do this on a restart as the rest of the processes * in the job won't be executing this step, so we would hang */ if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) { opal_pmix.fence(NULL, 0); } return ORTE_SUCCESS; error: if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) { orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); } return ret; }
int orte_ess_base_app_setup(bool db_restrict_local) { int ret; char *error = NULL; opal_list_t transports; OPAL_TIMING_ENV_INIT(ess_base_setup); /* * stdout/stderr buffering * If the user requested to override the default setting then do * as they wish. */ if( orte_ess_base_std_buffering > -1 ) { if( 0 == orte_ess_base_std_buffering ) { setvbuf(stdout, NULL, _IONBF, 0); setvbuf(stderr, NULL, _IONBF, 0); } else if( 1 == orte_ess_base_std_buffering ) { setvbuf(stdout, NULL, _IOLBF, 0); setvbuf(stderr, NULL, _IOLBF, 0); } else if( 2 == orte_ess_base_std_buffering ) { setvbuf(stdout, NULL, _IOFBF, 0); setvbuf(stderr, NULL, _IOFBF, 0); } } /* if I am an MPI app, we will let the MPI layer define and * control the opal_proc_t structure. Otherwise, we need to * do so here */ if (ORTE_PROC_NON_MPI) { orte_process_info.super.proc_name = *(opal_process_name_t*)ORTE_PROC_MY_NAME; orte_process_info.super.proc_hostname = orte_process_info.nodename; orte_process_info.super.proc_flags = OPAL_PROC_ALL_LOCAL; orte_process_info.super.proc_arch = opal_local_arch; opal_proc_local_set(&orte_process_info.super); } /* open and setup the state machine */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_state_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_state_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_state_base_select"; goto error; } OPAL_TIMING_ENV_NEXT(ess_base_setup, "state_framework_open"); /* open the errmgr */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_open"; goto error; } OPAL_TIMING_ENV_NEXT(ess_base_setup, "errmgr_framework_open"); /* setup my session directory */ if (orte_create_session_dirs) { OPAL_OUTPUT_VERBOSE((2, orte_ess_base_framework.framework_output, "%s setting up session dir with\n\ttmpdir: %s\n\thost %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base, orte_process_info.nodename)); if (ORTE_SUCCESS != (ret = orte_session_dir(true, ORTE_PROC_MY_NAME))) { ORTE_ERROR_LOG(ret); error = "orte_session_dir"; goto error; } /* Once the session directory location has been established, set the opal_output env file location to be in the proc-specific session directory. */ opal_output_set_output_file_info(orte_process_info.proc_session_dir, "output-", NULL, NULL); /* register the directory for cleanup */ if (NULL != opal_pmix.register_cleanup) { if (orte_standalone_operation) { if (OPAL_SUCCESS != (ret = opal_pmix.register_cleanup(orte_process_info.top_session_dir, true, false, true))) { ORTE_ERROR_LOG(ret); error = "register cleanup"; goto error; } } else { if (OPAL_SUCCESS != (ret = opal_pmix.register_cleanup(orte_process_info.job_session_dir, true, false, false))) { ORTE_ERROR_LOG(ret); error = "register cleanup"; goto error; } } } } OPAL_TIMING_ENV_NEXT(ess_base_setup, "create_session_dirs"); /* Setup the communication infrastructure */ /* Routed system */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_routed_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_routed_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_routed_base_select"; goto error; } OPAL_TIMING_ENV_NEXT(ess_base_setup, "routed_framework_open"); /* * OOB Layer */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_oob_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_oob_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_oob_base_select"; goto error; } OPAL_TIMING_ENV_NEXT(ess_base_setup, "oob_framework_open"); /* Runtime Messaging Layer */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_rml_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_select"; goto error; } OPAL_TIMING_ENV_NEXT(ess_base_setup, "rml_framework_open"); /* if we have info on the HNP and local daemon, process it */ if (NULL != orte_process_info.my_hnp_uri) { /* we have to set the HNP's name, even though we won't route messages directly * to it. This is required to ensure that we -do- send messages to the correct * HNP name */ if (ORTE_SUCCESS != (ret = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, ORTE_PROC_MY_HNP, NULL))) { ORTE_ERROR_LOG(ret); error = "orte_rml_parse_HNP"; goto error; } } if (NULL != orte_process_info.my_daemon_uri) { opal_value_t val; /* extract the daemon's name so we can update the routing table */ if (ORTE_SUCCESS != (ret = orte_rml_base_parse_uris(orte_process_info.my_daemon_uri, ORTE_PROC_MY_DAEMON, NULL))) { ORTE_ERROR_LOG(ret); error = "orte_rml_parse_daemon"; goto error; } /* Set the contact info in the database - this won't actually establish * the connection, but just tells us how to reach the daemon * if/when we attempt to send to it */ OBJ_CONSTRUCT(&val, opal_value_t); val.key = OPAL_PMIX_PROC_URI; val.type = OPAL_STRING; val.data.string = orte_process_info.my_daemon_uri; if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_DAEMON, &val))) { ORTE_ERROR_LOG(ret); val.key = NULL; val.data.string = NULL; OBJ_DESTRUCT(&val); error = "store DAEMON URI"; goto error; } val.key = NULL; val.data.string = NULL; OBJ_DESTRUCT(&val); } /* setup the errmgr */ if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_select"; goto error; } OPAL_TIMING_ENV_NEXT(ess_base_setup, "errmgr_select"); /* get a conduit for our use - we never route IO over fabric */ OBJ_CONSTRUCT(&transports, opal_list_t); orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE, ORTE_ATTR_LOCAL, orte_mgmt_transport, OPAL_STRING); if (ORTE_RML_CONDUIT_INVALID == (orte_mgmt_conduit = orte_rml.open_conduit(&transports))) { ret = ORTE_ERR_OPEN_CONDUIT_FAIL; error = "orte_rml_open_mgmt_conduit"; goto error; } OPAL_LIST_DESTRUCT(&transports); OBJ_CONSTRUCT(&transports, opal_list_t); orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE, ORTE_ATTR_LOCAL, orte_coll_transport, OPAL_STRING); if (ORTE_RML_CONDUIT_INVALID == (orte_coll_conduit = orte_rml.open_conduit(&transports))) { ret = ORTE_ERR_OPEN_CONDUIT_FAIL; error = "orte_rml_open_coll_conduit"; goto error; } OPAL_LIST_DESTRUCT(&transports); OPAL_TIMING_ENV_NEXT(ess_base_setup, "rml_open_conduit"); /* * Group communications */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_grpcomm_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_grpcomm_base_select"; goto error; } OPAL_TIMING_ENV_NEXT(ess_base_setup, "grpcomm_framework_open"); /* open the distributed file system */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_dfs_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_dfs_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_dfs_base_select"; goto error; } OPAL_TIMING_ENV_NEXT(ess_base_setup, "dfs_framework_open"); return ORTE_SUCCESS; error: orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); return ret; }
static int rte_init(void) { int ret; char *error = NULL; char *envar, *ev1, *ev2; uint64_t unique_key[2]; char *string_key; opal_value_t *kv; char *val; int u32, *u32ptr; uint16_t u16, *u16ptr; char **peers=NULL, *mycpuset, **cpusets=NULL; opal_process_name_t name; size_t i; /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { error = "orte_ess_base_std_prolog"; goto error; } /* get an async event base - we use the opal_async one so * we don't startup extra threads if not needed */ orte_event_base = opal_progress_thread_init(NULL); progress_thread_running = true; /* open and setup pmix */ if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_pmix_base_framework, 0))) { ORTE_ERROR_LOG(ret); /* we cannot run */ error = "pmix init"; goto error; } if (OPAL_SUCCESS != (ret = opal_pmix_base_select())) { /* we cannot run */ error = "pmix init"; goto error; } /* set the event base */ opal_pmix_base_set_evbase(orte_event_base); /* initialize the selected module */ if (!opal_pmix.initialized() && (OPAL_SUCCESS != (ret = opal_pmix.init()))) { /* we cannot run */ error = "pmix init"; goto error; } u32ptr = &u32; u16ptr = &u16; /**** THE FOLLOWING ARE REQUIRED VALUES ***/ /* pmix.init set our process name down in the OPAL layer, * so carry it forward here */ ORTE_PROC_MY_NAME->jobid = OPAL_PROC_MY_NAME.jobid; ORTE_PROC_MY_NAME->vpid = OPAL_PROC_MY_NAME.vpid; /* get our local rank from PMI */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_RANK, ORTE_PROC_MY_NAME, &u16ptr, OPAL_UINT16); if (OPAL_SUCCESS != ret) { error = "getting local rank"; goto error; } orte_process_info.my_local_rank = u16; /* get our node rank from PMI */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_NODE_RANK, ORTE_PROC_MY_NAME, &u16ptr, OPAL_UINT16); if (OPAL_SUCCESS != ret) { error = "getting node rank"; goto error; } orte_process_info.my_node_rank = u16; /* get universe size */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_UNIV_SIZE, ORTE_PROC_MY_NAME, &u32ptr, OPAL_UINT32); if (OPAL_SUCCESS != ret) { error = "getting univ size"; goto error; } orte_process_info.num_procs = u32; /* push into the environ for pickup in MPI layer for * MPI-3 required info key */ if (NULL == getenv(OPAL_MCA_PREFIX"orte_ess_num_procs")) { asprintf(&ev1, OPAL_MCA_PREFIX"orte_ess_num_procs=%d", orte_process_info.num_procs); putenv(ev1); added_num_procs = true; } if (NULL == getenv("OMPI_APP_CTX_NUM_PROCS")) { asprintf(&ev2, "OMPI_APP_CTX_NUM_PROCS=%d", orte_process_info.num_procs); putenv(ev2); added_app_ctx = true; } /* get our app number from PMI - ok if not found */ OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_APPNUM, ORTE_PROC_MY_NAME, &u32ptr, OPAL_UINT32); if (OPAL_SUCCESS == ret) { orte_process_info.app_num = u32; } else { orte_process_info.app_num = 0; } /* get the number of local peers - required for wireup of * shared memory BTL */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_SIZE, ORTE_PROC_MY_NAME, &u32ptr, OPAL_UINT32); if (OPAL_SUCCESS == ret) { orte_process_info.num_local_peers = u32 - 1; // want number besides ourselves } else { orte_process_info.num_local_peers = 0; } /* setup transport keys in case the MPI layer needs them - * we can use the jobfam and stepid as unique keys * because they are unique values assigned by the RM */ if (NULL == getenv(OPAL_MCA_PREFIX"orte_precondition_transports")) { unique_key[0] = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid); unique_key[1] = ORTE_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid); if (NULL == (string_key = orte_pre_condition_transports_print(unique_key))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } asprintf(&envar, OPAL_MCA_PREFIX"orte_precondition_transports=%s", string_key); putenv(envar); added_transport_keys = true; /* cannot free the envar as that messes up our environ */ free(string_key); } /* retrieve our topology */ val = NULL; OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCAL_TOPO, ORTE_PROC_MY_NAME, &val, OPAL_STRING); if (OPAL_SUCCESS == ret && NULL != val) { /* load the topology */ if (0 != hwloc_topology_init(&opal_hwloc_topology)) { ret = OPAL_ERROR; free(val); error = "setting topology"; goto error; } if (0 != hwloc_topology_set_xmlbuffer(opal_hwloc_topology, val, strlen(val))) { ret = OPAL_ERROR; free(val); hwloc_topology_destroy(opal_hwloc_topology); error = "setting topology"; goto error; } /* since we are loading this from an external source, we have to * explicitly set a flag so hwloc sets things up correctly */ if (0 != hwloc_topology_set_flags(opal_hwloc_topology, (HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM | HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM | HWLOC_TOPOLOGY_FLAG_IO_DEVICES))) { ret = OPAL_ERROR; hwloc_topology_destroy(opal_hwloc_topology); free(val); error = "setting topology"; goto error; } /* now load the topology */ if (0 != hwloc_topology_load(opal_hwloc_topology)) { ret = OPAL_ERROR; hwloc_topology_destroy(opal_hwloc_topology); free(val); error = "setting topology"; goto error; } free(val); /* filter the cpus thru any default cpu set */ if (OPAL_SUCCESS != (ret = opal_hwloc_base_filter_cpus(opal_hwloc_topology))) { error = "filtering topology"; goto error; } } else { /* it wasn't passed down to us, so go get it */ if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) { error = "topology discovery"; goto error; } /* push it into the PMIx database in case someone * tries to retrieve it so we avoid an attempt to * get it again */ kv = OBJ_NEW(opal_value_t); kv->key = strdup(OPAL_PMIX_LOCAL_TOPO); kv->type = OPAL_STRING; if (0 != (ret = hwloc_topology_export_xmlbuffer(opal_hwloc_topology, &kv->data.string, &u32))) { error = "topology export"; goto error; } if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_NAME, kv))) { error = "topology store"; goto error; } OBJ_RELEASE(kv); } /* get our local peers */ if (0 < orte_process_info.num_local_peers) { /* if my local rank if too high, then that's an error */ if (orte_process_info.num_local_peers < orte_process_info.my_local_rank) { ret = ORTE_ERR_BAD_PARAM; error = "num local peers"; goto error; } /* retrieve the local peers */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_PEERS, ORTE_PROC_MY_NAME, &val, OPAL_STRING); if (OPAL_SUCCESS == ret && NULL != val) { peers = opal_argv_split(val, ','); free(val); /* and their cpusets, if available */ OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCAL_CPUSETS, ORTE_PROC_MY_NAME, &val, OPAL_STRING); if (OPAL_SUCCESS == ret && NULL != val) { cpusets = opal_argv_split(val, ':'); free(val); } else { cpusets = NULL; } } else { peers = NULL; cpusets = NULL; } } else { peers = NULL; cpusets = NULL; } /* set the locality */ if (NULL != peers) { /* indentify our cpuset */ if (NULL != cpusets) { mycpuset = cpusets[orte_process_info.my_local_rank]; } else { mycpuset = NULL; } name.jobid = ORTE_PROC_MY_NAME->jobid; for (i=0; NULL != peers[i]; i++) { kv = OBJ_NEW(opal_value_t); kv->key = strdup(OPAL_PMIX_LOCALITY); kv->type = OPAL_UINT16; name.vpid = strtoul(peers[i], NULL, 10); if (name.vpid == ORTE_PROC_MY_NAME->vpid) { /* we are fully local to ourselves */ u16 = OPAL_PROC_ALL_LOCAL; } else if (NULL == mycpuset || NULL == cpusets[i] || 0 == strcmp(cpusets[i], "UNBOUND")) { /* all we can say is that it shares our node */ u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE; } else { /* we have it, so compute the locality */ u16 = opal_hwloc_base_get_relative_locality(opal_hwloc_topology, mycpuset, cpusets[i]); } OPAL_OUTPUT_VERBOSE((1, orte_ess_base_framework.framework_output, "%s ess:pmi:locality: proc %s locality %x", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&name), u16)); kv->data.uint16 = u16; ret = opal_pmix.store_local(&name, kv); if (OPAL_SUCCESS != ret) { error = "local store of locality"; opal_argv_free(peers); opal_argv_free(cpusets); goto error; } OBJ_RELEASE(kv); } opal_argv_free(peers); opal_argv_free(cpusets); } /* now that we have all required info, complete the setup */ if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup(false))) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_app_setup"; goto error; } /* setup process binding */ if (ORTE_SUCCESS != (ret = orte_ess_base_proc_binding())) { error = "proc_binding"; goto error; } /* this needs to be set to enable debugger use when direct launched */ if (NULL == orte_process_info.my_daemon_uri) { orte_standalone_operation = true; } /* set max procs */ if (orte_process_info.max_procs < orte_process_info.num_procs) { orte_process_info.max_procs = orte_process_info.num_procs; } /*** PUSH DATA FOR OTHERS TO FIND ***/ /* push our hostname so others can find us, if they need to */ OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL, OPAL_PMIX_HOSTNAME, orte_process_info.nodename, OPAL_STRING); if (ORTE_SUCCESS != ret) { error = "db store hostname"; goto error; } /* if we are an ORTE app - and not an MPI app - then * we need to exchange our connection info here. * MPI_Init has its own modex, so we don't need to do * two of them. However, if we don't do a modex at all, * then processes have no way to communicate * * NOTE: only do this when the process originally launches. * Cannot do this on a restart as the rest of the processes * in the job won't be executing this step, so we would hang */ if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) { opal_pmix.fence(NULL, 0); } return ORTE_SUCCESS; error: if (!progress_thread_running) { /* can't send the help message, so ensure it * comes out locally */ orte_show_help_finalize(); } if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) { orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); } return ret; }
/* * Initialize global variables used w/in the server. */ int pmix_server_init(void) { int rc; opal_list_t info; opal_value_t *kv; if (orte_pmix_server_globals.initialized) { return ORTE_SUCCESS; } orte_pmix_server_globals.initialized = true; /* setup the server's state variables */ OBJ_CONSTRUCT(&orte_pmix_server_globals.reqs, opal_hotel_t); if (OPAL_SUCCESS != (rc = opal_hotel_init(&orte_pmix_server_globals.reqs, orte_pmix_server_globals.num_rooms, orte_event_base, orte_pmix_server_globals.timeout*1000000, ORTE_ERROR_PRI, eviction_cbfunc))) { ORTE_ERROR_LOG(rc); return rc; } OBJ_CONSTRUCT(&orte_pmix_server_globals.notifications, opal_list_t); /* setup recv for direct modex requests */ orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DIRECT_MODEX, ORTE_RML_PERSISTENT, pmix_server_dmdx_recv, NULL); /* setup recv for replies to direct modex requests */ orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DIRECT_MODEX_RESP, ORTE_RML_PERSISTENT, pmix_server_dmdx_resp, NULL); /* setup recv for replies to proxy launch requests */ orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_LAUNCH_RESP, ORTE_RML_PERSISTENT, pmix_server_launch_resp, NULL); /* setup recv for replies from data server */ orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DATA_CLIENT, ORTE_RML_PERSISTENT, pmix_server_keyval_client, NULL); /* setup recv for notifications */ orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_NOTIFICATION, ORTE_RML_PERSISTENT, pmix_server_notify, NULL); /* ensure the PMIx server uses the proper rendezvous directory */ opal_setenv("PMIX_SERVER_TMPDIR", orte_process_info.proc_session_dir, true, &environ); /* pass the server the local topology - we do this so the procs won't read the * topology themselves as this could overwhelm the local * system on large-scale SMPs */ OBJ_CONSTRUCT(&info, opal_list_t); if (NULL != opal_hwloc_topology) { char *xmlbuffer=NULL; int len; kv = OBJ_NEW(opal_value_t); kv->key = strdup(OPAL_PMIX_LOCAL_TOPO); if (0 != hwloc_topology_export_xmlbuffer(opal_hwloc_topology, &xmlbuffer, &len)) { OBJ_RELEASE(kv); OBJ_DESTRUCT(&info); return ORTE_ERROR; } kv->data.string = xmlbuffer; kv->type = OPAL_STRING; opal_list_append(&info, &kv->super); } /* tell the server to allow tool connections */ kv = OBJ_NEW(opal_value_t); kv->key = strdup(OPAL_PMIX_SERVER_TOOL_SUPPORT); kv->type = OPAL_BOOL; kv->data.flag = true; opal_list_append(&info, &kv->super); /* tell the server our temp directory */ kv = OBJ_NEW(opal_value_t); kv->key = strdup(OPAL_PMIX_SERVER_TMPDIR); kv->type = OPAL_STRING; kv->data.string = opal_os_path(false, orte_process_info.jobfam_session_dir, NULL); opal_list_append(&info, &kv->super); /* use the same for the system temp directory - this is * where the system-level tool connections will go */ kv = OBJ_NEW(opal_value_t); kv->key = strdup(OPAL_PMIX_SYSTEM_TMPDIR); kv->type = OPAL_STRING; kv->data.string = strdup(orte_process_info.tmpdir_base); opal_list_append(&info, &kv->super); /* setup the local server */ if (ORTE_SUCCESS != (rc = opal_pmix.server_init(&pmix_server, &info))) { /* pmix will provide a nice show_help output here */ return rc; } OPAL_LIST_DESTRUCT(&info); /* if the universal server wasn't specified, then we use * our own HNP for that purpose */ if (NULL == orte_pmix_server_globals.server_uri) { orte_pmix_server_globals.server = *ORTE_PROC_MY_HNP; } else { char *server; opal_buffer_t buf; if (0 == strncmp(orte_pmix_server_globals.server_uri, "file", strlen("file")) || 0 == strncmp(orte_pmix_server_globals.server_uri, "FILE", strlen("FILE"))) { char input[1024], *filename; FILE *fp; /* it is a file - get the filename */ filename = strchr(orte_pmix_server_globals.server_uri, ':'); if (NULL == filename) { /* filename is not correctly formatted */ orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-bad", true, orte_basename, orte_pmix_server_globals.server_uri); return ORTE_ERR_BAD_PARAM; } ++filename; /* space past the : */ if (0 >= strlen(filename)) { /* they forgot to give us the name! */ orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-missing", true, orte_basename, orte_pmix_server_globals.server_uri); return ORTE_ERR_BAD_PARAM; } /* open the file and extract the uri */ fp = fopen(filename, "r"); if (NULL == fp) { /* can't find or read file! */ orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-access", true, orte_basename, orte_pmix_server_globals.server_uri); return ORTE_ERR_BAD_PARAM; } if (NULL == fgets(input, 1024, fp)) { /* something malformed about file */ fclose(fp); orte_show_help("help-orterun.txt", "orterun:ompi-server-file-bad", true, orte_basename, orte_pmix_server_globals.server_uri, orte_basename); return ORTE_ERR_BAD_PARAM; } fclose(fp); input[strlen(input)-1] = '\0'; /* remove newline */ server = strdup(input); } else { server = strdup(orte_pmix_server_globals.server_uri); } /* setup our route to the server */ OBJ_CONSTRUCT(&buf, opal_buffer_t); opal_dss.pack(&buf, &server, 1, OPAL_STRING); if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(&buf))) { ORTE_ERROR_LOG(rc); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); return rc; } OBJ_DESTRUCT(&buf); /* parse the URI to get the server's name */ if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(server, &orte_pmix_server_globals.server, NULL))) { ORTE_ERROR_LOG(rc); return rc; } /* check if we are to wait for the server to start - resolves * a race condition that can occur when the server is run * as a background job - e.g., in scripts */ if (orte_pmix_server_globals.wait_for_server) { /* ping the server */ struct timeval timeout; timeout.tv_sec = orte_pmix_server_globals.timeout; timeout.tv_usec = 0; if (ORTE_SUCCESS != (rc = orte_rml.ping(server, &timeout))) { /* try it one more time */ if (ORTE_SUCCESS != (rc = orte_rml.ping(server, &timeout))) { /* okay give up */ orte_show_help("help-orterun.txt", "orterun:server-not-found", true, orte_basename, server, (long)orte_pmix_server_globals.timeout, ORTE_ERROR_NAME(rc)); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); return rc; } } } } return rc; }
int orte_init(int* pargc, char*** pargv, orte_proc_type_t flags) { int ret; char *error = NULL; if (0 < orte_initialized) { /* track number of times we have been called */ orte_initialized++; return ORTE_SUCCESS; } orte_initialized++; /* initialize the opal layer */ if (ORTE_SUCCESS != (ret = opal_init(pargc, pargv))) { error = "opal_init"; goto error; } /* ensure we know the type of proc for when we finalize */ orte_process_info.proc_type = flags; /* setup the locks */ if (ORTE_SUCCESS != (ret = orte_locks_init())) { error = "orte_locks_init"; goto error; } /* Register all MCA Params */ if (ORTE_SUCCESS != (ret = orte_register_params())) { error = "orte_register_params"; goto error; } /* setup the orte_show_help system */ if (ORTE_SUCCESS != (ret = orte_show_help_init())) { error = "opal_output_init"; goto error; } /* register handler for errnum -> string conversion */ opal_error_register("ORTE", ORTE_ERR_BASE, ORTE_ERR_MAX, orte_err2str); /* Ensure the rest of the process info structure is initialized */ if (ORTE_SUCCESS != (ret = orte_proc_info())) { error = "orte_proc_info"; goto error; } /* open the ESS and select the correct module for this environment */ if (ORTE_SUCCESS != (ret = orte_ess_base_open())) { error = "orte_ess_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_ess_base_select())) { error = "orte_ess_base_select"; goto error; } #if ORTE_ENABLE_PROGRESS_THREADS #if OPAL_EVENT_HAVE_THREAD_SUPPORT /* get a separate orte event base */ orte_event_base = opal_event_base_create(); /* construct the thread object */ OBJ_CONSTRUCT(&orte_progress_thread, opal_thread_t); /* fork off a thread to progress it */ orte_progress_thread.t_run = orte_progress_thread_engine; if (OPAL_SUCCESS != (ret = opal_thread_start(&orte_progress_thread))) { error = "orte progress thread start"; goto error; } #else error = "event thread support is not configured"; ret = ORTE_ERROR; goto error; #endif #else /* set the event base to the opal one */ orte_event_base = opal_event_base; #endif /* initialize the RTE for this environment */ if (ORTE_SUCCESS != (ret = orte_ess.init())) { error = "orte_ess_init"; goto error; } /* All done */ return ORTE_SUCCESS; error: if (ORTE_ERR_SILENT != ret) { orte_show_help("help-orte-runtime", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); } return ret; }
static int orcmd_init(void) { int ret = ORTE_ERROR; char *error = NULL; opal_buffer_t buf, *clusterbuf, *uribuf; orte_job_t *jdata; orte_node_t *node; orte_proc_t *proc; opal_list_t config; orcm_scheduler_t *scheduler; orcm_node_t *mynode=NULL; int32_t n; if (initialized) { return ORCM_SUCCESS; } initialized = true; /* Initialize the ORTE data type support */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { error = "orte_std_prolog"; goto error; } /* setup the global job and node arrays */ orte_job_data = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data, 1, ORTE_GLOBAL_ARRAY_MAX_SIZE, 1))) { ORTE_ERROR_LOG(ret); error = "setup job array"; goto error; } orte_node_pool = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool, ORTE_GLOBAL_ARRAY_BLOCK_SIZE, ORTE_GLOBAL_ARRAY_MAX_SIZE, ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) { ORTE_ERROR_LOG(ret); error = "setup node array"; goto error; } orte_node_topologies = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_topologies, ORTE_GLOBAL_ARRAY_BLOCK_SIZE, ORTE_GLOBAL_ARRAY_MAX_SIZE, ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) { ORTE_ERROR_LOG(ret); error = "setup node topologies array"; goto error; } /* create a job tracker for the daemons */ jdata = OBJ_NEW(orte_job_t); jdata->jobid = 0; ORTE_PROC_MY_NAME->jobid = 0; opal_pointer_array_set_item(orte_job_data, 0, jdata); /* read the site configuration */ OBJ_CONSTRUCT(&config, opal_list_t); if (ORCM_SUCCESS != (ret = orcm_cfgi.read_config(&config))) { error = "getting config"; goto error; } /* define the cluster and collect contact info for all * aggregators - we'll need to know how to talk to any * of them in case of failures */ OBJ_CONSTRUCT(&buf, opal_buffer_t); if (ORCM_SUCCESS != (ret = orcm_cfgi.define_system(&config, &mynode, &orte_process_info.num_procs, &buf))) { OBJ_DESTRUCT(&buf); error = "define system"; goto error; } /* if my name didn't get set, then we didn't find our node * in the config - report it and die */ if (NULL == mynode) { orte_show_help("help-ess-orcm.txt", "node-not-found", true, orcm_cfgi_base.config_file, orte_process_info.nodename); OBJ_DESTRUCT(&buf); return ORTE_ERR_SILENT; } /* define a node and proc object for ourselves as some parts * of ORTE and ORCM require it */ if (NULL == (node = OBJ_NEW(orte_node_t))) { ret = ORTE_ERR_OUT_OF_RESOURCE; error = "out of memory"; goto error; } node->name = strdup(orte_process_info.nodename); opal_pointer_array_set_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid, node); if (NULL == (proc = OBJ_NEW(orte_proc_t))) { ret = ORTE_ERR_OUT_OF_RESOURCE; error = "out of memory"; goto error; } proc->name.jobid = ORTE_PROC_MY_NAME->jobid; proc->name.vpid = ORTE_PROC_MY_NAME->vpid; OBJ_RETAIN(proc); node->daemon = proc; OBJ_RETAIN(node); proc->node = node; opal_pointer_array_set_item(jdata->procs, ORTE_PROC_MY_NAME->vpid, proc); /* For now, we only support a single scheduler daemon in the system. * This *may* change someday in the future */ scheduler = (orcm_scheduler_t*)opal_list_get_first(orcm_schedulers); /* If we are in test mode, then we don't *require* that a scheduler * be defined in the system - otherwise, we do */ if (NULL == scheduler) { if (mca_sst_orcmd_component.scheduler_reqd) { error = "no scheduler found"; ret = ORTE_ERR_NOT_FOUND; goto error; } } else { ORTE_PROC_MY_SCHEDULER->jobid = scheduler->controller.daemon.jobid; ORTE_PROC_MY_SCHEDULER->vpid = scheduler->controller.daemon.vpid; } /* register the ORTE-level params at this time now that the * config has had a chance to push things into the environ */ if (ORTE_SUCCESS != (ret = orte_register_params())) { OBJ_DESTRUCT(&buf); error = "orte_register_params"; goto error; } /* setup callback for SIGPIPE */ setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback); /* Set signal handlers to catch kill signals so we can properly clean up * after ourselves. */ setup_sighandler(SIGTERM, &term_handler, shutdown_signal); setup_sighandler(SIGINT, &int_handler, shutdown_signal); /** setup callbacks for signals we should ignore */ setup_sighandler(SIGUSR1, &sigusr1_handler, signal_callback); setup_sighandler(SIGUSR2, &sigusr2_handler, signal_callback); signals_set = true; #if OPAL_HAVE_HWLOC { hwloc_obj_t obj; unsigned i, j; /* get the local topology */ if (NULL == opal_hwloc_topology) { if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) { OBJ_DESTRUCT(&buf); error = "topology discovery"; goto error; } } /* remove the hostname from the topology. Unfortunately, hwloc * decided to add the source hostname to the "topology", thus * rendering it unusable as a pure topological description. So * we remove that information here. */ obj = hwloc_get_root_obj(opal_hwloc_topology); for (i=0; i < obj->infos_count; i++) { if (NULL == obj->infos[i].name || NULL == obj->infos[i].value) { continue; } if (0 == strncmp(obj->infos[i].name, "HostName", strlen("HostName"))) { free(obj->infos[i].name); free(obj->infos[i].value); /* left justify the array */ for (j=i; j < obj->infos_count-1; j++) { obj->infos[j] = obj->infos[j+1]; } obj->infos[obj->infos_count-1].name = NULL; obj->infos[obj->infos_count-1].value = NULL; obj->infos_count--; break; } } if (15 < opal_output_get_verbosity(orcm_sst_base_framework.framework_output)) { opal_output(0, "%s Topology Info:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO); } /* if we were asked to bind to specific core(s), do so now */ if (NULL != orte_daemon_cores) { char **cores=NULL, tmp[128]; hwloc_obj_t pu; hwloc_cpuset_t ours, pucpus, res; int core; /* could be a collection of comma-delimited ranges, so * use our handy utility to parse it */ orte_util_parse_range_options(orte_daemon_cores, &cores); if (NULL != cores) { ours = hwloc_bitmap_alloc(); hwloc_bitmap_zero(ours); pucpus = hwloc_bitmap_alloc(); res = hwloc_bitmap_alloc(); for (i=0; NULL != cores[i]; i++) { core = strtoul(cores[i], NULL, 10); if (NULL == (pu = opal_hwloc_base_get_pu(opal_hwloc_topology, core, OPAL_HWLOC_LOGICAL))) { orte_show_help("help-orted.txt", "orted:cannot-bind", true, orte_process_info.nodename, orte_daemon_cores); ret = ORTE_ERR_NOT_SUPPORTED; OBJ_DESTRUCT(&buf); error = "cannot bind"; goto error; } hwloc_bitmap_and(pucpus, pu->online_cpuset, pu->allowed_cpuset); hwloc_bitmap_or(res, ours, pucpus); hwloc_bitmap_copy(ours, res); } /* if the result is all zeros, then don't bind */ if (!hwloc_bitmap_iszero(ours)) { (void)hwloc_set_cpubind(opal_hwloc_topology, ours, 0); if (opal_hwloc_report_bindings) { opal_hwloc_base_cset2mapstr(tmp, sizeof(tmp), opal_hwloc_topology, ours); opal_output(0, "Daemon %s is bound to cores %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp); } } /* cleanup */ hwloc_bitmap_free(ours); hwloc_bitmap_free(pucpus); hwloc_bitmap_free(res); opal_argv_free(cores); } } } #endif /* open and select the pstat framework */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_pstat_base_framework, 0))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "opal_pstat_base_open"; goto error; } if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "opal_pstat_base_select"; goto error; } /* open and setup the state machine */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_state_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_state_base_select())) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_state_base_select"; goto error; } /* open the notifier */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_notifier_base_framework, 0))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_notifier_base_open"; goto error; } /* open the errmgr */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_errmgr_base_open"; goto error; } /* Setup the communication infrastructure */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_oob_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_oob_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_oob_base_select"; goto error; } if (!opal_list_get_size(&orte_oob_base.actives)) { ret = ORTE_ERROR; error = "orte_oob: Found 0 active transports"; goto error; } /* Runtime Messaging Layer */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_rml_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_rml_base_select())) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_rml_base_select"; goto error; } /* select the notifier*/ if (ORTE_SUCCESS != (ret = orte_notifier_base_select())) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_notifier_base_select"; goto error; } /* select the errmgr */ if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_errmgr_base_select"; goto error; } /* Routed system */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_rml_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_routed_base_select())) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_routed_base_select"; goto error; } /* database */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_db_base_framework, 0))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orcm_db_base_open"; goto error; } /* always restrict daemons to local database components */ if (ORTE_SUCCESS != (ret = orcm_db_base_select())) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orcm_db_base_select"; goto error; } /* datastore - ensure we don't pickup the pmi component, but * don't override anything set by user */ if (NULL == getenv(OPAL_MCA_PREFIX"dstore")) { putenv(OPAL_MCA_PREFIX"dstore=^pmi"); } if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_dstore_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "opal_dstore_base_open"; goto error; } if (ORTE_SUCCESS != (ret = opal_dstore_base_select())) { ORTE_ERROR_LOG(ret); error = "opal_dstore_base_select"; goto error; } /* create the handle */ if (0 > (opal_dstore_internal = opal_dstore.open("INTERNAL", NULL, NULL))) { error = "opal dstore internal"; ret = ORTE_ERR_FATAL; goto error; } /* extract the cluster description and setup the routed info - the orcm routed component * will know what to do. */ n = 1; if (OPAL_SUCCESS != (ret = opal_dss.unpack(&buf, &clusterbuf, &n, OPAL_BUFFER))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "extract cluster buf"; goto error; } if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, clusterbuf))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); OBJ_RELEASE(clusterbuf); error = "orte_routed.init_routes"; goto error; } OBJ_RELEASE(clusterbuf); /* extract the uri buffer and load the hash tables */ n = 1; if (OPAL_SUCCESS != (ret = opal_dss.unpack(&buf, &uribuf, &n, OPAL_BUFFER))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "extract uri buffer"; goto error; } if (ORTE_SUCCESS != (ret = orte_rml_base_update_contact_info(uribuf))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); OBJ_RELEASE(uribuf); error = "load hash tables"; goto error; } OBJ_DESTRUCT(&buf); OBJ_RELEASE(uribuf); /* * Group communications */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_grpcomm_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_grpcomm_base_select"; goto error; } /* Open/select the odls */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_odls_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_odls_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_odls_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_odls_base_select"; goto error; } /* enable communication with the rml */ if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) { ORTE_ERROR_LOG(ret); error = "orte_rml.enable_comm"; goto error; } /* setup the FileM */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_filem_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_filem_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_filem_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_filem_base_select"; goto error; } /* * Initalize the CR setup * Note: Always do this, even in non-FT builds. * If we don't some user level tools may hang. */ opal_cr_set_enabled(false); if (ORTE_SUCCESS != (ret = orte_cr_init())) { ORTE_ERROR_LOG(ret); error = "orte_cr_init"; goto error; } /* setup the ANALYTICS framework */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_analytics_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orcm_analytics_base_open"; goto error; } /* setup the EVGEN framework */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_evgen_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orcm_evgen_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orcm_evgen_base_select())) { ORTE_ERROR_LOG(ret); error = "orcm_evgen_select"; goto error; } /* setup the SENSOR framework */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_sensor_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orcm_sensor_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orcm_sensor_base_select())) { ORTE_ERROR_LOG(ret); error = "orcm_sensor_select"; goto error; } /* start the local sensors */ orcm_sensor.start(ORTE_PROC_MY_NAME->jobid); /* setup the PWRMGMT framework */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_pwrmgmt_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orcm_pwrmgmt_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orcm_pwrmgmt_base_select())) { ORTE_ERROR_LOG(ret); error = "orcm_pwrmgmt_select"; goto error; } /* setup the DFS framework */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_dfs_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_dfs_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_dfs_select"; goto error; } /* open and setup the DIAG framework */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_diag_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orcm_diag_base_open"; goto error; } if (ORCM_SUCCESS != (ret = orcm_diag_base_select())) { ORTE_ERROR_LOG(ret); error = "orcm_diag_select"; goto error; } return ORTE_SUCCESS; error: orte_show_help("help-orcm-runtime.txt", "orcm_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); return ORTE_ERR_SILENT; }
int orte_ess_base_tool_setup(void) { int ret; char *error = NULL; opal_list_t transports; orte_jobid_t jobid; orte_vpid_t vpid; /* setup the PMIx framework - ensure it skips all non-PMIx components, * but do not override anything we were given */ opal_setenv("OMPI_MCA_pmix", "^s1,s2,cray,isolated", false, &environ); if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_pmix_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_pmix_base_open"; goto error; } if (ORTE_SUCCESS != (ret = opal_pmix_base_select())) { ORTE_ERROR_LOG(ret); error = "opal_pmix_base_select"; goto error; } /* set the event base */ opal_pmix_base_set_evbase(orte_event_base); /* we have to define our name here */ if (NULL != orte_ess_base_jobid && NULL != orte_ess_base_vpid) { opal_output_verbose(2, orte_ess_base_framework.framework_output, "ess:tool:obtaining name from environment"); if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_jobid(&jobid, orte_ess_base_jobid))) { return(ret); } ORTE_PROC_MY_NAME->jobid = jobid; if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_vpid(&vpid, orte_ess_base_vpid))) { return(ret); } ORTE_PROC_MY_NAME->vpid = vpid; } else { /* If we are a tool with no name, then define it here */ uint16_t jobfam; uint32_t hash32; uint32_t bias; opal_output_verbose(2, orte_ess_base_framework.framework_output, "ess:tool:computing name"); /* hash the nodename */ OPAL_HASH_STR(orte_process_info.nodename, hash32); bias = (uint32_t)orte_process_info.pid; /* fold in the bias */ hash32 = hash32 ^ bias; /* now compress to 16-bits */ jobfam = (uint16_t)(((0x0000ffff & (0xffff0000 & hash32) >> 16)) ^ (0x0000ffff & hash32)); /* set the name */ ORTE_PROC_MY_NAME->jobid = 0xffff0000 & ((uint32_t)jobfam << 16); ORTE_PROC_MY_NAME->vpid = 0; } /* my name is set, xfer it to the OPAL layer */ orte_process_info.super.proc_name = *(opal_process_name_t*)ORTE_PROC_MY_NAME; /* initialize - PMIx may set our name here if we attach to * a PMIx server */ if (NULL != opal_pmix.tool_init) { opal_list_t info; opal_value_t *kv; OBJ_CONSTRUCT(&info, opal_list_t); /* pass our name so the PMIx layer can use it */ kv = OBJ_NEW(opal_value_t); kv->key = strdup(OPAL_PMIX_TOOL_NSPACE); orte_util_convert_jobid_to_string(&kv->data.string, ORTE_PROC_MY_NAME->jobid); kv->type = OPAL_STRING; opal_list_append(&info, &kv->super); /* ditto for our rank */ kv = OBJ_NEW(opal_value_t); kv->key = strdup(OPAL_PMIX_TOOL_RANK); kv->data.name.vpid = ORTE_PROC_MY_NAME->vpid; kv->type = OPAL_VPID; opal_list_append(&info, &kv->super); /* ORTE tools don't need to connect to a PMIx server as * they will connect via the OOB */ kv = OBJ_NEW(opal_value_t); kv->key = strdup(OPAL_PMIX_TOOL_DO_NOT_CONNECT); kv->data.flag = true; kv->type = OPAL_BOOL; opal_list_append(&info, &kv->super); if (OPAL_SUCCESS != (ret = opal_pmix.tool_init(&info))) { ORTE_ERROR_LOG(ret); error = "opal_pmix.init"; OPAL_LIST_DESTRUCT(&info); goto error; } OPAL_LIST_DESTRUCT(&info); ORTE_PROC_MY_NAME->jobid = OPAL_PROC_MY_NAME.jobid; ORTE_PROC_MY_NAME->vpid = OPAL_PROC_MY_NAME.vpid; } orte_process_info.super.proc_hostname = strdup(orte_process_info.nodename); orte_process_info.super.proc_flags = OPAL_PROC_ALL_LOCAL; orte_process_info.super.proc_arch = opal_local_arch; opal_proc_local_set(&orte_process_info.super); /* open and setup the state machine */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_state_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_state_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_state_base_select"; goto error; } /* open and setup the error manager */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_select"; goto error; } /* Setup the communication infrastructure */ /* Routed system */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_routed_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_routed_base_select"; goto error; } if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_oob_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_oob_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_oob_base_select"; goto error; } /* Runtime Messaging Layer */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_rml_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_select"; goto error; } /* get a conduit for our use - we never route IO over fabric */ OBJ_CONSTRUCT(&transports, opal_list_t); orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE, ORTE_ATTR_LOCAL, orte_mgmt_transport, OPAL_STRING); orte_mgmt_conduit = orte_rml.open_conduit(&transports); OPAL_LIST_DESTRUCT(&transports); /* since I am a tool, then all I really want to do is communicate. * So setup communications and be done - finding the HNP * to which I want to communicate and setting up a route for * that link is my responsibility */ /* we -may- need to know the name of the head * of our session directory tree, particularly the * tmp base where any other session directories on * this node might be located */ ret = orte_session_setup_base(ORTE_PROC_MY_NAME); if (ORTE_SUCCESS != ret ) { ORTE_ERROR_LOG(ret); error = "define session dir names"; goto error; } /* setup I/O forwarding system - must come after we init routes */ if (NULL != orte_process_info.my_hnp_uri) { /* only do this if we were given an HNP */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_iof_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_iof_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_iof_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_iof_base_select"; goto error; } /* if we were given an HNP, then also setup the PLM in case this * tool wants to request that we spawn something for it */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_plm_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_plm_base_open"; goto error; } /* we don't select the plm framework as we only want the * base proxy functions */ } #if OPAL_ENABLE_FT_CR == 1 /* * Setup the SnapC */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_snapc_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_snapc_base_open"; goto error; } if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_sstore_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_sstore_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_snapc_base_select(ORTE_PROC_IS_HNP, ORTE_PROC_IS_APP))) { ORTE_ERROR_LOG(ret); error = "orte_snapc_base_select"; goto error; } if (ORTE_SUCCESS != (ret = orte_sstore_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_sstore_base_select"; goto error; } /* Tools do not need all the OPAL CR stuff */ opal_cr_set_enabled(false); #endif return ORTE_SUCCESS; error: orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); return ret; }
int main(int argc, char *argv[]){ int count; int msgsize; uint8_t *msg; int i, j, rc; orte_process_name_t peer; double maxpower; /* * Init */ orte_init(ORTE_NON_TOOL); if (argc > 1) { count = atoi(argv[1]); if (count < 0) { count = INT_MAX-1; } } else { count = MAX_COUNT; } peer.jobid = ORTE_PROC_MY_NAME->jobid; for (j=1; j < count+1; j++) { peer.vpid = (ORTE_PROC_MY_NAME->vpid + j) % orte_process_info.num_procs; /* rank0 starts ring */ if (ORTE_PROC_MY_NAME->vpid == 0) { /* setup the initiating buffer - put random sized message in it */ OBJ_CONSTRUCT(&buf, opal_buffer_t); maxpower = (double)(j%7); msgsize = (int)pow(10.0, maxpower); opal_output(0, "Ring %d message size %d bytes", j, msgsize); msg = (uint8_t*)malloc(msgsize); opal_dss.pack(&buf, msg, msgsize, OPAL_BYTE); if (0 > (rc = orte_rml.send_buffer(&peer,&buf, MY_TAG, 0))) { opal_output(0, "error sending to %s %s\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer), ORTE_ERROR_NAME(rc)); exit(1); } OBJ_DESTRUCT(&buf); /* wait for it to come around */ OBJ_CONSTRUCT(&buf, opal_buffer_t); msg_recvd = false; orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, MY_TAG, ORTE_RML_NON_PERSISTENT, recv_ack, NULL); ORTE_PROGRESSED_WAIT(msg_recvd, 0, 1); opal_output(0, "%s Ring %d completed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j); } else { /* wait for msg */ OBJ_CONSTRUCT(&buf, opal_buffer_t); msg_recvd = false; orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, MY_TAG, ORTE_RML_NON_PERSISTENT, recv_ack, NULL); ORTE_PROGRESSED_WAIT(msg_recvd, 0, 1); /* send it along */ if (0 > (rc = orte_rml.send_buffer(&peer, &buf, MY_TAG, 0))) { opal_output(0, "%s error sending to %s %s\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&peer), ORTE_ERROR_NAME(rc)); exit(1); } OBJ_DESTRUCT(&buf); } } orte_finalize(); return 0; }
int orte_ess_base_tool_setup(void) { int ret; char *error = NULL; /* my name is set, xfer it to the OPAL layer */ orte_process_info.super.proc_name = *(opal_process_name_t*)ORTE_PROC_MY_NAME; orte_process_info.super.proc_hostname = strdup(orte_process_info.nodename); orte_process_info.super.proc_flags = OPAL_PROC_ALL_LOCAL; orte_process_info.super.proc_arch = opal_local_arch; opal_proc_local_set(&orte_process_info.super); /* open and setup the state machine */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_state_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_state_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_state_base_select"; goto error; } /* open and setup the error manager */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_select"; goto error; } /* Setup the communication infrastructure */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_oob_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_oob_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_oob_base_select"; goto error; } /* Runtime Messaging Layer */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_rml_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_select"; goto error; } /* Routed system */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_routed_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_routed_base_select"; goto error; } /* since I am a tool, then all I really want to do is communicate. * So setup communications and be done - finding the HNP * to which I want to communicate and setting up a route for * that link is my responsibility */ /* enable communication via the rml */ if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) { ORTE_ERROR_LOG(ret); error = "orte_rml.enable_comm"; goto error; } /* we -may- need to know the name of the head * of our session directory tree, particularly the * tmp base where any other session directories on * this node might be located */ if (ORTE_SUCCESS != (ret = orte_session_dir_get_name(NULL, &orte_process_info.tmpdir_base, &orte_process_info.top_session_dir, orte_process_info.nodename, NULL, NULL))) { ORTE_ERROR_LOG(ret); error = "define session dir names"; goto error; } /* setup the routed info - the selected routed component * will know what to do. */ if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) { ORTE_ERROR_LOG(ret); error = "orte_routed.init_routes"; goto error; } /* setup I/O forwarding system - must come after we init routes */ if (NULL != orte_process_info.my_hnp_uri) { /* only do this if we were given an HNP */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_iof_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_iof_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_iof_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_iof_base_select"; goto error; } /* if we were given an HNP, then also setup the PLM in case this * tool wants to request that we spawn something for it */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_plm_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_plm_base_open"; goto error; } /* we don't select the plm framework as we only want the * base proxy functions */ } #if OPAL_ENABLE_FT_CR == 1 /* * Setup the SnapC */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_snapc_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_snapc_base_open"; goto error; } if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_sstore_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_sstore_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_snapc_base_select(ORTE_PROC_IS_HNP, ORTE_PROC_IS_APP))) { ORTE_ERROR_LOG(ret); error = "orte_snapc_base_select"; goto error; } if (ORTE_SUCCESS != (ret = orte_sstore_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_sstore_base_select"; goto error; } /* Tools do not need all the OPAL CR stuff */ opal_cr_set_enabled(false); #endif return ORTE_SUCCESS; error: orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); return ret; }
static int rte_init(void) { int ret; char *error = NULL; char *contact_path, *jobfam_dir; orte_job_t *jdata; orte_node_t *node; orte_proc_t *proc; int value; /* initialize the global list of local children and job data */ OBJ_CONSTRUCT(&orte_local_children, opal_list_t); OBJ_CONSTRUCT(&orte_local_jobdata, opal_list_t); /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { error = "orte_ess_base_std_prolog"; goto error; } /* determine the topology info */ if (0 == orte_default_num_sockets_per_board) { /* we weren't given a number, so try to determine it */ if (OPAL_SUCCESS != opal_paffinity_base_get_socket_info(&value)) { /* can't get any info - default to 1 */ value = 1; } orte_default_num_sockets_per_board = (uint8_t)value; } if (0 == orte_default_num_cores_per_socket) { /* we weren't given a number, so try to determine it */ if (OPAL_SUCCESS != (ret = opal_paffinity_base_get_core_info(0, &value))) { /* don't have topo info - can we at least get #processors? */ if (OPAL_SUCCESS != opal_paffinity_base_get_processor_info(&value)) { /* can't get any info - default to 1 */ value = 1; } } orte_default_num_cores_per_socket = (uint8_t)value; } /* if we are using xml for output, put an mpirun start tag */ if (orte_xml_output) { fprintf(orte_xml_fp, "<mpirun>\n"); fflush(orte_xml_fp); } /* open and setup the opal_pstat framework so we can provide * process stats if requested */ if (ORTE_SUCCESS != (ret = opal_pstat_base_open())) { ORTE_ERROR_LOG(ret); error = "opal_pstat_base_open"; goto error; } if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_pstat_base_select"; goto error; } /* open and setup the local resource discovery framework */ if (ORTE_SUCCESS != (ret = opal_sysinfo_base_open())) { ORTE_ERROR_LOG(ret); error = "opal_sysinfo_base_open"; goto error; } if (ORTE_SUCCESS != (ret = opal_sysinfo_base_select())) { ORTE_ERROR_LOG(ret); error = "opal_sysinfo_base_select"; goto error; } /* Since we are the HNP, then responsibility for * defining the name falls to the PLM component for our * respective environment - hence, we have to open the PLM * first and select that component. */ if (ORTE_SUCCESS != (ret = orte_plm_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_plm_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_plm_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_plm_base_select"; goto error; } if (ORTE_SUCCESS != (ret = orte_plm.set_hnp_name())) { ORTE_ERROR_LOG(ret); error = "orte_plm_set_hnp_name"; goto error; } /* Setup the communication infrastructure */ /* * Runtime Messaging Layer */ if (ORTE_SUCCESS != (ret = orte_rml_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_rml_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_select"; goto error; } /* * Routed system */ if (ORTE_SUCCESS != (ret = orte_routed_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_routed_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_routed_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_routed_base_select"; goto error; } /* * Group communications */ if (ORTE_SUCCESS != (ret = orte_grpcomm_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_grpcomm_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_grpcomm_base_select"; goto error; } /* multicast */ #if ORTE_ENABLE_MULTICAST if (ORTE_SUCCESS != (ret = orte_rmcast_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_rmcast_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_rmcast_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_rmcast_base_select"; goto error; } #endif /* Now provide a chance for the PLM * to perform any module-specific init functions. This * needs to occur AFTER the communications are setup * as it may involve starting a non-blocking recv */ if (ORTE_SUCCESS != (ret = orte_plm.init())) { ORTE_ERROR_LOG(ret); error = "orte_plm_init"; goto error; } /* * Setup the remaining resource * management and errmgr frameworks - application procs * and daemons do not open these frameworks as they only use * the hnp proxy support in the PLM framework. */ if (ORTE_SUCCESS != (ret = orte_ras_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_ras_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_ras_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_ras_base_find_available"; goto error; } if (ORTE_SUCCESS != (ret = orte_rmaps_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_rmaps_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_rmaps_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_rmaps_base_find_available"; goto error; } if (ORTE_SUCCESS != (ret = orte_errmgr_base_open())) { error = "orte_errmgr_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_select"; goto error; } /* Open/select the odls */ if (ORTE_SUCCESS != (ret = orte_odls_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_odls_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_odls_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_odls_base_select"; goto error; } /* enable communication with the rml */ if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) { ORTE_ERROR_LOG(ret); error = "orte_rml.enable_comm"; goto error; } /* we are an hnp, so update the contact info field for later use */ orte_process_info.my_hnp_uri = orte_rml.get_contact_info(); /* we are also officially a daemon, so better update that field too */ orte_process_info.my_daemon_uri = orte_rml.get_contact_info(); #if !ORTE_DISABLE_FULL_SUPPORT /* setup the orte_show_help system to recv remote output */ ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_SHOW_HELP, ORTE_RML_NON_PERSISTENT, orte_show_help_recv, NULL); if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) { ORTE_ERROR_LOG(ret); error = "setup receive for orte_show_help"; goto error; } #endif /* setup my session directory */ OPAL_OUTPUT_VERBOSE((2, orte_debug_output, "%s setting up session dir with\n\ttmpdir: %s\n\thost %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base, orte_process_info.nodename)); if (ORTE_SUCCESS != (ret = orte_session_dir(true, orte_process_info.tmpdir_base, orte_process_info.nodename, NULL, ORTE_PROC_MY_NAME))) { ORTE_ERROR_LOG(ret); error = "orte_session_dir"; goto error; } /* Once the session directory location has been established, set the opal_output hnp file location to be in the proc-specific session directory. */ opal_output_set_output_file_info(orte_process_info.proc_session_dir, "output-", NULL, NULL); /* save my contact info in a file for others to find */ jobfam_dir = opal_dirname(orte_process_info.job_session_dir); contact_path = opal_os_path(false, jobfam_dir, "contact.txt", NULL); free(jobfam_dir); OPAL_OUTPUT_VERBOSE((2, orte_debug_output, "%s writing contact file %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), contact_path)); if (ORTE_SUCCESS != (ret = orte_write_hnp_contact_file(contact_path))) { OPAL_OUTPUT_VERBOSE((2, orte_debug_output, "%s writing contact file failed with error %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(ret))); } else { OPAL_OUTPUT_VERBOSE((2, orte_debug_output, "%s wrote contact file", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } free(contact_path); /* setup the global job and node arrays */ orte_job_data = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data, 1, ORTE_GLOBAL_ARRAY_MAX_SIZE, 1))) { ORTE_ERROR_LOG(ret); error = "setup job array"; goto error; } orte_node_pool = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool, ORTE_GLOBAL_ARRAY_BLOCK_SIZE, ORTE_GLOBAL_ARRAY_MAX_SIZE, ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) { ORTE_ERROR_LOG(ret); error = "setup node array"; goto error; } /* Setup the job data object for the daemons */ /* create and store the job data object */ jdata = OBJ_NEW(orte_job_t); jdata->jobid = ORTE_PROC_MY_NAME->jobid; opal_pointer_array_set_item(orte_job_data, 0, jdata); /* create and store a node object where we are */ node = OBJ_NEW(orte_node_t); node->name = strdup(orte_process_info.nodename); node->index = opal_pointer_array_add(orte_node_pool, node); /* create and store a proc object for us */ proc = OBJ_NEW(orte_proc_t); proc->name.jobid = ORTE_PROC_MY_NAME->jobid; proc->name.vpid = ORTE_PROC_MY_NAME->vpid; proc->pid = orte_process_info.pid; proc->rml_uri = orte_rml.get_contact_info(); proc->state = ORTE_PROC_STATE_RUNNING; OBJ_RETAIN(node); /* keep accounting straight */ proc->node = node; proc->nodename = node->name; opal_pointer_array_add(jdata->procs, proc); /* record that the daemon (i.e., us) is on this node * NOTE: we do not add the proc object to the node's * proc array because we are not an application proc. * Instead, we record it in the daemon field of the * node object */ OBJ_RETAIN(proc); /* keep accounting straight */ node->daemon = proc; node->daemon_launched = true; node->state = ORTE_NODE_STATE_UP; /* record that the daemon job is running */ jdata->num_procs = 1; jdata->state = ORTE_JOB_STATE_RUNNING; /* setup the routed info - the selected routed component * will know what to do. */ if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) { ORTE_ERROR_LOG(ret); error = "orte_routed.init_routes"; goto error; } /* setup I/O forwarding system - must come after we init routes */ if (ORTE_SUCCESS != (ret = orte_iof_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_iof_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_iof_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_iof_base_select"; goto error; } /* setup the FileM */ if (ORTE_SUCCESS != (ret = orte_filem_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_filem_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_filem_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_filem_base_select"; goto error; } #if OPAL_ENABLE_FT_CR == 1 /* * Setup the SnapC */ if (ORTE_SUCCESS != (ret = orte_snapc_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_snapc_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_snapc_base_select(ORTE_PROC_IS_HNP, !ORTE_PROC_IS_DAEMON))) { ORTE_ERROR_LOG(ret); error = "orte_snapc_base_select"; goto error; } /* For HNP, ORTE doesn't need the OPAL CR stuff */ opal_cr_set_enabled(false); #else opal_cr_set_enabled(false); #endif /* * Initalize the CR setup * Note: Always do this, even in non-FT builds. * If we don't some user level tools may hang. */ if (ORTE_SUCCESS != (ret = orte_cr_init())) { ORTE_ERROR_LOG(ret); error = "orte_cr_init"; goto error; } /* setup the notifier system */ if (ORTE_SUCCESS != (ret = orte_notifier_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_notifer_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_notifier_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_notifer_select"; goto error; } /* if a tool has launched us and is requesting event reports, * then set its contact info into the comm system */ if (orte_report_events) { if (ORTE_SUCCESS != (ret = orte_util_comm_connect_tool(orte_report_events_uri))) { error = "could not connect to tool"; goto error; } } /* We actually do *not* want an HNP to voluntarily yield() the processor more than necessary. Orterun already blocks when it is doing nothing, so it doesn't use any more CPU cycles than it should; but when it *is* doing something, we do not want it to be unnecessarily delayed because it voluntarily yielded the processor in the middle of its work. For example: when a message arrives at orterun, we want the OS to wake us up in a timely fashion (which most OS's seem good about doing) and then we want orterun to process the message as fast as possible. If orterun yields and lets aggressive MPI applications get the processor back, it may be a long time before the OS schedules orterun to run again (particularly if there is no IO event to wake it up). Hence, routed OOB messages (for example) may be significantly delayed before being delivered to MPI processes, which can be problematic in some scenarios (e.g., COMM_SPAWN, BTL's that require OOB messages for wireup, etc.). */ opal_progress_set_yield_when_idle(false); return ORTE_SUCCESS; error: if (ORTE_ERR_SILENT != ret) { orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); } /* cleanup the global list of local children and job data */ OBJ_DESTRUCT(&orte_local_children); OBJ_DESTRUCT(&orte_local_jobdata); return ret; }
int orte_ess_base_app_setup(bool db_restrict_local) { int ret; char *error = NULL; opal_value_t kv; /* * stdout/stderr buffering * If the user requested to override the default setting then do * as they wish. */ if( orte_ess_base_std_buffering > -1 ) { if( 0 == orte_ess_base_std_buffering ) { setvbuf(stdout, NULL, _IONBF, 0); setvbuf(stderr, NULL, _IONBF, 0); } else if( 1 == orte_ess_base_std_buffering ) { setvbuf(stdout, NULL, _IOLBF, 0); setvbuf(stderr, NULL, _IOLBF, 0); } else if( 2 == orte_ess_base_std_buffering ) { setvbuf(stdout, NULL, _IOFBF, 0); setvbuf(stderr, NULL, _IOFBF, 0); } } /* if I am an MPI app, we will let the MPI layer define and * control the opal_proc_t structure. Otherwise, we need to * do so here */ if (ORTE_PROC_NON_MPI) { orte_process_info.super.proc_name = *(opal_process_name_t*)ORTE_PROC_MY_NAME; orte_process_info.super.proc_hostname = orte_process_info.nodename; orte_process_info.super.proc_flags = OPAL_PROC_ALL_LOCAL; orte_process_info.super.proc_arch = opal_local_arch; opal_proc_local_set(&orte_process_info.super); } /* get an async event base - we use the opal_async one so * we don't startup extra threads if not needed */ orte_event_base = opal_progress_thread_init(NULL); progress_thread_running = true; /* open and setup the state machine */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_state_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_state_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_state_base_select"; goto error; } /* open the errmgr */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_open"; goto error; } /* setup my session directory */ if (orte_create_session_dirs) { OPAL_OUTPUT_VERBOSE((2, orte_ess_base_framework.framework_output, "%s setting up session dir with\n\ttmpdir: %s\n\thost %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base, orte_process_info.nodename)); if (ORTE_SUCCESS != (ret = orte_session_dir(true, orte_process_info.tmpdir_base, orte_process_info.nodename, NULL, ORTE_PROC_MY_NAME))) { ORTE_ERROR_LOG(ret); error = "orte_session_dir"; goto error; } /* Once the session directory location has been established, set the opal_output env file location to be in the proc-specific session directory. */ opal_output_set_output_file_info(orte_process_info.proc_session_dir, "output-", NULL, NULL); /* store the session directory location */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_NSDIR); kv.type = OPAL_STRING; kv.data.string = strdup(orte_process_info.job_session_dir); if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_NAME, &kv))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); error = "opal pmix put job sessiondir"; goto error; } OBJ_DESTRUCT(&kv); OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_PROCDIR); kv.type = OPAL_STRING; kv.data.string = strdup(orte_process_info.proc_session_dir); if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_NAME, &kv))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); error = "opal pmix put proc sessiondir"; goto error; } OBJ_DESTRUCT(&kv); } /* setup the errmgr */ if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_select"; goto error; } #if OPAL_ENABLE_FT_CR == 1 /* * Setup the SnapC */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_snapc_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_snapc_base_open"; goto error; } if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_sstore_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_sstore_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_snapc_base_select(ORTE_PROC_IS_HNP, ORTE_PROC_IS_APP))) { ORTE_ERROR_LOG(ret); error = "orte_snapc_base_select"; goto error; } if (ORTE_SUCCESS != (ret = orte_sstore_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_sstore_base_select"; goto error; } /* apps need the OPAL CR stuff */ opal_cr_set_enabled(true); #else opal_cr_set_enabled(false); #endif /* Initalize the CR setup * Note: Always do this, even in non-FT builds. * If we don't some user level tools may hang. */ if (ORTE_SUCCESS != (ret = orte_cr_init())) { ORTE_ERROR_LOG(ret); error = "orte_cr_init"; goto error; } /* open the distributed file system */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_dfs_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_dfs_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_dfs_base_select"; goto error; } return ORTE_SUCCESS; error: if (!progress_thread_running) { /* can't send the help message, so ensure it * comes out locally */ orte_show_help_finalize(); } orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); return ret; }
int orcm_init(orcm_proc_type_t flags) { int ret; char *error; int i, spin; if (NULL != getenv("ORCM_MCA_spin")) { spin = 1; /* spin until a debugger can attach */ while (0 != spin) { ret = 0; while (ret < 10000) { ret++; }; } } if (!orcm_util_initialized) { orcm_init_util(); } /* set the default leader policy */ orcm_default_leader_policy.jobid = ORTE_JOBID_WILDCARD; orcm_default_leader_policy.vpid = ORTE_VPID_WILDCARD; /* get the number of max msgs */ mca_base_param_reg_int_name("orcm", "max_buffered_msgs", "Number of recvd messages to hold in storage from each source", false, false, ORCM_MAX_MSG_RING_SIZE, &orcm_max_msg_ring_size); /* independent mode or not */ mca_base_param_reg_int_name("orcm", "sched_kill_dvm", "Whether or not scheduler kills associated daemons upon termination (default: no)", false, false, (int)false, &ret); orcm_sched_kill_dvm = OPAL_INT_TO_BOOL(ret); /* setup the globals that require initialization */ orcm_triplets = OBJ_NEW(orcm_triplets_array_t); #ifdef HAVE_QSYSTEM_H #ifdef Q_SYSTEM_INTFCS_TO_PROBE_FOR_IP_ADDRESS { char *eth_ifs[] = Q_SYSTEM_INTFCS_TO_PROBE_FOR_IP_ADDRESS; char **adds=NULL, *ifs, *envar; int i, num_ifs; num_ifs = sizeof(eth_ifs) / sizeof(eth_ifs[0]); for (i=0; i < num_ifs; i++) { opal_argv_append_nosize(&adds, eth_ifs[i]); } ifs = opal_argv_join(adds, ','); opal_argv_free(adds); /* push it into the environ so that the rmcast framework can get it */ asprintf(&envar, "OMPI_MCA_rmcast_base_if_include=%s", ifs); putenv(envar); /* cannot release envar as the environ doesn't keep its own copy */ free(ifs); } #endif #endif /* initialize us */ if (ORTE_SUCCESS != (ret = orte_init(NULL, NULL, flags))) { error = "orte_init"; goto error; } if (!ORCM_PROC_IS_TOOL) { opal_set_using_threads(true); } if (!ORCM_PROC_IS_APP) { trap_signals(); } orcm_initialized = true; return ORCM_SUCCESS; error: if (ORCM_ERR_SILENT != ret) { orte_show_help("help-openrcm-runtime.txt", "orcm_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); } return ret; }
static int rte_init(void) { int ret; char *error = NULL; char *contact_path, *jobfam_dir; orte_job_t *jdata; orte_node_t *node; orte_proc_t *proc; orte_app_context_t *app; /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { error = "orte_ess_base_std_prolog"; goto error; } /* setup callback for SIGPIPE */ setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback); /** setup callbacks for abort signals - from this point * forward, we need to abort in a manner that allows us * to cleanup. However, we cannot directly use libevent * to trap these signals as otherwise we cannot respond * to them if we are stuck in an event! So instead use * the basic POSIX trap functions to handle the signal, * and then let that signal handler do some magic to * avoid the hang * * NOTE: posix traps don't allow us to do anything major * in them, so use a pipe tied to a libevent event to * reach a "safe" place where the termination event can * be created */ pipe(term_pipe); /* setup an event to attempt normal termination on signal */ opal_event_set(orte_event_base, &term_handler, term_pipe[0], OPAL_EV_READ, clean_abort, NULL); opal_event_set_priority(&term_handler, ORTE_ERROR_PRI); opal_event_add(&term_handler, NULL); /* Set both ends of this pipe to be close-on-exec so that no children inherit it */ if (opal_fd_set_cloexec(term_pipe[0]) != OPAL_SUCCESS || opal_fd_set_cloexec(term_pipe[1]) != OPAL_SUCCESS) { error = "unable to set the pipe to CLOEXEC"; goto error; } /* point the signal trap to a function that will activate that event */ signal(SIGTERM, abort_signal_callback); signal(SIGINT, abort_signal_callback); signal(SIGHUP, abort_signal_callback); /** setup callbacks for signals we should foward */ setup_sighandler(SIGUSR1, &sigusr1_handler, signal_forward_callback); setup_sighandler(SIGUSR2, &sigusr2_handler, signal_forward_callback); setup_sighandler(SIGTSTP, &sigtstp_handler, signal_forward_callback); setup_sighandler(SIGCONT, &sigcont_handler, signal_forward_callback); signals_set = true; #if OPAL_HAVE_HWLOC { hwloc_obj_t obj; unsigned i, j; /* get the local topology */ if (NULL == opal_hwloc_topology) { if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) { error = "topology discovery"; goto error; } } /* remove the hostname from the topology. Unfortunately, hwloc * decided to add the source hostname to the "topology", thus * rendering it unusable as a pure topological description. So * we remove that information here. */ obj = hwloc_get_root_obj(opal_hwloc_topology); for (i=0; i < obj->infos_count; i++) { if (NULL == obj->infos[i].name || NULL == obj->infos[i].value) { continue; } if (0 == strncmp(obj->infos[i].name, "HostName", strlen("HostName"))) { free(obj->infos[i].name); free(obj->infos[i].value); /* left justify the array */ for (j=i; j < obj->infos_count-1; j++) { obj->infos[j] = obj->infos[j+1]; } obj->infos[obj->infos_count-1].name = NULL; obj->infos[obj->infos_count-1].value = NULL; obj->infos_count--; break; } } if (4 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) { opal_output(0, "%s Topology Info:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO); } } #endif /* if we are using xml for output, put an mpirun start tag */ if (orte_xml_output) { fprintf(orte_xml_fp, "<mpirun>\n"); fflush(orte_xml_fp); } /* setup the global nidmap/pidmap object */ orte_nidmap.bytes = NULL; orte_nidmap.size = 0; orte_pidmap.bytes = NULL; orte_pidmap.size = 0; /* open and setup the opal_pstat framework so we can provide * process stats if requested */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_pstat_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "opal_pstat_base_open"; goto error; } if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) { ORTE_ERROR_LOG(ret); error = "opal_pstat_base_select"; goto error; } /* open and setup the state machine */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_state_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_state_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_state_base_select"; goto error; } /* open the errmgr */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_open"; goto error; } /* Since we are the HNP, then responsibility for * defining the name falls to the PLM component for our * respective environment - hence, we have to open the PLM * first and select that component. */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_plm_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_plm_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_plm_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_plm_base_select"; goto error; } /* if we were spawned by a singleton, our jobid was given to us */ if (NULL != orte_ess_base_jobid) { if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_jobid(&ORTE_PROC_MY_NAME->jobid, orte_ess_base_jobid))) { ORTE_ERROR_LOG(ret); error = "convert_string_to_jobid"; goto error; } ORTE_PROC_MY_NAME->vpid = 0; } else { if (ORTE_SUCCESS != (ret = orte_plm.set_hnp_name())) { ORTE_ERROR_LOG(ret); error = "orte_plm_set_hnp_name"; goto error; } } /* Setup the communication infrastructure */ /* * OOB Layer */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_oob_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_oob_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_oob_base_select"; goto error; } /* * Runtime Messaging Layer */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_rml_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_select"; goto error; } if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_select"; goto error; } /* setup the global job and node arrays */ orte_job_data = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data, 1, ORTE_GLOBAL_ARRAY_MAX_SIZE, 1))) { ORTE_ERROR_LOG(ret); error = "setup job array"; goto error; } orte_node_pool = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool, ORTE_GLOBAL_ARRAY_BLOCK_SIZE, ORTE_GLOBAL_ARRAY_MAX_SIZE, ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) { ORTE_ERROR_LOG(ret); error = "setup node array"; goto error; } orte_node_topologies = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_topologies, ORTE_GLOBAL_ARRAY_BLOCK_SIZE, ORTE_GLOBAL_ARRAY_MAX_SIZE, ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) { ORTE_ERROR_LOG(ret); error = "setup node topologies array"; goto error; } /* init the nidmap - just so we register that verbosity */ orte_util_nidmap_init(NULL); /* Setup the job data object for the daemons */ /* create and store the job data object */ jdata = OBJ_NEW(orte_job_t); jdata->jobid = ORTE_PROC_MY_NAME->jobid; opal_pointer_array_set_item(orte_job_data, 0, jdata); /* mark that the daemons have reported as we are the * only ones in the system right now, and we definitely * are running! */ jdata->state = ORTE_JOB_STATE_DAEMONS_REPORTED; /* every job requires at least one app */ app = OBJ_NEW(orte_app_context_t); opal_pointer_array_set_item(jdata->apps, 0, app); jdata->num_apps++; /* create and store a node object where we are */ node = OBJ_NEW(orte_node_t); node->name = strdup(orte_process_info.nodename); node->index = opal_pointer_array_set_item(orte_node_pool, 0, node); #if OPAL_HAVE_HWLOC /* add it to the array of known topologies */ opal_pointer_array_add(orte_node_topologies, opal_hwloc_topology); #endif /* create and store a proc object for us */ proc = OBJ_NEW(orte_proc_t); proc->name.jobid = ORTE_PROC_MY_NAME->jobid; proc->name.vpid = ORTE_PROC_MY_NAME->vpid; proc->pid = orte_process_info.pid; proc->rml_uri = orte_rml.get_contact_info(); proc->state = ORTE_PROC_STATE_RUNNING; OBJ_RETAIN(node); /* keep accounting straight */ proc->node = node; proc->nodename = node->name; opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc); /* record that the daemon (i.e., us) is on this node * NOTE: we do not add the proc object to the node's * proc array because we are not an application proc. * Instead, we record it in the daemon field of the * node object */ OBJ_RETAIN(proc); /* keep accounting straight */ node->daemon = proc; node->daemon_launched = true; node->state = ORTE_NODE_STATE_UP; /* if we are to retain aliases, get ours */ if (orte_retain_aliases) { opal_ifgetaliases(&node->alias); /* add our own local name to it */ opal_argv_append_nosize(&node->alias, orte_process_info.nodename); } /* record that the daemon job is running */ jdata->num_procs = 1; jdata->state = ORTE_JOB_STATE_RUNNING; /* obviously, we have "reported" */ jdata->num_reported = 1; /* * Routed system */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_routed_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_routed_base_select"; goto error; } /* database */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_db_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_db_base_open"; goto error; } if (ORTE_SUCCESS != (ret = opal_db_base_select(true))) { ORTE_ERROR_LOG(ret); error = "orte_db_base_select"; goto error; } /* set our id */ opal_db.set_id((opal_identifier_t*)ORTE_PROC_MY_NAME); /* * Group communications */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_grpcomm_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_grpcomm_base_select"; goto error; } /* Now provide a chance for the PLM * to perform any module-specific init functions. This * needs to occur AFTER the communications are setup * as it may involve starting a non-blocking recv */ if (ORTE_SUCCESS != (ret = orte_plm.init())) { ORTE_ERROR_LOG(ret); error = "orte_plm_init"; goto error; } /* * Setup the remaining resource * management and errmgr frameworks - application procs * and daemons do not open these frameworks as they only use * the hnp proxy support in the PLM framework. */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_ras_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_ras_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_ras_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_ras_base_find_available"; goto error; } if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rmaps_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_rmaps_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_rmaps_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_rmaps_base_find_available"; goto error; } #if OPAL_HAVE_HWLOC { char *coprocessors, **sns; uint32_t h; int idx; /* if a topology file was given, then the rmaps framework open * will have reset our topology. Ensure we always get the right * one by setting our node topology afterwards */ node->topology = opal_hwloc_topology; /* init the hash table, if necessary */ if (NULL == orte_coprocessors) { orte_coprocessors = OBJ_NEW(opal_hash_table_t); opal_hash_table_init(orte_coprocessors, orte_process_info.num_procs); } /* detect and add any coprocessors */ coprocessors = opal_hwloc_base_find_coprocessors(opal_hwloc_topology); if (NULL != coprocessors) { /* separate the serial numbers of the coprocessors * on this host */ sns = opal_argv_split(coprocessors, ','); for (idx=0; NULL != sns[idx]; idx++) { /* compute the hash */ OPAL_HASH_STR(sns[idx], h); /* mark that this coprocessor is hosted by this node */ opal_hash_table_set_value_uint32(orte_coprocessors, h, (void*)&(ORTE_PROC_MY_NAME->vpid)); } opal_argv_free(sns); free(coprocessors); orte_coprocessors_detected = true; } /* see if I am on a coprocessor */ coprocessors = opal_hwloc_base_check_on_coprocessor(); if (NULL != coprocessors) { node->serial_number = coprocessors; orte_coprocessors_detected = true; } } #endif /* Open/select the odls */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_odls_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_odls_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_odls_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_odls_base_select"; goto error; } /* enable communication with the rml */ if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) { ORTE_ERROR_LOG(ret); error = "orte_rml.enable_comm"; goto error; } /* we are an hnp, so update the contact info field for later use */ orte_process_info.my_hnp_uri = orte_rml.get_contact_info(); proc->rml_uri = strdup(orte_process_info.my_hnp_uri); /* we are also officially a daemon, so better update that field too */ orte_process_info.my_daemon_uri = strdup(orte_process_info.my_hnp_uri); /* setup the orte_show_help system to recv remote output */ orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_SHOW_HELP, ORTE_RML_PERSISTENT, orte_show_help_recv, NULL); /* setup my session directory */ if (orte_create_session_dirs) { OPAL_OUTPUT_VERBOSE((2, orte_debug_output, "%s setting up session dir with\n\ttmpdir: %s\n\thost %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base, orte_process_info.nodename)); /* take a pass thru the session directory code to fillin the * tmpdir names - don't create anything yet */ if (ORTE_SUCCESS != (ret = orte_session_dir(false, orte_process_info.tmpdir_base, orte_process_info.nodename, NULL, ORTE_PROC_MY_NAME))) { ORTE_ERROR_LOG(ret); error = "orte_session_dir define"; goto error; } /* clear the session directory just in case there are * stale directories laying around */ orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); /* now actually create the directory tree */ if (ORTE_SUCCESS != (ret = orte_session_dir(true, orte_process_info.tmpdir_base, orte_process_info.nodename, NULL, ORTE_PROC_MY_NAME))) { ORTE_ERROR_LOG(ret); error = "orte_session_dir"; goto error; } /* Once the session directory location has been established, set the opal_output hnp file location to be in the proc-specific session directory. */ opal_output_set_output_file_info(orte_process_info.proc_session_dir, "output-", NULL, NULL); /* save my contact info in a file for others to find */ jobfam_dir = opal_dirname(orte_process_info.job_session_dir); contact_path = opal_os_path(false, jobfam_dir, "contact.txt", NULL); free(jobfam_dir); OPAL_OUTPUT_VERBOSE((2, orte_debug_output, "%s writing contact file %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), contact_path)); if (ORTE_SUCCESS != (ret = orte_write_hnp_contact_file(contact_path))) { OPAL_OUTPUT_VERBOSE((2, orte_debug_output, "%s writing contact file failed with error %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(ret))); } else { OPAL_OUTPUT_VERBOSE((2, orte_debug_output, "%s wrote contact file", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } free(contact_path); } /* setup the routed info - the selected routed component * will know what to do. */ if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) { ORTE_ERROR_LOG(ret); error = "orte_routed.init_routes"; goto error; } /* setup I/O forwarding system - must come after we init routes */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_iof_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_iof_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_iof_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_iof_base_select"; goto error; } /* setup the FileM */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_filem_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_filem_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_filem_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_filem_base_select"; goto error; } #if OPAL_ENABLE_FT_CR == 1 /* * Setup the SnapC */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_snapc_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_snapc_base_open"; goto error; } if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_sstore_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_sstore_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_snapc_base_select(ORTE_PROC_IS_HNP, !ORTE_PROC_IS_DAEMON))) { ORTE_ERROR_LOG(ret); error = "orte_snapc_base_select"; goto error; } if (ORTE_SUCCESS != (ret = orte_sstore_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_sstore_base_select"; goto error; } /* For HNP, ORTE doesn't need the OPAL CR stuff */ opal_cr_set_enabled(false); #else opal_cr_set_enabled(false); #endif /* * Initalize the CR setup * Note: Always do this, even in non-FT builds. * If we don't some user level tools may hang. */ if (ORTE_SUCCESS != (ret = orte_cr_init())) { ORTE_ERROR_LOG(ret); error = "orte_cr_init"; goto error; } /* setup the SENSOR framework */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_sensor_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_sensor_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_sensor_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_sensor_select"; goto error; } /* start the local sensors */ orte_sensor.start(ORTE_PROC_MY_NAME->jobid); /* setup the dfs framework */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_dfs_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_dfs_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_dfs_select"; goto error; } /* if a tool has launched us and is requesting event reports, * then set its contact info into the comm system */ if (orte_report_events) { if (ORTE_SUCCESS != (ret = orte_util_comm_connect_tool(orte_report_events_uri))) { error = "could not connect to tool"; goto error; } } /* We actually do *not* want an HNP to voluntarily yield() the processor more than necessary. Orterun already blocks when it is doing nothing, so it doesn't use any more CPU cycles than it should; but when it *is* doing something, we do not want it to be unnecessarily delayed because it voluntarily yielded the processor in the middle of its work. For example: when a message arrives at orterun, we want the OS to wake us up in a timely fashion (which most OS's seem good about doing) and then we want orterun to process the message as fast as possible. If orterun yields and lets aggressive MPI applications get the processor back, it may be a long time before the OS schedules orterun to run again (particularly if there is no IO event to wake it up). Hence, routed OOB messages (for example) may be significantly delayed before being delivered to MPI processes, which can be problematic in some scenarios (e.g., COMM_SPAWN, BTL's that require OOB messages for wireup, etc.). */ opal_progress_set_yield_when_idle(false); return ORTE_SUCCESS; error: if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) { orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); } return ORTE_ERR_SILENT; }
static void _send_notification(int status, orte_proc_state_t state, orte_process_name_t *proc, orte_process_name_t *target) { opal_buffer_t *buf; orte_grpcomm_signature_t sig; int rc; opal_value_t kv, *kvptr; orte_process_name_t daemon; buf = OBJ_NEW(opal_buffer_t); opal_output_verbose(5, orte_state_base_framework.framework_output, "%s state:hnp:sending notification %s proc %s target %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(status), ORTE_NAME_PRINT(proc), ORTE_NAME_PRINT(target)); /* pack the status */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &status, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); return; } /* the source is the proc */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, proc, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); return; } if (OPAL_ERR_PROC_ABORTED == status) { /* we will pass three opal_value_t's */ rc = 3; if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rc, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); return; } /* pass along the affected proc(s) */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_EVENT_AFFECTED_PROC); kv.type = OPAL_NAME; kv.data.name.jobid = proc->jobid; kv.data.name.vpid = proc->vpid; kvptr = &kv; if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &kvptr, 1, OPAL_VALUE))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); OBJ_RELEASE(buf); return; } OBJ_DESTRUCT(&kv); } else { /* we are going to pass two opal_value_t's */ rc = 2; if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rc, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); return; } } /* pass along the affected proc(s) */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_EVENT_AFFECTED_PROC); kv.type = OPAL_NAME; kv.data.name.jobid = proc->jobid; kv.data.name.vpid = proc->vpid; kvptr = &kv; if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &kvptr, 1, OPAL_VALUE))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); OBJ_RELEASE(buf); return; } OBJ_DESTRUCT(&kv); /* pass along the proc(s) to be notified */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_PMIX_EVENT_CUSTOM_RANGE); kv.type = OPAL_NAME; kv.data.name.jobid = target->jobid; kv.data.name.vpid = target->vpid; kvptr = &kv; if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &kvptr, 1, OPAL_VALUE))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&kv); OBJ_RELEASE(buf); return; } OBJ_DESTRUCT(&kv); /* if the targets are a wildcard, then xcast it to everyone */ if (ORTE_VPID_WILDCARD == target->vpid) { OBJ_CONSTRUCT(&sig, orte_grpcomm_signature_t); sig.signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t)); sig.signature[0].jobid = ORTE_PROC_MY_NAME->jobid; sig.signature[0].vpid = ORTE_VPID_WILDCARD; sig.sz = 1; if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(&sig, ORTE_RML_TAG_NOTIFICATION, buf))) { ORTE_ERROR_LOG(rc); } OBJ_DESTRUCT(&sig); OBJ_RELEASE(buf); } else { /* get the daemon hosting the proc to be notified */ daemon.jobid = ORTE_PROC_MY_NAME->jobid; daemon.vpid = orte_get_proc_daemon_vpid(target); /* send the notification to that daemon */ opal_output_verbose(5, orte_state_base_framework.framework_output, "%s state:base:sending notification %s to proc %s at daemon %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(status), ORTE_NAME_PRINT(target), ORTE_NAME_PRINT(&daemon)); if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(&daemon, buf, ORTE_RML_TAG_NOTIFICATION, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); } } }
static int tool_init(void) { int ret = ORTE_ERROR; char *error = NULL; opal_buffer_t buf, *clusterbuf, *uribuf; orte_job_t *jdata; orte_node_t *node; orte_proc_t *proc; opal_list_t config; orcm_scheduler_t *scheduler; orcm_node_t *mynode=NULL; int32_t n; if (initialized) { return ORCM_SUCCESS; } initialized = true; /* Initialize the ORTE data type support */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { error = "orte_std_prolog"; goto error; } /* setup the global job and node arrays */ orte_job_data = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data, 1, ORTE_GLOBAL_ARRAY_MAX_SIZE, 1))) { ORTE_ERROR_LOG(ret); error = "setup job array"; goto error; } orte_node_pool = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool, ORTE_GLOBAL_ARRAY_BLOCK_SIZE, ORTE_GLOBAL_ARRAY_MAX_SIZE, ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) { ORTE_ERROR_LOG(ret); error = "setup node array"; goto error; } orte_node_topologies = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_topologies, ORTE_GLOBAL_ARRAY_BLOCK_SIZE, ORTE_GLOBAL_ARRAY_MAX_SIZE, ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) { ORTE_ERROR_LOG(ret); error = "setup node topologies array"; goto error; } /* create a job tracker for the daemons */ jdata = OBJ_NEW(orte_job_t); jdata->jobid = 0; ORTE_PROC_MY_NAME->jobid = 0; opal_pointer_array_set_item(orte_job_data, 0, jdata); /* read the site configuration */ OBJ_CONSTRUCT(&config, opal_list_t); if (ORCM_SUCCESS != (ret = orcm_cfgi.read_config(&config))) { error = "getting config"; goto error; } /* define the cluster and collect contact info for all * aggregators - we'll need to know how to talk to any * of them in case of failures */ OBJ_CONSTRUCT(&buf, opal_buffer_t); if (ORCM_SUCCESS != (ret = orcm_cfgi.define_system(&config, &mynode, &orte_process_info.num_procs, &buf))) { OBJ_DESTRUCT(&buf); error = "define system"; goto error; } /* define a name for myself */ if (ORTE_SUCCESS != (ret = orte_plm_base_set_hnp_name())) { ORTE_ERROR_LOG(ret); error = "orte_plm_base_set_hnp_name"; goto error; } /* define a node and proc object for ourselves as some parts * of ORTE and ORCM require it */ if (NULL == (node = OBJ_NEW(orte_node_t))) { ret = ORTE_ERR_OUT_OF_RESOURCE; error = "out of memory"; goto error; } node->name = strdup(orte_process_info.nodename); opal_pointer_array_set_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid, node); if (NULL == (proc = OBJ_NEW(orte_proc_t))) { ret = ORTE_ERR_OUT_OF_RESOURCE; error = "out of memory"; goto error; } proc->name.jobid = ORTE_PROC_MY_NAME->jobid; proc->name.vpid = ORTE_PROC_MY_NAME->vpid; OBJ_RETAIN(proc); node->daemon = proc; OBJ_RETAIN(node); proc->node = node; opal_pointer_array_set_item(jdata->procs, ORTE_PROC_MY_NAME->vpid, proc); /* For now, we only support a single scheduler daemon in the system. * This *may* change someday in the future */ scheduler = (orcm_scheduler_t*)opal_list_get_first(orcm_schedulers); ORTE_PROC_MY_SCHEDULER->jobid = scheduler->controller.daemon.jobid; ORTE_PROC_MY_SCHEDULER->vpid = scheduler->controller.daemon.vpid; /* register the ORTE-level params at this time now that the * config has had a chance to push things into the environ */ if (ORTE_SUCCESS != (ret = orte_register_params())) { OBJ_DESTRUCT(&buf); error = "orte_register_params"; goto error; } /* setup callback for SIGPIPE */ setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback); /* Set signal handlers to catch kill signals so we can properly clean up * after ourselves. */ setup_sighandler(SIGTERM, &term_handler, shutdown_signal); setup_sighandler(SIGINT, &int_handler, shutdown_signal); /** setup callbacks for signals we should ignore */ setup_sighandler(SIGUSR1, &sigusr1_handler, signal_callback); setup_sighandler(SIGUSR2, &sigusr2_handler, signal_callback); signals_set = true; /* open and select the pstat framework */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_pstat_base_framework, 0))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "opal_pstat_base_open"; goto error; } if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "opal_pstat_base_select"; goto error; } /* open and setup the state machine */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_state_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_state_base_select())) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_state_base_select"; goto error; } /* open the errmgr */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_errmgr_base_open"; goto error; } /* Setup the communication infrastructure */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_oob_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_oob_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_oob_base_select"; goto error; } /* Runtime Messaging Layer */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_rml_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_rml_base_select())) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_rml_base_select"; goto error; } /* select the errmgr */ if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_errmgr_base_select"; goto error; } /* Routed system */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_rml_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_routed_base_select())) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_routed_base_select"; goto error; } /* database */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_db_base_framework, 0))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orcm_db_base_open"; goto error; } /* always restrict daemons to local database components */ if (ORTE_SUCCESS != (ret = orcm_db_base_select())) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orcm_db_base_select"; goto error; } /* datastore - ensure we don't pickup the pmi component, but * don't override anything set by user */ if (NULL == getenv("OMPI_MCA_dstore")) { putenv("OMPI_MCA_dstore=^pmi"); } if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_dstore_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "opal_dstore_base_open"; goto error; } if (ORTE_SUCCESS != (ret = opal_dstore_base_select())) { ORTE_ERROR_LOG(ret); error = "opal_dstore_base_select"; goto error; } /* create the handles */ if (0 > (opal_dstore_peer = opal_dstore.open("PEER"))) { error = "opal dstore global"; ret = ORTE_ERR_FATAL; goto error; } if (0 > (opal_dstore_internal = opal_dstore.open("INTERNAL"))) { error = "opal dstore internal"; ret = ORTE_ERR_FATAL; goto error; } if (0 > (opal_dstore_nonpeer = opal_dstore.open("NONPEER"))) { error = "opal dstore nonpeer"; ret = ORTE_ERR_FATAL; goto error; } /* initialize the nidmaps */ if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "orte_util_nidmap_init"; goto error; } /* extract the cluster description and setup the routed info - the orcm routed component * will know what to do. */ n = 1; if (OPAL_SUCCESS != (ret = opal_dss.unpack(&buf, &clusterbuf, &n, OPAL_BUFFER))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "extract cluster buf"; goto error; } if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, clusterbuf))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); OBJ_RELEASE(clusterbuf); error = "orte_routed.init_routes"; goto error; } OBJ_RELEASE(clusterbuf); /* extract the uri buffer and load the hash tables */ n = 1; if (OPAL_SUCCESS != (ret = opal_dss.unpack(&buf, &uribuf, &n, OPAL_BUFFER))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); error = "extract uri buffer"; goto error; } if (ORTE_SUCCESS != (ret = orte_rml_base_update_contact_info(uribuf))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&buf); OBJ_RELEASE(uribuf); error = "load hash tables"; goto error; } OBJ_DESTRUCT(&buf); OBJ_RELEASE(uribuf); /* construct the thread object */ OBJ_CONSTRUCT(&progress_thread, opal_thread_t); /* fork off a thread to progress it */ progress_thread.t_run = progress_thread_engine; progress_thread_running = true; if (OPAL_SUCCESS != (ret = opal_thread_start(&progress_thread))) { error = "progress thread start"; progress_thread_running = false; goto error; } /* * Group communications */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_grpcomm_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_grpcomm_base_select"; goto error; } /* Open/select the odls */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_odls_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_odls_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_odls_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_odls_base_select"; goto error; } /* enable communication with the rml */ if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) { ORTE_ERROR_LOG(ret); error = "orte_rml.enable_comm"; goto error; } /* setup the FileM */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_filem_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_filem_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_filem_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_filem_base_select"; goto error; } /* * Initalize the CR setup * Note: Always do this, even in non-FT builds. * If we don't some user level tools may hang. */ opal_cr_set_enabled(false); if (ORTE_SUCCESS != (ret = orte_cr_init())) { ORTE_ERROR_LOG(ret); error = "orte_cr_init"; goto error; } /* setup the DFS framework */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_dfs_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_dfs_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_dfs_select"; goto error; } return ORTE_SUCCESS; error: orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); return ORTE_ERR_SILENT; }
/* * Initialization of data structures for running under a debugger * using the MPICH/TotalView parallel debugger interface. This stage * of initialization must occur after spawn * * NOTE: We -always- perform this step to ensure that any debugger * that attaches to us post-launch of the application can get a * completed proctable */ void orte_debugger_base_init_after_spawn(orte_job_t *jdata) { orte_proc_t *proc; orte_app_context_t *appctx; orte_vpid_t i, j; opal_buffer_t buf; orte_process_name_t rank0; int rc; if (MPIR_proctable) { /* already initialized */ opal_output_verbose(5, orte_debugger_base.output, "%s: debugger already initialized", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); return; } /* fill in the proc table for the application processes */ opal_output_verbose(5, orte_debugger_base.output, "%s: Setting up debugger process table for applications", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); MPIR_debug_state = 1; /* set the total number of processes in the job */ MPIR_proctable_size = jdata->num_procs; /* allocate MPIR_proctable */ MPIR_proctable = (struct MPIR_PROCDESC *) malloc(sizeof(struct MPIR_PROCDESC) * MPIR_proctable_size); if (MPIR_proctable == NULL) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return; } if (orte_debugger_base.dump_proctable) { opal_output(orte_clean_output, "MPIR Proctable for job %s", ORTE_JOBID_PRINT(jdata->jobid)); } /* initialize MPIR_proctable */ for (j=0; j < jdata->num_procs; j++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) { continue; } /* store this data in the location whose index * corresponds to the proc's rank */ i = proc->name.vpid; if (NULL == (appctx = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, proc->app_idx))) { continue; } MPIR_proctable[i].host_name = strdup(proc->node->name); if ( 0 == strncmp(appctx->app, OPAL_PATH_SEP, 1 )) { MPIR_proctable[i].executable_name = opal_os_path( false, appctx->app, NULL ); } else { MPIR_proctable[i].executable_name = opal_os_path( false, appctx->cwd, appctx->app, NULL ); } MPIR_proctable[i].pid = proc->pid; if (orte_debugger_base.dump_proctable) { opal_output(orte_clean_output, "%s: Host %s Exe %s Pid %d", ORTE_VPID_PRINT(i), MPIR_proctable[i].host_name, MPIR_proctable[i].executable_name, MPIR_proctable[i].pid); } } if (0 < opal_output_get_verbosity(orte_debugger_base.output)) { orte_debugger_base_dump(); } /* if we are being launched under a debugger, then we must wait * for it to be ready to go and do some things to start the job */ if (MPIR_being_debugged) { /* wait for all procs to have reported their contact info - this * ensures that (a) they are all into mpi_init, and (b) the system * has the contact info to successfully send a message to rank=0 */ ORTE_PROGRESSED_WAIT(false, jdata->num_reported, jdata->num_procs); (void) MPIR_Breakpoint(); /* send a message to rank=0 to release it */ OBJ_CONSTRUCT(&buf, opal_buffer_t); /* don't need anything in this */ rank0.jobid = jdata->jobid; rank0.vpid = 0; if (0 > (rc = orte_rml.send_buffer(&rank0, &buf, ORTE_RML_TAG_DEBUGGER_RELEASE, 0))) { opal_output(0, "Error: could not send debugger release to MPI procs - error %s", ORTE_ERROR_NAME(rc)); } OBJ_DESTRUCT(&buf); } }
int orte_ess_base_proc_binding(void) { hwloc_obj_t node, obj; hwloc_cpuset_t cpus, nodeset; hwloc_obj_type_t target; unsigned int cache_level = 0; struct hwloc_topology_support *support; char *map; int ret; char *error=NULL; hwloc_cpuset_t mycpus; /* Determine if we were pre-bound or not */ if (NULL != getenv(OPAL_MCA_PREFIX"orte_bound_at_launch")) { orte_proc_is_bound = true; if (NULL != (map = getenv(OPAL_MCA_PREFIX"orte_base_applied_binding"))) { orte_proc_applied_binding = hwloc_bitmap_alloc(); if (0 != (ret = hwloc_bitmap_list_sscanf(orte_proc_applied_binding, map))) { error = "applied_binding parse"; goto error; } } } /* see if we were bound when launched */ if (!orte_proc_is_bound) { OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output, "%s Not bound at launch", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* we were not bound at launch */ if (NULL == opal_hwloc_topology) { /* there is nothing we can do, so just return */ return ORTE_SUCCESS; } support = (struct hwloc_topology_support*)hwloc_topology_get_support(opal_hwloc_topology); /* get our node object */ node = hwloc_get_root_obj(opal_hwloc_topology); nodeset = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, node); /* get our bindings */ cpus = hwloc_bitmap_alloc(); if (hwloc_get_cpubind(opal_hwloc_topology, cpus, HWLOC_CPUBIND_PROCESS) < 0) { /* we are NOT bound if get_cpubind fails, nor can we be bound - the * environment does not support it */ hwloc_bitmap_free(cpus); OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output, "%s Binding not supported", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto MOVEON; } /* we are bound if the two cpusets are not equal, * or if there is only ONE cpu available to us */ if (0 != hwloc_bitmap_compare(cpus, nodeset) || opal_hwloc_base_single_cpu(nodeset) || opal_hwloc_base_single_cpu(cpus)) { /* someone external set it - indicate it is set * so that we know */ orte_proc_is_bound = true; hwloc_bitmap_list_asprintf(&orte_process_info.cpuset, cpus); hwloc_bitmap_free(cpus); OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output, "%s Process was externally bound", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else if (support->cpubind->set_thisproc_cpubind && OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy) && OPAL_BIND_TO_NONE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { /* the system is capable of doing processor affinity, but it * has not yet been set - see if a slot_list was given */ hwloc_bitmap_zero(cpus); if (OPAL_BIND_TO_CPUSET == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { if (OPAL_SUCCESS != (ret = opal_hwloc_base_slot_list_parse(opal_hwloc_base_slot_list, opal_hwloc_topology, OPAL_HWLOC_LOGICAL, cpus))) { error = "Setting processor affinity failed"; hwloc_bitmap_free(cpus); goto error; } if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) { error = "Setting processor affinity failed"; hwloc_bitmap_free(cpus); goto error; } hwloc_bitmap_list_asprintf(&orte_process_info.cpuset, cpus); hwloc_bitmap_free(cpus); orte_proc_is_bound = true; OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output, "%s Process bound according to slot_list", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else { /* cleanup */ hwloc_bitmap_free(cpus); /* get the node rank */ if (ORTE_NODE_RANK_INVALID == orte_process_info.my_node_rank) { /* this is not an error - could be due to being * direct launched - so just ignore and leave * us unbound */ OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output, "%s Process not bound - no node rank available", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto MOVEON; } /* if the binding policy is hwthread, then we bind to the nrank-th * hwthread on this node */ if (OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_PU, 0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) { ret = ORTE_ERR_NOT_FOUND; error = "Getting hwthread object"; goto error; } cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj); if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) { ret = ORTE_ERROR; error = "Setting processor affinity failed"; goto error; } hwloc_bitmap_list_asprintf(&orte_process_info.cpuset, cpus); hwloc_bitmap_free(cpus); OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output, "%s Process bound to hwthread", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { /* if the binding policy is core, then we bind to the nrank-th * core on this node */ if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_CORE, 0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) { ret = ORTE_ERR_NOT_FOUND; error = "Getting core object"; goto error; } cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj); if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) { error = "Setting processor affinity failed"; ret = ORTE_ERROR; goto error; } hwloc_bitmap_list_asprintf(&orte_process_info.cpuset, cpus); OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output, "%s Process bound to core", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else { /* for all higher binding policies, we bind to the specified * object that the nrank-th core belongs to */ if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_CORE, 0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) { ret = ORTE_ERR_NOT_FOUND; error = "Getting core object"; goto error; } if (OPAL_BIND_TO_L1CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { target = HWLOC_OBJ_CACHE; cache_level = 1; } else if (OPAL_BIND_TO_L2CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { target = HWLOC_OBJ_CACHE; cache_level = 2; } else if (OPAL_BIND_TO_L3CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { target = HWLOC_OBJ_CACHE; cache_level = 3; } else if (OPAL_BIND_TO_SOCKET == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { target = HWLOC_OBJ_SOCKET; } else if (OPAL_BIND_TO_NUMA == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { target = HWLOC_OBJ_NODE; } else { ret = ORTE_ERR_NOT_FOUND; error = "Binding policy not known"; goto error; } for (obj = obj->parent; NULL != obj; obj = obj->parent) { if (target == obj->type) { if (HWLOC_OBJ_CACHE == target && cache_level != obj->attr->cache.depth) { continue; } /* this is the place! */ cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj); if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) { ret = ORTE_ERROR; error = "Setting processor affinity failed"; goto error; } hwloc_bitmap_list_asprintf(&orte_process_info.cpuset, cpus); orte_proc_is_bound = true; OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output, "%s Process bound to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hwloc_obj_type_string(target))); break; } } if (!orte_proc_is_bound) { ret = ORTE_ERROR; error = "Setting processor affinity failed"; goto error; } } } } } else { OPAL_OUTPUT_VERBOSE((5, orte_ess_base_framework.framework_output, "%s Process bound at launch", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } MOVEON: /* get or update our local cpuset - it will get used multiple * times, so it's more efficient to keep a global copy */ opal_hwloc_base_get_local_cpuset(); /* get the cpus we are bound to */ mycpus = hwloc_bitmap_alloc(); if (hwloc_get_cpubind(opal_hwloc_topology, mycpus, HWLOC_CPUBIND_PROCESS) < 0) { if (NULL != orte_process_info.cpuset) { free(orte_process_info.cpuset); orte_process_info.cpuset = NULL; } if (opal_hwloc_report_bindings || 4 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) { opal_output(0, "MCW rank %d is not bound", ORTE_PROC_MY_NAME->vpid); } } else { /* store/update the string representation of our local binding */ if (NULL != orte_process_info.cpuset) { free(orte_process_info.cpuset); orte_process_info.cpuset = NULL; } hwloc_bitmap_list_asprintf(&orte_process_info.cpuset, mycpus); /* report the binding, if requested */ if (opal_hwloc_report_bindings || 4 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) { char tmp1[1024], tmp2[1024]; if (OPAL_ERR_NOT_BOUND == opal_hwloc_base_cset2str(tmp1, sizeof(tmp1), opal_hwloc_topology, mycpus)) { opal_output(0, "MCW rank %d is not bound (or bound to all available processors)", ORTE_PROC_MY_NAME->vpid); } else { opal_hwloc_base_cset2mapstr(tmp2, sizeof(tmp2), opal_hwloc_topology, mycpus); opal_output(0, "MCW rank %d bound to %s: %s", ORTE_PROC_MY_NAME->vpid, tmp1, tmp2); } } } hwloc_bitmap_free(mycpus); /* push our cpuset so others can calculate our locality */ if (NULL != orte_process_info.cpuset) { OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL, OPAL_PMIX_CPUSET, orte_process_info.cpuset, OPAL_STRING); } return ORTE_SUCCESS; error: if (ORTE_ERR_SILENT != ret) { orte_show_help("help-orte-runtime", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); } return ORTE_ERR_SILENT; }
static int rte_init(void) { int rc, ret; char *error = NULL; char *envar, *ev1, *ev2; uint64_t unique_key[2]; char *string_key; opal_value_t *kv; char *val; int u32, *u32ptr; uint16_t u16, *u16ptr; /* run the prolog */ if (ORTE_SUCCESS != (rc = orte_ess_base_std_prolog())) { ORTE_ERROR_LOG(rc); return rc; } u32ptr = &u32; u16ptr = &u16; if (NULL != mca_ess_singleton_component.server_uri) { /* we are going to connect to a server HNP */ if (0 == strncmp(mca_ess_singleton_component.server_uri, "file", strlen("file")) || 0 == strncmp(mca_ess_singleton_component.server_uri, "FILE", strlen("FILE"))) { char input[1024], *filename; FILE *fp; /* it is a file - get the filename */ filename = strchr(mca_ess_singleton_component.server_uri, ':'); if (NULL == filename) { /* filename is not correctly formatted */ orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-bad", true, "singleton", mca_ess_singleton_component.server_uri); return ORTE_ERROR; } ++filename; /* space past the : */ if (0 >= strlen(filename)) { /* they forgot to give us the name! */ orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-missing", true, "singleton", mca_ess_singleton_component.server_uri); return ORTE_ERROR; } /* open the file and extract the uri */ fp = fopen(filename, "r"); if (NULL == fp) { /* can't find or read file! */ orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-access", true, "singleton", mca_ess_singleton_component.server_uri); return ORTE_ERROR; } memset(input, 0, 1024); // initialize the array to ensure a NULL termination if (NULL == fgets(input, 1023, fp)) { /* something malformed about file */ fclose(fp); orte_show_help("help-orterun.txt", "orterun:ompi-server-file-bad", true, "singleton", mca_ess_singleton_component.server_uri, "singleton"); return ORTE_ERROR; } fclose(fp); input[strlen(input)-1] = '\0'; /* remove newline */ orte_process_info.my_hnp_uri = strdup(input); } else { orte_process_info.my_hnp_uri = strdup(mca_ess_singleton_component.server_uri); } /* save the daemon uri - we will process it later */ orte_process_info.my_daemon_uri = strdup(orte_process_info.my_hnp_uri); /* construct our name - we are in their job family, so we know that * much. However, we cannot know how many other singletons and jobs * this HNP is running. Oh well - if someone really wants to use this * option, they can try to figure it out. For now, we'll just assume * we are the only ones */ ORTE_PROC_MY_NAME->jobid = ORTE_CONSTRUCT_LOCAL_JOBID(ORTE_PROC_MY_HNP->jobid, 1); /* obviously, we are vpid=0 for this job */ ORTE_PROC_MY_NAME->vpid = 0; /* for convenience, push the pubsub version of this param into the environ */ opal_setenv (OPAL_MCA_PREFIX"pubsub_orte_server", orte_process_info.my_hnp_uri, true, &environ); } else if (NULL != getenv("SINGULARITY_CONTAINER") || mca_ess_singleton_component.isolated) { /* ensure we use the isolated pmix component */ opal_setenv (OPAL_MCA_PREFIX"pmix", "isolated", true, &environ); } else { /* spawn our very own HNP to support us */ if (ORTE_SUCCESS != (rc = fork_hnp())) { ORTE_ERROR_LOG(rc); return rc; } /* our name was given to us by the HNP */ opal_setenv (OPAL_MCA_PREFIX"pmix", "^s1,s2,cray,isolated", true, &environ); } /* get an async event base - we use the opal_async one so * we don't startup extra threads if not needed */ orte_event_base = opal_progress_thread_init(NULL); progress_thread_running = true; /* open and setup pmix */ if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_pmix_base_framework, 0))) { error = "opening pmix"; goto error; } if (OPAL_SUCCESS != (ret = opal_pmix_base_select())) { error = "select pmix"; goto error; } /* set the event base */ opal_pmix_base_set_evbase(orte_event_base); /* initialize the selected module */ if (!opal_pmix.initialized() && (OPAL_SUCCESS != (ret = opal_pmix.init()))) { /* we cannot run */ error = "pmix init"; goto error; } /* pmix.init set our process name down in the OPAL layer, * so carry it forward here */ ORTE_PROC_MY_NAME->jobid = OPAL_PROC_MY_NAME.jobid; ORTE_PROC_MY_NAME->vpid = OPAL_PROC_MY_NAME.vpid; /* get our local rank from PMI */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_RANK, ORTE_PROC_MY_NAME, &u16ptr, OPAL_UINT16); if (OPAL_SUCCESS != ret) { error = "getting local rank"; goto error; } orte_process_info.my_local_rank = u16; /* get our node rank from PMI */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_NODE_RANK, ORTE_PROC_MY_NAME, &u16ptr, OPAL_UINT16); if (OPAL_SUCCESS != ret) { error = "getting node rank"; goto error; } orte_process_info.my_node_rank = u16; /* get universe size */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_UNIV_SIZE, ORTE_PROC_MY_NAME, &u32ptr, OPAL_UINT32); if (OPAL_SUCCESS != ret) { error = "getting univ size"; goto error; } orte_process_info.num_procs = u32; /* push into the environ for pickup in MPI layer for * MPI-3 required info key */ if (NULL == getenv(OPAL_MCA_PREFIX"orte_ess_num_procs")) { asprintf(&ev1, OPAL_MCA_PREFIX"orte_ess_num_procs=%d", orte_process_info.num_procs); putenv(ev1); added_num_procs = true; } if (NULL == getenv("OMPI_APP_CTX_NUM_PROCS")) { asprintf(&ev2, "OMPI_APP_CTX_NUM_PROCS=%d", orte_process_info.num_procs); putenv(ev2); added_app_ctx = true; } /* get our app number from PMI - ok if not found */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_APPNUM, ORTE_PROC_MY_NAME, &u32ptr, OPAL_UINT32); if (OPAL_SUCCESS == ret) { orte_process_info.app_num = u32; } else { orte_process_info.app_num = 0; } /* set some other standard values */ orte_process_info.num_local_peers = 0; /* setup transport keys in case the MPI layer needs them - * we can use the jobfam and stepid as unique keys * because they are unique values assigned by the RM */ if (NULL == getenv(OPAL_MCA_PREFIX"orte_precondition_transports")) { unique_key[0] = ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid); unique_key[1] = ORTE_LOCAL_JOBID(ORTE_PROC_MY_NAME->jobid); if (NULL == (string_key = orte_pre_condition_transports_print(unique_key))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } asprintf(&envar, OPAL_MCA_PREFIX"orte_precondition_transports=%s", string_key); putenv(envar); added_transport_keys = true; /* cannot free the envar as that messes up our environ */ free(string_key); } /* retrieve our topology */ OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_TOPO, ORTE_PROC_MY_NAME, &val, OPAL_STRING); if (OPAL_SUCCESS == ret && NULL != val) { /* load the topology */ if (0 != hwloc_topology_init(&opal_hwloc_topology)) { ret = OPAL_ERROR; free(val); error = "setting topology"; goto error; } if (0 != hwloc_topology_set_xmlbuffer(opal_hwloc_topology, val, strlen(val))) { ret = OPAL_ERROR; free(val); hwloc_topology_destroy(opal_hwloc_topology); error = "setting topology"; goto error; } /* since we are loading this from an external source, we have to * explicitly set a flag so hwloc sets things up correctly */ if (0 != hwloc_topology_set_flags(opal_hwloc_topology, (HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM | HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM | HWLOC_TOPOLOGY_FLAG_IO_DEVICES))) { ret = OPAL_ERROR; hwloc_topology_destroy(opal_hwloc_topology); free(val); error = "setting topology"; goto error; } /* now load the topology */ if (0 != hwloc_topology_load(opal_hwloc_topology)) { ret = OPAL_ERROR; hwloc_topology_destroy(opal_hwloc_topology); free(val); error = "setting topology"; goto error; } free(val); } else { /* it wasn't passed down to us, so go get it */ if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) { error = "topology discovery"; goto error; } /* push it into the PMIx database in case someone * tries to retrieve it so we avoid an attempt to * get it again */ kv = OBJ_NEW(opal_value_t); kv->key = strdup(OPAL_PMIX_LOCAL_TOPO); kv->type = OPAL_STRING; if (0 != (ret = hwloc_topology_export_xmlbuffer(opal_hwloc_topology, &kv->data.string, &u32))) { error = "topology export"; goto error; } if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_NAME, kv))) { error = "topology store"; goto error; } OBJ_RELEASE(kv); } /* use the std app init to complete the procedure */ if (ORTE_SUCCESS != (rc = orte_ess_base_app_setup(true))) { ORTE_ERROR_LOG(rc); return rc; } /* push our hostname so others can find us, if they need to */ OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL, OPAL_PMIX_HOSTNAME, orte_process_info.nodename, OPAL_STRING); if (ORTE_SUCCESS != ret) { error = "db store hostname"; goto error; } return ORTE_SUCCESS; error: if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) { orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); } return ret; }
static int rte_init(char flags) { int ret; char *error = NULL; char *contact_path, *jobfam_dir; orte_job_t *jdata; orte_node_t *node; orte_proc_t *proc; int value; /* initialize the global list of local children and job data */ OBJ_CONSTRUCT(&orte_local_children, opal_list_t); OBJ_CONSTRUCT(&orte_local_jobdata, opal_list_t); /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { error = "orte_ess_base_std_prolog"; goto error; } /* if we are using xml for output, put a basename start tag */ if (orte_xml_output) { fprintf(orte_xml_fp, "<%s>\n", orte_cmd_basename); fflush(orte_xml_fp); } /* determine the topology info */ if (0 == orte_default_num_sockets_per_board) { /* we weren't given a number, so try to determine it */ if (OPAL_SUCCESS != opal_paffinity_base_get_socket_info(&value)) { /* can't get any info - default to 1 */ value = 1; } orte_default_num_sockets_per_board = (uint8_t)value; } if (0 == orte_default_num_cores_per_socket) { /* we weren't given a number, so try to determine it */ if (OPAL_SUCCESS != (ret = opal_paffinity_base_get_core_info(0, &value))) { /* don't have topo info - can we at least get #processors? */ if (OPAL_SUCCESS != opal_paffinity_base_get_processor_info(&value)) { /* can't get any info - default to 1 */ value = 1; } } orte_default_num_cores_per_socket = (uint8_t)value; } /* Since we are the HNP, then responsibility for * defining the name falls to the PLM component for our * respective environment - hence, we have to open the PLM * first and select that component. Note that ONLY the * HNP ever uses a PLM component anyway */ if (ORTE_SUCCESS != (ret = orte_plm_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_plm_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_plm_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_plm_base_select"; goto error; } if (ORTE_SUCCESS != (ret = orte_plm.set_hnp_name())) { ORTE_ERROR_LOG(ret); error = "orte_plm_set_hnp_name"; goto error; } /* Setup the communication infrastructure */ /* * Runtime Messaging Layer */ if (ORTE_SUCCESS != (ret = orte_rml_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_rml_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_select"; goto error; } /* * Routed system */ if (ORTE_SUCCESS != (ret = orte_routed_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_routed_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_routed_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_routed_base_select"; goto error; } /* * Group communications */ if (ORTE_SUCCESS != (ret = orte_grpcomm_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_grpcomm_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_grpcomm_base_select"; goto error; } /* Now provide a chance for the PLM * to perform any module-specific init functions. This * needs to occur AFTER the communications are setup * as it may involve starting a non-blocking recv */ if (ORTE_SUCCESS != (ret = orte_plm.init())) { ORTE_ERROR_LOG(ret); error = "orte_plm_init"; goto error; } /* * Setup the remaining resource * management and errmgr frameworks - application procs * and daemons do not open these frameworks as they only use * the hnp proxy support in the PLM framework. */ if (ORTE_SUCCESS != (ret = orte_ras_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_ras_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_ras_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_ras_base_find_available"; goto error; } if (ORTE_SUCCESS != (ret = orte_rmaps_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_rmaps_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_rmaps_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_rmaps_base_find_available"; goto error; } if (ORTE_SUCCESS != (ret = orte_errmgr_base_open())) { error = "orte_errmgr_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_select"; goto error; } /* Open/select the odls */ if (ORTE_SUCCESS != (ret = orte_odls_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_odls_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_odls_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_odls_base_select"; goto error; } /* enable communication with the rml */ if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) { ORTE_ERROR_LOG(ret); error = "orte_rml.enable_comm"; goto error; } #if !ORTE_DISABLE_FULL_SUPPORT /* setup the orte_show_help system to recv remote output */ ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_SHOW_HELP, ORTE_RML_NON_PERSISTENT, orte_show_help_recv, NULL); if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) { ORTE_ERROR_LOG(ret); error = "setup receive for orte_show_help"; goto error; } #endif /* setup my session directory */ OPAL_OUTPUT_VERBOSE((2, orte_debug_output, "%s setting up session dir with\n\ttmpdir: %s\n\thost %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base, orte_process_info.nodename)); if (ORTE_SUCCESS != (ret = orte_session_dir(true, orte_process_info.tmpdir_base, orte_process_info.nodename, NULL, ORTE_PROC_MY_NAME))) { ORTE_ERROR_LOG(ret); error = "orte_session_dir"; goto error; } /* Once the session directory location has been established, set the opal_output hnp file location to be in the proc-specific session directory. */ opal_output_set_output_file_info(orte_process_info.proc_session_dir, "output-", NULL, NULL); /* save my contact info in a file for others to find */ jobfam_dir = opal_dirname(orte_process_info.job_session_dir); contact_path = opal_os_path(false, jobfam_dir, "contact.txt", NULL); free(jobfam_dir); OPAL_OUTPUT_VERBOSE((2, orte_debug_output, "%s writing contact file %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), contact_path)); if (ORTE_SUCCESS != (ret = orte_write_hnp_contact_file(contact_path))) { OPAL_OUTPUT_VERBOSE((2, orte_debug_output, "%s writing contact file failed with error %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(ret))); } else { OPAL_OUTPUT_VERBOSE((2, orte_debug_output, "%s wrote contact file", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } free(contact_path); /* Setup the job data object for the daemons */ /* create and store the job data object */ jdata = OBJ_NEW(orte_job_t); jdata->jobid = ORTE_PROC_MY_NAME->jobid; opal_pointer_array_add(orte_job_data, jdata); /* create and store a node object where we are */ node = OBJ_NEW(orte_node_t); node->name = strdup(orte_process_info.nodename); node->arch = orte_process_info.arch; node->index = opal_pointer_array_add(orte_node_pool, node); /* create and store a proc object for us */ proc = OBJ_NEW(orte_proc_t); proc->name.jobid = ORTE_PROC_MY_NAME->jobid; proc->name.vpid = ORTE_PROC_MY_NAME->vpid; proc->pid = orte_process_info.pid; proc->rml_uri = orte_rml.get_contact_info(); proc->state = ORTE_PROC_STATE_RUNNING; OBJ_RETAIN(node); /* keep accounting straight */ proc->node = node; proc->nodename = node->name; opal_pointer_array_add(jdata->procs, proc); /* record that the daemon (i.e., us) is on this node * NOTE: we do not add the proc object to the node's * proc array because we are not an application proc. * Instead, we record it in the daemon field of the * node object */ OBJ_RETAIN(proc); /* keep accounting straight */ node->daemon = proc; node->daemon_launched = true; node->state = ORTE_NODE_STATE_UP; /* record that the daemon job is running */ jdata->num_procs = 1; jdata->state = ORTE_JOB_STATE_RUNNING; /* setup the routed info - the selected routed component * will know what to do. */ if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) { ORTE_ERROR_LOG(ret); error = "orte_routed.init_routes"; goto error; } /* setup I/O forwarding system - must come after we init routes */ if (ORTE_SUCCESS != (ret = orte_iof_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_iof_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_iof_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_iof_base_select"; goto error; } /* setup the FileM */ if (ORTE_SUCCESS != (ret = orte_filem_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_filem_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_filem_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_filem_base_select"; goto error; } #if OPAL_ENABLE_FT == 1 /* * Setup the SnapC */ if (ORTE_SUCCESS != (ret = orte_snapc_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_snapc_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_snapc_base_select(orte_process_info.hnp, !orte_process_info.daemon))) { ORTE_ERROR_LOG(ret); error = "orte_snapc_base_select"; goto error; } /* For HNP, ORTE doesn't need the OPAL CR stuff */ opal_cr_set_enabled(false); #else opal_cr_set_enabled(false); #endif /* * Initalize the CR setup * Note: Always do this, even in non-FT builds. * If we don't some user level tools may hang. */ if (ORTE_SUCCESS != (ret = orte_cr_init())) { ORTE_ERROR_LOG(ret); error = "orte_cr_init"; goto error; } /* setup the notifier system */ if (ORTE_SUCCESS != (ret = orte_notifier_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_notifer_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_notifier_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_notifer_select"; goto error; } return ORTE_SUCCESS; error: if (ORTE_ERR_SILENT != ret) { orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); } /* cleanup the global list of local children and job data */ OBJ_DESTRUCT(&orte_local_children); OBJ_DESTRUCT(&orte_local_jobdata); return ret; }
int orte_ess_base_proc_binding(void) { #if OPAL_HAVE_HWLOC hwloc_obj_t node, obj; hwloc_cpuset_t cpus, nodeset; hwloc_obj_type_t target; unsigned int cache_level = 0; struct hwloc_topology_support *support; char *map; int ret; char *error; /* Determine if we were pre-bound or not */ if (NULL != getenv("OMPI_MCA_orte_bound_at_launch")) { orte_proc_is_bound = true; if (NULL != (map = getenv("OMPI_MCA_orte_base_applied_binding"))) { orte_proc_applied_binding = hwloc_bitmap_alloc(); if (0 != (ret = hwloc_bitmap_list_sscanf(orte_proc_applied_binding, map))) { error = "applied_binding parse"; goto error; } } } /* see if we were bound when launched */ if (!orte_proc_is_bound) { OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, "%s Not bound at launch", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* we were not bound at launch */ if (NULL != opal_hwloc_topology) { support = (struct hwloc_topology_support*)hwloc_topology_get_support(opal_hwloc_topology); /* get our node object */ node = hwloc_get_root_obj(opal_hwloc_topology); nodeset = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, node); /* get our bindings */ cpus = hwloc_bitmap_alloc(); if (hwloc_get_cpubind(opal_hwloc_topology, cpus, HWLOC_CPUBIND_PROCESS) < 0) { /* we are NOT bound if get_cpubind fails, nor can we be bound - the * environment does not support it */ hwloc_bitmap_free(cpus); OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, "%s Binding not supported", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto MOVEON; } /* we are bound if the two cpusets are not equal, * or if there is only ONE cpu available to us */ if (0 != hwloc_bitmap_compare(cpus, nodeset) || opal_hwloc_base_single_cpu(nodeset) || opal_hwloc_base_single_cpu(cpus)) { /* someone external set it - indicate it is set * so that we know */ orte_proc_is_bound = true; hwloc_bitmap_free(cpus); OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, "%s Process was externally bound", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else if (support->cpubind->set_thisproc_cpubind && OPAL_BINDING_POLICY_IS_SET(opal_hwloc_binding_policy) && OPAL_BIND_TO_NONE != OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { /* the system is capable of doing processor affinity, but it * has not yet been set - see if a slot_list was given */ hwloc_bitmap_zero(cpus); if (OPAL_BIND_TO_CPUSET == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { if (OPAL_SUCCESS != (ret = opal_hwloc_base_slot_list_parse(opal_hwloc_base_slot_list, opal_hwloc_topology, cpus))) { error = "Setting processor affinity failed"; hwloc_bitmap_free(cpus); goto error; } if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) { error = "Setting processor affinity failed"; hwloc_bitmap_free(cpus); goto error; } /* try to find a level and index for this location */ opal_hwloc_base_get_level_and_index(cpus, &orte_process_info.bind_level, &orte_process_info.bind_idx); /* cleanup */ hwloc_bitmap_free(cpus); orte_proc_is_bound = true; OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, "%s Process bound according to slot_list", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else { /* cleanup */ hwloc_bitmap_free(cpus); /* get the node rank */ if (ORTE_NODE_RANK_INVALID == orte_process_info.my_node_rank) { /* this is not an error - could be due to being * direct launched - so just ignore and leave * us unbound */ OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, "%s Process not bound - no node rank available", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto MOVEON; } /* if the binding policy is hwthread, then we bind to the nrank-th * hwthread on this node */ if (OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_PU, 0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) { ret = ORTE_ERR_NOT_FOUND; error = "Getting hwthread object"; goto error; } cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj); if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) { ret = ORTE_ERROR; error = "Setting processor affinity failed"; goto error; } orte_process_info.bind_level = OPAL_HWLOC_HWTHREAD_LEVEL; orte_process_info.bind_idx = orte_process_info.my_node_rank; OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, "%s Process bound to hwthread", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else if (OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { /* if the binding policy is core, then we bind to the nrank-th * core on this node */ if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_CORE, 0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) { ret = ORTE_ERR_NOT_FOUND; error = "Getting core object"; goto error; } cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj); if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) { error = "Setting processor affinity failed"; ret = ORTE_ERROR; goto error; } orte_process_info.bind_level = OPAL_HWLOC_CORE_LEVEL; orte_process_info.bind_idx = orte_process_info.my_node_rank; OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, "%s Process bound to core", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } else { /* for all higher binding policies, we bind to the specified * object that the nrank-th core belongs to */ if (NULL == (obj = opal_hwloc_base_get_obj_by_type(opal_hwloc_topology, HWLOC_OBJ_CORE, 0, orte_process_info.my_node_rank, OPAL_HWLOC_LOGICAL))) { ret = ORTE_ERR_NOT_FOUND; error = "Getting core object"; goto error; } if (OPAL_BIND_TO_L1CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { target = HWLOC_OBJ_CACHE; cache_level = 1; orte_process_info.bind_level = OPAL_HWLOC_L1CACHE_LEVEL; } else if (OPAL_BIND_TO_L2CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { target = HWLOC_OBJ_CACHE; cache_level = 2; orte_process_info.bind_level = OPAL_HWLOC_L2CACHE_LEVEL; } else if (OPAL_BIND_TO_L3CACHE == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { target = HWLOC_OBJ_CACHE; cache_level = 3; orte_process_info.bind_level = OPAL_HWLOC_L3CACHE_LEVEL; } else if (OPAL_BIND_TO_SOCKET == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { target = HWLOC_OBJ_SOCKET; orte_process_info.bind_level = OPAL_HWLOC_SOCKET_LEVEL; } else if (OPAL_BIND_TO_NUMA == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { target = HWLOC_OBJ_NODE; orte_process_info.bind_level = OPAL_HWLOC_NUMA_LEVEL; } else { ret = ORTE_ERR_NOT_FOUND; error = "Binding policy not known"; goto error; } for (obj = obj->parent; NULL != obj; obj = obj->parent) { if (target == obj->type) { if (HWLOC_OBJ_CACHE == target && cache_level != obj->attr->cache.depth) { continue; } /* this is the place! */ cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, obj); if (0 > hwloc_set_cpubind(opal_hwloc_topology, cpus, 0)) { ret = ORTE_ERROR; error = "Setting processor affinity failed"; goto error; } orte_process_info.bind_idx = opal_hwloc_base_get_obj_idx(opal_hwloc_topology, obj, OPAL_HWLOC_LOGICAL); orte_proc_is_bound = true; OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, "%s Process bound to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), opal_hwloc_base_print_level(orte_process_info.bind_level))); break; } } if (!orte_proc_is_bound) { ret = ORTE_ERROR; error = "Setting processor affinity failed"; goto error; } } } } } } else { OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output, "%s Process bound at launch", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); } MOVEON: /* get or update our local cpuset - it will get used multiple * times, so it's more efficient to keep a global copy */ opal_hwloc_base_get_local_cpuset(); /* report bindings, if requested */ if (opal_hwloc_report_bindings) { char bindings[64]; hwloc_obj_t root; hwloc_cpuset_t cpus; /* get the root object for this node */ root = hwloc_get_root_obj(opal_hwloc_topology); cpus = opal_hwloc_base_get_available_cpus(opal_hwloc_topology, root); /* we are not bound if this equals our cpuset */ if (0 == hwloc_bitmap_compare(cpus, opal_hwloc_my_cpuset)) { opal_output(0, "%s is not bound", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); } else { hwloc_bitmap_list_snprintf(bindings, 64, opal_hwloc_my_cpuset); opal_output(0, "%s is bound to cpus %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), bindings); } } return ORTE_SUCCESS; error: if (ORTE_ERR_SILENT != ret) { orte_show_help("help-orte-runtime", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); } return ORTE_ERR_SILENT; #else return ORTE_SUCCESS; #endif }
static int rte_init(void) { int ret, i, j; char *error = NULL, *localj; int32_t jobfam, stepid; char *envar, *ev1, *ev2; uint64_t unique_key[2]; char *cs_env, *string_key; char *pmi_id=NULL; int *ranks; char *tmp; orte_jobid_t jobid; orte_process_name_t proc; orte_local_rank_t local_rank; orte_node_rank_t node_rank; /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { error = "orte_ess_base_std_prolog"; goto error; } #if OPAL_HAVE_HWLOC /* get the topology */ if (NULL == opal_hwloc_topology) { if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) { error = "topology discovery"; goto error; } } #endif if (ORTE_PROC_IS_DAEMON) { /* I am a daemon, launched by mpirun */ /* we had to be given a jobid */ mca_base_param_reg_string_name("orte", "ess_jobid", "Process jobid", true, false, NULL, &tmp); if (NULL == tmp) { error = "missing jobid"; ret = ORTE_ERR_FATAL; goto error; } if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_jobid(&jobid, tmp))) { ORTE_ERROR_LOG(ret); error = "convert jobid"; goto error; } free(tmp); ORTE_PROC_MY_NAME->jobid = jobid; /* get our rank from PMI */ if (PMI_SUCCESS != (ret = PMI_Get_rank(&i))) { ORTE_PMI_ERROR(ret, "PMI_Get_rank"); error = "could not get PMI rank"; goto error; } ORTE_PROC_MY_NAME->vpid = i + 1; /* compensate for orterun */ /* get the number of procs from PMI */ if (PMI_SUCCESS != (ret = PMI_Get_universe_size(&i))) { ORTE_PMI_ERROR(ret, "PMI_Get_universe_size"); error = "could not get PMI universe size"; goto error; } orte_process_info.num_procs = i + 1; /* compensate for orterun */ /* complete setup */ if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(NULL))) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_orted_setup"; goto error; } } else { /* we are a direct-launched MPI process */ /* get our PMI id length */ if (PMI_SUCCESS != (ret = PMI_Get_id_length_max(&pmi_maxlen))) { error = "PMI_Get_id_length_max"; goto error; } pmi_id = malloc(pmi_maxlen); if (PMI_SUCCESS != (ret = PMI_Get_kvs_domain_id(pmi_id, pmi_maxlen))) { free(pmi_id); error = "PMI_Get_kvs_domain_id"; goto error; } /* PMI is very nice to us - the domain id is an integer followed * by a '.', followed by essentially a stepid. The first integer * defines an overall job number. The second integer is the number of * individual jobs we have run within that allocation. So we translate * this as the overall job number equating to our job family, and * the individual number equating to our local jobid */ jobfam = strtol(pmi_id, &localj, 10); if (NULL == localj) { /* hmmm - no '.', so let's just use zero */ stepid = 0; } else { localj++; /* step over the '.' */ stepid = strtol(localj, NULL, 10) + 1; /* add one to avoid looking like a daemon */ } free(pmi_id); /* now build the jobid */ ORTE_PROC_MY_NAME->jobid = ORTE_CONSTRUCT_LOCAL_JOBID(jobfam << 16, stepid); /* get our rank */ if (PMI_SUCCESS != (ret = PMI_Get_rank(&i))) { ORTE_PMI_ERROR(ret, "PMI_Get_rank"); error = "could not get PMI rank"; goto error; } ORTE_PROC_MY_NAME->vpid = i; /* get the number of procs from PMI */ if (PMI_SUCCESS != (ret = PMI_Get_universe_size(&i))) { ORTE_PMI_ERROR(ret, "PMI_Get_universe_size"); error = "could not get PMI universe size"; goto error; } orte_process_info.num_procs = i; /* push into the environ for pickup in MPI layer for * MPI-3 required info key */ asprintf(&ev1, "OMPI_MCA_orte_ess_num_procs=%d", i); putenv(ev1); asprintf(&ev2, "OMPI_APP_CTX_NUM_PROCS=%d", i); putenv(ev2); /* setup transport keys in case the MPI layer needs them - * we can use the jobfam and stepid as unique keys * because they are unique values assigned by the RM */ unique_key[0] = (uint64_t)jobfam; unique_key[1] = (uint64_t)stepid; if (NULL == (string_key = orte_pre_condition_transports_print(unique_key))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } if (NULL == (cs_env = mca_base_param_environ_variable("orte_precondition_transports",NULL,NULL))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } asprintf(&envar, "%s=%s", cs_env, string_key); putenv(envar); /* cannot free the envar as that messes up our environ */ free(cs_env); free(string_key); /* our app_context number can only be 0 as we don't support * dynamic spawns */ orte_process_info.app_num = 0; /* setup my daemon's name - arbitrary, since we don't route * messages */ ORTE_PROC_MY_DAEMON->jobid = 0; ORTE_PROC_MY_DAEMON->vpid = 0; /* ensure we pick the correct critical components */ putenv("OMPI_MCA_grpcomm=pmi"); putenv("OMPI_MCA_routed=direct"); /* now use the default procedure to finish my setup */ if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_app_setup"; goto error; } /* store our info into the database */ if (ORTE_SUCCESS != (ret = orte_db.store(ORTE_PROC_MY_NAME, ORTE_DB_HOSTNAME, orte_process_info.nodename, OPAL_STRING))) { error = "db store daemon vpid"; goto error; } /* get our local proc info to find our local rank */ if (PMI_SUCCESS != (ret = PMI_Get_clique_size(&i))) { ORTE_PMI_ERROR(ret, "PMI_Get_clique_size"); error = "could not get PMI clique size"; goto error; } /* store that info - remember, we want the number of peers that * share the node WITH ME, so we have to subtract ourselves from * that number */ orte_process_info.num_local_peers = i - 1; /* now get the specific ranks */ ranks = (int*)malloc(i * sizeof(int)); if (PMI_SUCCESS != (ret = PMI_Get_clique_ranks(ranks, i))) { ORTE_PMI_ERROR(ret, "PMI_Get_clique_ranks"); error = "could not get clique ranks"; goto error; } /* The clique ranks are returned in rank order, so * cycle thru the array and update the local/node * rank info */ proc.jobid = ORTE_PROC_MY_NAME->jobid; for (j=0; j < i; j++) { proc.vpid = ranks[j]; local_rank = j; node_rank = j; if (ranks[j] == (int)ORTE_PROC_MY_NAME->vpid) { orte_process_info.my_local_rank = local_rank; orte_process_info.my_node_rank = node_rank; } if (ORTE_SUCCESS != (ret = orte_db.store(&proc, ORTE_DB_LOCALRANK, &local_rank, ORTE_LOCAL_RANK))) { error = "db store local rank"; goto error; } if (ORTE_SUCCESS != (ret = orte_db.store(&proc, ORTE_DB_NODERANK, &node_rank, ORTE_NODE_RANK))) { error = "db store node rank"; goto error; } } free(ranks); /* setup process binding */ if (ORTE_SUCCESS != (ret = orte_ess_base_proc_binding())) { error = "proc_binding"; goto error; } } /* set max procs */ if (orte_process_info.max_procs < orte_process_info.num_procs) { orte_process_info.max_procs = orte_process_info.num_procs; } /* flag that we completed init */ app_init_complete = true; return ORTE_SUCCESS; error: if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) { orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); } return ret; }
int orte_init(int* pargc, char*** pargv, orte_proc_type_t flags) { int ret; char *error = NULL; if (0 < orte_initialized) { /* track number of times we have been called */ orte_initialized++; return ORTE_SUCCESS; } orte_initialized++; /* initialize the opal layer */ if (ORTE_SUCCESS != (ret = opal_init(pargc, pargv))) { error = "opal_init"; goto error; } /* Convince OPAL to use our naming scheme */ opal_process_name_print = _process_name_print_for_opal; opal_process_name_vpid = _process_name_vpid_for_opal; opal_process_name_jobid = _process_name_jobid_for_opal; opal_compare_proc = _process_name_compare; /* ensure we know the type of proc for when we finalize */ orte_process_info.proc_type = flags; /* setup the locks */ if (ORTE_SUCCESS != (ret = orte_locks_init())) { error = "orte_locks_init"; goto error; } /* Register all MCA Params */ if (ORTE_SUCCESS != (ret = orte_register_params())) { error = "orte_register_params"; goto error; } /* setup the orte_show_help system */ if (ORTE_SUCCESS != (ret = orte_show_help_init())) { error = "opal_output_init"; goto error; } /* register handler for errnum -> string conversion */ opal_error_register("ORTE", ORTE_ERR_BASE, ORTE_ERR_MAX, orte_err2str); /* Ensure the rest of the process info structure is initialized */ if (ORTE_SUCCESS != (ret = orte_proc_info())) { error = "orte_proc_info"; goto error; } /* open the ESS and select the correct module for this environment */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_ess_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_ess_base_select())) { error = "orte_ess_base_select"; goto error; } if (!ORTE_PROC_IS_APP) { /* ORTE tools "block" in their own loop over the event * base, so no progress thread is required - apps will * start their progress thread in ess_base_std_app.c * at the appropriate point */ orte_event_base = opal_event_base; } /* initialize the RTE for this environment */ if (ORTE_SUCCESS != (ret = orte_ess.init())) { error = "orte_ess_init"; goto error; } /* All done */ return ORTE_SUCCESS; error: if (ORTE_ERR_SILENT != ret) { orte_show_help("help-orte-runtime", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); } return ret; }
static int rte_init(void) { int ret; char *error = NULL; char **hosts = NULL; char *nodelist; /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { error = "orte_ess_base_std_prolog"; goto error; } /* Start by getting a unique name */ tm_set_name(); /* if I am a daemon, complete my setup using the * default procedure */ if (ORTE_PROC_IS_DAEMON) { /* get the list of nodes used for this job */ nodelist = getenv("OMPI_MCA_orte_nodelist"); if (NULL != nodelist) { /* split the node list into an argv array */ hosts = opal_argv_split(nodelist, ','); } if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_orted_setup"; goto error; } opal_argv_free(hosts); return ORTE_SUCCESS; } if (ORTE_PROC_IS_TOOL) { /* otherwise, if I am a tool proc, use that procedure */ if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup())) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_tool_setup"; goto error; } /* as a tool, I don't need a nidmap - so just return now */ return ORTE_SUCCESS; } /* no other options are supported! */ error = "ess_error"; ret = ORTE_ERROR; error: if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) { orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); } return ret; }
int orte_ess_base_orted_setup(char **hosts) { int ret = ORTE_ERROR; int fd; char log_file[PATH_MAX]; char *jobidstring; char *error = NULL; orte_job_t *jdata; orte_proc_t *proc; orte_app_context_t *app; orte_node_t *node; char *param; plm_in_use = false; /* setup callback for SIGPIPE */ setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback); /* Set signal handlers to catch kill signals so we can properly clean up * after ourselves. */ setup_sighandler(SIGTERM, &term_handler, shutdown_signal); setup_sighandler(SIGINT, &int_handler, shutdown_signal); /** setup callbacks for signals we should ignore */ setup_sighandler(SIGUSR1, &sigusr1_handler, signal_callback); setup_sighandler(SIGUSR2, &sigusr2_handler, signal_callback); signals_set = true; #if OPAL_HAVE_HWLOC { hwloc_obj_t obj; unsigned i, j; /* get the local topology */ if (NULL == opal_hwloc_topology) { if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) { error = "topology discovery"; goto error; } } /* remove the hostname from the topology. Unfortunately, hwloc * decided to add the source hostname to the "topology", thus * rendering it unusable as a pure topological description. So * we remove that information here. */ obj = hwloc_get_root_obj(opal_hwloc_topology); for (i=0; i < obj->infos_count; i++) { if (NULL == obj->infos[i].name || NULL == obj->infos[i].value) { continue; } if (0 == strncmp(obj->infos[i].name, "HostName", strlen("HostName"))) { free(obj->infos[i].name); free(obj->infos[i].value); /* left justify the array */ for (j=i; j < obj->infos_count-1; j++) { obj->infos[j] = obj->infos[j+1]; } obj->infos[obj->infos_count-1].name = NULL; obj->infos[obj->infos_count-1].value = NULL; obj->infos_count--; break; } } if (4 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) { opal_output(0, "%s Topology Info:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO); } } #endif /* setup the global nidmap/pidmap object */ orte_nidmap.bytes = NULL; orte_nidmap.size = 0; orte_pidmap.bytes = NULL; orte_pidmap.size = 0; /* open and setup the opal_pstat framework so we can provide * process stats if requested */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_pstat_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "opal_pstat_base_open"; goto error; } if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) { ORTE_ERROR_LOG(ret); error = "opal_pstat_base_select"; goto error; } /* open and setup the state machine */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_state_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_state_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_state_base_select"; goto error; } /* open the errmgr */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_open"; goto error; } /* some environments allow remote launches - e.g., ssh - so * open and select something -only- if we are given * a specific module to use */ (void) mca_base_var_env_name("plm", ¶m); plm_in_use = !!(getenv(param)); free (param); if (plm_in_use) { if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_plm_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_plm_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_plm_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_plm_base_select"; goto error; } } /* Setup the communication infrastructure */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_oob_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_oob_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_oob_base_select"; goto error; } if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_rml_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_select"; goto error; } /* select the errmgr */ if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_select"; goto error; } /* Routed system */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_routed_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_routed_base_select"; goto error; } /* database */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_db_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_db_base_open"; goto error; } /* always restrict daemons to local database components */ if (ORTE_SUCCESS != (ret = opal_db_base_select(true))) { ORTE_ERROR_LOG(ret); error = "orte_db_base_select"; goto error; } /* set our id */ opal_db.set_id((opal_identifier_t*)ORTE_PROC_MY_NAME); /* * Group communications */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_grpcomm_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_grpcomm_base_select"; goto error; } /* Open/select the odls */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_odls_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_odls_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_odls_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_odls_base_select"; goto error; } /* enable communication with the rml */ if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) { ORTE_ERROR_LOG(ret); error = "orte_rml.enable_comm"; goto error; } /* initialize the nidmaps */ if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) { ORTE_ERROR_LOG(ret); error = "orte_util_nidmap_init"; goto error; } #if ORTE_ENABLE_STATIC_PORTS /* if we are using static ports, then we need to setup * the daemon info so the RML can function properly * without requiring a wireup stage. This must be done * after we enable_comm as that function determines our * own port, which we need in order to construct the nidmap */ if (orte_static_ports) { /* define the routing tree so we know the pattern * if we are trying to setup common or static ports */ orte_routed.update_routing_plan(); /* extract the node info from the environment and * build a nidmap from it */ if (ORTE_SUCCESS != (ret = orte_util_build_daemon_nidmap(hosts))) { ORTE_ERROR_LOG(ret); error = "construct daemon map from static ports"; goto error; } } #endif /* be sure to update the routing tree so the initial "phone home" * to mpirun goes through the tree if static ports were enabled - still * need to do it anyway just to initialize things */ orte_routed.update_routing_plan(); /* Now provide a chance for the PLM * to perform any module-specific init functions. This * needs to occur AFTER the communications are setup * as it may involve starting a non-blocking recv * Do this only if a specific PLM was given to us - the * orted has no need of the proxy PLM at all */ if (plm_in_use) { if (ORTE_SUCCESS != (ret = orte_plm.init())) { ORTE_ERROR_LOG(ret); error = "orte_plm_init"; goto error; } } /* setup my session directory */ if (orte_create_session_dirs) { OPAL_OUTPUT_VERBOSE((2, orte_ess_base_framework.framework_output, "%s setting up session dir with\n\ttmpdir: %s\n\thost %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base, orte_process_info.nodename)); /* take a pass thru the session directory code to fillin the * tmpdir names - don't create anything yet */ if (ORTE_SUCCESS != (ret = orte_session_dir(false, orte_process_info.tmpdir_base, orte_process_info.nodename, NULL, ORTE_PROC_MY_NAME))) { ORTE_ERROR_LOG(ret); error = "orte_session_dir define"; goto error; } /* clear the session directory just in case there are * stale directories laying around */ orte_session_dir_cleanup(ORTE_JOBID_WILDCARD); /* now actually create the directory tree */ if (ORTE_SUCCESS != (ret = orte_session_dir(true, orte_process_info.tmpdir_base, orte_process_info.nodename, NULL, ORTE_PROC_MY_NAME))) { ORTE_ERROR_LOG(ret); error = "orte_session_dir"; goto error; } /* Once the session directory location has been established, set the opal_output env file location to be in the proc-specific session directory. */ opal_output_set_output_file_info(orte_process_info.proc_session_dir, "output-", NULL, NULL); /* setup stdout/stderr */ if (orte_debug_daemons_file_flag) { /* if we are debugging to a file, then send stdout/stderr to * the orted log file */ /* get my jobid */ if (ORTE_SUCCESS != (ret = orte_util_convert_jobid_to_string(&jobidstring, ORTE_PROC_MY_NAME->jobid))) { ORTE_ERROR_LOG(ret); error = "convert_jobid"; goto error; } /* define a log file name in the session directory */ snprintf(log_file, PATH_MAX, "output-orted-%s-%s.log", jobidstring, orte_process_info.nodename); log_path = opal_os_path(false, orte_process_info.tmpdir_base, orte_process_info.top_session_dir, log_file, NULL); fd = open(log_path, O_RDWR|O_CREAT|O_TRUNC, 0640); if (fd < 0) { /* couldn't open the file for some reason, so * just connect everything to /dev/null */ fd = open("/dev/null", O_RDWR|O_CREAT|O_TRUNC, 0666); } else { dup2(fd, STDOUT_FILENO); dup2(fd, STDERR_FILENO); if(fd != STDOUT_FILENO && fd != STDERR_FILENO) { close(fd); } } } } /* setup the global job and node arrays */ orte_job_data = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data, 1, ORTE_GLOBAL_ARRAY_MAX_SIZE, 1))) { ORTE_ERROR_LOG(ret); error = "setup job array"; goto error; } orte_node_pool = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool, ORTE_GLOBAL_ARRAY_BLOCK_SIZE, ORTE_GLOBAL_ARRAY_MAX_SIZE, ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) { ORTE_ERROR_LOG(ret); error = "setup node array"; goto error; } orte_node_topologies = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_topologies, ORTE_GLOBAL_ARRAY_BLOCK_SIZE, ORTE_GLOBAL_ARRAY_MAX_SIZE, ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) { ORTE_ERROR_LOG(ret); error = "setup node topologies array"; goto error; } /* Setup the job data object for the daemons */ /* create and store the job data object */ jdata = OBJ_NEW(orte_job_t); jdata->jobid = ORTE_PROC_MY_NAME->jobid; opal_pointer_array_set_item(orte_job_data, 0, jdata); /* every job requires at least one app */ app = OBJ_NEW(orte_app_context_t); opal_pointer_array_set_item(jdata->apps, 0, app); jdata->num_apps++; /* create and store a node object where we are */ node = OBJ_NEW(orte_node_t); node->name = strdup(orte_process_info.nodename); node->index = opal_pointer_array_set_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid, node); #if OPAL_HAVE_HWLOC /* point our topology to the one detected locally */ node->topology = opal_hwloc_topology; #endif /* create and store a proc object for us */ proc = OBJ_NEW(orte_proc_t); proc->name.jobid = ORTE_PROC_MY_NAME->jobid; proc->name.vpid = ORTE_PROC_MY_NAME->vpid; proc->pid = orte_process_info.pid; proc->rml_uri = orte_rml.get_contact_info(); proc->state = ORTE_PROC_STATE_RUNNING; opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc); /* record that the daemon (i.e., us) is on this node * NOTE: we do not add the proc object to the node's * proc array because we are not an application proc. * Instead, we record it in the daemon field of the * node object */ OBJ_RETAIN(proc); /* keep accounting straight */ node->daemon = proc; node->daemon_launched = true; node->state = ORTE_NODE_STATE_UP; /* now point our proc node field to the node */ OBJ_RETAIN(node); /* keep accounting straight */ proc->node = node; /* record that the daemon job is running */ jdata->num_procs = 1; jdata->state = ORTE_JOB_STATE_RUNNING; /* obviously, we have "reported" */ jdata->num_reported = 1; /* setup the routed info - the selected routed component * will know what to do. */ if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) { ORTE_ERROR_LOG(ret); error = "orte_routed.init_routes"; goto error; } /* setup I/O forwarding system - must come after we init routes */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_iof_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_iof_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_iof_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_iof_base_select"; goto error; } /* setup the FileM */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_filem_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_filem_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_filem_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_filem_base_select"; goto error; } #if OPAL_ENABLE_FT_CR == 1 /* * Setup the SnapC */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_snapc_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_snapc_base_open"; goto error; } if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_sstore_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_sstore_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_snapc_base_select(!ORTE_PROC_IS_HNP, ORTE_PROC_IS_DAEMON))) { ORTE_ERROR_LOG(ret); error = "orte_snapc_base_select"; goto error; } if (ORTE_SUCCESS != (ret = orte_sstore_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_sstore_base_select"; goto error; } /* For daemons, ORTE doesn't need the OPAL CR stuff */ opal_cr_set_enabled(false); #else opal_cr_set_enabled(false); #endif /* * Initalize the CR setup * Note: Always do this, even in non-FT builds. * If we don't some user level tools may hang. */ if (ORTE_SUCCESS != (ret = orte_cr_init())) { ORTE_ERROR_LOG(ret); error = "orte_cr_init"; goto error; } /* setup the DFS framework */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_dfs_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_dfs_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_dfs_select"; goto error; } return ORTE_SUCCESS; error: orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); return ORTE_ERR_SILENT; }
int orcm_init_util(void) { int ret, i; char *error; char *destdir, *tmp, *mcp, *new_mcp; /* Setup OPAL */ if( ORTE_SUCCESS != (ret = opal_init(NULL, NULL)) ) { error = "opal_init_util"; goto error; } /* register handler for errnum -> string conversion */ opal_error_register("OPENRCM", ORCM_ERR_BASE, ORCM_ERR_MAX, orcm_err2str); /* register where the OPENRCM show_help files are located */ if (NULL != (destdir = getenv("ORCM_DESTDIR"))) { asprintf(&tmp, "%s%s", destdir, ORCM_PKGHELPDIR); } else { tmp = strdup(ORCM_PKGHELPDIR); } if (ORTE_SUCCESS != (ret = opal_show_help_add_dir(tmp))) { error = "register show_help_dir"; goto error; } free(tmp); /* Add ORCM's component directory into the mca_base_param_component_path */ i = mca_base_param_find("mca", NULL, "component_path"); if (i < 0) { ret = ORCM_ERR_NOT_FOUND; error = "Could not find mca_component_path"; goto error; } mca_base_param_lookup_string(i, &mcp); if (NULL == mcp) { ret = ORCM_ERR_NOT_FOUND; error = "Could not find mca_component_path"; goto error; } if (NULL != destdir) { asprintf(&new_mcp, "%s%s:%s", destdir, ORCM_PKGLIBDIR, mcp); } else { asprintf(&new_mcp, "%s:%s", ORCM_PKGLIBDIR, mcp); } mca_base_param_set_string(i, new_mcp); free(new_mcp); free(mcp); orcm_util_initialized = true; return ORCM_SUCCESS; error: if (ORCM_ERR_SILENT != ret) { orte_show_help("help-openrcm-runtime.txt", "orcm_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); } return ret; }
int orcm_init(orcm_proc_type_t flags) { int ret; char *error, *envar; int spin; opal_output_stream_t lds; if (0 < orcm_initialized) { /* track number of times we have been called */ orcm_initialized++; return ORCM_SUCCESS; } orcm_initialized++; if (NULL != getenv("ORCM_MCA_spin")) { spin = 1; /* spin until a debugger can attach */ while (0 != spin) { ret = 0; while (ret < 10000) { ret++; }; } } /* prior to initializing the OPAL layer, check to see * if the OPAL (and friends) install location has been * moved. In order to avoid conflicts with any other * OPAL-using software, the relocation point will have * been expressed as a set of "ORCM_foo" envars. We * therefore check for the ORCM_foo values, and name-shift * any we find to OPAL_foo so that OPAL will find them. * Since all ORCM tools will have already copied their * local environment, these name-shifted vars will not * appear in the environment of any launched processes */ if (NULL != (envar = getenv("ORCM_PREFIX"))) { opal_unsetenv("ORCM_PREFIX", &environ); opal_setenv("OPAL_PREFIX", envar, true, &environ); } if (NULL != (envar = getenv("ORCM_LIBDIR"))) { opal_unsetenv("ORCM_LIBDIR", &environ); opal_setenv("OPAL_LIBDIR", envar, true, &environ); } if (NULL != (envar = getenv("ORCM_DATADIR"))) { opal_unsetenv("ORCM_DATADIR", &environ); opal_setenv("OPAL_DATADIR", envar, true, &environ); } /* initialize the opal layer */ if (ORTE_SUCCESS != (ret = opal_init(NULL, NULL))) { error = "opal_init"; goto error; } orcm_debug_verbosity = -1; (void) mca_base_var_register ("orcm", "orcm", NULL, "debug_verbose", "Verbosity level for ORCM debug messages (default: 1)", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL, &orcm_debug_verbosity); if (0 <= orcm_debug_verbosity) { /* get a debug output channel */ OBJ_CONSTRUCT(&lds, opal_output_stream_t); lds.lds_want_stdout = true; orcm_debug_output = opal_output_open(&lds); OBJ_DESTRUCT(&lds); /* set the verbosity */ opal_output_set_verbosity(orcm_debug_output, orcm_debug_verbosity); } /* ensure we know the type of proc for when we finalize */ orte_process_info.proc_type = flags; /* setup the locks */ if (ORTE_SUCCESS != (ret = orte_locks_init())) { error = "orte_locks_init"; goto error; } /* register handler for errnum -> string conversion */ opal_error_register("ORTE", ORTE_ERR_BASE, ORTE_ERR_MAX, orte_err2str); /* Ensure the rest of the process info structure is initialized */ if (ORTE_SUCCESS != (ret = orte_proc_info())) { error = "orte_proc_info"; goto error; } /* register handler for errnum -> string conversion */ opal_error_register("ORCM", ORCM_ERR_BASE, ORCM_ERR_MAX, orcm_err2str); /* register handler for attr key -> string conversion */ if (ORTE_SUCCESS != (ret = orte_attr_register("orcm", ORCM_ATTR_KEY_BASE, ORCM_ATTR_KEY_MAX, orcm_attr_key_print))) { error = "register attr print"; goto error; } /* we don't need a progress thread as all our tools loop inside themselves, * so define orte_event_base to be the base opal_event_base */ orte_event_base = opal_sync_event_base; /* setup the globals */ orcm_clusters = OBJ_NEW(opal_list_t); orcm_schedulers = OBJ_NEW(opal_list_t); if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_parser_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orcm_parser_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orcm_parser_base_select())) { ORTE_ERROR_LOG(ret); error = "orcm_parser_select"; goto error; } /* everyone must open the cfgi framework */ if (ORCM_SUCCESS != (ret = mca_base_framework_open(&orcm_cfgi_base_framework, 0))) { error = "orcm_cfgi_base_open"; goto error; } if (ORCM_SUCCESS != (ret = orcm_cfgi_base_select())) { error = "orcm_cfgi_select"; goto error; } envar = getenv("ORCM_MCA_logical_group_config_file"); if (ORCM_SUCCESS != (ret = orcm_logical_group_load_to_memory(envar))) { error = "orcm_logical_group_load_to_memory"; goto error; } if (ORCM_SCHED == flags) { if (NULL == (envar = getenv("ORCM_MCA_event_exec_path"))) { asprintf(&orcm_event_exec_path, "%s/bin", opal_install_dirs.prefix); } else { orcm_event_exec_path = strdup(envar); } if (NULL == orcm_event_exec_path) { error = "orcm_event_exec_path"; goto error; } } /* everyone must open the sst framework */ if (ORCM_SUCCESS != (ret = mca_base_framework_open(&orcm_sst_base_framework, 0))) { error = "orcm_sst_base_open"; goto error; } if (ORCM_SUCCESS != (ret = orcm_sst_base_select())) { error = "orcm_sst_select"; goto error; } /* open the ESS and select the correct module for this environment - the * orcm module is basically a no-op, but we need the framework defined * as other parts of ORTE will want to call it */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_ess_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_ess_base_select())) { error = "orte_ess_base_select"; goto error; } if (ORTE_SUCCESS != (ret = orte_ess.init())) { error = "orte_ess_init"; goto error; } /* initialize us - we will register the ORTE-level MCA params in there */ if (ORTE_SUCCESS != (ret = orcm_sst.init())) { error = "orte_init"; goto error; } /* setup the orte_show_help system - don't do this until the * end as otherwise show_help messages won't appear */ if (ORTE_SUCCESS != (ret = orte_show_help_init())) { error = "opal_output_init"; goto error; } /* initialize orcm datatype support */ if (ORCM_SUCCESS != (ret = orcm_dt_init())) { error = "orcm_dt_init"; goto error; } /* flag that orte is initialized so things can work */ orte_initialized = true; orte_help_want_aggregate = false; return ORCM_SUCCESS; error: if (ORCM_ERR_SILENT != ret) { opal_show_help("help-orcm-runtime.txt", "orcm_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); } return ret; }
static int rte_init(void) { int ret, i; char *error = NULL; char **hosts = NULL; /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { error = "orte_ess_base_std_prolog"; goto error; } /* Start by getting a unique name */ alps_set_name(); /* if I am a daemon, complete my setup using the * default procedure */ if (ORTE_PROC_IS_DAEMON) { if (NULL != orte_node_regex) { /* extract the nodes */ if (ORTE_SUCCESS != (ret = orte_regex_extract_node_names(orte_node_regex, &hosts)) || NULL == hosts) { error = "orte_regex_extract_node_names"; goto error; } /* find our host in the list */ for (i=0; NULL != hosts[i]; i++) { if (0 == strncmp(hosts[i], orte_process_info.nodename, strlen(hosts[i]))) { /* correct our vpid */ ORTE_PROC_MY_NAME->vpid = starting_vpid + i; OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "ess:alps reset name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); break; } } } if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_orted_setup"; goto error; } opal_argv_free(hosts); return ORTE_SUCCESS; } if (ORTE_PROC_IS_TOOL) { /* otherwise, if I am a tool proc, use that procedure */ if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup())) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_tool_setup"; goto error; } /* as a tool, I don't need a nidmap - so just return now */ return ORTE_SUCCESS; } /* otherwise, I must be an application process - use * the default procedure to finish my setup */ if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_app_setup"; goto error; } /* setup the nidmap arrays */ if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(orte_process_info.sync_buf))) { ORTE_ERROR_LOG(ret); error = "orte_util_nidmap_init"; goto error; } return ORTE_SUCCESS; error: orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); return ret; }