int orte_info_register_components(opal_pointer_array_t *mca_types, opal_pointer_array_t *component_map) { opal_info_component_map_t *map; char *env, *str; int i, rc; char *target, *save, *type; char **env_save=NULL; /* Clear out the environment. Use strdup() to orphan the resulting * strings because items are placed in the environment by reference, * not by value. */ for (i = 0; i < mca_types->size; ++i) { if (NULL == (type = (char*)opal_pointer_array_get_item(mca_types, i))) { continue; } asprintf(&env, "OMPI_MCA_%s", type); if (NULL != (save = getenv(env))) { /* save this param so it can later be restored */ asprintf(&str, "%s=%s", env, save); opal_argv_append_nosize(&env_save, str); free(str); /* can't manipulate it directly, so make a copy first */ asprintf(&target, "%s=", env); putenv(target); free(target); } free(env); } /* Set orte_process_info.proc_type to HNP to force all frameworks to * open components */ orte_process_info.proc_type = ORTE_PROC_HNP; /* set the event base to be the opal event base as we * aren't attempting to do anything with progress threads here */ orte_event_base = opal_event_base; /* Register the ORTE layer's MCA parameters */ if (ORTE_SUCCESS != (rc = orte_register_params()) && ORTE_ERR_BAD_PARAM != rc) { str = "orte_register_params"; goto error; } if (ORTE_SUCCESS != (rc = orte_db_base_open()) && ORTE_ERR_BAD_PARAM != rc) { str = "db_base_open"; goto error; } map = OBJ_NEW(opal_info_component_map_t); map->type = strdup("db"); map->components = &orte_db_base.available_components; opal_pointer_array_add(component_map, map); if (ORTE_ERR_BAD_PARAM == rc) { str = "db"; goto breakout; } if (ORTE_SUCCESS != (rc = orte_errmgr_base_open()) && ORTE_ERR_BAD_PARAM != rc) { str = "errmgr_base_open"; goto error; } map = OBJ_NEW(opal_info_component_map_t); map->type = strdup("errmgr"); map->components = &orte_errmgr_base_components_available; opal_pointer_array_add(component_map, map); if (ORTE_ERR_BAD_PARAM == rc) { str = "errmgr"; goto breakout; } if (ORTE_SUCCESS != (rc = orte_ess_base_open()) && ORTE_ERR_BAD_PARAM != rc) { str = "ess_base_open"; goto error; } map = OBJ_NEW(opal_info_component_map_t); map->type = strdup("ess"); map->components = &orte_ess_base_components_available; opal_pointer_array_add(component_map, map); if (ORTE_ERR_BAD_PARAM == rc) { str = "ess"; goto breakout; } if (ORTE_SUCCESS != (rc = orte_filem_base_open()) && ORTE_ERR_BAD_PARAM != rc) { str = "filem_base_open"; goto error; } map = OBJ_NEW(opal_info_component_map_t); map->type = strdup("filem"); map->components = &orte_filem_base_components_available; opal_pointer_array_add(component_map, map); if (ORTE_ERR_BAD_PARAM == rc) { str = "filem"; goto breakout; } if (ORTE_SUCCESS != (rc = orte_grpcomm_base_open()) && ORTE_ERR_BAD_PARAM != rc) { str = "grpcomm_base_open"; goto error; } map = OBJ_NEW(opal_info_component_map_t); map->type = strdup("grpcomm"); map->components = &orte_grpcomm_base.components_available; opal_pointer_array_add(component_map, map); if (ORTE_ERR_BAD_PARAM == rc) { str = "grpcomm"; goto breakout; } if (ORTE_SUCCESS != (rc = orte_iof_base_open()) && ORTE_ERR_BAD_PARAM != rc) { str = "iof_base_open"; goto error; } map = OBJ_NEW(opal_info_component_map_t); map->type = strdup("iof"); map->components = &orte_iof_base.iof_components_opened; opal_pointer_array_add(component_map, map); if (ORTE_ERR_BAD_PARAM == rc) { str = "iof"; goto breakout; } if (ORTE_SUCCESS != (rc = orte_odls_base_open()) && ORTE_ERR_BAD_PARAM != rc) { str = "odls_base_open"; goto error; } map = OBJ_NEW(opal_info_component_map_t); map->type = strdup("odls"); map->components = &orte_odls_base.available_components; opal_pointer_array_add(component_map, map); if (ORTE_ERR_BAD_PARAM == rc) { str = "odls"; goto breakout; } if (ORTE_SUCCESS != (rc = mca_oob_base_open()) && ORTE_ERR_BAD_PARAM != rc) { str = "oob_base_open"; goto error; } map = OBJ_NEW(opal_info_component_map_t); map->type = strdup("oob"); map->components = &mca_oob_base_components; opal_pointer_array_add(component_map, map); if (ORTE_ERR_BAD_PARAM == rc) { str = "oob"; goto breakout; } if (ORTE_SUCCESS != (rc = orte_plm_base_open()) && ORTE_ERR_BAD_PARAM != rc) { str = "plm_base_open"; goto error; } map = OBJ_NEW(opal_info_component_map_t); map->type = strdup("plm"); map->components = &orte_plm_base.available_components; opal_pointer_array_add(component_map, map); if (ORTE_ERR_BAD_PARAM == rc) { str = "plm"; goto breakout; } if (ORTE_SUCCESS != (rc = orte_ras_base_open()) && ORTE_ERR_BAD_PARAM != rc) { str = "ras_base_open"; goto error; } map = OBJ_NEW(opal_info_component_map_t); map->type = strdup("ras"); map->components = &orte_ras_base.ras_opened; opal_pointer_array_add(component_map, map); if (ORTE_ERR_BAD_PARAM == rc) { str = "ras"; goto breakout; } if (ORTE_SUCCESS != (rc = orte_rmaps_base_open()) && ORTE_ERR_BAD_PARAM != rc) { str = "rmaps_base_open"; goto error; } map = OBJ_NEW(opal_info_component_map_t); map->type = strdup("rmaps"); map->components = &orte_rmaps_base.available_components; opal_pointer_array_add(component_map, map); if (ORTE_ERR_BAD_PARAM == rc) { str = "rmaps"; goto breakout; } if (ORTE_SUCCESS != (rc = orte_routed_base_open()) && ORTE_ERR_BAD_PARAM != rc) { str = "routed_base_open"; goto error; } map = OBJ_NEW(opal_info_component_map_t); map->type = strdup("routed"); map->components = &orte_routed_base_components; opal_pointer_array_add(component_map, map); if (ORTE_ERR_BAD_PARAM == rc) { str = "routed"; goto breakout; } if (ORTE_SUCCESS != (rc = orte_rml_base_open()) && ORTE_ERR_BAD_PARAM != rc) { str = "rml_base_open"; goto error; } map = OBJ_NEW(opal_info_component_map_t); map->type = strdup("rml"); map->components = &orte_rml_base_components; opal_pointer_array_add(component_map, map); if (ORTE_ERR_BAD_PARAM == rc) { str = "rml"; goto breakout; } #if ORTE_ENABLE_SENSORS if (ORTE_SUCCESS != (rc = orte_sensor_base_open()) && ORTE_ERR_BAD_PARAM != rc) { str = "sensor_base_open"; goto error; } map = OBJ_NEW(opal_info_component_map_t); map->type = strdup("sensor"); map->components = &mca_sensor_base_components_available; opal_pointer_array_add(component_map, map); if (ORTE_ERR_BAD_PARAM == rc) { str = "sensor"; goto breakout; } #endif #if OPAL_ENABLE_FT_CR == 1 if (ORTE_SUCCESS != (rc = orte_snapc_base_open()) && ORTE_ERR_BAD_PARAM != rc) { str = "snapc_base_open"; goto error; } map = OBJ_NEW(opal_info_component_map_t); map->type = strdup("snapc"); map->components = &orte_snapc_base_components_available; opal_pointer_array_add(component_map, map); if (ORTE_ERR_BAD_PARAM == rc) { str = "snapc"; goto breakout; } if (ORTE_SUCCESS != (rc = orte_sstore_base_open()) && ORTE_ERR_BAD_PARAM != rc) { str = "sstore_base_open"; goto error; } map = OBJ_NEW(opal_info_component_map_t); map->type = strdup("sstore"); map->components = &orte_sstore_base_components_available; opal_pointer_array_add(component_map, map); if (ORTE_ERR_BAD_PARAM == rc) { str = "sstore"; goto breakout; } #endif if (ORTE_SUCCESS != (rc = orte_state_base_open()) && ORTE_ERR_BAD_PARAM != rc) { str = "state_base_open"; goto error; } map = OBJ_NEW(opal_info_component_map_t); map->type = strdup("state"); map->components = &orte_state_base_components_available; opal_pointer_array_add(component_map, map); if (ORTE_ERR_BAD_PARAM == rc) { str = "state"; goto breakout; } breakout: /* Restore the environment to what it was before we started so that * if users setenv OMPI_MCA_<framework name> to some value, they'll * see that value when it is shown via --param output. */ if (NULL != env_save) { for (i = 0; i < opal_argv_count(env_save); ++i) { putenv(env_save[i]); } } if (ORTE_ERR_BAD_PARAM == rc) { fprintf(stderr, "\nA \"bad parameter\" error was encountered when opening the ORTE %s framework\n", str); fprintf(stderr, "The output received from that framework includes the following parameters:\n\n"); } return rc; error: fprintf(stderr, "orte_info_register: %s failed\n", str); return ORTE_ERROR; }
int orte_ess_base_orted_setup(char **hosts) { int ret = ORTE_ERROR; int fd; char log_file[PATH_MAX]; char *jobidstring; char *error = NULL; char *plm_to_use; orte_job_t *jdata; orte_proc_t *proc; orte_app_context_t *app; orte_node_t *node; #ifndef __WINDOWS__ /* setup callback for SIGPIPE */ setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback); /* Set signal handlers to catch kill signals so we can properly clean up * after ourselves. */ setup_sighandler(SIGTERM, &term_handler, shutdown_signal); setup_sighandler(SIGINT, &int_handler, shutdown_signal); /** setup callbacks for signals we should ignore */ setup_sighandler(SIGUSR1, &sigusr1_handler, signal_callback); setup_sighandler(SIGUSR2, &sigusr2_handler, signal_callback); #endif /* __WINDOWS__ */ signals_set = true; #if OPAL_HAVE_HWLOC { hwloc_obj_t obj; unsigned i, j; /* get the local topology */ if (NULL == opal_hwloc_topology) { if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) { error = "topology discovery"; goto error; } } /* remove the hostname from the topology. Unfortunately, hwloc * decided to add the source hostname to the "topology", thus * rendering it unusable as a pure topological description. So * we remove that information here. */ obj = hwloc_get_root_obj(opal_hwloc_topology); for (i=0; i < obj->infos_count; i++) { if (NULL == obj->infos[i].name || NULL == obj->infos[i].value) { continue; } if (0 == strncmp(obj->infos[i].name, "HostName", strlen("HostName"))) { free(obj->infos[i].name); free(obj->infos[i].value); /* left justify the array */ for (j=i; j < obj->infos_count-1; j++) { obj->infos[j] = obj->infos[j+1]; } obj->infos[obj->infos_count-1].name = NULL; obj->infos[obj->infos_count-1].value = NULL; obj->infos_count--; break; } } if (4 < opal_output_get_verbosity(orte_ess_base_output)) { opal_output(0, "%s Topology Info:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO); } } #endif /* open and setup the opal_pstat framework so we can provide * process stats if requested */ if (ORTE_SUCCESS != (ret = opal_pstat_base_open())) { ORTE_ERROR_LOG(ret); error = "opal_pstat_base_open"; goto error; } if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) { ORTE_ERROR_LOG(ret); error = "opal_pstat_base_select"; goto error; } /* open and setup the state machine */ if (ORTE_SUCCESS != (ret = orte_state_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_state_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_state_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_state_base_select"; goto error; } /* open the errmgr */ if (ORTE_SUCCESS != (ret = orte_errmgr_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_open"; goto error; } /* some environments allow remote launches - e.g., ssh - so * open and select something -only- if we are given * a specific module to use */ mca_base_param_reg_string_name("plm", NULL, "Which plm component to use (empty = none)", false, false, NULL, &plm_to_use); if (NULL == plm_to_use) { plm_in_use = false; } else { plm_in_use = true; if (ORTE_SUCCESS != (ret = orte_plm_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_plm_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_plm_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_plm_base_select"; goto error; } } /* Setup the communication infrastructure */ /* Runtime Messaging Layer - this opens/selects the OOB as well */ if (ORTE_SUCCESS != (ret = orte_rml_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_rml_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_select"; goto error; } /* select the errmgr */ if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_select"; goto error; } /* Routed system */ if (ORTE_SUCCESS != (ret = orte_routed_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_routed_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_routed_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_routed_base_select"; goto error; } /* database */ if (ORTE_SUCCESS != (ret = orte_db_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_db_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_db_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_db_base_select"; goto error; } /* * Group communications */ if (ORTE_SUCCESS != (ret = orte_grpcomm_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_grpcomm_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_grpcomm_base_select"; goto error; } /* Open/select the odls */ if (ORTE_SUCCESS != (ret = orte_odls_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_odls_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_odls_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_odls_base_select"; goto error; } /* enable communication with the rml */ if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) { ORTE_ERROR_LOG(ret); error = "orte_rml.enable_comm"; goto error; } /* initialize the nidmaps */ if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) { ORTE_ERROR_LOG(ret); error = "orte_util_nidmap_init"; goto error; } #if ORTE_ENABLE_STATIC_PORTS /* if we are using static ports, then we need to setup * the daemon info so the RML can function properly * without requiring a wireup stage. This must be done * after we enable_comm as that function determines our * own port, which we need in order to construct the nidmap */ if (orte_static_ports) { /* define the routing tree so we know the pattern * if we are trying to setup common or static ports */ orte_routed.update_routing_plan(); /* extract the node info from the environment and * build a nidmap from it */ if (ORTE_SUCCESS != (ret = orte_util_build_daemon_nidmap(hosts))) { ORTE_ERROR_LOG(ret); error = "construct daemon map from static ports"; goto error; } } #endif /* be sure to update the routing tree so the initial "phone home" * to mpirun goes through the tree if static ports were enabled - still * need to do it anyway just to initialize things */ orte_routed.update_routing_plan(); /* Now provide a chance for the PLM * to perform any module-specific init functions. This * needs to occur AFTER the communications are setup * as it may involve starting a non-blocking recv * Do this only if a specific PLM was given to us - the * orted has no need of the proxy PLM at all */ if (plm_in_use) { if (ORTE_SUCCESS != (ret = orte_plm.init())) { ORTE_ERROR_LOG(ret); error = "orte_plm_init"; goto error; } } /* setup my session directory */ if (orte_create_session_dirs) { OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, "%s setting up session dir with\n\ttmpdir: %s\n\thost %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base, orte_process_info.nodename)); if (ORTE_SUCCESS != (ret = orte_session_dir(true, orte_process_info.tmpdir_base, orte_process_info.nodename, NULL, ORTE_PROC_MY_NAME))) { ORTE_ERROR_LOG(ret); error = "orte_session_dir"; goto error; } /* Once the session directory location has been established, set the opal_output env file location to be in the proc-specific session directory. */ opal_output_set_output_file_info(orte_process_info.proc_session_dir, "output-", NULL, NULL); /* setup stdout/stderr */ if (orte_debug_daemons_file_flag) { /* if we are debugging to a file, then send stdout/stderr to * the orted log file */ /* get my jobid */ if (ORTE_SUCCESS != (ret = orte_util_convert_jobid_to_string(&jobidstring, ORTE_PROC_MY_NAME->jobid))) { ORTE_ERROR_LOG(ret); error = "convert_jobid"; goto error; } /* define a log file name in the session directory */ snprintf(log_file, PATH_MAX, "output-orted-%s-%s.log", jobidstring, orte_process_info.nodename); log_path = opal_os_path(false, orte_process_info.tmpdir_base, orte_process_info.top_session_dir, log_file, NULL); fd = open(log_path, O_RDWR|O_CREAT|O_TRUNC, 0640); if (fd < 0) { /* couldn't open the file for some reason, so * just connect everything to /dev/null */ fd = open("/dev/null", O_RDWR|O_CREAT|O_TRUNC, 0666); } else { dup2(fd, STDOUT_FILENO); dup2(fd, STDERR_FILENO); if(fd != STDOUT_FILENO && fd != STDERR_FILENO) { close(fd); } } } } /* setup the global job and node arrays */ orte_job_data = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data, 1, ORTE_GLOBAL_ARRAY_MAX_SIZE, 1))) { ORTE_ERROR_LOG(ret); error = "setup job array"; goto error; } orte_node_pool = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool, ORTE_GLOBAL_ARRAY_BLOCK_SIZE, ORTE_GLOBAL_ARRAY_MAX_SIZE, ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) { ORTE_ERROR_LOG(ret); error = "setup node array"; goto error; } orte_node_topologies = OBJ_NEW(opal_pointer_array_t); if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_topologies, ORTE_GLOBAL_ARRAY_BLOCK_SIZE, ORTE_GLOBAL_ARRAY_MAX_SIZE, ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) { ORTE_ERROR_LOG(ret); error = "setup node topologies array"; goto error; } /* Setup the job data object for the daemons */ /* create and store the job data object */ jdata = OBJ_NEW(orte_job_t); jdata->jobid = ORTE_PROC_MY_NAME->jobid; opal_pointer_array_set_item(orte_job_data, 0, jdata); /* every job requires at least one app */ app = OBJ_NEW(orte_app_context_t); opal_pointer_array_set_item(jdata->apps, 0, app); jdata->num_apps++; /* create and store a node object where we are */ node = OBJ_NEW(orte_node_t); node->name = strdup(orte_process_info.nodename); node->index = opal_pointer_array_set_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid, node); #if OPAL_HAVE_HWLOC /* point our topology to the one detected locally */ node->topology = opal_hwloc_topology; #endif /* create and store a proc object for us */ proc = OBJ_NEW(orte_proc_t); proc->name.jobid = ORTE_PROC_MY_NAME->jobid; proc->name.vpid = ORTE_PROC_MY_NAME->vpid; proc->pid = orte_process_info.pid; proc->rml_uri = orte_rml.get_contact_info(); proc->state = ORTE_PROC_STATE_RUNNING; opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc); /* record that the daemon (i.e., us) is on this node * NOTE: we do not add the proc object to the node's * proc array because we are not an application proc. * Instead, we record it in the daemon field of the * node object */ OBJ_RETAIN(proc); /* keep accounting straight */ node->daemon = proc; node->daemon_launched = true; node->state = ORTE_NODE_STATE_UP; /* record that the daemon job is running */ jdata->num_procs = 1; jdata->state = ORTE_JOB_STATE_RUNNING; /* obviously, we have "reported" */ jdata->num_reported = 1; /* setup the routed info - the selected routed component * will know what to do. */ if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) { ORTE_ERROR_LOG(ret); error = "orte_routed.init_routes"; goto error; } /* setup I/O forwarding system - must come after we init routes */ if (ORTE_SUCCESS != (ret = orte_iof_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_iof_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_iof_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_iof_base_select"; goto error; } /* setup the FileM */ if (ORTE_SUCCESS != (ret = orte_filem_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_filem_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_filem_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_filem_base_select"; goto error; } #if OPAL_ENABLE_FT_CR == 1 /* * Setup the SnapC */ if (ORTE_SUCCESS != (ret = orte_snapc_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_snapc_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_snapc_base_select(ORTE_PROC_IS_HNP, !ORTE_PROC_IS_DAEMON))) { ORTE_ERROR_LOG(ret); error = "orte_snapc_base_select"; goto error; } /* For daemons, ORTE doesn't need the OPAL CR stuff */ opal_cr_set_enabled(false); #else opal_cr_set_enabled(false); #endif /* * Initalize the CR setup * Note: Always do this, even in non-FT builds. * If we don't some user level tools may hang. */ if (ORTE_SUCCESS != (ret = orte_cr_init())) { ORTE_ERROR_LOG(ret); error = "orte_cr_init"; goto error; } /* setup the SENSOR framework */ if (ORTE_SUCCESS != (ret = orte_sensor_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_sensor_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_sensor_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_sensor_select"; goto error; } /* start the local sensors */ orte_sensor.start(ORTE_PROC_MY_NAME->jobid); return ORTE_SUCCESS; error: orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); return ORTE_ERR_SILENT; }
int orte_ess_base_app_setup(void) { int ret; char *error = NULL; /* * stdout/stderr buffering * If the user requested to override the default setting then do * as they wish. */ if( orte_ess_base_std_buffering > -1 ) { if( 0 == orte_ess_base_std_buffering ) { setvbuf(stdout, NULL, _IONBF, 0); setvbuf(stderr, NULL, _IONBF, 0); } else if( 1 == orte_ess_base_std_buffering ) { setvbuf(stdout, NULL, _IOLBF, 0); setvbuf(stderr, NULL, _IOLBF, 0); } else if( 2 == orte_ess_base_std_buffering ) { setvbuf(stdout, NULL, _IOFBF, 0); setvbuf(stderr, NULL, _IOFBF, 0); } } /* open and setup the state machine */ if (ORTE_SUCCESS != (ret = orte_state_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_state_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_state_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_state_base_select"; goto error; } /* open the errmgr */ if (ORTE_SUCCESS != (ret = orte_errmgr_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_open"; goto error; } /* Setup the communication infrastructure */ /* Runtime Messaging Layer */ if (ORTE_SUCCESS != (ret = orte_rml_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_rml_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_select"; goto error; } /* setup the errmgr */ if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_select"; goto error; } /* Routed system */ if (ORTE_SUCCESS != (ret = orte_routed_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_routed_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_routed_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_routed_base_select"; goto error; } /* database */ if (ORTE_SUCCESS != (ret = orte_db_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_db_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_db_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_db_base_select"; goto error; } /* * Group communications */ if (ORTE_SUCCESS != (ret = orte_grpcomm_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_grpcomm_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_grpcomm_base_select"; goto error; } /* non-daemon/HNP apps can only have the default proxy PLM * module open - provide a chance for it to initialize */ if (ORTE_SUCCESS != (ret = orte_plm.init())) { ORTE_ERROR_LOG(ret); error = "orte_plm_init"; goto error; } /* enable communication via the rml */ if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) { ORTE_ERROR_LOG(ret); error = "orte_rml.enable_comm"; goto error; } /* setup my session directory */ if (orte_create_session_dirs) { OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, "%s setting up session dir with\n\ttmpdir: %s\n\thost %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base, orte_process_info.nodename)); if (ORTE_SUCCESS != (ret = orte_session_dir(true, orte_process_info.tmpdir_base, orte_process_info.nodename, NULL, ORTE_PROC_MY_NAME))) { ORTE_ERROR_LOG(ret); error = "orte_session_dir"; goto error; } /* Once the session directory location has been established, set the opal_output env file location to be in the proc-specific session directory. */ opal_output_set_output_file_info(orte_process_info.proc_session_dir, "output-", NULL, NULL); } /* setup the routed info - the selected routed component * will know what to do. Some may put us in a blocking * receive here so they can get ALL of the contact info * from our peers. Others may just find the local daemon's * contact info and immediately return. */ if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) { ORTE_ERROR_LOG(ret); error = "orte_routed.init_routes"; goto error; } #if OPAL_ENABLE_FT_CR == 1 /* * Setup the SnapC */ if (ORTE_SUCCESS != (ret = orte_snapc_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_snapc_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_snapc_base_select(ORTE_PROC_IS_HNP, !ORTE_PROC_IS_DAEMON))) { ORTE_ERROR_LOG(ret); error = "orte_snapc_base_select"; goto error; } /* apps need the OPAL CR stuff */ opal_cr_set_enabled(true); #else opal_cr_set_enabled(false); #endif /* Initalize the CR setup * Note: Always do this, even in non-FT builds. * If we don't some user level tools may hang. */ if (ORTE_SUCCESS != (ret = orte_cr_init())) { ORTE_ERROR_LOG(ret); error = "orte_cr_init"; goto error; } /* if we are an ORTE app - and not an MPI app - then * we need to barrier here. MPI_Init has its own barrier, * so we don't need to do two of them. However, if we * don't do a barrier at all, then one process could * finalize before another one called orte_init. This * causes ORTE to believe that the proc abnormally * terminated * * NOTE: only do this when the process originally launches. * Cannot do this on a restart as the rest of the processes * in the job won't be executing this step, so we would hang */ if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) { orte_grpcomm_collective_t coll; OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t); coll.id = orte_process_info.peer_init_barrier; if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier(&coll))) { ORTE_ERROR_LOG(ret); error = "orte barrier"; goto error; } ORTE_WAIT_FOR_COMPLETION(coll.active); OBJ_DESTRUCT(&coll); } return ORTE_SUCCESS; error: orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); return ret; }
/* * Open all MCA components so that they can register their MCA * parameters. Take a shotgun approach here and indiscriminately open * all components -- don't be selective. To this end, we need to clear * out the environment of all OMPI_MCA_<type> variables to ensure * that the open algorithms don't try to only open one component. */ void orte_info_open_components(void) { int i; char *env, *str; char *target, *save, *type; char **env_save=NULL; bool need_close_components = false; orte_info_component_map_t *map; if (opened_components) { return; } /* init the map */ OBJ_CONSTRUCT(&component_map, opal_pointer_array_t); opal_pointer_array_init(&component_map, 256, INT_MAX, 128); /* Clear out the environment. Use strdup() to orphan the resulting * strings because items are placed in the environment by reference, * not by value. */ for (i = 0; i < mca_types.size; ++i) { if (NULL == (type = (char*)opal_pointer_array_get_item(&mca_types, i))) { continue; } asprintf(&env, "OMPI_MCA_%s", type); if (NULL != (save = getenv(env))) { /* save this param so it can later be restored */ asprintf(&str, "%s=%s", env, save); opal_argv_append_nosize(&env_save, str); free(str); /* can't manipulate it directly, so make a copy first */ asprintf(&target, "%s=", env); putenv(target); free(target); } } /* some components require the event library be active, so activate it */ if (OPAL_SUCCESS != opal_event_base_open()) { str = "opal_event_base_open"; goto error; } /* Open the DSS */ if (ORTE_SUCCESS != opal_dss_open()) { str = "Unable to initialize the DSS"; goto error; } /* Open up the MCA */ if (OPAL_SUCCESS != mca_base_open()) { str = "mca_base_open failed"; goto error; } /* Register the OPAL layer's MCA parameters */ if (OPAL_SUCCESS != opal_register_params()) { str = "opal_register_params failed"; goto error; } /* Register the ORTE layer's MCA parameters */ if (ORTE_SUCCESS != orte_register_params()) { str = "orte_register_params failed"; goto error; } /* Initialize the opal_output system */ if (!opal_output_init()) { str = "opal_output_init failed"; goto error; } /* Find / open all components */ map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("base"); opal_pointer_array_add(&component_map, map); /* set default error message from here forward */ str = "A component framework failed to open properly."; /* OPAL frameworks */ if (OPAL_SUCCESS != opal_backtrace_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("backtrace"); map->components = &opal_backtrace_base_components_opened; opal_pointer_array_add(&component_map, map); if (OPAL_SUCCESS != opal_memory_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("memory"); map->components = &opal_memory_base_components_opened; opal_pointer_array_add(&component_map, map); /* the event framework is already open - just get its components */ map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("event"); map->components = &opal_event_components; opal_pointer_array_add(&component_map, map); if (OPAL_SUCCESS != opal_memchecker_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("memchecker"); map->components = &opal_memchecker_base_components_opened; opal_pointer_array_add(&component_map, map); if (OPAL_SUCCESS != opal_shmem_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("shmem"); map->components = &opal_shmem_base_components_opened; opal_pointer_array_add(&component_map, map); #if OPAL_HAVE_HWLOC if (OPAL_SUCCESS != opal_hwloc_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("hwloc"); map->components = &opal_hwloc_base_components; opal_pointer_array_add(&component_map, map); #endif if (OPAL_SUCCESS != opal_timer_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("timer"); map->components = &opal_timer_base_components_opened; opal_pointer_array_add(&component_map, map); #if OPAL_ENABLE_FT_CR == 1 if (OPAL_SUCCESS != opal_crs_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("crs"); map->components = &opal_crs_base_components_available; opal_pointer_array_add(&component_map, map); #endif if (OPAL_SUCCESS != opal_if_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("if"); map->components = &opal_if_components; opal_pointer_array_add(&component_map, map); /* OPAL's installdirs base open has already been called as part of * opal_init_util() back in main(). */ map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("installdirs"); map->components = &opal_installdirs_components; opal_pointer_array_add(&component_map, map); /* ORTE frameworks * Set orte_process_info.proc_type to HNP to force all frameworks to * open components */ orte_process_info.proc_type = ORTE_PROC_HNP; if (ORTE_SUCCESS != orte_state_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("state"); map->components = &orte_state_base_components_available; opal_pointer_array_add(&component_map, map); if (ORTE_SUCCESS != orte_errmgr_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("errmgr"); map->components = &orte_errmgr_base_components_available; opal_pointer_array_add(&component_map, map); if (ORTE_SUCCESS != orte_grpcomm_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("grpcomm"); map->components = &orte_grpcomm_base.components_available; opal_pointer_array_add(&component_map, map); if (ORTE_SUCCESS != orte_db_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("db"); map->components = &orte_db_base.available_components; opal_pointer_array_add(&component_map, map); if (ORTE_SUCCESS != orte_ess_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("ess"); map->components = &orte_ess_base_components_available; opal_pointer_array_add(&component_map, map); #if !ORTE_DISABLE_FULL_SUPPORT if (ORTE_SUCCESS != mca_oob_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("oob"); map->components = &mca_oob_base_components; opal_pointer_array_add(&component_map, map); if (ORTE_SUCCESS != orte_odls_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("odls"); map->components = &orte_odls_base.available_components; opal_pointer_array_add(&component_map, map); if (ORTE_SUCCESS != orte_iof_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("iof"); map->components = &orte_iof_base.iof_components_opened; opal_pointer_array_add(&component_map, map); if (ORTE_SUCCESS != orte_ras_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("ras"); map->components = &orte_ras_base.ras_opened; opal_pointer_array_add(&component_map, map); if (ORTE_SUCCESS != orte_rmaps_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("rmaps"); map->components = &orte_rmaps_base.available_components; opal_pointer_array_add(&component_map, map); if (ORTE_SUCCESS != orte_rml_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("rml"); map->components = &orte_rml_base_components; opal_pointer_array_add(&component_map, map); if (ORTE_SUCCESS != orte_routed_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("routed"); map->components = &orte_routed_base_components; opal_pointer_array_add(&component_map, map); if (ORTE_SUCCESS != orte_plm_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("plm"); map->components = &orte_plm_base.available_components; opal_pointer_array_add(&component_map, map); #if OPAL_ENABLE_FT_CR == 1 if (ORTE_SUCCESS != orte_snapc_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("snapc"); map->components = &orte_snapc_base_components_available; opal_pointer_array_add(&component_map, map); #endif if (ORTE_SUCCESS != orte_sensor_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("sensor"); map->components = &mca_sensor_base_components_available; opal_pointer_array_add(&component_map, map); if (ORTE_SUCCESS != orte_filem_base_open()) { goto error; } map = OBJ_NEW(orte_info_component_map_t); map->type = strdup("filem"); map->components = &orte_filem_base_components_available; opal_pointer_array_add(&component_map, map); #endif /* flag that we need to close components */ need_close_components = true; /* Restore the environment to what it was before we started so that * if users setenv OMPI_MCA_<framework name> to some value, they'll * see that value when it is shown via --param output. */ if (NULL != env_save) { for (i = 0; i < opal_argv_count(env_save); ++i) { putenv(env_save[i]); } } /* All done */ opened_components = true; return; error: fprintf(stderr, "%s\n", str); fprintf(stderr, "orte-info will likely not display all configuration information\n"); if (need_close_components) { opened_components = true; orte_info_close_components(); } }