Beispiel #1
0
int orte_info_register_components(opal_pointer_array_t *mca_types,
                                  opal_pointer_array_t *component_map)
{
    opal_info_component_map_t *map;
    char *env, *str;
    int i, rc;
    char *target, *save, *type;
    char **env_save=NULL;

    /* Clear out the environment.  Use strdup() to orphan the resulting
     * strings because items are placed in the environment by reference,
     * not by value.
     */
    for (i = 0; i < mca_types->size; ++i) {
        if (NULL == (type = (char*)opal_pointer_array_get_item(mca_types, i))) {
            continue;
        }
        asprintf(&env, "OMPI_MCA_%s", type);
        if (NULL != (save = getenv(env))) {
            /* save this param so it can later be restored */
            asprintf(&str, "%s=%s", env, save);
            opal_argv_append_nosize(&env_save, str);
            free(str);
            /* can't manipulate it directly, so make a copy first */
            asprintf(&target, "%s=", env);
            putenv(target);
            free(target);
        }
        free(env);
    }

    /* Set orte_process_info.proc_type to HNP to force all frameworks to
     * open components
     */
    orte_process_info.proc_type = ORTE_PROC_HNP;
    /* set the event base to be the opal event base as we
     * aren't attempting to do anything with progress threads here
     */
    orte_event_base = opal_event_base;

    /* Register the ORTE layer's MCA parameters */
    
    if (ORTE_SUCCESS != (rc = orte_register_params()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "orte_register_params";
        goto error;
    }
    
    if (ORTE_SUCCESS != (rc = orte_db_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "db_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("db");
    map->components = &orte_db_base.available_components;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "db";
        goto breakout;
    }

    if (ORTE_SUCCESS != (rc = orte_errmgr_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "errmgr_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("errmgr");
    map->components = &orte_errmgr_base_components_available;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "errmgr";
        goto breakout;
    }

    if (ORTE_SUCCESS != (rc = orte_ess_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "ess_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("ess");
    map->components = &orte_ess_base_components_available;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "ess";
        goto breakout;
    }

    if (ORTE_SUCCESS != (rc = orte_filem_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "filem_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("filem");
    map->components = &orte_filem_base_components_available;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "filem";
        goto breakout;
    }

    if (ORTE_SUCCESS != (rc = orte_grpcomm_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "grpcomm_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("grpcomm");
    map->components = &orte_grpcomm_base.components_available;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "grpcomm";
        goto breakout;
    }

    if (ORTE_SUCCESS != (rc = orte_iof_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "iof_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("iof");
    map->components = &orte_iof_base.iof_components_opened;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "iof";
        goto breakout;
    }

    if (ORTE_SUCCESS != (rc = orte_odls_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "odls_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("odls");
    map->components = &orte_odls_base.available_components;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "odls";
        goto breakout;
    }

    if (ORTE_SUCCESS != (rc = mca_oob_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "oob_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("oob");
    map->components = &mca_oob_base_components;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "oob";
        goto breakout;
    }

    if (ORTE_SUCCESS != (rc = orte_plm_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "plm_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("plm");
    map->components = &orte_plm_base.available_components;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "plm";
        goto breakout;
    }

    if (ORTE_SUCCESS != (rc = orte_ras_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "ras_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("ras");
    map->components = &orte_ras_base.ras_opened;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "ras";
        goto breakout;
    }

    if (ORTE_SUCCESS != (rc = orte_rmaps_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "rmaps_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("rmaps");
    map->components = &orte_rmaps_base.available_components;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "rmaps";
        goto breakout;
    }

    if (ORTE_SUCCESS != (rc = orte_routed_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "routed_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("routed");
    map->components = &orte_routed_base_components;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "routed";
        goto breakout;
    }

    if (ORTE_SUCCESS != (rc = orte_rml_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "rml_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("rml");
    map->components = &orte_rml_base_components;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "rml";
        goto breakout;
    }

#if ORTE_ENABLE_SENSORS
    if (ORTE_SUCCESS != (rc = orte_sensor_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "sensor_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("sensor");
    map->components = &mca_sensor_base_components_available;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "sensor";
        goto breakout;
    }
#endif
    
#if OPAL_ENABLE_FT_CR == 1
    if (ORTE_SUCCESS != (rc = orte_snapc_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "snapc_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("snapc");
    map->components = &orte_snapc_base_components_available;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "snapc";
        goto breakout;
    }

    if (ORTE_SUCCESS != (rc = orte_sstore_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "sstore_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("sstore");
    map->components = &orte_sstore_base_components_available;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "sstore";
        goto breakout;
    }
#endif
    
    if (ORTE_SUCCESS != (rc = orte_state_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "state_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("state");
    map->components = &orte_state_base_components_available;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "state";
        goto breakout;
    }

 breakout:
    /* Restore the environment to what it was before we started so that
     * if users setenv OMPI_MCA_<framework name> to some value, they'll
     * see that value when it is shown via --param output.
     */
    
    if (NULL != env_save) {
        for (i = 0; i < opal_argv_count(env_save); ++i) {
            putenv(env_save[i]);
        }
    }

    if (ORTE_ERR_BAD_PARAM == rc) {
        fprintf(stderr, "\nA \"bad parameter\" error was encountered when opening the ORTE %s framework\n", str);
        fprintf(stderr, "The output received from that framework includes the following parameters:\n\n");
    }

    return rc;

 error:
    fprintf(stderr, "orte_info_register: %s failed\n", str);
    return ORTE_ERROR;
}
int orte_ess_base_orted_setup(char **hosts)
{
    int ret = ORTE_ERROR;
    int fd;
    char log_file[PATH_MAX];
    char *jobidstring;
    char *error = NULL;
    char *plm_to_use;
    orte_job_t *jdata;
    orte_proc_t *proc;
    orte_app_context_t *app;
    orte_node_t *node;

#ifndef __WINDOWS__
    /* setup callback for SIGPIPE */
    setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback);
    /* Set signal handlers to catch kill signals so we can properly clean up
     * after ourselves. 
     */
    setup_sighandler(SIGTERM, &term_handler, shutdown_signal);
    setup_sighandler(SIGINT, &int_handler, shutdown_signal);
    
    /** setup callbacks for signals we should ignore */
    setup_sighandler(SIGUSR1, &sigusr1_handler, signal_callback);
    setup_sighandler(SIGUSR2, &sigusr2_handler, signal_callback);
#endif  /* __WINDOWS__ */
    
    signals_set = true;
    
#if OPAL_HAVE_HWLOC
    {
        hwloc_obj_t obj;
        unsigned i, j;
        
        /* get the local topology */
        if (NULL == opal_hwloc_topology) {
            if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
                error = "topology discovery";
                goto error;
            }
        }
        
        /* remove the hostname from the topology. Unfortunately, hwloc
         * decided to add the source hostname to the "topology", thus
         * rendering it unusable as a pure topological description. So
         * we remove that information here.
         */
        obj = hwloc_get_root_obj(opal_hwloc_topology);
        for (i=0; i < obj->infos_count; i++) {
            if (NULL == obj->infos[i].name ||
                NULL == obj->infos[i].value) {
                continue;
            }
            if (0 == strncmp(obj->infos[i].name, "HostName", strlen("HostName"))) {
                free(obj->infos[i].name);
                free(obj->infos[i].value);
                /* left justify the array */
                for (j=i; j < obj->infos_count-1; j++) {
                    obj->infos[j] = obj->infos[j+1];
                }
                obj->infos[obj->infos_count-1].name = NULL;
                obj->infos[obj->infos_count-1].value = NULL;
                obj->infos_count--;
                break;
            }
        }
        
        if (4 < opal_output_get_verbosity(orte_ess_base_output)) {
            opal_output(0, "%s Topology Info:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO);
        }
    }
#endif
    
    /* open and setup the opal_pstat framework so we can provide
     * process stats if requested
     */
    if (ORTE_SUCCESS != (ret = opal_pstat_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "opal_pstat_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "opal_pstat_base_select";
        goto error;
    }
    
    /* open and setup the state machine */
    if (ORTE_SUCCESS != (ret = orte_state_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_state_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_state_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_state_base_select";
        goto error;
    }
    
    /* open the errmgr */
    if (ORTE_SUCCESS != (ret = orte_errmgr_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_errmgr_base_open";
        goto error;
    }
    
    /* some environments allow remote launches - e.g., ssh - so
     * open and select something -only- if we are given
     * a specific module to use
     */
    mca_base_param_reg_string_name("plm", NULL,
                                   "Which plm component to use (empty = none)",
                                   false, false,
                                   NULL, &plm_to_use);
    
    if (NULL == plm_to_use) {
        plm_in_use = false;
    } else {
        plm_in_use = true;
        
        if (ORTE_SUCCESS != (ret = orte_plm_base_open())) {
            ORTE_ERROR_LOG(ret);
            error = "orte_plm_base_open";
            goto error;
        }
        
        if (ORTE_SUCCESS != (ret = orte_plm_base_select())) {
            ORTE_ERROR_LOG(ret);
            error = "orte_plm_base_select";
            goto error;
        }
    }
    
    /* Setup the communication infrastructure */
    
    /* Runtime Messaging Layer - this opens/selects the OOB as well */
    if (ORTE_SUCCESS != (ret = orte_rml_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_rml_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml_base_select";
        goto error;
    }
    
    /* select the errmgr */
    if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_errmgr_base_select";
        goto error;
    }
    
    /* Routed system */
    if (ORTE_SUCCESS != (ret = orte_routed_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_routed_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_routed_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_routed_base_select";
        goto error;
    }
    
    /* database */
    if (ORTE_SUCCESS != (ret = orte_db_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_db_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_db_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_db_base_select";
        goto error;
    }

    /*
     * Group communications
     */
    if (ORTE_SUCCESS != (ret = orte_grpcomm_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_select";
        goto error;
    }
    
    /* Open/select the odls */
    if (ORTE_SUCCESS != (ret = orte_odls_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_odls_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_odls_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_odls_base_select";
        goto error;
    }
    
    /* enable communication with the rml */
    if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml.enable_comm";
        goto error;
    }
    
    /* initialize the nidmaps */
    if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_util_nidmap_init";
        goto error;
    }
#if ORTE_ENABLE_STATIC_PORTS
    /* if we are using static ports, then we need to setup
     * the daemon info so the RML can function properly
     * without requiring a wireup stage. This must be done
     * after we enable_comm as that function determines our
     * own port, which we need in order to construct the nidmap
     */
    if (orte_static_ports) {
        /* define the routing tree so we know the pattern
         * if we are trying to setup common or static ports
         */
        orte_routed.update_routing_plan();

        /* extract the node info from the environment and
         * build a nidmap from it
         */
        if (ORTE_SUCCESS != (ret = orte_util_build_daemon_nidmap(hosts))) {
            ORTE_ERROR_LOG(ret);
            error = "construct daemon map from static ports";
            goto error;
        }
    }
#endif
    /* be sure to update the routing tree so the initial "phone home"
     * to mpirun goes through the tree if static ports were enabled - still
     * need to do it anyway just to initialize things
     */
    orte_routed.update_routing_plan();
    
    /* Now provide a chance for the PLM
     * to perform any module-specific init functions. This
     * needs to occur AFTER the communications are setup
     * as it may involve starting a non-blocking recv
     * Do this only if a specific PLM was given to us - the
     * orted has no need of the proxy PLM at all
     */
    if (plm_in_use) {
        if (ORTE_SUCCESS != (ret = orte_plm.init())) {
            ORTE_ERROR_LOG(ret);
            error = "orte_plm_init";
            goto error;
        }
    }
    
    /* setup my session directory */
    if (orte_create_session_dirs) {
        OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output,
                             "%s setting up session dir with\n\ttmpdir: %s\n\thost %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base,
                             orte_process_info.nodename));
        
        if (ORTE_SUCCESS != (ret = orte_session_dir(true,
                                                    orte_process_info.tmpdir_base,
                                                    orte_process_info.nodename, NULL,
                                                    ORTE_PROC_MY_NAME))) {
            ORTE_ERROR_LOG(ret);
            error = "orte_session_dir";
            goto error;
        }
        /* Once the session directory location has been established, set
           the opal_output env file location to be in the
           proc-specific session directory. */
        opal_output_set_output_file_info(orte_process_info.proc_session_dir,
                                         "output-", NULL, NULL);
        
        /* setup stdout/stderr */
        if (orte_debug_daemons_file_flag) {
            /* if we are debugging to a file, then send stdout/stderr to
             * the orted log file
             */
            
            /* get my jobid */
            if (ORTE_SUCCESS != (ret = orte_util_convert_jobid_to_string(&jobidstring,
                                                                         ORTE_PROC_MY_NAME->jobid))) {
                ORTE_ERROR_LOG(ret);
                error = "convert_jobid";
                goto error;
            }
            
            /* define a log file name in the session directory */
            snprintf(log_file, PATH_MAX, "output-orted-%s-%s.log",
                     jobidstring, orte_process_info.nodename);
            log_path = opal_os_path(false,
                                    orte_process_info.tmpdir_base,
                                    orte_process_info.top_session_dir,
                                    log_file,
                                    NULL);
            
            fd = open(log_path, O_RDWR|O_CREAT|O_TRUNC, 0640);
            if (fd < 0) {
                /* couldn't open the file for some reason, so
                 * just connect everything to /dev/null
                 */
                fd = open("/dev/null", O_RDWR|O_CREAT|O_TRUNC, 0666);
            } else {
                dup2(fd, STDOUT_FILENO);
                dup2(fd, STDERR_FILENO);
                if(fd != STDOUT_FILENO && fd != STDERR_FILENO) {
                    close(fd);
                }
            }
        }
    }
    
    /* setup the global job and node arrays */
    orte_job_data = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data,
                                                       1,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       1))) {
        ORTE_ERROR_LOG(ret);
        error = "setup job array";
        goto error;
    }
    
    orte_node_pool = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
        ORTE_ERROR_LOG(ret);
        error = "setup node array";
        goto error;
    }
    orte_node_topologies = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_topologies,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
        ORTE_ERROR_LOG(ret);
        error = "setup node topologies array";
        goto error;
    }

    /* Setup the job data object for the daemons */        
    /* create and store the job data object */
    jdata = OBJ_NEW(orte_job_t);
    jdata->jobid = ORTE_PROC_MY_NAME->jobid;
    opal_pointer_array_set_item(orte_job_data, 0, jdata);
    
    /* every job requires at least one app */
    app = OBJ_NEW(orte_app_context_t);
    opal_pointer_array_set_item(jdata->apps, 0, app);
    jdata->num_apps++;
    
    /* create and store a node object where we are */
    node = OBJ_NEW(orte_node_t);
    node->name = strdup(orte_process_info.nodename);
    node->index = opal_pointer_array_set_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid, node);
#if OPAL_HAVE_HWLOC
    /* point our topology to the one detected locally */
    node->topology = opal_hwloc_topology;
#endif

    /* create and store a proc object for us */
    proc = OBJ_NEW(orte_proc_t);
    proc->name.jobid = ORTE_PROC_MY_NAME->jobid;
    proc->name.vpid = ORTE_PROC_MY_NAME->vpid;
    
    proc->pid = orte_process_info.pid;
    proc->rml_uri = orte_rml.get_contact_info();
    proc->state = ORTE_PROC_STATE_RUNNING;
    opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc);
    
    /* record that the daemon (i.e., us) is on this node 
     * NOTE: we do not add the proc object to the node's
     * proc array because we are not an application proc.
     * Instead, we record it in the daemon field of the
     * node object
     */
    OBJ_RETAIN(proc);   /* keep accounting straight */
    node->daemon = proc;
    node->daemon_launched = true;
    node->state = ORTE_NODE_STATE_UP;
    
    /* record that the daemon job is running */
    jdata->num_procs = 1;
    jdata->state = ORTE_JOB_STATE_RUNNING;
    /* obviously, we have "reported" */
    jdata->num_reported = 1;
    
    /* setup the routed info - the selected routed component
     * will know what to do. 
     */
    if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_routed.init_routes";
        goto error;
    }
    
    /* setup I/O forwarding system - must come after we init routes */
    if (ORTE_SUCCESS != (ret = orte_iof_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_iof_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_iof_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_iof_base_select";
        goto error;
    }
    
    /* setup the FileM */
    if (ORTE_SUCCESS != (ret = orte_filem_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_filem_base_open";
        goto error;
    }
    
    if (ORTE_SUCCESS != (ret = orte_filem_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_filem_base_select";
        goto error;
    }
    
#if OPAL_ENABLE_FT_CR == 1
    /*
     * Setup the SnapC
     */
    if (ORTE_SUCCESS != (ret = orte_snapc_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_snapc_base_open";
        goto error;
    }
    
    if (ORTE_SUCCESS != (ret = orte_snapc_base_select(ORTE_PROC_IS_HNP, !ORTE_PROC_IS_DAEMON))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_snapc_base_select";
        goto error;
    }
    
    /* For daemons, ORTE doesn't need the OPAL CR stuff */
    opal_cr_set_enabled(false);
#else
    opal_cr_set_enabled(false);
#endif
    
    /*
     * Initalize the CR setup
     * Note: Always do this, even in non-FT builds.
     * If we don't some user level tools may hang.
     */
    if (ORTE_SUCCESS != (ret = orte_cr_init())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_cr_init";
        goto error;
    }
    
    /* setup the SENSOR framework */
    if (ORTE_SUCCESS != (ret = orte_sensor_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_sensor_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_sensor_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_sensor_select";
        goto error;
    }
    /* start the local sensors */
    orte_sensor.start(ORTE_PROC_MY_NAME->jobid);
    
    return ORTE_SUCCESS;
    
 error:
    orte_show_help("help-orte-runtime.txt",
                   "orte_init:startup:internal-failure",
                   true, error, ORTE_ERROR_NAME(ret), ret);
    
    return ORTE_ERR_SILENT;
}
int orte_ess_base_app_setup(void)
{
    int ret;
    char *error = NULL;

    /*
     * stdout/stderr buffering
     * If the user requested to override the default setting then do
     * as they wish.
     */
    if( orte_ess_base_std_buffering > -1 ) {
        if( 0 == orte_ess_base_std_buffering ) {
            setvbuf(stdout, NULL, _IONBF, 0);
            setvbuf(stderr, NULL, _IONBF, 0);
        }
        else if( 1 == orte_ess_base_std_buffering ) {
            setvbuf(stdout, NULL, _IOLBF, 0);
            setvbuf(stderr, NULL, _IOLBF, 0);
        }
        else if( 2 == orte_ess_base_std_buffering ) {
            setvbuf(stdout, NULL, _IOFBF, 0);
            setvbuf(stderr, NULL, _IOFBF, 0);
        }
    }

    /* open and setup the state machine */
    if (ORTE_SUCCESS != (ret = orte_state_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_state_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_state_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_state_base_select";
        goto error;
    }

    /* open the errmgr */
    if (ORTE_SUCCESS != (ret = orte_errmgr_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_errmgr_base_open";
        goto error;
    }

    /* Setup the communication infrastructure */
    
    /* Runtime Messaging Layer */
    if (ORTE_SUCCESS != (ret = orte_rml_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_rml_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml_base_select";
        goto error;
    }
    
    /* setup the errmgr */
    if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_errmgr_base_select";
        goto error;
    }

    /* Routed system */
    if (ORTE_SUCCESS != (ret = orte_routed_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_routed_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_routed_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_routed_base_select";
        goto error;
    }
    
    /* database */
    if (ORTE_SUCCESS != (ret = orte_db_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_db_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_db_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_db_base_select";
        goto error;
    }

    /*
     * Group communications
     */
    if (ORTE_SUCCESS != (ret = orte_grpcomm_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_select";
        goto error;
    }
    
    /* non-daemon/HNP apps can only have the default proxy PLM
     * module open - provide a chance for it to initialize
     */
    if (ORTE_SUCCESS != (ret = orte_plm.init())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_plm_init";
        goto error;
    }
    
    /* enable communication via the rml */
    if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml.enable_comm";
        goto error;
    }
    
    /* setup my session directory */
    if (orte_create_session_dirs) {
        OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output,
                             "%s setting up session dir with\n\ttmpdir: %s\n\thost %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base,
                             orte_process_info.nodename));
        
        if (ORTE_SUCCESS != (ret = orte_session_dir(true,
                                                    orte_process_info.tmpdir_base,
                                                    orte_process_info.nodename, NULL,
                                                    ORTE_PROC_MY_NAME))) {
            ORTE_ERROR_LOG(ret);
            error = "orte_session_dir";
            goto error;
        }
        
        /* Once the session directory location has been established, set
         the opal_output env file location to be in the
         proc-specific session directory. */
        opal_output_set_output_file_info(orte_process_info.proc_session_dir,
                                         "output-", NULL, NULL);
    }
    
    /* setup the routed info - the selected routed component
     * will know what to do. Some may put us in a blocking
     * receive here so they can get ALL of the contact info
     * from our peers. Others may just find the local daemon's
     * contact info and immediately return.
     */
    if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_routed.init_routes";
        goto error;
    }
    
    
#if OPAL_ENABLE_FT_CR == 1
    /*
     * Setup the SnapC
     */
    if (ORTE_SUCCESS != (ret = orte_snapc_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_snapc_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_snapc_base_select(ORTE_PROC_IS_HNP, !ORTE_PROC_IS_DAEMON))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_snapc_base_select";
        goto error;
    }
    
    /* apps need the OPAL CR stuff */
    opal_cr_set_enabled(true);
#else
    opal_cr_set_enabled(false);
#endif
    
    /* Initalize the CR setup
     * Note: Always do this, even in non-FT builds.
     * If we don't some user level tools may hang.
     */
    if (ORTE_SUCCESS != (ret = orte_cr_init())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_cr_init";
        goto error;
    }

    /* if we are an ORTE app - and not an MPI app - then
     * we need to barrier here. MPI_Init has its own barrier,
     * so we don't need to do two of them. However, if we
     * don't do a barrier at all, then one process could
     * finalize before another one called orte_init. This
     * causes ORTE to believe that the proc abnormally
     * terminated
     *
     * NOTE: only do this when the process originally launches.
     * Cannot do this on a restart as the rest of the processes
     * in the job won't be executing this step, so we would hang
     */
    if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) {
        orte_grpcomm_collective_t coll;
        OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t);
        coll.id = orte_process_info.peer_init_barrier;
        if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier(&coll))) {
            ORTE_ERROR_LOG(ret);
            error = "orte barrier";
            goto error;
        }
        ORTE_WAIT_FOR_COMPLETION(coll.active);
        OBJ_DESTRUCT(&coll);
    }
    
    return ORTE_SUCCESS;
    
error:
    orte_show_help("help-orte-runtime.txt",
                   "orte_init:startup:internal-failure",
                   true, error, ORTE_ERROR_NAME(ret), ret);
    
    return ret;
}
Beispiel #4
0
/*
 * Open all MCA components so that they can register their MCA
 * parameters.  Take a shotgun approach here and indiscriminately open
 * all components -- don't be selective.  To this end, we need to clear
 * out the environment of all OMPI_MCA_<type> variables to ensure
 * that the open algorithms don't try to only open one component.
 */
void orte_info_open_components(void)
{
    int i;
    char *env, *str;
    char *target, *save, *type;
    char **env_save=NULL;
    bool need_close_components = false;
    orte_info_component_map_t *map;
    
    if (opened_components) {
        return;
    }
    
    /* init the map */
    OBJ_CONSTRUCT(&component_map, opal_pointer_array_t);
    opal_pointer_array_init(&component_map, 256, INT_MAX, 128);
    
    /* Clear out the environment.  Use strdup() to orphan the resulting
     * strings because items are placed in the environment by reference,
     * not by value.
     */
    
    for (i = 0; i < mca_types.size; ++i) {
        if (NULL == (type = (char*)opal_pointer_array_get_item(&mca_types, i))) {
            continue;
        }
        asprintf(&env, "OMPI_MCA_%s", type);
        if (NULL != (save = getenv(env))) {
            /* save this param so it can later be restored */
            asprintf(&str, "%s=%s", env, save);
            opal_argv_append_nosize(&env_save, str);
            free(str);
            /* can't manipulate it directly, so make a copy first */
            asprintf(&target, "%s=", env);
            putenv(target);
            free(target);
        }
    }
    
    /* some components require the event library be active, so activate it */
    if (OPAL_SUCCESS != opal_event_base_open()) {
        str = "opal_event_base_open";
        goto error;
    }
    
    /* Open the DSS */
    
    if (ORTE_SUCCESS != opal_dss_open()) {
        str = "Unable to initialize the DSS";
        goto error;
    }
    
    /* Open up the MCA */
    
    if (OPAL_SUCCESS != mca_base_open()) {
        str = "mca_base_open failed";
        goto error;
    }
    
    /* Register the OPAL layer's MCA parameters */
    
    if (OPAL_SUCCESS != opal_register_params()) {
        str = "opal_register_params failed";
        goto error;
    }
    
    /* Register the ORTE layer's MCA parameters */
    
    if (ORTE_SUCCESS != orte_register_params()) {
        str = "orte_register_params failed";
        goto error;
    }
    
    /* Initialize the opal_output system */
    if (!opal_output_init()) {
        str = "opal_output_init failed";
        goto error;
    }
    
    /* Find / open all components */
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("base");
    opal_pointer_array_add(&component_map, map);
    
    /* set default error message from here forward */
    str = "A component framework failed to open properly.";
    
    /* OPAL frameworks */
    
    if (OPAL_SUCCESS != opal_backtrace_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("backtrace");
    map->components = &opal_backtrace_base_components_opened;
    opal_pointer_array_add(&component_map, map);
    
    if (OPAL_SUCCESS != opal_memory_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("memory");
    map->components = &opal_memory_base_components_opened;
    opal_pointer_array_add(&component_map, map);
    
    /* the event framework is already open - just get its components */
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("event");
    map->components = &opal_event_components;
    opal_pointer_array_add(&component_map, map);
    
    if (OPAL_SUCCESS != opal_memchecker_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("memchecker");
    map->components = &opal_memchecker_base_components_opened;
    opal_pointer_array_add(&component_map, map);
    
    if (OPAL_SUCCESS != opal_shmem_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("shmem");
    map->components = &opal_shmem_base_components_opened;
    opal_pointer_array_add(&component_map, map);
    
#if OPAL_HAVE_HWLOC
    if (OPAL_SUCCESS != opal_hwloc_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("hwloc");
    map->components = &opal_hwloc_base_components;
    opal_pointer_array_add(&component_map, map);
#endif

    if (OPAL_SUCCESS != opal_timer_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("timer");
    map->components = &opal_timer_base_components_opened;
    opal_pointer_array_add(&component_map, map);

#if OPAL_ENABLE_FT_CR == 1
    if (OPAL_SUCCESS != opal_crs_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("crs");
    map->components = &opal_crs_base_components_available;
    opal_pointer_array_add(&component_map, map);
#endif

    if (OPAL_SUCCESS != opal_if_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("if");
    map->components = &opal_if_components;
    opal_pointer_array_add(&component_map, map);

    
    /* OPAL's installdirs base open has already been called as part of
     * opal_init_util() back in main().
     */
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("installdirs");
    map->components = &opal_installdirs_components;
    opal_pointer_array_add(&component_map, map);
    
    /* ORTE frameworks
     * Set orte_process_info.proc_type to HNP to force all frameworks to
     * open components
     */
    orte_process_info.proc_type = ORTE_PROC_HNP;
    
    if (ORTE_SUCCESS != orte_state_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("state");
    map->components = &orte_state_base_components_available;
    opal_pointer_array_add(&component_map, map);

    if (ORTE_SUCCESS != orte_errmgr_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("errmgr");
    map->components = &orte_errmgr_base_components_available;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_grpcomm_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("grpcomm");
    map->components = &orte_grpcomm_base.components_available;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_db_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("db");
    map->components = &orte_db_base.available_components;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_ess_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("ess");
    map->components = &orte_ess_base_components_available;
    opal_pointer_array_add(&component_map, map);
    
#if !ORTE_DISABLE_FULL_SUPPORT
    if (ORTE_SUCCESS != mca_oob_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("oob");
    map->components = &mca_oob_base_components;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_odls_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("odls");
    map->components = &orte_odls_base.available_components;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_iof_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("iof");
    map->components = &orte_iof_base.iof_components_opened;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_ras_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("ras");
    map->components = &orte_ras_base.ras_opened;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_rmaps_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("rmaps");
    map->components = &orte_rmaps_base.available_components;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_rml_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("rml");
    map->components = &orte_rml_base_components;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_routed_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("routed");
    map->components = &orte_routed_base_components;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_plm_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("plm");
    map->components = &orte_plm_base.available_components;
    opal_pointer_array_add(&component_map, map);

#if OPAL_ENABLE_FT_CR == 1
    if (ORTE_SUCCESS != orte_snapc_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("snapc");
    map->components = &orte_snapc_base_components_available;
    opal_pointer_array_add(&component_map, map);
#endif

    if (ORTE_SUCCESS != orte_sensor_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("sensor");
    map->components = &mca_sensor_base_components_available;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_filem_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("filem");
    map->components = &orte_filem_base_components_available;
    opal_pointer_array_add(&component_map, map);
#endif
    
    /* flag that we need to close components */
    need_close_components = true;

    /* Restore the environment to what it was before we started so that
     * if users setenv OMPI_MCA_<framework name> to some value, they'll
     * see that value when it is shown via --param output.
     */
    
    if (NULL != env_save) {
        for (i = 0; i < opal_argv_count(env_save); ++i) {
            putenv(env_save[i]);
        }
    }
    
    /* All done */
    
    opened_components = true;
    return;
    
error:
    fprintf(stderr, "%s\n", str);
    fprintf(stderr, "orte-info will likely not display all configuration information\n");
    if (need_close_components) {
        opened_components = true;
        orte_info_close_components();
    }
}