예제 #1
0
int orte_init(int* pargc, char*** pargv, orte_proc_type_t flags)
{
    int ret;
    char *error = NULL;

    if (0 < orte_initialized) {
        /* track number of times we have been called */
        orte_initialized++;
        return ORTE_SUCCESS;
    }
    orte_initialized++;

    /* initialize the opal layer */
    if (ORTE_SUCCESS != (ret = opal_init(pargc, pargv))) {
        error = "opal_init";
        goto error;
    }
    
    /* ensure we know the type of proc for when we finalize */
    orte_process_info.proc_type = flags;

    /* setup the locks */
    if (ORTE_SUCCESS != (ret = orte_locks_init())) {
        error = "orte_locks_init";
        goto error;
    }
    
    /* Register all MCA Params */
    if (ORTE_SUCCESS != (ret = orte_register_params())) {
        error = "orte_register_params";
        goto error;
    }
    
    /* setup the orte_show_help system */
    if (ORTE_SUCCESS != (ret = orte_show_help_init())) {
        error = "opal_output_init";
        goto error;
    }
    
    /* register handler for errnum -> string conversion */
    opal_error_register("ORTE", ORTE_ERR_BASE, ORTE_ERR_MAX, orte_err2str);

    /* Ensure the rest of the process info structure is initialized */
    if (ORTE_SUCCESS != (ret = orte_proc_info())) {
        error = "orte_proc_info";
        goto error;
    }

    /* open the ESS and select the correct module for this environment */
    if (ORTE_SUCCESS != (ret = orte_ess_base_open())) {
        error = "orte_ess_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_ess_base_select())) {
        error = "orte_ess_base_select";
        goto error;
    }

    if (ORTE_PROC_IS_APP) {
#if !ORTE_DISABLE_FULL_SUPPORT && ORTE_ENABLE_PROGRESS_THREADS
#if OPAL_EVENT_HAVE_THREAD_SUPPORT
        /* get a separate orte event base */
        orte_event_base = opal_event_base_create();
        /* setup the finalize event - we'll need it
         * to break the thread out of the event lib
         * when we want to stop it
         */
        opal_event_set(orte_event_base, &orte_finalize_event, -1, OPAL_EV_WRITE, ignore_callback, NULL);
        opal_event_set_priority(&orte_finalize_event, ORTE_ERROR_PRI);
#if 0
        {
            /* seems strange, but wake us up once a second just so we can check for new events */
            opal_event_t *ev;
            struct timeval tv = {1,0};
            ev = opal_event_alloc();
            opal_event_evtimer_set(orte_event_base,
                               ev, ignore_callback, ev);
            opal_event_set_priority(ev, ORTE_INFO_PRI);
            opal_event_evtimer_add(ev, &tv);
        }
#endif
        /* construct the thread object */
        OBJ_CONSTRUCT(&orte_progress_thread, opal_thread_t);
        /* fork off a thread to progress it */
        orte_progress_thread.t_run = orte_progress_thread_engine;
        if (OPAL_SUCCESS != (ret = opal_thread_start(&orte_progress_thread))) {
            error = "orte progress thread start";
            goto error;
        }
#else
        error = "event thread support is not configured";
        ret = ORTE_ERROR;
        goto error;
#endif
#else
        /* set the event base to the opal one */
        orte_event_base = opal_event_base;
#endif
    } else {
        /* set the event base to the opal one */
        orte_event_base = opal_event_base;
    }

    /* initialize the RTE for this environment */
    if (ORTE_SUCCESS != (ret = orte_ess.init())) {
        error = "orte_ess_init";
        goto error;
    }
    
    /* All done */
    return ORTE_SUCCESS;
    
 error:
    if (ORTE_ERR_SILENT != ret) {
        orte_show_help("help-orte-runtime",
                       "orte_init:startup:internal-failure",
                       true, error, ORTE_ERROR_NAME(ret), ret);
    }

    return ret;
}
예제 #2
0
int main(int argc, char *argv[])
{
    int rc, i, j;
    opal_cmd_line_t cmd_line;
    char *param, *value;
    orte_job_t *jdata=NULL;
    orte_app_context_t *app;
    char *uri, *ptr;

    /* Setup and parse the command line */
    memset(&myglobals, 0, sizeof(myglobals));
    /* find our basename (the name of the executable) so that we can
       use it in pretty-print error messages */
    myglobals.basename = opal_basename(argv[0]);

    opal_cmd_line_create(&cmd_line, cmd_line_init);
    mca_base_cmd_line_setup(&cmd_line);
    if (OPAL_SUCCESS != (rc = opal_cmd_line_parse(&cmd_line, true,
                                                  argc, argv)) ) {
        if (OPAL_ERR_SILENT != rc) {
            fprintf(stderr, "%s: command line error (%s)\n", argv[0],
                    opal_strerror(rc));
        }
        return rc;
    }

    /* print version if requested.  Do this before check for help so
       that --version --help works as one might expect. */
    if (myglobals.version) {
        char *str;
        str = opal_info_make_version_str("all",
                                         OPAL_MAJOR_VERSION, OPAL_MINOR_VERSION,
                                         OPAL_RELEASE_VERSION,
                                         OPAL_GREEK_VERSION,
                                         OPAL_REPO_REV);
        if (NULL != str) {
            fprintf(stdout, "%s %s\n\nReport bugs to %s\n",
                    myglobals.basename, str, PACKAGE_BUGREPORT);
            free(str);
        }
        exit(0);
    }

    /* check if we are running as root - if we are, then only allow
     * us to proceed if the allow-run-as-root flag was given. Otherwise,
     * exit with a giant warning flag
     */
    if (0 == geteuid() && !myglobals.run_as_root) {
        fprintf(stderr, "--------------------------------------------------------------------------\n");
        if (myglobals.help) {
            fprintf(stderr, "%s cannot provide the help message when run as root\n", myglobals.basename);
        } else {
            /* show_help is not yet available, so print an error manually */
            fprintf(stderr, "%s has detected an attempt to run as root.\n", myglobals.basename);
        }
        fprintf(stderr, " This is *strongly* discouraged as any mistake (e.g., in defining TMPDIR) or bug can\n");
        fprintf(stderr, "result in catastrophic damage to the OS file system, leaving\n");
        fprintf(stderr, "your system in an unusable state.\n\n");
        fprintf(stderr, "You can override this protection by adding the --allow-run-as-root\n");
        fprintf(stderr, "option to your cmd line. However, we reiterate our strong advice\n");
        fprintf(stderr, "against doing so - please do so at your own risk.\n");
        fprintf(stderr, "--------------------------------------------------------------------------\n");
        exit(1);
    }

    /*
     * Since this process can now handle MCA/GMCA parameters, make sure to
     * process them.
     * NOTE: It is "safe" to call mca_base_cmd_line_process_args() before
     *  opal_init_util() since mca_base_cmd_line_process_args() does *not*
     *  depend upon opal_init_util() functionality.
     */
    if (OPAL_SUCCESS != mca_base_cmd_line_process_args(&cmd_line, &environ, &environ)) {
        exit(1);
    }

    /* Need to initialize OPAL so that install_dirs are filled in */
    if (OPAL_SUCCESS != opal_init(&argc, &argv)) {
        exit(1);
    }

    /* Check for help request */
    if (myglobals.help) {
        char *str, *args = NULL;
        char *project_name = NULL;
        if (0 == strcmp(myglobals.basename, "mpirun")) {
            project_name = "Open MPI";
        } else {
            project_name = "OpenRTE";
        }
        args = opal_cmd_line_get_usage_msg(&cmd_line);
        str = opal_show_help_string("help-orterun.txt", "orterun:usage", false,
                                    myglobals.basename, project_name, OPAL_VERSION,
                                    myglobals.basename, args,
                                    PACKAGE_BUGREPORT);
        if (NULL != str) {
            printf("%s", str);
            free(str);
        }
        free(args);

        /* If someone asks for help, that should be all we do */
        exit(0);
    }

    /* flag that I am the HNP */
    orte_process_info.proc_type = ORTE_PROC_HNP;

    /* Setup MCA params */
    orte_register_params();

    /* specify the DVM state machine */
    opal_setenv("OMPI_MCA_state", "dvm", true, &environ);

    /* Intialize our Open RTE environment */
    if (ORTE_SUCCESS != (rc = orte_init(&argc, &argv, ORTE_PROC_HNP))) {
        /* cannot call ORTE_ERROR_LOG as it could be the errmgr
         * never got loaded!
         */
        return rc;
    }
    /* finalize OPAL. As it was opened again from orte_init->opal_init
     * we continue to have a reference count on it. So we have to finalize it twice...
     */
    opal_finalize();

    /* check for request to report uri */
    uri = orte_rml.get_contact_info();
    if (NULL != myglobals.report_uri) {
        FILE *fp;
        if (0 == strcmp(myglobals.report_uri, "-")) {
            /* if '-', then output to stdout */
            printf("VMURI: %s\n", uri);
        } else if (0 == strcmp(myglobals.report_uri, "+")) {
            /* if '+', output to stderr */
            fprintf(stderr, "VMURI: %s\n", uri);
        } else if (0 == strncasecmp(myglobals.report_uri, "file:", strlen("file:"))) {
            ptr = strchr(myglobals.report_uri, ':');
            ++ptr;
            fp = fopen(ptr, "w");
            if (NULL == fp) {
                orte_show_help("help-orterun.txt", "orterun:write_file", false,
                               myglobals.basename, "pid", ptr);
                exit(0);
            }
            fprintf(fp, "%s\n", uri);
            fclose(fp);
        } else {
            fp = fopen(myglobals.report_uri, "w");
            if (NULL == fp) {
                orte_show_help("help-orterun.txt", "orterun:write_file", false,
                               myglobals.basename, "pid", myglobals.report_uri);
                exit(0);
            }
            fprintf(fp, "%s\n", uri);
            fclose(fp);
        }
        free(uri);
    } else {
        printf("VMURI: %s\n", uri);
    }

    /* get the daemon job object - was created by ess/hnp component */
    if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
        orte_show_help("help-orterun.txt", "bad-job-object", true,
                       myglobals.basename);
        exit(0);
    }
    /* also should have created a daemon "app" */
    if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, 0))) {
        orte_show_help("help-orterun.txt", "bad-app-object", true,
                       myglobals.basename);
        exit(0);
    }

    /* Did the user specify a prefix, or want prefix by default? */
    if (opal_cmd_line_is_taken(&cmd_line, "prefix") || want_prefix_by_default) {
        size_t param_len;
        /* if both the prefix was given and we have a prefix
         * given above, check to see if they match
         */
        if (opal_cmd_line_is_taken(&cmd_line, "prefix") &&
            NULL != myglobals.prefix) {
            /* if they don't match, then that merits a warning */
            param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0));
            /* ensure we strip any trailing '/' */
            if (0 == strcmp(OPAL_PATH_SEP, &(param[strlen(param)-1]))) {
                param[strlen(param)-1] = '\0';
            }
            value = strdup(myglobals.prefix);
            if (0 == strcmp(OPAL_PATH_SEP, &(value[strlen(value)-1]))) {
                value[strlen(value)-1] = '\0';
            }
            if (0 != strcmp(param, value)) {
                orte_show_help("help-orterun.txt", "orterun:app-prefix-conflict",
                               true, myglobals.basename, value, param);
                /* let the global-level prefix take precedence since we
                 * know that one is being used
                 */
                free(param);
                param = strdup(myglobals.prefix);
            }
            free(value);
        } else if (NULL != myglobals.prefix) {
            param = myglobals.prefix;
        } else if (opal_cmd_line_is_taken(&cmd_line, "prefix")){
            /* must be --prefix alone */
            param = strdup(opal_cmd_line_get_param(&cmd_line, "prefix", 0, 0));
        } else {
            /* --enable-orterun-prefix-default was given to orterun */
            param = strdup(opal_install_dirs.prefix);
        }

        if (NULL != param) {
            /* "Parse" the param, aka remove superfluous path_sep. */
            param_len = strlen(param);
            while (0 == strcmp (OPAL_PATH_SEP, &(param[param_len-1]))) {
                param[param_len-1] = '\0';
                param_len--;
                if (0 == param_len) {
                    orte_show_help("help-orterun.txt", "orterun:empty-prefix",
                                   true, myglobals.basename, myglobals.basename);
                    return ORTE_ERR_FATAL;
                }
            }
            orte_set_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, ORTE_ATTR_GLOBAL, param, OPAL_STRING);
            free(param);
        }
    }

    /* Did the user specify a hostfile. Need to check for both
     * hostfile and machine file.
     * We can only deal with one hostfile per app context, otherwise give an error.
     */
    if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "hostfile"))) {
        if(1 < j) {
            orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles",
                           true, myglobals.basename, NULL);
            return ORTE_ERR_FATAL;
        } else {
            value = opal_cmd_line_get_param(&cmd_line, "hostfile", 0, 0);
            orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, ORTE_ATTR_LOCAL, value, OPAL_STRING);
        }
    }
    if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "machinefile"))) {
        if(1 < j || orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, NULL, OPAL_STRING)) {
            orte_show_help("help-orterun.txt", "orterun:multiple-hostfiles",
                           true, myglobals.basename, NULL);
            return ORTE_ERR_FATAL;
        } else {
            value = opal_cmd_line_get_param(&cmd_line, "machinefile", 0, 0);
            orte_set_attribute(&app->attributes, ORTE_APP_HOSTFILE, ORTE_ATTR_LOCAL, value, OPAL_STRING);
        }
    }

    /* Did the user specify any hosts? */
    if (0 < (j = opal_cmd_line_get_ninsts(&cmd_line, "host"))) {
        char **targ=NULL, *tval;
        for (i = 0; i < j; ++i) {
            value = opal_cmd_line_get_param(&cmd_line, "host", i, 0);
            opal_argv_append_nosize(&targ, value);
        }
        tval = opal_argv_join(targ, ',');
        orte_set_attribute(&app->attributes, ORTE_APP_DASH_HOST, ORTE_ATTR_LOCAL, tval, OPAL_STRING);
        opal_argv_free(targ);
        free(tval);
    }
    OBJ_DESTRUCT(&cmd_line);

    /* setup to listen for commands sent specifically to me, even though I would probably
     * be the one sending them! Unfortunately, since I am a participating daemon,
     * there are times I need to send a command to "all daemons", and that means *I* have
     * to receive it too
     */
    orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON,
                            ORTE_RML_PERSISTENT, orte_daemon_recv, NULL);

    /* override the notify_completed state so we can send a message
     * back to anyone who submits a job to us telling them the job
     * completed */
    if (ORTE_SUCCESS != (rc = orte_state.set_job_state_callback(ORTE_JOB_STATE_NOTIFY_COMPLETED, notify_requestor))) {
        ORTE_ERROR_LOG(rc);
        ORTE_UPDATE_EXIT_STATUS(rc);
        exit(orte_exit_status);
    }

    /* spawn the DVM - we skip the initial steps as this
     * isn't a user-level application */
    ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOCATE);

    /* loop the event lib until an exit event is detected */
    while (orte_event_base_active) {
        opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
    }

    /* cleanup and leave */
    orte_finalize();

    if (orte_debug_flag) {
        fprintf(stderr, "exiting with status %d\n", orte_exit_status);
    }
    exit(orte_exit_status);
}
예제 #3
0
int orte_info_register_components(opal_pointer_array_t *mca_types,
                                  opal_pointer_array_t *component_map)
{
    opal_info_component_map_t *map;
    char *env, *str;
    int i, rc;
    char *target, *save, *type;
    char **env_save=NULL;

    /* Clear out the environment.  Use strdup() to orphan the resulting
     * strings because items are placed in the environment by reference,
     * not by value.
     */
    for (i = 0; i < mca_types->size; ++i) {
        if (NULL == (type = (char*)opal_pointer_array_get_item(mca_types, i))) {
            continue;
        }
        asprintf(&env, "OMPI_MCA_%s", type);
        if (NULL != (save = getenv(env))) {
            /* save this param so it can later be restored */
            asprintf(&str, "%s=%s", env, save);
            opal_argv_append_nosize(&env_save, str);
            free(str);
            /* can't manipulate it directly, so make a copy first */
            asprintf(&target, "%s=", env);
            putenv(target);
            free(target);
        }
        free(env);
    }

    /* Set orte_process_info.proc_type to HNP to force all frameworks to
     * open components
     */
    orte_process_info.proc_type = ORTE_PROC_HNP;
    /* set the event base to be the opal event base as we
     * aren't attempting to do anything with progress threads here
     */
    orte_event_base = opal_event_base;

    /* Register the ORTE layer's MCA parameters */
    
    if (ORTE_SUCCESS != (rc = orte_register_params()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "orte_register_params";
        goto error;
    }
    
    if (ORTE_SUCCESS != (rc = orte_db_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "db_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("db");
    map->components = &orte_db_base.available_components;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "db";
        goto breakout;
    }

    if (ORTE_SUCCESS != (rc = orte_errmgr_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "errmgr_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("errmgr");
    map->components = &orte_errmgr_base_components_available;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "errmgr";
        goto breakout;
    }

    if (ORTE_SUCCESS != (rc = orte_ess_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "ess_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("ess");
    map->components = &orte_ess_base_components_available;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "ess";
        goto breakout;
    }

    if (ORTE_SUCCESS != (rc = orte_filem_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "filem_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("filem");
    map->components = &orte_filem_base_components_available;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "filem";
        goto breakout;
    }

    if (ORTE_SUCCESS != (rc = orte_grpcomm_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "grpcomm_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("grpcomm");
    map->components = &orte_grpcomm_base.components_available;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "grpcomm";
        goto breakout;
    }

    if (ORTE_SUCCESS != (rc = orte_iof_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "iof_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("iof");
    map->components = &orte_iof_base.iof_components_opened;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "iof";
        goto breakout;
    }

    if (ORTE_SUCCESS != (rc = orte_odls_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "odls_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("odls");
    map->components = &orte_odls_base.available_components;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "odls";
        goto breakout;
    }

    if (ORTE_SUCCESS != (rc = mca_oob_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "oob_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("oob");
    map->components = &mca_oob_base_components;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "oob";
        goto breakout;
    }

    if (ORTE_SUCCESS != (rc = orte_plm_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "plm_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("plm");
    map->components = &orte_plm_base.available_components;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "plm";
        goto breakout;
    }

    if (ORTE_SUCCESS != (rc = orte_ras_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "ras_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("ras");
    map->components = &orte_ras_base.ras_opened;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "ras";
        goto breakout;
    }

    if (ORTE_SUCCESS != (rc = orte_rmaps_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "rmaps_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("rmaps");
    map->components = &orte_rmaps_base.available_components;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "rmaps";
        goto breakout;
    }

    if (ORTE_SUCCESS != (rc = orte_routed_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "routed_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("routed");
    map->components = &orte_routed_base_components;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "routed";
        goto breakout;
    }

    if (ORTE_SUCCESS != (rc = orte_rml_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "rml_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("rml");
    map->components = &orte_rml_base_components;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "rml";
        goto breakout;
    }

#if ORTE_ENABLE_SENSORS
    if (ORTE_SUCCESS != (rc = orte_sensor_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "sensor_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("sensor");
    map->components = &mca_sensor_base_components_available;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "sensor";
        goto breakout;
    }
#endif
    
#if OPAL_ENABLE_FT_CR == 1
    if (ORTE_SUCCESS != (rc = orte_snapc_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "snapc_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("snapc");
    map->components = &orte_snapc_base_components_available;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "snapc";
        goto breakout;
    }

    if (ORTE_SUCCESS != (rc = orte_sstore_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "sstore_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("sstore");
    map->components = &orte_sstore_base_components_available;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "sstore";
        goto breakout;
    }
#endif
    
    if (ORTE_SUCCESS != (rc = orte_state_base_open()) &&
        ORTE_ERR_BAD_PARAM != rc) {
        str = "state_base_open";
        goto error;
    }
    map = OBJ_NEW(opal_info_component_map_t);
    map->type = strdup("state");
    map->components = &orte_state_base_components_available;
    opal_pointer_array_add(component_map, map);
    if (ORTE_ERR_BAD_PARAM == rc)  {
        str = "state";
        goto breakout;
    }

 breakout:
    /* Restore the environment to what it was before we started so that
     * if users setenv OMPI_MCA_<framework name> to some value, they'll
     * see that value when it is shown via --param output.
     */
    
    if (NULL != env_save) {
        for (i = 0; i < opal_argv_count(env_save); ++i) {
            putenv(env_save[i]);
        }
    }

    if (ORTE_ERR_BAD_PARAM == rc) {
        fprintf(stderr, "\nA \"bad parameter\" error was encountered when opening the ORTE %s framework\n", str);
        fprintf(stderr, "The output received from that framework includes the following parameters:\n\n");
    }

    return rc;

 error:
    fprintf(stderr, "orte_info_register: %s failed\n", str);
    return ORTE_ERROR;
}
예제 #4
0
파일: orte_init.c 프로젝트: bringhurst/ompi
int orte_init(int* pargc, char*** pargv, orte_proc_type_t flags)
{
    int ret;
    char *error = NULL;

    if (orte_initialized) {
        return ORTE_SUCCESS;
    }

    /* initialize the opal layer */
    if (ORTE_SUCCESS != (ret = opal_init(pargc, pargv))) {
        ORTE_ERROR_LOG(ret);
        return ret;
    }
    
    /* ensure we know the type of proc for when we finalize */
    orte_process_info.proc_type = flags;

    /* setup the locks */
    if (ORTE_SUCCESS != (ret = orte_locks_init())) {
        error = "orte_locks_init";
        goto error;
    }
    
    /* Register all MCA Params */
    if (ORTE_SUCCESS != (ret = orte_register_params())) {
        error = "orte_register_params";
        goto error;
    }
    
    /* setup the orte_show_help system */
    if (ORTE_SUCCESS != (ret = orte_show_help_init())) {
        ORTE_ERROR_LOG(ret);
        error = "opal_output_init";
        goto error;
    }
    
    /* register handler for errnum -> string conversion */
    opal_error_register("ORTE", ORTE_ERR_BASE, ORTE_ERR_MAX, orte_err2str);

    /* Ensure the rest of the process info structure is initialized */
    if (ORTE_SUCCESS != (ret = orte_proc_info())) {
        error = "orte_proc_info";
        goto error;
    }

    /* open the ESS and select the correct module for this environment */
    if (ORTE_SUCCESS != (ret = orte_ess_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_ess_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_ess_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_ess_base_select";
        goto error;
    }
    
    /* initialize the RTE for this environment */
    if (ORTE_SUCCESS != (ret = orte_ess.init())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_ess_set_name";
        goto error;
    }
    
    /* All done */
    orte_initialized = true;
    return ORTE_SUCCESS;
    
error:
    if (ORTE_ERR_SILENT != OPAL_SOS_GET_ERROR_CODE(ret)) {
        orte_show_help("help-orte-runtime",
                       "orte_init:startup:internal-failure",
                       true, error, ORTE_ERROR_NAME(ret), ret);
    }

    return ret;
}
예제 #5
0
int orte_init(int* pargc, char*** pargv, orte_proc_type_t flags)
{
    int ret;
    char *error = NULL;

    if (0 < orte_initialized) {
        /* track number of times we have been called */
        orte_initialized++;
        return ORTE_SUCCESS;
    }
    orte_initialized++;

    /* initialize the opal layer */
    if (ORTE_SUCCESS != (ret = opal_init(pargc, pargv))) {
        error = "opal_init";
        goto error;
    }
    
    /* Convince OPAL to use our naming scheme */
    opal_process_name_print = _process_name_print_for_opal;
    opal_vpid_print = _vpid_print_for_opal;
    opal_jobid_print = _jobid_print_for_opal;
    opal_compare_proc = _process_name_compare;
    opal_convert_string_to_process_name = _convert_string_to_process_name;
    
    /* ensure we know the type of proc for when we finalize */
    orte_process_info.proc_type = flags;

    /* setup the locks */
    if (ORTE_SUCCESS != (ret = orte_locks_init())) {
        error = "orte_locks_init";
        goto error;
    }
    
    /* Register all MCA Params */
    if (ORTE_SUCCESS != (ret = orte_register_params())) {
        error = "orte_register_params";
        goto error;
    }
    
    /* setup the orte_show_help system */
    if (ORTE_SUCCESS != (ret = orte_show_help_init())) {
        error = "opal_output_init";
        goto error;
    }
    
    /* register handler for errnum -> string conversion */
    opal_error_register("ORTE", ORTE_ERR_BASE, ORTE_ERR_MAX, orte_err2str);

    /* Ensure the rest of the process info structure is initialized */
    if (ORTE_SUCCESS != (ret = orte_proc_info())) {
        error = "orte_proc_info";
        goto error;
    }

    /* we may have modified the local nodename according to
     * request to retain/strip the FQDN and prefix, so update
     * it here. The OPAL layer will strdup the hostname, so
     * we have to free it first to avoid a memory leak */
    if (NULL != opal_process_info.nodename) {
        free(opal_process_info.nodename);
    }
    /* opal_finalize_util will call free on this pointer so set from strdup */
    opal_process_info.nodename = strdup (orte_process_info.nodename);

    /* setup the dstore framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_dstore_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "opal_dstore_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = opal_dstore_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "opal_dstore_base_select";
        goto error;
    }
    /* create the handle */
    if (0 > (opal_dstore_internal = opal_dstore.open("INTERNAL", "hash", NULL))) {
        error = "opal dstore internal";
        ret = ORTE_ERR_FATAL;
        goto error;
    }

    if (ORTE_PROC_IS_APP) {
        if (0 > (opal_dstore_modex = opal_dstore.open("MODEX", "sm,hash", NULL))) {
            error = "opal dstore modex";
            ret = ORTE_ERR_FATAL;
            goto error;
        }
    }

    if (ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) {
        /* let the pmix server register params */
        pmix_server_register();
    }

    /* open the ESS and select the correct module for this environment */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_ess_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_ess_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_ess_base_select())) {
        error = "orte_ess_base_select";
        goto error;
    }

    if (!ORTE_PROC_IS_APP) {
        /* ORTE tools "block" in their own loop over the event
         * base, so no progress thread is required - apps will
         * start their progress thread in ess_base_std_app.c
         * at the appropriate point
         */
        orte_event_base = opal_event_base;
    }

    /* initialize the RTE for this environment */
    if (ORTE_SUCCESS != (ret = orte_ess.init())) {
        error = "orte_ess_init";
        goto error;
    }

    /* set the remaining opal_process_info fields. Note that
     * the OPAL layer will have initialized these to NULL, and
     * anyone between us would not have strdup'd the string, so
     * we cannot free it here */
    opal_process_info.job_session_dir  = orte_process_info.job_session_dir;
    opal_process_info.proc_session_dir = orte_process_info.proc_session_dir;
    opal_process_info.num_local_peers  = (int32_t)orte_process_info.num_local_peers;
    opal_process_info.my_local_rank    = (int32_t)orte_process_info.my_local_rank;
#if OPAL_HAVE_HWLOC
    opal_process_info.cpuset           = orte_process_info.cpuset;
#endif  /* OPAL_HAVE_HWLOC */

#if OPAL_ENABLE_TIMING
    opal_timing_set_jobid(ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
#endif

    /* All done */
    return ORTE_SUCCESS;
    
 error:
    if (ORTE_ERR_SILENT != ret) {
        orte_show_help("help-orte-runtime",
                       "orte_init:startup:internal-failure",
                       true, error, ORTE_ERROR_NAME(ret), ret);
    }

    return ret;
}
예제 #6
0
파일: orte_init.c 프로젝트: orcmuser/orcm
int orte_init(int* pargc, char*** pargv, orte_proc_type_t flags)
{
    int ret;
    char *error = NULL;

    if (0 < orte_initialized) {
        /* track number of times we have been called */
        orte_initialized++;
        return ORTE_SUCCESS;
    }
    orte_initialized++;

    /* initialize the opal layer */
    if (ORTE_SUCCESS != (ret = opal_init(pargc, pargv))) {
        error = "opal_init";
        goto error;
    }
    
    /* Convince OPAL to use our naming scheme */
    opal_process_name_print = _process_name_print_for_opal;
    opal_process_name_vpid = _process_name_vpid_for_opal;
    opal_process_name_jobid = _process_name_jobid_for_opal;
    opal_compare_proc = _process_name_compare;

    /* ensure we know the type of proc for when we finalize */
    orte_process_info.proc_type = flags;

    /* setup the locks */
    if (ORTE_SUCCESS != (ret = orte_locks_init())) {
        error = "orte_locks_init";
        goto error;
    }
    
    /* Register all MCA Params */
    if (ORTE_SUCCESS != (ret = orte_register_params())) {
        error = "orte_register_params";
        goto error;
    }
    
    /* setup the orte_show_help system */
    if (ORTE_SUCCESS != (ret = orte_show_help_init())) {
        error = "opal_output_init";
        goto error;
    }
    
    /* register handler for errnum -> string conversion */
    opal_error_register("ORTE", ORTE_ERR_BASE, ORTE_ERR_MAX, orte_err2str);

    /* Ensure the rest of the process info structure is initialized */
    if (ORTE_SUCCESS != (ret = orte_proc_info())) {
        error = "orte_proc_info";
        goto error;
    }

    /* open the ESS and select the correct module for this environment */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_ess_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_ess_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_ess_base_select())) {
        error = "orte_ess_base_select";
        goto error;
    }

    if (!ORTE_PROC_IS_APP) {
        /* ORTE tools "block" in their own loop over the event
         * base, so no progress thread is required - apps will
         * start their progress thread in ess_base_std_app.c
         * at the appropriate point
         */
        orte_event_base = opal_event_base;
    }

    /* initialize the RTE for this environment */
    if (ORTE_SUCCESS != (ret = orte_ess.init())) {
        error = "orte_ess_init";
        goto error;
    }
    
    /* All done */
    return ORTE_SUCCESS;
    
 error:
    if (ORTE_ERR_SILENT != ret) {
        orte_show_help("help-orte-runtime",
                       "orte_init:startup:internal-failure",
                       true, error, ORTE_ERROR_NAME(ret), ret);
    }

    return ret;
}
예제 #7
0
파일: sst_orcmd.c 프로젝트: htquach/gemeter
static int orcmd_init(void)
{
    int ret = ORTE_ERROR;
    char *error = NULL;
    opal_buffer_t buf, *clusterbuf, *uribuf;
    orte_job_t *jdata;
    orte_node_t *node;
    orte_proc_t *proc;
    opal_list_t config;
    orcm_scheduler_t *scheduler;
    orcm_node_t *mynode=NULL;
    int32_t n;

    if (initialized) {
        return ORCM_SUCCESS;
    }
    initialized = true;

    /* Initialize the ORTE data type support */
    if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
        error = "orte_std_prolog";
        goto error;
    }

    /* setup the global job and node arrays */
    orte_job_data = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data,
                                                       1,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       1))) {
        ORTE_ERROR_LOG(ret);
        error = "setup job array";
        goto error;
    }

    orte_node_pool = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
        ORTE_ERROR_LOG(ret);
        error = "setup node array";
        goto error;
    }
    orte_node_topologies = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_topologies,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
        ORTE_ERROR_LOG(ret);
        error = "setup node topologies array";
        goto error;
    }

    /* create a job tracker for the daemons */
    jdata = OBJ_NEW(orte_job_t);
    jdata->jobid = 0;
    ORTE_PROC_MY_NAME->jobid = 0;
    opal_pointer_array_set_item(orte_job_data, 0, jdata);

    /* read the site configuration */
    OBJ_CONSTRUCT(&config, opal_list_t);
    if (ORCM_SUCCESS != (ret = orcm_cfgi.read_config(&config))) {
        error = "getting config";
        goto error;
    }

    /* define the cluster and collect contact info for all
     * aggregators - we'll need to know how to talk to any
     * of them in case of failures
     */
    OBJ_CONSTRUCT(&buf, opal_buffer_t);
    if (ORCM_SUCCESS != (ret = orcm_cfgi.define_system(&config,
                                                       &mynode,
                                                       &orte_process_info.num_procs,
                                                       &buf))) {
        OBJ_DESTRUCT(&buf);
        error = "define system";
        goto error;
    }

    /* if my name didn't get set, then we didn't find our node
     * in the config - report it and die
     */
    if (NULL == mynode) {
        orte_show_help("help-ess-orcm.txt", "node-not-found", true,
                       orcm_cfgi_base.config_file,
                       orte_process_info.nodename);
        OBJ_DESTRUCT(&buf);
        return ORTE_ERR_SILENT;
    }

    /* define a node and proc object for ourselves as some parts
     * of ORTE and ORCM require it */
    if (NULL == (node = OBJ_NEW(orte_node_t))) {
        ret = ORTE_ERR_OUT_OF_RESOURCE;
        error = "out of memory";
        goto error;
    }
    node->name = strdup(orte_process_info.nodename);
    opal_pointer_array_set_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid, node);
    if (NULL == (proc = OBJ_NEW(orte_proc_t))) {
        ret = ORTE_ERR_OUT_OF_RESOURCE;
        error = "out of memory";
        goto error;
    }
    proc->name.jobid = ORTE_PROC_MY_NAME->jobid;
    proc->name.vpid = ORTE_PROC_MY_NAME->vpid;
    OBJ_RETAIN(proc);
    node->daemon = proc;
    OBJ_RETAIN(node);
    proc->node = node;
    opal_pointer_array_set_item(jdata->procs, ORTE_PROC_MY_NAME->vpid, proc);

    /* For now, we only support a single scheduler daemon in the system.
     * This *may* change someday in the future */
    scheduler = (orcm_scheduler_t*)opal_list_get_first(orcm_schedulers);

    /* If we are in test mode, then we don't *require* that a scheduler
     * be defined in the system - otherwise, we do */
    if (NULL == scheduler) {
        if (mca_sst_orcmd_component.scheduler_reqd) {
            error = "no scheduler found";
            ret = ORTE_ERR_NOT_FOUND;
            goto error;
        }
    } else {
        ORTE_PROC_MY_SCHEDULER->jobid = scheduler->controller.daemon.jobid;
        ORTE_PROC_MY_SCHEDULER->vpid = scheduler->controller.daemon.vpid;
    }

    /* register the ORTE-level params at this time now that the
     * config has had a chance to push things into the environ
     */
    if (ORTE_SUCCESS != (ret = orte_register_params())) {
        OBJ_DESTRUCT(&buf);
        error = "orte_register_params";
        goto error;
    }

    /* setup callback for SIGPIPE */
    setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback);
    /* Set signal handlers to catch kill signals so we can properly clean up
     * after ourselves.
     */
    setup_sighandler(SIGTERM, &term_handler, shutdown_signal);
    setup_sighandler(SIGINT, &int_handler, shutdown_signal);

    /** setup callbacks for signals we should ignore */
    setup_sighandler(SIGUSR1, &sigusr1_handler, signal_callback);
    setup_sighandler(SIGUSR2, &sigusr2_handler, signal_callback);
    signals_set = true;

#if OPAL_HAVE_HWLOC
    {
        hwloc_obj_t obj;
        unsigned i, j;

        /* get the local topology */
        if (NULL == opal_hwloc_topology) {
            if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
                OBJ_DESTRUCT(&buf);
                error = "topology discovery";
                goto error;
            }
        }

        /* remove the hostname from the topology. Unfortunately, hwloc
         * decided to add the source hostname to the "topology", thus
         * rendering it unusable as a pure topological description. So
         * we remove that information here.
         */
        obj = hwloc_get_root_obj(opal_hwloc_topology);
        for (i=0; i < obj->infos_count; i++) {
            if (NULL == obj->infos[i].name ||
                NULL == obj->infos[i].value) {
                continue;
            }
            if (0 == strncmp(obj->infos[i].name, "HostName", strlen("HostName"))) {
                free(obj->infos[i].name);
                free(obj->infos[i].value);
                /* left justify the array */
                for (j=i; j < obj->infos_count-1; j++) {
                    obj->infos[j] = obj->infos[j+1];
                }
                obj->infos[obj->infos_count-1].name = NULL;
                obj->infos[obj->infos_count-1].value = NULL;
                obj->infos_count--;
                break;
            }
        }

        if (15 < opal_output_get_verbosity(orcm_sst_base_framework.framework_output)) {
            opal_output(0, "%s Topology Info:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO);
        }

        /* if we were asked to bind to specific core(s), do so now */
        if (NULL != orte_daemon_cores) {
            char **cores=NULL, tmp[128];
            hwloc_obj_t pu;
            hwloc_cpuset_t ours, pucpus, res;
            int core;

            /* could be a collection of comma-delimited ranges, so
             * use our handy utility to parse it
             */
            orte_util_parse_range_options(orte_daemon_cores, &cores);
            if (NULL != cores) {
                ours = hwloc_bitmap_alloc();
                hwloc_bitmap_zero(ours);
                pucpus = hwloc_bitmap_alloc();
                res = hwloc_bitmap_alloc();
                for (i=0; NULL != cores[i]; i++) {
                    core = strtoul(cores[i], NULL, 10);
                    if (NULL == (pu = opal_hwloc_base_get_pu(opal_hwloc_topology, core, OPAL_HWLOC_LOGICAL))) {
                        orte_show_help("help-orted.txt", "orted:cannot-bind",
                                       true, orte_process_info.nodename,
                                       orte_daemon_cores);
                        ret = ORTE_ERR_NOT_SUPPORTED;
                        OBJ_DESTRUCT(&buf);
                        error = "cannot bind";
                        goto error;
                    }
                    hwloc_bitmap_and(pucpus, pu->online_cpuset, pu->allowed_cpuset);
                    hwloc_bitmap_or(res, ours, pucpus);
                    hwloc_bitmap_copy(ours, res);
                }
                /* if the result is all zeros, then don't bind */
                if (!hwloc_bitmap_iszero(ours)) {
                    (void)hwloc_set_cpubind(opal_hwloc_topology, ours, 0);
                    if (opal_hwloc_report_bindings) {
                        opal_hwloc_base_cset2mapstr(tmp, sizeof(tmp), opal_hwloc_topology, ours);
                        opal_output(0, "Daemon %s is bound to cores %s",
                                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp);
                    }
                }
                /* cleanup */
                hwloc_bitmap_free(ours);
                hwloc_bitmap_free(pucpus);
                hwloc_bitmap_free(res);
                opal_argv_free(cores);
            }
        }
    }
#endif

    /* open and select the pstat framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_pstat_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "opal_pstat_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "opal_pstat_base_select";
        goto error;
    }

    /* open and setup the state machine */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_state_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_state_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_state_base_select";
        goto error;
    }

    /* open the notifier */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_notifier_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_notifier_base_open";
        goto error;
    }

    /* open the errmgr */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_errmgr_base_open";
        goto error;
    }

    /* Setup the communication infrastructure */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_oob_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_oob_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_oob_base_select";
        goto error;
    }
    if (!opal_list_get_size(&orte_oob_base.actives)) {
        ret = ORTE_ERROR;
        error = "orte_oob: Found 0 active transports";
        goto error;
    }

    /* Runtime Messaging Layer */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_rml_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_rml_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_rml_base_select";
        goto error;
    }

    /* select the notifier*/
    if (ORTE_SUCCESS != (ret = orte_notifier_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_notifier_base_select";
        goto error;
    }

    /* select the errmgr */
    if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_errmgr_base_select";
        goto error;
    }

    /* Routed system */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_rml_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_routed_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_routed_base_select";
        goto error;
    }

    /* database */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_db_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orcm_db_base_open";
        goto error;
    }
    /* always restrict daemons to local database components */
    if (ORTE_SUCCESS != (ret = orcm_db_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orcm_db_base_select";
        goto error;
    }

    /* datastore - ensure we don't pickup the pmi component, but
     * don't override anything set by user
     */
    if (NULL == getenv(OPAL_MCA_PREFIX"dstore")) {
        putenv(OPAL_MCA_PREFIX"dstore=^pmi");
    }
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_dstore_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "opal_dstore_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = opal_dstore_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "opal_dstore_base_select";
        goto error;
    }
    /* create the handle */
    if (0 > (opal_dstore_internal = opal_dstore.open("INTERNAL", NULL, NULL))) {
        error = "opal dstore internal";
        ret = ORTE_ERR_FATAL;
        goto error;
    }

    /* extract the cluster description and setup the routed info - the orcm routed component
     * will know what to do. */
    n = 1;
    if (OPAL_SUCCESS != (ret = opal_dss.unpack(&buf, &clusterbuf, &n, OPAL_BUFFER))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "extract cluster buf";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, clusterbuf))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        OBJ_RELEASE(clusterbuf);
        error = "orte_routed.init_routes";
        goto error;
    }
    OBJ_RELEASE(clusterbuf);

    /* extract the uri buffer and load the hash tables */
    n = 1;
    if (OPAL_SUCCESS != (ret = opal_dss.unpack(&buf, &uribuf, &n, OPAL_BUFFER))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "extract uri buffer";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_rml_base_update_contact_info(uribuf))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        OBJ_RELEASE(uribuf);
        error = "load hash tables";
        goto error;
    }
    OBJ_DESTRUCT(&buf);
    OBJ_RELEASE(uribuf);

    /*
     * Group communications
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_select";
        goto error;
    }

    /* Open/select the odls */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_odls_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_odls_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_odls_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_odls_base_select";
        goto error;
    }

    /* enable communication with the rml */
    if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml.enable_comm";
        goto error;
    }

    /* setup the FileM */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_filem_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_filem_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_filem_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_filem_base_select";
        goto error;
    }

    /*
     * Initalize the CR setup
     * Note: Always do this, even in non-FT builds.
     * If we don't some user level tools may hang.
     */
    opal_cr_set_enabled(false);
    if (ORTE_SUCCESS != (ret = orte_cr_init())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_cr_init";
        goto error;
    }

    /* setup the ANALYTICS framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_analytics_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_analytics_base_open";
        goto error;
    }

    /* setup the EVGEN framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_evgen_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_evgen_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orcm_evgen_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_evgen_select";
        goto error;
    }

    /* setup the SENSOR framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_sensor_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_sensor_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orcm_sensor_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_sensor_select";
        goto error;
    }
    /* start the local sensors */
    orcm_sensor.start(ORTE_PROC_MY_NAME->jobid);

    /* setup the PWRMGMT framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_pwrmgmt_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_pwrmgmt_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orcm_pwrmgmt_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_pwrmgmt_select";
        goto error;
    }

    /* setup the DFS framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_dfs_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_dfs_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_dfs_select";
        goto error;
    }

    /* open and setup the DIAG framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_diag_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_diag_base_open";
        goto error;
    }
    if (ORCM_SUCCESS != (ret = orcm_diag_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_diag_select";
        goto error;
    }

    return ORTE_SUCCESS;
    
 error:
    orte_show_help("help-orcm-runtime.txt",
                   "orcm_init:startup:internal-failure",
                   true, error, ORTE_ERROR_NAME(ret), ret);
    
    return ORTE_ERR_SILENT;
}
예제 #8
0
파일: sst_tool.c 프로젝트: orcmuser/orcm
static int tool_init(void)
{
    int ret = ORTE_ERROR;
    char *error = NULL;
    opal_buffer_t buf, *clusterbuf, *uribuf;
    orte_job_t *jdata;
    orte_node_t *node;
    orte_proc_t *proc;
    opal_list_t config;
    orcm_scheduler_t *scheduler;
    orcm_node_t *mynode=NULL;
    int32_t n;
    
    if (initialized) {
        return ORCM_SUCCESS;
    }
    initialized = true;
    
    /* Initialize the ORTE data type support */
    if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
        error = "orte_std_prolog";
        goto error;
    }
    
    /* setup the global job and node arrays */
    orte_job_data = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data,
                                                       1,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       1))) {
        ORTE_ERROR_LOG(ret);
        error = "setup job array";
        goto error;
    }
    
    orte_node_pool = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
        ORTE_ERROR_LOG(ret);
        error = "setup node array";
        goto error;
    }
    orte_node_topologies = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_topologies,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
        ORTE_ERROR_LOG(ret);
        error = "setup node topologies array";
        goto error;
    }
    
    /* create a job tracker for the daemons */
    jdata = OBJ_NEW(orte_job_t);
    jdata->jobid = 0;
    ORTE_PROC_MY_NAME->jobid = 0;
    opal_pointer_array_set_item(orte_job_data, 0, jdata);
    
    /* read the site configuration */
    OBJ_CONSTRUCT(&config, opal_list_t);
    if (ORCM_SUCCESS != (ret = orcm_cfgi.read_config(&config))) {
        error = "getting config";
        goto error;
    }
    
    /* define the cluster and collect contact info for all
     * aggregators - we'll need to know how to talk to any
     * of them in case of failures
     */
    OBJ_CONSTRUCT(&buf, opal_buffer_t);
    if (ORCM_SUCCESS != (ret = orcm_cfgi.define_system(&config,
                                                       &mynode,
                                                       &orte_process_info.num_procs,
                                                       &buf))) {
        OBJ_DESTRUCT(&buf);
        error = "define system";
        goto error;
    }
    
    /* define a name for myself */
    if (ORTE_SUCCESS != (ret = orte_plm_base_set_hnp_name())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_plm_base_set_hnp_name";
        goto error;
    }
    
    /* define a node and proc object for ourselves as some parts
     * of ORTE and ORCM require it */
    if (NULL == (node = OBJ_NEW(orte_node_t))) {
        ret = ORTE_ERR_OUT_OF_RESOURCE;
        error = "out of memory";
        goto error;
    }
    node->name = strdup(orte_process_info.nodename);
    opal_pointer_array_set_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid, node);
    if (NULL == (proc = OBJ_NEW(orte_proc_t))) {
        ret = ORTE_ERR_OUT_OF_RESOURCE;
        error = "out of memory";
        goto error;
    }
    proc->name.jobid = ORTE_PROC_MY_NAME->jobid;
    proc->name.vpid = ORTE_PROC_MY_NAME->vpid;
    OBJ_RETAIN(proc);
    node->daemon = proc;
    OBJ_RETAIN(node);
    proc->node = node;
    opal_pointer_array_set_item(jdata->procs, ORTE_PROC_MY_NAME->vpid, proc);
    
    /* For now, we only support a single scheduler daemon in the system.
     * This *may* change someday in the future */
    scheduler = (orcm_scheduler_t*)opal_list_get_first(orcm_schedulers);
    
    ORTE_PROC_MY_SCHEDULER->jobid = scheduler->controller.daemon.jobid;
    ORTE_PROC_MY_SCHEDULER->vpid = scheduler->controller.daemon.vpid;
    
    /* register the ORTE-level params at this time now that the
     * config has had a chance to push things into the environ
     */
    if (ORTE_SUCCESS != (ret = orte_register_params())) {
        OBJ_DESTRUCT(&buf);
        error = "orte_register_params";
        goto error;
    }
    
    /* setup callback for SIGPIPE */
    setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback);
    /* Set signal handlers to catch kill signals so we can properly clean up
     * after ourselves.
     */
    setup_sighandler(SIGTERM, &term_handler, shutdown_signal);
    setup_sighandler(SIGINT, &int_handler, shutdown_signal);
    
    /** setup callbacks for signals we should ignore */
    setup_sighandler(SIGUSR1, &sigusr1_handler, signal_callback);
    setup_sighandler(SIGUSR2, &sigusr2_handler, signal_callback);
    signals_set = true;
    
    /* open and select the pstat framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_pstat_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "opal_pstat_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "opal_pstat_base_select";
        goto error;
    }
    
    /* open and setup the state machine */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_state_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_state_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_state_base_select";
        goto error;
    }
    
    /* open the errmgr */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_errmgr_base_open";
        goto error;
    }
    
    /* Setup the communication infrastructure */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_oob_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_oob_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_oob_base_select";
        goto error;
    }
    
    /* Runtime Messaging Layer */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_rml_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_rml_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_rml_base_select";
        goto error;
    }
    
    /* select the errmgr */
    if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_errmgr_base_select";
        goto error;
    }
    
    /* Routed system */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_rml_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_routed_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_routed_base_select";
        goto error;
    }
    
    /* database */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_db_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orcm_db_base_open";
        goto error;
    }
    /* always restrict daemons to local database components */
    if (ORTE_SUCCESS != (ret = orcm_db_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orcm_db_base_select";
        goto error;
    }
    
    /* datastore - ensure we don't pickup the pmi component, but
     * don't override anything set by user
     */
    if (NULL == getenv("OMPI_MCA_dstore")) {
        putenv("OMPI_MCA_dstore=^pmi");
    }
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_dstore_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "opal_dstore_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = opal_dstore_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "opal_dstore_base_select";
        goto error;
    }
    /* create the handles */
    if (0 > (opal_dstore_peer = opal_dstore.open("PEER"))) {
        error = "opal dstore global";
        ret = ORTE_ERR_FATAL;
        goto error;
    }
    if (0 > (opal_dstore_internal = opal_dstore.open("INTERNAL"))) {
        error = "opal dstore internal";
        ret = ORTE_ERR_FATAL;
        goto error;
    }
    if (0 > (opal_dstore_nonpeer = opal_dstore.open("NONPEER"))) {
        error = "opal dstore nonpeer";
        ret = ORTE_ERR_FATAL;
        goto error;
    }
    
    /* initialize the nidmaps */
    if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_util_nidmap_init";
        goto error;
    }
    
    /* extract the cluster description and setup the routed info - the orcm routed component
     * will know what to do. */
    n = 1;
    if (OPAL_SUCCESS != (ret = opal_dss.unpack(&buf, &clusterbuf, &n, OPAL_BUFFER))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "extract cluster buf";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, clusterbuf))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        OBJ_RELEASE(clusterbuf);
        error = "orte_routed.init_routes";
        goto error;
    }
    OBJ_RELEASE(clusterbuf);
    
    /* extract the uri buffer and load the hash tables */
    n = 1;
    if (OPAL_SUCCESS != (ret = opal_dss.unpack(&buf, &uribuf, &n, OPAL_BUFFER))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "extract uri buffer";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_rml_base_update_contact_info(uribuf))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        OBJ_RELEASE(uribuf);
        error = "load hash tables";
        goto error;
    }
    OBJ_DESTRUCT(&buf);
    OBJ_RELEASE(uribuf);

    /* construct the thread object */
    OBJ_CONSTRUCT(&progress_thread, opal_thread_t);
    /* fork off a thread to progress it */
    progress_thread.t_run = progress_thread_engine;
    progress_thread_running = true;
    if (OPAL_SUCCESS != (ret = opal_thread_start(&progress_thread))) {
        error = "progress thread start";
        progress_thread_running = false;
        goto error;
    }

    /*
     * Group communications
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_select";
        goto error;
    }
    
    /* Open/select the odls */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_odls_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_odls_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_odls_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_odls_base_select";
        goto error;
    }
    
    /* enable communication with the rml */
    if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml.enable_comm";
        goto error;
    }
    
    /* setup the FileM */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_filem_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_filem_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_filem_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_filem_base_select";
        goto error;
    }
    
    /*
     * Initalize the CR setup
     * Note: Always do this, even in non-FT builds.
     * If we don't some user level tools may hang.
     */
    opal_cr_set_enabled(false);
    if (ORTE_SUCCESS != (ret = orte_cr_init())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_cr_init";
        goto error;
    }
    
    /* setup the DFS framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_dfs_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_dfs_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_dfs_select";
        goto error;
    }
    
    return ORTE_SUCCESS;
    
error:
    orte_show_help("help-orte-runtime.txt",
                   "orte_init:startup:internal-failure",
                   true, error, ORTE_ERROR_NAME(ret), ret);
    
    return ORTE_ERR_SILENT;
}
예제 #9
0
/*
 * Open all MCA components so that they can register their MCA
 * parameters.  Take a shotgun approach here and indiscriminately open
 * all components -- don't be selective.  To this end, we need to clear
 * out the environment of all OMPI_MCA_<type> variables to ensure
 * that the open algorithms don't try to only open one component.
 */
void orte_info_open_components(void)
{
    int i;
    char *env, *str;
    char *target, *save, *type;
    char **env_save=NULL;
    bool need_close_components = false;
    orte_info_component_map_t *map;
    
    if (opened_components) {
        return;
    }
    
    /* init the map */
    OBJ_CONSTRUCT(&component_map, opal_pointer_array_t);
    opal_pointer_array_init(&component_map, 256, INT_MAX, 128);
    
    /* Clear out the environment.  Use strdup() to orphan the resulting
     * strings because items are placed in the environment by reference,
     * not by value.
     */
    
    for (i = 0; i < mca_types.size; ++i) {
        if (NULL == (type = (char*)opal_pointer_array_get_item(&mca_types, i))) {
            continue;
        }
        asprintf(&env, "OMPI_MCA_%s", type);
        if (NULL != (save = getenv(env))) {
            /* save this param so it can later be restored */
            asprintf(&str, "%s=%s", env, save);
            opal_argv_append_nosize(&env_save, str);
            free(str);
            /* can't manipulate it directly, so make a copy first */
            asprintf(&target, "%s=", env);
            putenv(target);
            free(target);
        }
    }
    
    /* some components require the event library be active, so activate it */
    if (OPAL_SUCCESS != opal_event_base_open()) {
        str = "opal_event_base_open";
        goto error;
    }
    
    /* Open the DSS */
    
    if (ORTE_SUCCESS != opal_dss_open()) {
        str = "Unable to initialize the DSS";
        goto error;
    }
    
    /* Open up the MCA */
    
    if (OPAL_SUCCESS != mca_base_open()) {
        str = "mca_base_open failed";
        goto error;
    }
    
    /* Register the OPAL layer's MCA parameters */
    
    if (OPAL_SUCCESS != opal_register_params()) {
        str = "opal_register_params failed";
        goto error;
    }
    
    /* Register the ORTE layer's MCA parameters */
    
    if (ORTE_SUCCESS != orte_register_params()) {
        str = "orte_register_params failed";
        goto error;
    }
    
    /* Initialize the opal_output system */
    if (!opal_output_init()) {
        str = "opal_output_init failed";
        goto error;
    }
    
    /* Find / open all components */
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("base");
    opal_pointer_array_add(&component_map, map);
    
    /* set default error message from here forward */
    str = "A component framework failed to open properly.";
    
    /* OPAL frameworks */
    
    if (OPAL_SUCCESS != opal_backtrace_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("backtrace");
    map->components = &opal_backtrace_base_components_opened;
    opal_pointer_array_add(&component_map, map);
    
    if (OPAL_SUCCESS != opal_memory_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("memory");
    map->components = &opal_memory_base_components_opened;
    opal_pointer_array_add(&component_map, map);
    
    /* the event framework is already open - just get its components */
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("event");
    map->components = &opal_event_components;
    opal_pointer_array_add(&component_map, map);
    
    if (OPAL_SUCCESS != opal_memchecker_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("memchecker");
    map->components = &opal_memchecker_base_components_opened;
    opal_pointer_array_add(&component_map, map);
    
    if (OPAL_SUCCESS != opal_shmem_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("shmem");
    map->components = &opal_shmem_base_components_opened;
    opal_pointer_array_add(&component_map, map);
    
#if OPAL_HAVE_HWLOC
    if (OPAL_SUCCESS != opal_hwloc_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("hwloc");
    map->components = &opal_hwloc_base_components;
    opal_pointer_array_add(&component_map, map);
#endif

    if (OPAL_SUCCESS != opal_timer_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("timer");
    map->components = &opal_timer_base_components_opened;
    opal_pointer_array_add(&component_map, map);

#if OPAL_ENABLE_FT_CR == 1
    if (OPAL_SUCCESS != opal_crs_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("crs");
    map->components = &opal_crs_base_components_available;
    opal_pointer_array_add(&component_map, map);
#endif

    if (OPAL_SUCCESS != opal_if_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("if");
    map->components = &opal_if_components;
    opal_pointer_array_add(&component_map, map);

    
    /* OPAL's installdirs base open has already been called as part of
     * opal_init_util() back in main().
     */
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("installdirs");
    map->components = &opal_installdirs_components;
    opal_pointer_array_add(&component_map, map);
    
    /* ORTE frameworks
     * Set orte_process_info.proc_type to HNP to force all frameworks to
     * open components
     */
    orte_process_info.proc_type = ORTE_PROC_HNP;
    
    if (ORTE_SUCCESS != orte_state_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("state");
    map->components = &orte_state_base_components_available;
    opal_pointer_array_add(&component_map, map);

    if (ORTE_SUCCESS != orte_errmgr_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("errmgr");
    map->components = &orte_errmgr_base_components_available;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_grpcomm_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("grpcomm");
    map->components = &orte_grpcomm_base.components_available;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_db_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("db");
    map->components = &orte_db_base.available_components;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_ess_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("ess");
    map->components = &orte_ess_base_components_available;
    opal_pointer_array_add(&component_map, map);
    
#if !ORTE_DISABLE_FULL_SUPPORT
    if (ORTE_SUCCESS != mca_oob_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("oob");
    map->components = &mca_oob_base_components;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_odls_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("odls");
    map->components = &orte_odls_base.available_components;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_iof_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("iof");
    map->components = &orte_iof_base.iof_components_opened;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_ras_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("ras");
    map->components = &orte_ras_base.ras_opened;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_rmaps_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("rmaps");
    map->components = &orte_rmaps_base.available_components;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_rml_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("rml");
    map->components = &orte_rml_base_components;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_routed_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("routed");
    map->components = &orte_routed_base_components;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_plm_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("plm");
    map->components = &orte_plm_base.available_components;
    opal_pointer_array_add(&component_map, map);

#if OPAL_ENABLE_FT_CR == 1
    if (ORTE_SUCCESS != orte_snapc_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("snapc");
    map->components = &orte_snapc_base_components_available;
    opal_pointer_array_add(&component_map, map);
#endif

    if (ORTE_SUCCESS != orte_sensor_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("sensor");
    map->components = &mca_sensor_base_components_available;
    opal_pointer_array_add(&component_map, map);
    
    if (ORTE_SUCCESS != orte_filem_base_open()) {
        goto error;
    }
    map = OBJ_NEW(orte_info_component_map_t);
    map->type = strdup("filem");
    map->components = &orte_filem_base_components_available;
    opal_pointer_array_add(&component_map, map);
#endif
    
    /* flag that we need to close components */
    need_close_components = true;

    /* Restore the environment to what it was before we started so that
     * if users setenv OMPI_MCA_<framework name> to some value, they'll
     * see that value when it is shown via --param output.
     */
    
    if (NULL != env_save) {
        for (i = 0; i < opal_argv_count(env_save); ++i) {
            putenv(env_save[i]);
        }
    }
    
    /* All done */
    
    opened_components = true;
    return;
    
error:
    fprintf(stderr, "%s\n", str);
    fprintf(stderr, "orte-info will likely not display all configuration information\n");
    if (need_close_components) {
        opened_components = true;
        orte_info_close_components();
    }
}
예제 #10
0
int orte_init(int* pargc, char*** pargv, orte_proc_type_t flags)
{
    int ret;
    char *error = NULL;

    if (0 < orte_initialized) {
        /* track number of times we have been called */
        orte_initialized++;
        return ORTE_SUCCESS;
    }
    orte_initialized++;

    /* initialize the opal layer */
    if (ORTE_SUCCESS != (ret = opal_init(pargc, pargv))) {
        error = "opal_init";
        goto error;
    }

    /* ensure we know the type of proc for when we finalize */
    orte_process_info.proc_type = flags;

    /* setup the locks */
    if (ORTE_SUCCESS != (ret = orte_locks_init())) {
        error = "orte_locks_init";
        goto error;
    }

    /* Register all MCA Params */
    if (ORTE_SUCCESS != (ret = orte_register_params())) {
        error = "orte_register_params";
        goto error;
    }

    /* setup the orte_show_help system */
    if (ORTE_SUCCESS != (ret = orte_show_help_init())) {
        error = "opal_output_init";
        goto error;
    }

    /* register handler for errnum -> string conversion */
    opal_error_register("ORTE", ORTE_ERR_BASE, ORTE_ERR_MAX, orte_err2str);

    /* Ensure the rest of the process info structure is initialized */
    if (ORTE_SUCCESS != (ret = orte_proc_info())) {
        error = "orte_proc_info";
        goto error;
    }

    /* open the ESS and select the correct module for this environment */
    if (ORTE_SUCCESS != (ret = orte_ess_base_open())) {
        error = "orte_ess_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_ess_base_select())) {
        error = "orte_ess_base_select";
        goto error;
    }

#if ORTE_ENABLE_PROGRESS_THREADS
#if OPAL_EVENT_HAVE_THREAD_SUPPORT
    /* get a separate orte event base */
    orte_event_base = opal_event_base_create();
    /* construct the thread object */
    OBJ_CONSTRUCT(&orte_progress_thread, opal_thread_t);
    /* fork off a thread to progress it */
    orte_progress_thread.t_run = orte_progress_thread_engine;
    if (OPAL_SUCCESS != (ret = opal_thread_start(&orte_progress_thread))) {
        error = "orte progress thread start";
        goto error;
    }
#else
    error = "event thread support is not configured";
    ret = ORTE_ERROR;
    goto error;
#endif
#else
    /* set the event base to the opal one */
    orte_event_base = opal_event_base;
#endif

    /* initialize the RTE for this environment */
    if (ORTE_SUCCESS != (ret = orte_ess.init())) {
        error = "orte_ess_init";
        goto error;
    }

    /* All done */
    return ORTE_SUCCESS;

error:
    if (ORTE_ERR_SILENT != ret) {
        orte_show_help("help-orte-runtime",
                       "orte_init:startup:internal-failure",
                       true, error, ORTE_ERROR_NAME(ret), ret);
    }

    return ret;
}