int mca_bml_base_open(void) 
{
    /* See if we've already been here */
    if (++mca_bml_base_already_opened > 1) {
        return OMPI_SUCCESS;
    }

    if(OMPI_SUCCESS !=
       mca_base_components_open("bml", 0, mca_bml_base_static_components, 
                                &mca_bml_base_components_available, 
                                true)) {  
        return OMPI_ERROR; 
    }

#if OPAL_ENABLE_DEBUG_RELIABILITY
    do {
        int param, value;
        
        mca_base_param_register_int("bml", NULL, "error_rate_floor", "error_rate_floor", 0);
        param = mca_base_param_find("bml", NULL, "error_rate_floor");
        mca_base_param_lookup_int(param, &value);
        mca_bml_base_error_rate_floor = value;

        mca_base_param_register_int("bml", NULL, "error_rate_ceiling", "error_rate_ceiling", 0);
        param = mca_base_param_find("bml", NULL, "error_rate_ceiling");
        mca_base_param_lookup_int(param, &value);
        mca_bml_base_error_rate_ceiling = value;


        mca_base_param_register_int("bml", NULL, "srand", "srand", 1);
        param = mca_base_param_find("bml", NULL, "srand");
        mca_base_param_lookup_int(param, &value);

        /* seed random number generator */
        if(value) {
            struct timeval tv;
            gettimeofday(&tv, NULL);
            srand(getpid() * tv.tv_usec);
        }

        /* initialize count */
        if(mca_bml_base_error_rate_ceiling > 0 
           && mca_bml_base_error_rate_floor <= mca_bml_base_error_rate_ceiling) {
            mca_bml_base_error_count = (int) ((mca_bml_base_error_rate_ceiling * rand())/(RAND_MAX+1.0));
        }
    } while (0);
#endif
    return mca_btl_base_open(); 
}
/*******************************************************************************
 * utilities
 */
static inline int mca_pml_v_param_register_int( const char* param_name,
                                                  int default_value )
{
    int id = mca_base_param_register_int("pml", "v", param_name, NULL, default_value);
    int param_value = default_value;
    mca_base_param_lookup_int(id, &param_value);
    return param_value;
}
static inline int mca_sbgp_basesmsocket_param_register_int(
        const char* param_name, int default_value)
{
    int id = mca_base_param_register_int("sbgp", "basesmsocket", param_name,NULL,default_value);
    int param_value = default_value;
    mca_base_param_lookup_int(id,&param_value);
    return param_value;
}
static inline int mca_btl_vader_param_register_int(const char *param_name,
                                                   int value)
{
    int id = mca_base_param_register_int("btl", "vader", param_name,
                                         NULL, value);
    mca_base_param_lookup_int(id, &value);
    return value;
}
Exemple #5
0
/*
 * Open the component
 */
int orte_gpr_proxy_open(void)
{
    int id, tmp;

    id = mca_base_param_register_int("gpr", "proxy", "debug", NULL, 0);
    mca_base_param_lookup_int(id, &tmp);
    if (tmp) {
        orte_gpr_proxy_globals.debug = true;
    } else {
        orte_gpr_proxy_globals.debug = false;
    }

    return ORTE_SUCCESS;
}
Exemple #6
0
int orte_proc_info(void)
{

    int id, tmp;
    
    /* all other params are set elsewhere */
    
    id = mca_base_param_register_int("seed", NULL, NULL, NULL, orte_process_info.seed);
    mca_base_param_lookup_int(id, &tmp);
    orte_process_info.seed = OPAL_INT_TO_BOOL(tmp);
    /* if we are a seed, then make sure the daemon flag is NOT set so that
     * framework components are properly selected
     */
    if (orte_process_info.seed) {
        orte_process_info.daemon = false;
    }

    id = mca_base_param_register_int("orte", "app", "num", NULL, -1);
    mca_base_param_lookup_int(id, &tmp);
    orte_process_info.app_num = tmp;

    id = mca_base_param_register_string("gpr", "replica", "uri", NULL, orte_process_info.gpr_replica_uri);
    mca_base_param_lookup_string(id, &(orte_process_info.gpr_replica_uri));
    mca_base_param_set_internal(id, true);

    id = mca_base_param_register_string("ns", "replica", "uri", NULL, orte_process_info.ns_replica_uri);
    mca_base_param_lookup_string(id, &(orte_process_info.ns_replica_uri));
    mca_base_param_set_internal(id, true);

    id = mca_base_param_register_string("tmpdir", "base", NULL, NULL, orte_process_info.tmpdir_base);
    mca_base_param_lookup_string(id, &(orte_process_info.tmpdir_base));

    /* get the process id */
    orte_process_info.pid = getpid();

    return ORTE_SUCCESS;
}
static int lookup_set(char *a, char *b, char *c, int default_val,
                      char *token, int *argc, char ***argv)
{
    int id, rc;
    
    id = mca_base_param_find(a, b, c);
    if (id < 0) {
        id = mca_base_param_register_int(a, b, c, NULL, default_val);
    }
    mca_base_param_lookup_int(id, &rc);
    if (rc) {
        opal_argv_append(argc, argv, token);
    }
    
    return ORTE_SUCCESS;
}
Exemple #8
0
int orte_restart(orte_process_name_t *name, const char* uri)
{
    int rc;
    orte_process_name_t* old_name;
    orte_process_name_t* new_name;

    if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&old_name, orte_process_info.my_name, ORTE_NAME))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&new_name, name, ORTE_NAME))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }

    /*
     * Restart event library
     */

    if (ORTE_SUCCESS != (rc = opal_event_restart())) {
        ORTE_ERROR_LOG(rc);
	return rc;
    }

    /*
     * Close selected components.
     */

    orte_iof_base.iof_flush = false;
    if (ORTE_SUCCESS != (rc = orte_iof_base_close())) {
        ORTE_ERROR_LOG(rc);
	return rc;
    }
    if (ORTE_SUCCESS != (rc = orte_smr_base_close())) {
        ORTE_ERROR_LOG(rc);
	return rc;
    }
    if (ORTE_SUCCESS != (rc = orte_gpr_base_close())) {
        ORTE_ERROR_LOG(rc);
	return rc;
    }
    if (ORTE_SUCCESS != (rc = orte_ns_base_close())) {
        ORTE_ERROR_LOG(rc);
	return rc;
    }
    if (ORTE_SUCCESS != (rc = orte_rml_base_close())) {
        ORTE_ERROR_LOG(rc);
	return rc;
    }
    if (ORTE_SUCCESS != (rc = orte_wait_finalize())) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }

    /*
     * setup new global state
     */
    orte_process_info.seed = false;

    /* if NULL, set ns_replica to old_name and set the corresponding uri parameter */
    if (NULL == orte_process_info.ns_replica) {
        orte_process_info.ns_replica = old_name;
        orte_process_info.ns_replica_uri = strdup(uri);
    }
    
    /* if NULL, set gpr_replica to old_name and set the corresponding uri parameter */
    if (NULL == orte_process_info.gpr_replica) {
        orte_process_info.gpr_replica = old_name;
        orte_process_info.gpr_replica_uri = strdup(uri);
    }

    /* ensure my_name is set to the new_name */
    if (NULL != orte_process_info.my_name) {
        free(orte_process_info.my_name);
    }
    orte_process_info.my_name = new_name;

#if 0
    /* close the proc_info structure so it can be reinitialized */
    if (ORTE_SUCCESS != (rc = orte_proc_info_finalize())) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    
    /* set seed flag to false */
    id = mca_base_param_register_int("seed", NULL, NULL, NULL, (int)false);
    if (ORTE_SUCCESS != (rc = mca_base_param_set_int(id, (int)false))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }

    /* call proc_info to reset the structure */
    if (ORTE_SUCCESS != (rc = orte_proc_info())) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    
    /* finalize the sys_info structure so it can be reinitialized */
    if (ORTE_SUCCESS != (rc = orte_sys_info_finalize())) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    
    /* call the sys_info function to load structure with any new info */
    orte_system_info.init = false;
    if (ORTE_SUCCESS != (rc = orte_sys_info())) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
   
    /* establish the session directory structure for this process */
    if (ORTE_SUCCESS != (rc = orte_ns.get_jobid_string(&jobid_str, orte_process_info.my_name))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    if (ORTE_SUCCESS != (rc = orte_ns.get_vpid_string(&procid_str, orte_process_info.my_name))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
 
    if (orte_debug_flag) {
        opal_output(0, "[%lu,%lu,%lu] setting up session dir with",
                    ORTE_NAME_ARGS(orte_process_info.my_name));
        if (NULL != orte_process_info.tmpdir_base) {
            opal_output(0, "\ttmpdir %s", orte_process_info.tmpdir_base);
        }
        opal_output(0, "\tuniverse %s", orte_universe_info.name);
        opal_output(0, "\tuser %s", orte_system_info.user);
        opal_output(0, "\thost %s", orte_system_info.nodename);
        opal_output(0, "\tjobid %s", jobid_str);
        opal_output(0, "\tprocid %s", procid_str);
    }
    if (ORTE_SUCCESS != (rc = orte_session_dir(true,
                                orte_process_info.tmpdir_base,
                                orte_system_info.user,
                                orte_system_info.nodename, NULL,
                                orte_universe_info.name,
                                jobid_str, procid_str))) {
        ORTE_ERROR_LOG(rc);
        if (jobid_str != NULL) free(jobid_str);
        if (procid_str != NULL) free(procid_str);
        return rc;
    }
    if (NULL != jobid_str) {
        free(jobid_str);
    }
    if (NULL != procid_str) {
        free(procid_str);
    }
#endif

    /*
     * Re-open components.
     */

    if (ORTE_SUCCESS != (rc = orte_wait_init())) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    if (ORTE_SUCCESS != (rc = orte_ns_base_open())) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    if (ORTE_SUCCESS != (rc = orte_rml_base_open())) {
        ORTE_ERROR_LOG(rc);
            return rc;
    }
    if (ORTE_SUCCESS != (rc = orte_gpr_base_open())) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    if (ORTE_SUCCESS != (rc = orte_smr_base_open())) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }

    /*
     * Select new modules.
     */

    if (ORTE_SUCCESS != (rc = orte_rml_base_select())) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    if (ORTE_SUCCESS != (rc = orte_ns_base_select())) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    if (ORTE_SUCCESS != (rc = orte_gpr_base_select())) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    if (ORTE_SUCCESS != (rc = orte_smr_base_select())) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }


    /*
     * Set contact info for the replicas
     */

    if (ORTE_SUCCESS != (rc = orte_rml.set_uri(orte_process_info.ns_replica_uri))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    if (ORTE_SUCCESS != (rc = orte_rml.set_uri(orte_process_info.gpr_replica_uri))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }

    /*
     * Re-init selected modules.
     */
    if (ORTE_SUCCESS != (rc = orte_rml.init())) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    if (ORTE_SUCCESS != (rc = orte_ns.init())) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    if (ORTE_SUCCESS != (rc = orte_gpr.init())) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }

    /*
     * Complete restart
     */
    if (ORTE_SUCCESS != (rc = orte_iof_base_open())) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    if (ORTE_SUCCESS != (rc = orte_iof_base_select())) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    return ORTE_SUCCESS;
}
Exemple #9
0
int
orte_sds_slurm_set_name(void)
{
    int rc;
    int id;
    int vpid_start;
    int num_procs;
    char* name_string = NULL;
    int slurm_nodeid;

    /* start by getting our cellid, jobid, and vpid (which is the
       starting vpid for the list of daemons) */
    id = mca_base_param_register_string("ns", "nds", "name", NULL, NULL);
    mca_base_param_lookup_string(id, &name_string);

    if(name_string != NULL) {
        if (ORTE_SUCCESS != 
            (rc = orte_ns.convert_string_to_process_name(&(orte_process_info.my_name),
                                                              name_string))) {
            ORTE_ERROR_LOG(rc);
            free(name_string);
            return rc;
        }
        free(name_string);

    } else {
        orte_cellid_t cellid;
        orte_jobid_t jobid;
        orte_vpid_t vpid;
        char* cellid_string;
        char* jobid_string;
        char* vpid_string;
      
        id = mca_base_param_register_string("ns", "nds", "cellid", NULL, NULL);
        mca_base_param_lookup_string(id, &cellid_string);
        if (NULL == cellid_string) {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            return ORTE_ERR_NOT_FOUND;
        }
        if (ORTE_SUCCESS != (rc = orte_ns.convert_string_to_cellid(&cellid, cellid_string))) {
            ORTE_ERROR_LOG(rc);
            return(rc);
        }
            
        id = mca_base_param_register_string("ns", "nds", "jobid", NULL, NULL);
        mca_base_param_lookup_string(id, &jobid_string);
        if (NULL == jobid_string) {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            return ORTE_ERR_NOT_FOUND;
        }
        if (ORTE_SUCCESS != (rc = orte_ns.convert_string_to_jobid(&jobid, jobid_string))) {
            ORTE_ERROR_LOG(rc);
            return(rc);
        }
        
        id = mca_base_param_register_string("ns", "nds", "vpid", NULL, NULL);
        mca_base_param_lookup_string(id, &vpid_string);
        if (NULL == vpid_string) {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            return ORTE_ERR_NOT_FOUND;
        }
        if (ORTE_SUCCESS != (rc = orte_ns.convert_string_to_vpid(&vpid, vpid_string))) {
            ORTE_ERROR_LOG(rc);
            return(rc);
        }

        if (ORTE_SUCCESS != (rc = orte_ns.create_process_name(&(orte_process_info.my_name),
                                                              cellid,
                                                              jobid,
                                                              vpid))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
    }

    /* fix up the base name and make it the "real" name */
    slurm_nodeid = atoi(getenv("SLURM_NODEID"));
    orte_process_info.my_name->vpid += slurm_nodeid;

    /* fix up the system info nodename to match exactly what slurm returned */
    if (NULL != orte_system_info.nodename) {
        free(orte_system_info.nodename);
    }
    orte_system_info.nodename = get_slurm_nodename(slurm_nodeid);

    id = mca_base_param_register_int("ns", "nds", "vpid_start", NULL, -1);
    mca_base_param_lookup_int(id, &vpid_start);
    if (vpid_start < 0) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        return ORTE_ERR_NOT_FOUND;
    }

    id = mca_base_param_register_int("ns", "nds", "num_procs", NULL, -1);
    mca_base_param_lookup_int(id, &num_procs);
    if (num_procs < 0) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        return ORTE_ERR_NOT_FOUND;
    }
    
    orte_process_info.vpid_start = (orte_vpid_t)vpid_start;
    orte_process_info.num_procs = (size_t)num_procs;
    return ORTE_SUCCESS;
}
/*
 * Traverse the entire list of found components (a list of
 * mca_base_component_t instances).  If the requested_component_names
 * array is empty, or the name of each component in the list of found
 * components is in the requested_components_array, try to open it.
 * If it opens, add it to the components_available list.
 */
static int open_components(const char *type_name, int output_id,
                           opal_list_t *src, opal_list_t *dest)
{
    int ret;
    opal_list_item_t *item;
    const mca_base_component_t *component;
    mca_base_component_list_item_t *cli;
    bool called_open;
    bool opened, registered;

    /* Announce */

    opal_output_verbose(10, output_id,
                        "mca: base: components_open: opening %s components",
                        type_name);

    /* Traverse the list of found components */

    OBJ_CONSTRUCT(dest, opal_list_t);
    for (item = opal_list_get_first(src);
            opal_list_get_end(src) != item;
            item = opal_list_get_next(item)) {
        cli = (mca_base_component_list_item_t *) item;
        component = cli->cli_component;

        registered = opened = called_open = false;
        opal_output_verbose(10, output_id,
                            "mca: base: components_open: found loaded component %s",
                            component->mca_component_name);

        /* Call the component's MCA parameter registration function */
        if (NULL == component->mca_register_component_params) {
            registered = true;
            opal_output_verbose(10, output_id,
                                "mca: base: components_open: "
                                "component %s has no register function",
                                component->mca_component_name);
        } else {
            ret = component->mca_register_component_params();
            if (MCA_SUCCESS == ret) {
                registered = true;
                opal_output_verbose(10, output_id,
                                    "mca: base: components_open: "
                                    "component %s register function successful",
                                    component->mca_component_name);
            } else if (OPAL_ERR_NOT_AVAILABLE != ret) {
                /* If the component returns OPAL_ERR_NOT_AVAILABLE,
                   it's a cue to "silently ignore me" -- it's not a
                   failure, it's just a way for the component to say
                   "nope!".

                   Otherwise, however, display an error.  We may end
                   up displaying this twice, but it may go to separate
                   streams.  So better to be redundant than to not
                   display the error in the stream where it was
                   expected. */

                if (show_errors) {
                    opal_output(0, "mca: base: components_open: "
                                "component %s / %s register function failed",
                                component->mca_type_name,
                                component->mca_component_name);
                }
                opal_output_verbose(10, output_id,
                                    "mca: base: components_open: "
                                    "component %s register function failed",
                                    component->mca_component_name);
            }
        }

        if (NULL == component->mca_open_component) {
            opened = true;
            opal_output_verbose(10, output_id,
                                "mca: base: components_open: "
                                "component %s has no open function",
                                component->mca_component_name);
        } else {
            called_open = true;
            ret = component->mca_open_component();
            if (MCA_SUCCESS == ret) {
                opened = true;
                opal_output_verbose(10, output_id,
                                    "mca: base: components_open: "
                                    "component %s open function successful",
                                    component->mca_component_name);
            } else if (OPAL_ERR_NOT_AVAILABLE != ret) {
                /* If the component returns OPAL_ERR_NOT_AVAILABLE,
                   it's a cue to "silently ignore me" -- it's not a
                   failure, it's just a way for the component to say
                   "nope!".

                   Otherwise, however, display an error.  We may end
                   up displaying this twice, but it may go to separate
                   streams.  So better to be redundant than to not
                   display the error in the stream where it was
                   expected. */

                if (show_errors) {
                    opal_output(0, "mca: base: components_open: "
                                "component %s / %s open function failed",
                                component->mca_type_name,
                                component->mca_component_name);
                }
                opal_output_verbose(10, output_id,
                                    "mca: base: components_open: "
                                    "component %s open function failed",
                                    component->mca_component_name);
            }
        }

        /* If it didn't open, close it out and get rid of it */

        if (!opened) {
            char *name;
            if (called_open) {
                if (NULL != component->mca_close_component) {
                    component->mca_close_component();
                }
                opal_output_verbose(10, output_id,
                                    "mca: base: components_open: component %s closed",
                                    component->mca_component_name);
                called_open = false;
            }
            name = strdup(component->mca_component_name);
            mca_base_component_repository_release(component);
            opal_output_verbose(10, output_id,
                                "mca: base: components_open: component %s unloaded",
                                name);
            free(name);
        }

        /* If it did open, register its "priority" MCA parameter (if
           it doesn't already have one) and save it in the
           opened_components list */

        else {
            if (OPAL_ERROR == mca_base_param_find(type_name,
                                                  component->mca_component_name,
                                                  "priority")) {
                mca_base_param_register_int(type_name,
                                            component->mca_component_name,
                                            "priority", NULL, 0);
            }

            cli = OBJ_NEW(mca_base_component_list_item_t);
            if (NULL == cli) {
                return OPAL_ERR_OUT_OF_RESOURCE;
            }
            cli->cli_component = component;
            opal_list_append(dest, (opal_list_item_t *) cli);
        }
    }

    /* All done */

    return OPAL_SUCCESS;
}