/*
 * Function for finding and opening either all MCA components, or the one
 * that was specifically requested via a MCA parameter.
 */
int opal_paffinity_base_open(void)
{
    opal_paffinity_base_components_opened_valid = false;
        
    mca_base_param_reg_string_name("opal", "paffinity_base_slot_list",
                                   "Used to set list of processor IDs to bind MPI processes to (e.g., used in conjunction with rank files)",
                                   true, false, NULL, &opal_paffinity_base_slot_list);

    /* Open up all available components */

    if (OPAL_SUCCESS !=
        mca_base_components_open("paffinity", opal_paffinity_base_output,
                                 mca_paffinity_base_static_components,
                                 &opal_paffinity_base_components_opened, 
                                 true)) {
        return OPAL_ERROR;
    }
    opal_paffinity_base_components_opened_valid = true;

    /* All done */

    return OPAL_SUCCESS;
}
Пример #2
0
/*******************************************************************************
 * MCA level functions - parasite setup
 */
static int mca_pml_v_component_open(void)
{
    char *output;
    int verbose;
    int priority;
    char *vprotocol_include_list;
    
    priority = mca_pml_v_param_register_int("priority", -1);
    output = mca_pml_v_param_register_string("output", "stderr");
    verbose = mca_pml_v_param_register_int("verbose", 0);

    mca_base_param_reg_string_name("vprotocol", NULL, 
                                   "Specify a specific vprotocol to use", 
                                   false, false, "", &vprotocol_include_list);
   
    pml_v_output_open(output, verbose);

    if(-1 != priority)
        V_OUTPUT_ERR("pml_v: Overriding priority setting (%d) with -1. The PML V should NEVER be the selected component; even when enabling fault tolerance.", priority);
            
    V_OUTPUT_VERBOSE(500, "loaded");

    return mca_vprotocol_base_open(vprotocol_include_list);
}
Пример #3
0
static int rte_init(void)
{
    int ret;
    char *error = NULL;
    char *tmp=NULL;
    orte_jobid_t jobid=ORTE_JOBID_INVALID;
    orte_vpid_t vpid=ORTE_VPID_INVALID;
    int32_t jfam;

    /* run the prolog */
    if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
        error = "orte_ess_base_std_prolog";
        goto error;
    }
    
    /* if we were given a jobid, use it */
    mca_base_param_reg_string_name("orte", "ess_jobid", "Process jobid",
                                   true, false, NULL, &tmp);
    if (NULL != tmp) {
        if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_jobid(&jobid, tmp))) {
            ORTE_ERROR_LOG(ret);
            error = "convert_jobid";
            goto error;
        }
        free(tmp);
        ORTE_PROC_MY_NAME->jobid = jobid;
    }
        
    /* if we were given a vpid, use it */
    mca_base_param_reg_string_name("orte", "ess_vpid", "Process vpid",
                                   true, false, NULL, &tmp);
    if (NULL != tmp) {
        if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_vpid(&vpid, tmp))) {
            ORTE_ERROR_LOG(ret);
            error = "convert_vpid";
            goto error;
        }
        free(tmp);
        ORTE_PROC_MY_NAME->vpid = vpid;
    }
        
    /* if both were given, then we are done */
    if (ORTE_JOBID_INVALID == jobid ||
        ORTE_VPID_INVALID == vpid) {
        /* create our own name */
        if (ORTE_SUCCESS != (ret = orte_plm_base_open())) {
            ORTE_ERROR_LOG(ret);
            error = "orte_plm_base_open";
            goto error;
        }
        
        if (ORTE_SUCCESS != (ret = orte_plm_base_select())) {
            ORTE_ERROR_LOG(ret);
            error = "orte_plm_base_select";
            goto error;
        }
        if (ORTE_SUCCESS != (ret = orte_plm.set_hnp_name())) {
            ORTE_ERROR_LOG(ret);
            error = "orte_plm_set_hnp_name";
            goto error;
        }
        /* close the plm since we opened it to set our
         * name, but have no further use for it
         */
        orte_plm_base_close();
    }
        
    /* do the rest of the standard tool init */
    if (ORTE_SUCCESS != (ret = local_init())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_ess_tool_init";
        goto error;
    }
    
    return ORTE_SUCCESS;
    
 error:
    orte_show_help("help-orte-runtime.txt",
                   "orte_init:startup:internal-failure",
                   true, error, ORTE_ERROR_NAME(ret), ret);
    
    return ret;
}
Пример #4
0
int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
{
    int ret;
    ompi_proc_t** procs;
    size_t nprocs;
    char *error = NULL;
    bool timing = false;
    int param, value;
    struct timeval ompistart, ompistop;
    char *event_val = NULL;
    opal_paffinity_base_cpu_set_t mask;
    bool proc_bound;
#if 0
    /* see comment below about sched_yield */
    int num_processors;
#endif
    bool orte_setup = false;
    bool paffinity_enabled = false;

    /* Setup enough to check get/set MCA params */

    if (ORTE_SUCCESS != (ret = opal_init_util())) {
        error = "ompi_mpi_init: opal_init_util failed";
        goto error;
    }

    /* _After_ opal_init_util() but _before_ orte_init(), we need to
       set an MCA param that tells libevent that it's ok to use any
       mechanism in libevent that is available on this platform (e.g.,
       epoll and friends).  Per opal/event/event.s, we default to
       select/poll -- but we know that MPI processes won't be using
       pty's with the event engine, so it's ok to relax this
       constraint and let any fd-monitoring mechanism be used. */
    ret = mca_base_param_reg_string_name("opal", "event_include",
                                         "Internal orted MCA param: tell opal_init() to use a specific mechanism in libevent",
                                         false, false, "all", &event_val);
    if (ret >= 0) {
        /* We have to explicitly "set" the MCA param value here
           because libevent initialization will re-register the MCA
           param and therefore override the default. Setting the value
           here puts the desired value ("all") in different storage
           that is not overwritten if/when the MCA param is
           re-registered. This is unless the user has specified a different
           value for this MCA parameter. Make sure we check to see if the
           default is specified before forcing "all" in case that is not what
           the user desires. Note that we do *NOT* set this value as an
           environment variable, just so that it won't be inherited by
           any spawned processes and potentially cause unintented
           side-effects with launching ORTE tools... */
        if (0 == strcmp("all", event_val)) {
            mca_base_param_set_string(ret, "all");
        }
    }

    if( NULL != event_val ) {
        free(event_val);
        event_val = NULL;
    }

    /* check to see if we want timing information */
    param = mca_base_param_reg_int_name("ompi", "timing",
                                        "Request that critical timing loops be measured",
                                        false, false, 0, &value);
    if (value != 0) {
        timing = true;
        gettimeofday(&ompistart, NULL);
    }
    
    /* Setup ORTE - note that we are not a tool  */
    
    if (ORTE_SUCCESS != (ret = orte_init(ORTE_NON_TOOL))) {
        error = "ompi_mpi_init: orte_init failed";
        goto error;
    }
    orte_setup = true;
    
    /* check for timing request - get stop time and report elapsed time if so */
    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init [%ld]: time from start to completion of orte_init %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
        gettimeofday(&ompistart, NULL);
    }

    /* Figure out the final MPI thread levels.  If we were not
       compiled for support for MPI threads, then don't allow
       MPI_THREAD_MULTIPLE.  Set this stuff up here early in the
       process so that other components can make decisions based on
       this value. */

    ompi_mpi_thread_requested = requested;
    if (OMPI_HAVE_THREAD_SUPPORT == 0) {
        ompi_mpi_thread_provided = *provided = MPI_THREAD_SINGLE;
        ompi_mpi_main_thread = NULL;
    } else if (OMPI_ENABLE_MPI_THREADS == 1) {
        ompi_mpi_thread_provided = *provided = requested;
        ompi_mpi_main_thread = opal_thread_get_self();
    } else {
        if (MPI_THREAD_MULTIPLE == requested) {
            ompi_mpi_thread_provided = *provided = MPI_THREAD_SERIALIZED;
        } else {
            ompi_mpi_thread_provided = *provided = requested;
        }
        ompi_mpi_main_thread = opal_thread_get_self();
    }

    ompi_mpi_thread_multiple = (ompi_mpi_thread_provided == 
                                MPI_THREAD_MULTIPLE);

    /* Once we've joined the RTE, see if any MCA parameters were
       passed to the MPI level */

    if (OMPI_SUCCESS != (ret = ompi_mpi_register_params())) {
        error = "mca_mpi_register_params() failed";
        goto error;
    }

    /* if it hasn't already been done, setup process affinity. 
     * First check to see if a slot list was
     * specified.  If so, use it.  If no slot list was specified,
     * that's not an error -- just fall through and try the next
     * paffinity scheme.
     */
    ret = opal_paffinity_base_get(&mask);
    if (OPAL_SUCCESS == ret) {
        /* paffinity is supported - check for binding */
        OPAL_PAFFINITY_PROCESS_IS_BOUND(mask, &proc_bound);
        if (proc_bound) {
            /* someone external set it - indicate it is set
             * so that we know
             */
            paffinity_enabled = true;
        } else {
            /* the system is capable of doing processor affinity, but it
             * has not yet been set - see if a slot_list was given
             */
            if (NULL != opal_paffinity_base_slot_list) {
                /* It's an error if multiple paffinity schemes were specified */
                if (opal_paffinity_alone) {
                    ret = OMPI_ERR_BAD_PARAM;
                    error = "Multiple processor affinity schemes specified (can only specify one)";
                    goto error;
                }
                ret = opal_paffinity_base_slot_list_set((long)ORTE_PROC_MY_NAME->vpid, opal_paffinity_base_slot_list);
                if (OPAL_ERR_NOT_FOUND != ret) {
                    error = "opal_paffinity_base_slot_list_set() returned an error";
                    goto error;
                }
                paffinity_enabled = true;
            } else if (opal_paffinity_alone) {
                /* no slot_list, but they asked for paffinity */
                int phys_cpu;
                orte_node_rank_t nrank;
                if (ORTE_NODE_RANK_INVALID == (nrank = orte_ess.get_node_rank(ORTE_PROC_MY_NAME))) {
                    error = "Could not get node rank - cannot set processor affinity";
                    goto error;
                }
                OPAL_PAFFINITY_CPU_ZERO(mask);
                phys_cpu = opal_paffinity_base_get_physical_processor_id(nrank);
                if (0 > phys_cpu) {
                    error = "Could not get physical processor id - cannot set processor affinity";
                    goto error;
                }
                OPAL_PAFFINITY_CPU_SET(phys_cpu, mask);
                ret = opal_paffinity_base_set(mask);
                if (OPAL_SUCCESS != ret) {
                    error = "Setting processor affinity failed";
                    goto error;
                }
                paffinity_enabled = true;
            }
        }
    }
    
    /* If we were able to set processor affinity, try setting up
     memory affinity */
    if (!opal_maffinity_setup && paffinity_enabled) {
        if (OPAL_SUCCESS == opal_maffinity_base_open() &&
            OPAL_SUCCESS == opal_maffinity_base_select()) {
            opal_maffinity_setup = true;
        }
    }
    
    /* initialize datatypes. This step should be done early as it will
     * create the local convertor and local arch used in the proc
     * init.
     */
    if (OMPI_SUCCESS != (ret = ompi_ddt_init())) {
        error = "ompi_ddt_init() failed";
        goto error;
    }

    /* Initialize OMPI procs */
    if (OMPI_SUCCESS != (ret = ompi_proc_init())) {
        error = "mca_proc_init() failed";
        goto error;
    }

    /* initialize ops. This has to be done *after* ddt_init, but
       befor mca_coll_base_open, since come collective modules
       (e.g. the hierarchical) need them in the query function
    */
    if (OMPI_SUCCESS != (ret = ompi_op_init())) {
        error = "ompi_op_init() failed";
        goto error;
    }


    /* Open up MPI-related MCA components */

    if (OMPI_SUCCESS != (ret = mca_allocator_base_open())) {
        error = "mca_allocator_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_rcache_base_open())) {
        error = "mca_rcache_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_mpool_base_open())) {
        error = "mca_mpool_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_pml_base_open())) {
        error = "mca_pml_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_coll_base_open())) {
        error = "mca_coll_base_open() failed";
        goto error;
    }

    if (OMPI_SUCCESS != (ret = ompi_osc_base_open())) {
        error = "ompi_osc_base_open() failed";
        goto error;
    }

#if OPAL_ENABLE_FT == 1
    if (OMPI_SUCCESS != (ret = ompi_crcp_base_open())) {
        error = "ompi_crcp_base_open() failed";
        goto error;
    }
#endif

    /* In order to reduce the common case for MPI apps (where they
       don't use MPI-2 IO or MPI-1 topology functions), the io and
       topo frameworks are initialized lazily, at the first use of
       relevant functions (e.g., MPI_FILE_*, MPI_CART_*, MPI_GRAPH_*),
       so they are not opened here. */

    /* Select which MPI components to use */

    if (OMPI_SUCCESS != 
        (ret = mca_mpool_base_init(OMPI_ENABLE_PROGRESS_THREADS,
                                   OMPI_ENABLE_MPI_THREADS))) {
        error = "mca_mpool_base_init() failed";
        goto error;
    }

    if (OMPI_SUCCESS != 
        (ret = mca_pml_base_select(OMPI_ENABLE_PROGRESS_THREADS,
                                   OMPI_ENABLE_MPI_THREADS))) {
        error = "mca_pml_base_select() failed";
        goto error;
    }

    /* select buffered send allocator component to be used */
    ret=mca_pml_base_bsend_init(OMPI_ENABLE_MPI_THREADS);
    if( OMPI_SUCCESS != ret ) {
        error = "mca_pml_base_bsend_init() failed";
        goto error;
    }

    if (OMPI_SUCCESS != 
        (ret = mca_coll_base_find_available(OMPI_ENABLE_PROGRESS_THREADS,
                                            OMPI_ENABLE_MPI_THREADS))) {
        error = "mca_coll_base_find_available() failed";
        goto error;
    }

    if (OMPI_SUCCESS != 
        (ret = ompi_osc_base_find_available(OMPI_ENABLE_PROGRESS_THREADS,
                                           OMPI_ENABLE_MPI_THREADS))) {
        error = "ompi_osc_base_find_available() failed";
        goto error;
    }

#if OPAL_ENABLE_FT == 1
    if (OMPI_SUCCESS != (ret = ompi_crcp_base_select() ) ) {
        error = "ompi_crcp_base_select() failed";
        goto error;
    }
#endif

    /* io and topo components are not selected here -- see comment
       above about the io and topo frameworks being loaded lazily */

    /* Initialize each MPI handle subsystem */
    /* initialize requests */
    if (OMPI_SUCCESS != (ret = ompi_request_init())) {
        error = "ompi_request_init() failed";
        goto error;
    }

    /* initialize info */
    if (OMPI_SUCCESS != (ret = ompi_info_init())) {
        error = "ompi_info_init() failed";
        goto error;
    }

    /* initialize error handlers */
    if (OMPI_SUCCESS != (ret = ompi_errhandler_init())) {
        error = "ompi_errhandler_init() failed";
        goto error;
    }

    /* initialize error codes */
    if (OMPI_SUCCESS != (ret = ompi_mpi_errcode_init())) {
        error = "ompi_mpi_errcode_init() failed";
        goto error;
    }
    
    /* initialize internal error codes */
    if (OMPI_SUCCESS != (ret = ompi_errcode_intern_init())) {
        error = "ompi_errcode_intern_init() failed";
        goto error;
    }
     
    /* initialize groups  */
    if (OMPI_SUCCESS != (ret = ompi_group_init())) {
        error = "ompi_group_init() failed";
        goto error;
    }

    /* initialize communicators */
    if (OMPI_SUCCESS != (ret = ompi_comm_init())) {
        error = "ompi_comm_init() failed";
        goto error;
    }

    /* initialize file handles */
    if (OMPI_SUCCESS != (ret = ompi_file_init())) {
        error = "ompi_file_init() failed";
        goto error;
    }

    /* initialize windows */
    if (OMPI_SUCCESS != (ret = ompi_win_init())) {
        error = "ompi_win_init() failed";
        goto error;
    }

    /* initialize attribute meta-data structure for comm/win/dtype */
    if (OMPI_SUCCESS != (ret = ompi_attr_init())) {
        error = "ompi_attr_init() failed";
        goto error;
    }

    /* check for timing request - get stop time and report elapsed time if so */
    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init[%ld]: time from completion of orte_init to modex %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
        gettimeofday(&ompistart, NULL);
    }
    
    /* exchange connection info - this function also acts as a barrier
     * as it will not return until the exchange is complete
     */
    if (OMPI_SUCCESS != (ret = orte_grpcomm.modex(NULL))) {
        error = "orte_grpcomm_modex failed";
        goto error;
    }

    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init[%ld]: time to execute modex %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
        gettimeofday(&ompistart, NULL);
    }
    
    /* identify the architectures of remote procs and setup
     * their datatype convertors, if required
     */
    if (OMPI_SUCCESS != (ret = ompi_proc_set_arch())) {
        error = "ompi_proc_set_arch failed";
        goto error;
    }

    /* If thread support was enabled, then setup OPAL to allow for
       them. */
    if ((OMPI_ENABLE_PROGRESS_THREADS == 1) ||
        (*provided != MPI_THREAD_SINGLE)) {
        opal_set_using_threads(true);
    }

    /* start PML/BTL's */
    ret = MCA_PML_CALL(enable(true));
    if( OMPI_SUCCESS != ret ) {
        error = "PML control failed";
        goto error;
    }

    /* add all ompi_proc_t's to PML */
    if (NULL == (procs = ompi_proc_world(&nprocs))) {
        error = "ompi_proc_world() failed";
        goto error;
    }
    ret = MCA_PML_CALL(add_procs(procs, nprocs));
    free(procs);
    if( OMPI_SUCCESS != ret ) {
        error = "PML add procs failed";
        goto error;
    }

    MCA_PML_CALL(add_comm(&ompi_mpi_comm_world.comm));
    MCA_PML_CALL(add_comm(&ompi_mpi_comm_self.comm));


    /*
     * Dump all MCA parameters if requested
     */
    if (ompi_mpi_show_mca_params) {
       ompi_show_all_mca_params(ompi_mpi_comm_world.comm.c_my_rank, 
                                nprocs, 
                                orte_process_info.nodename);
    }

    /* Do we need to wait for a debugger? */
    ompi_wait_for_debugger();
    
    /* check for timing request - get stop time and report elapsed
     time if so, then start the clock again */
    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init[%ld]: time from modex thru complete oob wireup %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
        gettimeofday(&ompistart, NULL);
    }
    
    /* wait for everyone to reach this point */
    if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier())) {
        error = "orte_grpcomm_barrier failed";
        goto error;
    }
    
    /* check for timing request - get stop time and report elapsed
       time if so, then start the clock again */
    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init[%ld]: time to execute barrier %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
        gettimeofday(&ompistart, NULL);
    }

#if OMPI_ENABLE_PROGRESS_THREADS == 0
    /* Start setting up the event engine for MPI operations.  Don't
       block in the event library, so that communications don't take
       forever between procs in the dynamic code.  This will increase
       CPU utilization for the remainder of MPI_INIT when we are
       blocking on ORTE-level events, but may greatly reduce non-TCP
       latency. */
    opal_progress_set_event_flag(OPAL_EVLOOP_NONBLOCK);
#endif
    
    /* wire up the mpi interface, if requested.  Do this after the
       non-block switch for non-TCP performance.  Do before the
       polling change as anyone with a complex wire-up is going to be
       using the oob. */
    if (OMPI_SUCCESS != (ret = ompi_init_preconnect_mpi())) {
        error = "ompi_mpi_do_preconnect_all() failed";
        goto error;
    }

    /* Setup the publish/subscribe (PUBSUB) framework */
    if (OMPI_SUCCESS != (ret = ompi_pubsub_base_open())) {
        error = "ompi_pubsub_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = ompi_pubsub_base_select())) {
        error = "ompi_pubsub_base_select() failed";
        goto error;
    }
    
    /* Setup the dynamic process management (DPM) framework */
    if (OMPI_SUCCESS != (ret = ompi_dpm_base_open())) {
        error = "ompi_dpm_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = ompi_dpm_base_select())) {
        error = "ompi_dpm_base_select() failed";
        goto error;
    }

    /* Init coll for the comms. This has to be after dpm_base_select, 
       (since dpm.mark_dyncomm is not set in the communicator creation
       function else), but before dpm.dyncom_init, since this function
       might require collective for the CID allocation. */
    if (OMPI_SUCCESS !=
        (ret = mca_coll_base_comm_select(MPI_COMM_WORLD))) {
        error = "mca_coll_base_comm_select(MPI_COMM_WORLD) failed";
        goto error;
    }

    if (OMPI_SUCCESS != 
        (ret = mca_coll_base_comm_select(MPI_COMM_SELF))) {
        error = "mca_coll_base_comm_select(MPI_COMM_SELF) failed";
        goto error;
    }


    
    /* Check whether we have been spawned or not.  We introduce that
       at the very end, since we need collectives, datatypes, ptls
       etc. up and running here.... */
    if (OMPI_SUCCESS != (ret = ompi_dpm.dyn_init())) {
        error = "ompi_comm_dyn_init() failed";
        goto error;
    }

    /*
     * Startup the Checkpoint/Restart Mech.
     * Note: Always do this so tools don't hang when
     * in a non-checkpointable build
     */
    if (OMPI_SUCCESS != (ret = ompi_cr_init())) {
        error = "ompi_cr_init";
        goto error;
    }

    /* Undo OPAL calling opal_progress_event_users_increment() during 
       opal_init, to get better latency when not using TCP.  Do 
       this *after* dyn_init, as dyn init uses lots of ORTE 
       communication and we don't want to hinder the performance of 
       that code. */ 
    opal_progress_event_users_decrement(); 

    /* see if yield_when_idle was specified - if so, use it */
    param = mca_base_param_find("mpi", NULL, "yield_when_idle");
    mca_base_param_lookup_int(param, &value);
    if (value < 0) {
        /* if no info is provided, just default to conservative */
        opal_progress_set_yield_when_idle(true);
    } else {
        /* info was provided, so set idle accordingly */
        opal_progress_set_yield_when_idle(value == 0 ? false : true);
    }
    
    param = mca_base_param_find("mpi", NULL, "event_tick_rate");
    mca_base_param_lookup_int(param, &value);
    /* negative value means use default - just don't do anything */
    if (value >= 0) {
        opal_progress_set_event_poll_rate(value);
    }

    /* At this point, we are fully configured and in MPI mode.  Any
       communication calls here will work exactly like they would in
       the user's code.  Setup the connections between procs and warm
       them up with simple sends, if requested */

 error:
    if (ret != OMPI_SUCCESS) {
        const char *err_msg = opal_strerror(ret);
        /* If ORTE was not setup yet, don't use orte_show_help */
        if (orte_setup) {
            orte_show_help("help-mpi-runtime",
                           "mpi_init:startup:internal-failure", true,
                           "MPI_INIT", "MPI_INIT", error, err_msg, ret);
        } else {
            opal_show_help("help-mpi-runtime",
                           "mpi_init:startup:internal-failure", true,
                           "MPI_INIT", "MPI_INIT", error, err_msg, ret);
        }
        return ret;
    }

    /* Initialize the registered datarep list to be empty */
    OBJ_CONSTRUCT(&ompi_registered_datareps, opal_list_t);

    /* Initialize the arrays used to store the F90 types returned by the
     *  MPI_Type_create_f90_XXX functions.
     */
    OBJ_CONSTRUCT( &ompi_mpi_f90_integer_hashtable, opal_hash_table_t);
    opal_hash_table_init(&ompi_mpi_f90_integer_hashtable, 16 /* why not? */);

    OBJ_CONSTRUCT( &ompi_mpi_f90_real_hashtable, opal_hash_table_t);
    opal_hash_table_init(&ompi_mpi_f90_real_hashtable, FLT_MAX_10_EXP);

    OBJ_CONSTRUCT( &ompi_mpi_f90_complex_hashtable, opal_hash_table_t);
    opal_hash_table_init(&ompi_mpi_f90_complex_hashtable, FLT_MAX_10_EXP);
    
    /* All done.  Wasn't that simple? */

    ompi_mpi_initialized = true;

    /* check for timing request - get stop time and report elapsed time if so */
    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init[%ld]: time from barrier p to complete mpi_init %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
    }

    return MPI_SUCCESS;
}
Пример #5
0
int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
{
    int ret;
    ompi_proc_t** procs;
    size_t nprocs;
    char *error = NULL;
    bool timing = false;
    int param, value;
    struct timeval ompistart, ompistop;
    char *event_val = NULL;
    bool orte_setup = false;
    orte_grpcomm_collective_t *coll;
    char *cmd=NULL, *av=NULL;

    /* bitflag of the thread level support provided. To be used
     * for the modex in order to work in heterogeneous environments. */
    uint8_t threadlevel_bf; 

    /* Indicate that we have *started* MPI_INIT*.  MPI_FINALIZE has
       something sorta similar in a static local variable in
       ompi_mpi_finalize(). */
    ompi_mpi_init_started = true;

    /* Setup enough to check get/set MCA params */

    if (OPAL_SUCCESS != (ret = opal_init_util(&argc, &argv))) {
        error = "ompi_mpi_init: opal_init_util failed";
        goto error;
    }

    if (OPAL_SUCCESS != (ret = opal_arch_set_fortran_logical_size(sizeof(ompi_fortran_logical_t)))) {
        error = "ompi_mpi_init: opal_arch_set_fortran_logical_size failed";
        goto error;
    }

    /* _After_ opal_init_util() but _before_ orte_init(), we need to
       set an MCA param that tells libevent that it's ok to use any
       mechanism in libevent that is available on this platform (e.g.,
       epoll and friends).  Per opal/event/event.s, we default to
       select/poll -- but we know that MPI processes won't be using
       pty's with the event engine, so it's ok to relax this
       constraint and let any fd-monitoring mechanism be used. */
    ret = mca_base_param_reg_string_name("opal", "event_include",
                                         "Internal orted MCA param: tell opal_init() to use a specific mechanism in libevent",
                                         false, false, "all", &event_val);
    if (ret >= 0) {
        /* We have to explicitly "set" the MCA param value here
           because libevent initialization will re-register the MCA
           param and therefore override the default. Setting the value
           here puts the desired value ("all") in different storage
           that is not overwritten if/when the MCA param is
           re-registered. This is unless the user has specified a different
           value for this MCA parameter. Make sure we check to see if the
           default is specified before forcing "all" in case that is not what
           the user desires. Note that we do *NOT* set this value as an
           environment variable, just so that it won't be inherited by
           any spawned processes and potentially cause unintented
           side-effects with launching ORTE tools... */
        if (0 == strcmp("all", event_val)) {
            mca_base_param_set_string(ret, "all");
        }
    }

    if( NULL != event_val ) {
        free(event_val);
        event_val = NULL;
    }

    /* check to see if we want timing information */
    param = mca_base_param_reg_int_name("ompi", "timing",
                                        "Request that critical timing loops be measured",
                                        false, false, 0, &value);
    if (value != 0) {
        timing = true;
        gettimeofday(&ompistart, NULL);
    }
    
    /* if we were not externally started, then we need to setup
     * some envars so the MPI_INFO_ENV can get the cmd name
     * and argv (but only if the user supplied a non-NULL argv!), and
     * the requested thread level
     */
    if (NULL == getenv("OMPI_COMMAND") && NULL != argv && NULL != argv[0]) {
        asprintf(&cmd, "OMPI_COMMAND=%s", argv[0]);
        putenv(cmd);
    }
    if (NULL == getenv("OMPI_ARGV") && 1 < argc) {
        char *tmp;
        tmp = opal_argv_join(&argv[1], ' ');
        asprintf(&av, "OMPI_ARGV=%s", tmp);
        free(tmp);
        putenv(av);
    }

    /* Setup ORTE - note that we are an MPI process  */
    if (ORTE_SUCCESS != (ret = orte_init(NULL, NULL, ORTE_PROC_MPI))) {
        error = "ompi_mpi_init: orte_init failed";
        goto error;
    }
    orte_setup = true;
    
    /* check for timing request - get stop time and report elapsed time if so */
    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init [%ld]: time from start to completion of orte_init %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
        gettimeofday(&ompistart, NULL);
    }

#if OPAL_HAVE_HWLOC
    /* if hwloc is available but didn't get setup for some
     * reason, do so now
     */
    if (NULL == opal_hwloc_topology) {
        if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) {
            error = "Topology init";
            goto error;
        }
    }
#endif

    /* Register errhandler callback with orte errmgr */
    if (NULL != orte_errmgr.set_fault_callback) {
        orte_errmgr.set_fault_callback(ompi_errhandler_runtime_callback);
    }

    /* Figure out the final MPI thread levels.  If we were not
       compiled for support for MPI threads, then don't allow
       MPI_THREAD_MULTIPLE.  Set this stuff up here early in the
       process so that other components can make decisions based on
       this value. */
    /**
     * These values are monotonic; MPI_THREAD_SINGLE < MPI_THREAD_FUNNELED
     *                             < MPI_THREAD_SERIALIZED < MPI_THREAD_MULTIPLE.
     * If possible, the call will return provided = required. Failing this,
     * the call will return the least supported level such that
     * provided > required. Finally, if the user requirement cannot be
     * satisfied, then the call will return in provided the highest
     * supported level.
     */
    ompi_mpi_thread_requested = requested;

    if (OMPI_ENABLE_THREAD_MULTIPLE == 1) {
        ompi_mpi_thread_provided = *provided = requested;
        ompi_mpi_main_thread = opal_thread_get_self();
    } else {
        if (MPI_THREAD_MULTIPLE == requested) {
            ompi_mpi_thread_provided = *provided = MPI_THREAD_SERIALIZED;
        } else {
            ompi_mpi_thread_provided = *provided = requested;
        }
        ompi_mpi_main_thread = (OPAL_ENABLE_MULTI_THREADS ? opal_thread_get_self() : NULL);
    }

    ompi_mpi_thread_multiple = (ompi_mpi_thread_provided == 
                                MPI_THREAD_MULTIPLE);

    /* determine the bitflag belonging to the threadlevel_support provided */
    memset ( &threadlevel_bf, 0, sizeof(uint8_t));
    OMPI_THREADLEVEL_SET_BITFLAG ( ompi_mpi_thread_provided, threadlevel_bf );

    /* add this bitflag to the modex */
    if ( OMPI_SUCCESS != (ret = ompi_modex_send_string("MPI_THREAD_LEVEL", &threadlevel_bf, sizeof(uint8_t)))) {
        error = "ompi_mpi_init: modex send thread level";
        goto error;
    }

    /* Once we've joined the RTE, see if any MCA parameters were
       passed to the MPI level */

    if (OMPI_SUCCESS != (ret = ompi_mpi_register_params())) {
        error = "mca_mpi_register_params() failed";
        goto error;
    }

    /* initialize datatypes. This step should be done early as it will
     * create the local convertor and local arch used in the proc
     * init.
     */
    if (OMPI_SUCCESS != (ret = ompi_datatype_init())) {
        error = "ompi_datatype_init() failed";
        goto error;
    }

    /* Initialize OMPI procs */
    if (OMPI_SUCCESS != (ret = ompi_proc_init())) {
        error = "mca_proc_init() failed";
        goto error;
    }

    /* Initialize the op framework. This has to be done *after*
       ddt_init, but befor mca_coll_base_open, since some collective
       modules (e.g., the hierarchical coll component) may need ops in
       their query function. */
    if (OMPI_SUCCESS != (ret = ompi_op_base_open())) {
        error = "ompi_op_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != 
        (ret = ompi_op_base_find_available(OMPI_ENABLE_PROGRESS_THREADS,
                                           OMPI_ENABLE_THREAD_MULTIPLE))) {
        error = "ompi_op_base_find_available() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = ompi_op_init())) {
        error = "ompi_op_init() failed";
        goto error;
    }

    /* Open up MPI-related MCA components */

    if (OMPI_SUCCESS != (ret = mca_allocator_base_open())) {
        error = "mca_allocator_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_rcache_base_open())) {
        error = "mca_rcache_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_mpool_base_open())) {
        error = "mca_mpool_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_pml_base_open())) {
        error = "mca_pml_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_coll_base_open())) {
        error = "mca_coll_base_open() failed";
        goto error;
    }

    if (OMPI_SUCCESS != (ret = ompi_osc_base_open())) {
        error = "ompi_osc_base_open() failed";
        goto error;
    }

#if OPAL_ENABLE_FT_CR == 1
    if (OMPI_SUCCESS != (ret = ompi_crcp_base_open())) {
        error = "ompi_crcp_base_open() failed";
        goto error;
    }
#endif

    /* In order to reduce the common case for MPI apps (where they
       don't use MPI-2 IO or MPI-1 topology functions), the io and
       topo frameworks are initialized lazily, at the first use of
       relevant functions (e.g., MPI_FILE_*, MPI_CART_*, MPI_GRAPH_*),
       so they are not opened here. */

    /* Select which MPI components to use */

    if (OMPI_SUCCESS != 
        (ret = mca_mpool_base_init(OMPI_ENABLE_PROGRESS_THREADS,
                                   OMPI_ENABLE_THREAD_MULTIPLE))) {
        error = "mca_mpool_base_init() failed";
        goto error;
    }

    if (OMPI_SUCCESS != 
        (ret = mca_pml_base_select(OMPI_ENABLE_PROGRESS_THREADS,
                                   OMPI_ENABLE_THREAD_MULTIPLE))) {
        error = "mca_pml_base_select() failed";
        goto error;
    }

    /* check for timing request - get stop time and report elapsed time if so */
    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init[%ld]: time from completion of orte_init to modex %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
        gettimeofday(&ompistart, NULL);
    }
    
    /* exchange connection info - this function also acts as a barrier
     * as it will not return until the exchange is complete
     */
    coll = OBJ_NEW(orte_grpcomm_collective_t);
    coll->id = orte_process_info.peer_modex;
    if (ORTE_SUCCESS != (ret = orte_grpcomm.modex(coll))) {
        error = "orte_grpcomm_modex failed";
        goto error;
    }
    /* wait for modex to complete - this may be moved anywhere in mpi_init
     * so long as it occurs prior to calling a function that needs
     * the modex info!
     */
    while (coll->active) {
        opal_progress();  /* block in progress pending events */
    }
    OBJ_RELEASE(coll);

    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init[%ld]: time to execute modex %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
        gettimeofday(&ompistart, NULL);
    }

    /* select buffered send allocator component to be used */
    ret=mca_pml_base_bsend_init(OMPI_ENABLE_THREAD_MULTIPLE);
    if( OMPI_SUCCESS != ret ) {
        error = "mca_pml_base_bsend_init() failed";
        goto error;
    }

    if (OMPI_SUCCESS != 
        (ret = mca_coll_base_find_available(OMPI_ENABLE_PROGRESS_THREADS,
                                            OMPI_ENABLE_THREAD_MULTIPLE))) {
        error = "mca_coll_base_find_available() failed";
        goto error;
    }

    if (OMPI_SUCCESS != 
        (ret = ompi_osc_base_find_available(OMPI_ENABLE_PROGRESS_THREADS,
                                            OMPI_ENABLE_THREAD_MULTIPLE))) {
        error = "ompi_osc_base_find_available() failed";
        goto error;
    }

#if OPAL_ENABLE_FT_CR == 1
    if (OMPI_SUCCESS != (ret = ompi_crcp_base_select() ) ) {
        error = "ompi_crcp_base_select() failed";
        goto error;
    }
#endif

    /* io and topo components are not selected here -- see comment
       above about the io and topo frameworks being loaded lazily */

    /* Initialize each MPI handle subsystem */
    /* initialize requests */
    if (OMPI_SUCCESS != (ret = ompi_request_init())) {
        error = "ompi_request_init() failed";
        goto error;
    }

    if (OMPI_SUCCESS != (ret = ompi_message_init())) {
        error = "ompi_message_init() failed";
        goto error;
    }

    /* initialize info */
    if (OMPI_SUCCESS != (ret = ompi_info_init())) {
        error = "ompi_info_init() failed";
        goto error;
    }

    /* initialize error handlers */
    if (OMPI_SUCCESS != (ret = ompi_errhandler_init())) {
        error = "ompi_errhandler_init() failed";
        goto error;
    }

    /* initialize error codes */
    if (OMPI_SUCCESS != (ret = ompi_mpi_errcode_init())) {
        error = "ompi_mpi_errcode_init() failed";
        goto error;
    }
    
    /* initialize internal error codes */
    if (OMPI_SUCCESS != (ret = ompi_errcode_intern_init())) {
        error = "ompi_errcode_intern_init() failed";
        goto error;
    }
     
    /* initialize groups  */
    if (OMPI_SUCCESS != (ret = ompi_group_init())) {
        error = "ompi_group_init() failed";
        goto error;
    }

    /* initialize communicators */
    if (OMPI_SUCCESS != (ret = ompi_comm_init())) {
        error = "ompi_comm_init() failed";
        goto error;
    }

    /* initialize file handles */
    if (OMPI_SUCCESS != (ret = ompi_file_init())) {
        error = "ompi_file_init() failed";
        goto error;
    }

    /* initialize windows */
    if (OMPI_SUCCESS != (ret = ompi_win_init())) {
        error = "ompi_win_init() failed";
        goto error;
    }

    /* initialize attribute meta-data structure for comm/win/dtype */
    if (OMPI_SUCCESS != (ret = ompi_attr_init())) {
        error = "ompi_attr_init() failed";
        goto error;
    }

    /* identify the architectures of remote procs and setup
     * their datatype convertors, if required
     */
    if (OMPI_SUCCESS != (ret = ompi_proc_complete_init())) {
        error = "ompi_proc_complete_init failed";
        goto error;
    }

    /* If thread support was enabled, then setup OPAL to allow for
       them. */
    if ((OMPI_ENABLE_PROGRESS_THREADS == 1) ||
        (*provided != MPI_THREAD_SINGLE)) {
        opal_set_using_threads(true);
    }

    /* start PML/BTL's */
    ret = MCA_PML_CALL(enable(true));
    if( OMPI_SUCCESS != ret ) {
        error = "PML control failed";
        goto error;
    }

    /* add all ompi_proc_t's to PML */
    if (NULL == (procs = ompi_proc_world(&nprocs))) {
        error = "ompi_proc_world() failed";
        goto error;
    }
    ret = MCA_PML_CALL(add_procs(procs, nprocs));
    free(procs);
    /* If we got "unreachable", then print a specific error message.
       Otherwise, if we got some other failure, fall through to print
       a generic message. */
    if (OMPI_ERR_UNREACH == ret) {
        orte_show_help("help-mpi-runtime",
                       "mpi_init:startup:pml-add-procs-fail", true);
        error = NULL;
        goto error;
    } else if (OMPI_SUCCESS != ret) {
        error = "PML add procs failed";
        goto error;
    }

    MCA_PML_CALL(add_comm(&ompi_mpi_comm_world.comm));
    MCA_PML_CALL(add_comm(&ompi_mpi_comm_self.comm));

    /*
     * Dump all MCA parameters if requested
     */
    if (ompi_mpi_show_mca_params) {
        ompi_show_all_mca_params(ompi_mpi_comm_world.comm.c_my_rank, 
                                 nprocs, 
                                 orte_process_info.nodename);
    }

    /* Do we need to wait for a debugger? */
    ompi_wait_for_debugger();
    
    /* check for timing request - get stop time and report elapsed
       time if so, then start the clock again */
    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init[%ld]: time from modex to first barrier %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
        gettimeofday(&ompistart, NULL);
    }
    
    /* wait for everyone to reach this point */
    coll = OBJ_NEW(orte_grpcomm_collective_t);
    coll->id = orte_process_info.peer_init_barrier;
    if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier(coll))) {
        error = "orte_grpcomm_barrier failed";
        goto error;
    }
    /* wait for barrier to complete */
    while (coll->active) {
        opal_progress();  /* block in progress pending events */
    }
    OBJ_RELEASE(coll);

    /* check for timing request - get stop time and report elapsed
       time if so, then start the clock again */
    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init[%ld]: time to execute barrier %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
        gettimeofday(&ompistart, NULL);
    }

#if OMPI_ENABLE_PROGRESS_THREADS == 0
    /* Start setting up the event engine for MPI operations.  Don't
       block in the event library, so that communications don't take
       forever between procs in the dynamic code.  This will increase
       CPU utilization for the remainder of MPI_INIT when we are
       blocking on ORTE-level events, but may greatly reduce non-TCP
       latency. */
    opal_progress_set_event_flag(OPAL_EVLOOP_NONBLOCK);
#endif
    
    /* wire up the mpi interface, if requested.  Do this after the
       non-block switch for non-TCP performance.  Do before the
       polling change as anyone with a complex wire-up is going to be
       using the oob. */
    if (OMPI_SUCCESS != (ret = ompi_init_preconnect_mpi())) {
        error = "ompi_mpi_do_preconnect_all() failed";
        goto error;
    }

    /* Setup the publish/subscribe (PUBSUB) framework */
    if (OMPI_SUCCESS != (ret = ompi_pubsub_base_open())) {
        error = "ompi_pubsub_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = ompi_pubsub_base_select())) {
        error = "ompi_pubsub_base_select() failed";
        goto error;
    }
    
    /* Setup the dynamic process management (DPM) framework */
    if (OMPI_SUCCESS != (ret = ompi_dpm_base_open())) {
        error = "ompi_dpm_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = ompi_dpm_base_select())) {
        error = "ompi_dpm_base_select() failed";
        goto error;
    }


    /* Determine the overall threadlevel support of all processes 
       in MPI_COMM_WORLD. This has to be done before calling 
       coll_base_comm_select, since some of the collective components
       e.g. hierarch, might create subcommunicators. The threadlevel
       requested by all processes is required in order to know
       which cid allocation algorithm can be used. */
    if ( OMPI_SUCCESS != 
	 ( ret = ompi_comm_cid_init ())) {
	error = "ompi_mpi_init: ompi_comm_cid_init failed";
	goto error;
    }

    /* Init coll for the comms. This has to be after dpm_base_select, 
       (since dpm.mark_dyncomm is not set in the communicator creation
       function else), but before dpm.dyncom_init, since this function
       might require collective for the CID allocation. */
    if (OMPI_SUCCESS !=
        (ret = mca_coll_base_comm_select(MPI_COMM_WORLD))) {
        error = "mca_coll_base_comm_select(MPI_COMM_WORLD) failed";
        goto error;
    }

    if (OMPI_SUCCESS != 
        (ret = mca_coll_base_comm_select(MPI_COMM_SELF))) {
        error = "mca_coll_base_comm_select(MPI_COMM_SELF) failed";
        goto error;
    }


    
    /* Check whether we have been spawned or not.  We introduce that
       at the very end, since we need collectives, datatypes, ptls
       etc. up and running here.... */
    if (OMPI_SUCCESS != (ret = ompi_dpm.dyn_init())) {
        error = "ompi_comm_dyn_init() failed";
        goto error;
    }

    /*
     * Startup the Checkpoint/Restart Mech.
     * Note: Always do this so tools don't hang when
     * in a non-checkpointable build
     */
    if (OMPI_SUCCESS != (ret = ompi_cr_init())) {
        error = "ompi_cr_init";
        goto error;
    }

    /* Undo OPAL calling opal_progress_event_users_increment() during 
       opal_init, to get better latency when not using TCP.  Do 
       this *after* dyn_init, as dyn init uses lots of ORTE 
       communication and we don't want to hinder the performance of 
       that code. */ 
    opal_progress_event_users_decrement(); 

    /* see if yield_when_idle was specified - if so, use it */
    param = mca_base_param_find("mpi", NULL, "yield_when_idle");
    mca_base_param_lookup_int(param, &value);
    if (value < 0) {
        /* if no info is provided, just default to conservative */
        opal_progress_set_yield_when_idle(true);
    } else {
        /* info was provided, so set idle accordingly */
        opal_progress_set_yield_when_idle(value == 0 ? false : true);
    }
    
    param = mca_base_param_find("mpi", NULL, "event_tick_rate");
    mca_base_param_lookup_int(param, &value);
    /* negative value means use default - just don't do anything */
    if (value >= 0) {
        opal_progress_set_event_poll_rate(value);
    }

    /* At this point, we are fully configured and in MPI mode.  Any
       communication calls here will work exactly like they would in
       the user's code.  Setup the connections between procs and warm
       them up with simple sends, if requested */

    if (OMPI_SUCCESS != (ret = ompi_mpiext_init())) {
        error = "ompi_mpiext_init";
        goto error;
    }

    /* Fall through */
 error:
    if (ret != OMPI_SUCCESS) {
        /* Only print a message if one was not already printed */
        if (NULL != error) {
            const char *err_msg = opal_strerror(ret);
            /* If ORTE was not setup yet, don't use orte_show_help */
            if (orte_setup) {
                orte_show_help("help-mpi-runtime",
                               "mpi_init:startup:internal-failure", true,
                               "MPI_INIT", "MPI_INIT", error, err_msg, ret);
            } else {
                opal_show_help("help-mpi-runtime",
                               "mpi_init:startup:internal-failure", true,
                               "MPI_INIT", "MPI_INIT", error, err_msg, ret);
            }
        }
        return ret;
    }

    /* Initialize the registered datarep list to be empty */
    OBJ_CONSTRUCT(&ompi_registered_datareps, opal_list_t);

    /* Initialize the arrays used to store the F90 types returned by the
     *  MPI_Type_create_f90_XXX functions.
     */
    OBJ_CONSTRUCT( &ompi_mpi_f90_integer_hashtable, opal_hash_table_t);
    opal_hash_table_init(&ompi_mpi_f90_integer_hashtable, 16 /* why not? */);

    OBJ_CONSTRUCT( &ompi_mpi_f90_real_hashtable, opal_hash_table_t);
    opal_hash_table_init(&ompi_mpi_f90_real_hashtable, FLT_MAX_10_EXP);

    OBJ_CONSTRUCT( &ompi_mpi_f90_complex_hashtable, opal_hash_table_t);
    opal_hash_table_init(&ompi_mpi_f90_complex_hashtable, FLT_MAX_10_EXP);

    /* All done.  Wasn't that simple? */

    ompi_mpi_initialized = true;

    /* check for timing request - get stop time and report elapsed time if so */
    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init[%ld]: time from barrier to complete mpi_init %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
    }

    return MPI_SUCCESS;
}
Пример #6
0
int orte_filem_base_select(void)
{
    int exit_status = OPAL_SUCCESS;
    orte_filem_base_component_t *best_component = NULL;
    orte_filem_base_module_t *best_module = NULL;
    char *include_list = NULL;

    /*
     * Register the framework MCA param and look up include list
     */
    mca_base_param_reg_string_name("filem", NULL,
                                   "Which FILEM component to use (empty = auto-select)",
                                   false, false,
                                   NULL, &include_list);

    /* If we do not have any components to select this is ok. The user likely
     * decided not to build with filem components. Just use the none
     * component and move on.
     */
    if( 0 >= opal_list_get_size(&orte_filem_base_components_available) || 
        (NULL != include_list && 0 == strncmp(include_list, "none", strlen("none")) ) ) { 
        opal_output_verbose(1, orte_filem_base_output,
                            "filem:select: Warning: Using none component. Some functionality (e.g., --preload-binary) will not work in this mode.");
        best_component = &none_component;
        best_module    = &none_module;

        /* JJH: Todo: Check if none is in the list */
        /* Close all components since none will be used */
        mca_base_components_close(0, /* Pass 0 to keep this from closing the output handle */
                                  &orte_filem_base_components_available,
                                  NULL);
        goto skip_select;
    }

    /*
     * Select the best component
     */
    if( OPAL_SUCCESS != mca_base_select("filem", orte_filem_base_output,
                                        &orte_filem_base_components_available,
                                        (mca_base_module_t **) &best_module,
                                        (mca_base_component_t **) &best_component) ) {
        /* This will only happen if no component was selected */
        exit_status = ORTE_ERROR;
        goto cleanup;
    }

 skip_select:
    /* Save the winner */
    orte_filem_base_selected_component = *best_component;
    orte_filem = *best_module;

    /* Initialize the winner */
    if (NULL != best_module) {
        if (OPAL_SUCCESS != orte_filem.filem_init()) {
            exit_status = OPAL_ERROR;
            goto cleanup;
        }
    }

 cleanup:
    if( NULL != include_list ) {
        free(include_list);
        include_list = NULL;
    }

    return exit_status;
}
/**
 * Function for finding and opening either all MCA components, or the
 * one that was specifically requested via a MCA parameter.
 */
int mca_base_components_open(const char *type_name, int output_id,
                             const mca_base_component_t **static_components,
                             opal_list_t *components_available,
                             bool open_dso_components)
{
    int ret, param;
    opal_list_item_t *item;
    opal_list_t components_found;
    char **requested_component_names;
    int param_verbose = -1;
    int param_type = -1;
    int verbose_level;
    char *str;
    bool include_mode;
#if (OPAL_ENABLE_FT == 1) && (OPAL_ENABLE_FT_CR == 1)
    opal_list_item_t *next;
    uint32_t open_only_flags = MCA_BASE_METADATA_PARAM_NONE;
    const mca_base_component_t *component;
#endif

    /* Register MCA parameters */
    /* Check to see if it exists first */
    if( 0 > (param_type = mca_base_param_find(type_name, NULL, NULL) ) ) {
        asprintf(&str, "Default selection set of components for the %s framework (<none>"
                 " means use all components that can be found)", type_name);
        param_type =
            mca_base_param_reg_string_name(type_name, NULL, str,
                                           false, false, NULL, NULL);
        free(str);
    }

    param = mca_base_param_find("mca", NULL, "component_show_load_errors");
    mca_base_param_lookup_int(param, &ret);
    show_errors = OPAL_INT_TO_BOOL(ret);

    /* Setup verbosity for this MCA type */
    asprintf(&str, "Verbosity level for the %s framework (0 = no verbosity)", type_name);
    param_verbose =
        mca_base_param_reg_int_name(type_name, "base_verbose",
                                    str, false, false, 0, NULL);
    free(str);
    mca_base_param_lookup_int(param_verbose, &verbose_level);
    if (output_id != 0) {
        opal_output_set_verbosity(output_id, verbose_level);
    }
    opal_output_verbose(10, output_id,
                        "mca: base: components_open: Looking for %s components",
                        type_name);

    ret = parse_requested(param_type, &include_mode, &requested_component_names);
    if( OPAL_SUCCESS != ret ) {
        return ret;
    }

    /* Find and load requested components */
    if (OPAL_SUCCESS != (ret =
                             mca_base_component_find(NULL, type_name, static_components,
                                     requested_component_names, include_mode,
                                     &components_found, open_dso_components)) ) {
        return ret;
    }

#if (OPAL_ENABLE_FT == 1) && (OPAL_ENABLE_FT_CR == 1)
    {
        int param_id = -1;
        int param_val = 0;
        /*
         * Extract supported mca parameters for selection contraints
         * Supported Options:
         *   - mca_base_component_distill_checkpoint_ready = Checkpoint Ready
         */
        param_id = mca_base_param_reg_int_name("mca", "base_component_distill_checkpoint_ready",
                                               "Distill only those components that are Checkpoint Ready",
                                               false, false,
                                               0, &param_val);
        if( 0 != param_val ) { /* Select Checkpoint Ready */
            open_only_flags |= MCA_BASE_METADATA_PARAM_CHECKPOINT;
        }
    }
#endif  /* (OPAL_ENABLE_FT == 1) && (OPAL_ENABLE_FT_CR == 1) */

    /*
     * Pre-process the list with parameter constraints
     * e.g., If requested to select only CR enabled components
     *       then only make available those components.
     *
     * JJH Note: Currently checkpoint/restart is the only user of this
     *           functionality. If other component constraint options are
     *           added, then this logic can be used for all contraint
     *           options.
     */
#if (OPAL_ENABLE_FT == 1) && (OPAL_ENABLE_FT_CR == 1)
    if( !(MCA_BASE_METADATA_PARAM_NONE & open_only_flags) ) {
        if( MCA_BASE_METADATA_PARAM_CHECKPOINT & open_only_flags) {
            opal_output_verbose(10, output_id,
                                "mca: base: components_open: "
                                "including only %s components that are checkpoint enabled", type_name);
        }

        /*
         * Check all the components to make sure they adhere to the user
         * expressed requirements.
         */
        for(item  = opal_list_get_first(&components_found);
                item != opal_list_get_end(&components_found);
                item  = next ) {
            mca_base_open_only_dummy_component_t *dummy;
            mca_base_component_list_item_t *cli = (mca_base_component_list_item_t *) item;
            dummy = (mca_base_open_only_dummy_component_t*) cli->cli_component;
            component = cli->cli_component;

            next = opal_list_get_next(item);

            /*
             * If the user asked for a checkpoint enabled run
             * then only load checkpoint enabled components.
             */
            if( MCA_BASE_METADATA_PARAM_CHECKPOINT & open_only_flags) {
                if( MCA_BASE_METADATA_PARAM_CHECKPOINT & dummy->data.param_field) {
                    opal_output_verbose(10, output_id,
                                        "mca: base: components_open: "
                                        "(%s) Component %s is Checkpointable",
                                        type_name,
                                        dummy->version.mca_component_name);
                }
                else {
                    opal_output_verbose(10, output_id,
                                        "mca: base: components_open: "
                                        "(%s) Component %s is *NOT* Checkpointable - Disabled",
                                        type_name,
                                        dummy->version.mca_component_name);
                    opal_list_remove_item(&components_found, item);
                    /* Make sure to release the component since we are not
                     * opening it */
                    mca_base_component_repository_release(component);
                }
            }
        }
    }
#endif  /* (OPAL_ENABLE_FT == 1) && (OPAL_ENABLE_FT_CR == 1) */

    /* Open all remaining components */
    ret = open_components(type_name, output_id,
                          &components_found, components_available);

    /* Free resources */
    for (item = opal_list_remove_first(&components_found); NULL != item;
            item = opal_list_remove_first(&components_found)) {
        OBJ_RELEASE(item);
    }
    OBJ_DESTRUCT(&components_found);

    if (NULL != requested_component_names) {
        opal_argv_free(requested_component_names);
    }

    /* All done */
    return ret;
}
Пример #8
0
/**
 * Function for finding and opening either all MCA components, or the one
 * that was specifically requested via a MCA parameter.
 */
int orte_ras_base_open(void)
{
    int value, rc, param;
    orte_data_type_t tmp;
    char *requested;

    /* Debugging / verbose output */

    orte_ras_base.ras_output = opal_output_open(NULL);
    mca_base_param_reg_int_name("ras", "base_verbose", 
                                "Enable debugging for the RAS framework (nonzero = enabled)",
                                false, false, 0, &value);
    if (value != 0) {
        orte_ras_base.ras_output = opal_output_open(NULL);
    } else {
        orte_ras_base.ras_output = -1;
    }

    /* Defaults */

    orte_ras_base.ras_opened_valid = false;
    orte_ras_base.ras_using_proxy = false;
    orte_ras_base.ras_available_valid = false;

    /** register the base system types with the DSS */
    tmp = ORTE_RAS_NODE;
    if (ORTE_SUCCESS != (rc = orte_dss.register_type(orte_ras_base_pack_node,
                                                     orte_ras_base_unpack_node,
                                                     (orte_dss_copy_fn_t)orte_ras_base_copy_node,
                                                     (orte_dss_compare_fn_t)orte_ras_base_compare_node,
                                                     (orte_dss_size_fn_t)orte_ras_base_size_node,
                                                     (orte_dss_print_fn_t)orte_ras_base_print_node,
                                                     (orte_dss_release_fn_t)orte_ras_base_std_obj_release,
                                                     ORTE_DSS_STRUCTURED,
                                                     "ORTE_RAS_NODE", &tmp))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }

    /* Some systems do not want any RAS support. In those cases,
        * memory consumption is also an issue. For those systems, we
        * avoid opening the RAS components by checking for a directive
        * to use the "null" component.
        */
    param = mca_base_param_reg_string_name("ras", NULL, NULL,
                                           false, false, NULL, NULL);
    if (ORTE_ERROR == mca_base_param_lookup_string(param, &requested)) {
        return ORTE_ERROR;
    }
    if (NULL != requested && 0 == strcmp(requested, "null")) {
        /* the user has specifically requested that we use the "null"
        * component. In this case, that means we do NOT open any
        * components, and we simply use the default module we have
        * already defined above
        */
        orte_ras_base.ras_opened_valid = false;
        orte_ras = orte_ras_no_op; /* use the no_op module */
        return ORTE_SUCCESS;
    }

    /* check for timing tests */
    param = mca_base_param_reg_int_name("orte", "timing",
                                        "Request that critical timing loops be measured",
                                        false, false, 0, &value);
    if (value != 0) {
        orte_ras_base.timing = true;
    } else {
        orte_ras_base.timing = false;
    }
    
    /* Open up all available components */
    if (ORTE_SUCCESS != 
        mca_base_components_open("ras", orte_ras_base.ras_output,
                                 mca_ras_base_static_components, 
                                 &orte_ras_base.ras_opened, true)) {
        return ORTE_ERROR;
    }

    /* if we are not on a HNP, select the proxy 'module' */
    if (!orte_process_info.seed) {
        orte_ras = orte_ras_base_proxy_module;
        /* initialize the module */
        orte_ras_base_proxy_init(&rc);
        orte_ras_base.ras_using_proxy = true;
        return ORTE_SUCCESS;
    }

    /* All done */

    orte_ras_base.ras_opened_valid = true;
    return ORTE_SUCCESS;
}
Пример #9
0
int orte_daemon(int argc, char *argv[])
{
    int ret = 0;
    opal_cmd_line_t *cmd_line = NULL;
    char *rml_uri;
    int i;
    opal_buffer_t *buffer;
    char hostname[100];
    char *tmp_env_var = NULL;
    
    /* initialize the globals */
    memset(&orted_globals, 0, sizeof(orted_globals));
    /* initialize the singleton died pipe to an illegal value so we can detect it was set */
    orted_globals.singleton_died_pipe = -1;
    /* init the failure orted vpid to an invalid value */
    orted_globals.fail = ORTE_VPID_INVALID;
    
    /* setup to check common command line options that just report and die */
    cmd_line = OBJ_NEW(opal_cmd_line_t);
    if (OPAL_SUCCESS != opal_cmd_line_create(cmd_line, orte_cmd_line_opts)) {
        OBJ_RELEASE(cmd_line);
        exit(1);
    }
    mca_base_cmd_line_setup(cmd_line);
    if (ORTE_SUCCESS != (ret = opal_cmd_line_parse(cmd_line, false,
                                                   argc, argv))) {
        char *args = NULL;
        args = opal_cmd_line_get_usage_msg(cmd_line);
        fprintf(stderr, "Usage: %s [OPTION]...\n%s\n", argv[0], args);
        free(args);
        OBJ_RELEASE(cmd_line);
        return ret;
    }
    
    /*
     * Since this process can now handle MCA/GMCA parameters, make sure to
     * process them.
     */
    mca_base_cmd_line_process_args(cmd_line, &environ, &environ);
    
    /* Ensure that enough of OPAL is setup for us to be able to run */
    /*
     * NOTE: (JJH)
     *  We need to allow 'mca_base_cmd_line_process_args()' to process command
     *  line arguments *before* calling opal_init_util() since the command
     *  line could contain MCA parameters that affect the way opal_init_util()
     *  functions. AMCA parameters are one such option normally received on the
     *  command line that affect the way opal_init_util() behaves.
     *  It is "safe" to call mca_base_cmd_line_process_args() before 
     *  opal_init_util() since mca_base_cmd_line_process_args() does *not*
     *  depend upon opal_init_util() functionality.
     */
    if (OPAL_SUCCESS != opal_init_util(&argc, &argv)) {
        fprintf(stderr, "OPAL failed to initialize -- orted aborting\n");
        exit(1);
    }

    /* save the environment for launch purposes. This MUST be
     * done so that we can pass it to any local procs we
     * spawn - otherwise, those local procs won't see any
     * non-MCA envars that were set in the enviro when the
     * orted was executed - e.g., by .csh
     */
    orte_launch_environ = opal_argv_copy(environ);
    
    /* purge any ess flag set in the environ when we were launched */
    opal_unsetenv("OMPI_MCA_ess", &orte_launch_environ);
    
    /* if orte_daemon_debug is set, let someone know we are alive right
     * away just in case we have a problem along the way
     */
    if (orted_globals.debug) {
        gethostname(hostname, 100);
        fprintf(stderr, "Daemon was launched on %s - beginning to initialize\n", hostname);
    }
    
    /* check for help request */
    if (orted_globals.help) {
        char *args = NULL;
        args = opal_cmd_line_get_usage_msg(cmd_line);
        orte_show_help("help-orted.txt", "orted:usage", false,
                       argv[0], args);
        free(args);
        return 1;
    }
#if defined(HAVE_SETSID) && !defined(__WINDOWS__)
    /* see if we were directed to separate from current session */
    if (orted_globals.set_sid) {
        setsid();
    }
#endif  /* !defined(__WINDOWS__) */
    /* see if they want us to spin until they can connect a debugger to us */
    i=0;
    while (orted_spin_flag) {
        i++;
        if (1000 < i) i=0;        
    }

#if OPAL_ENABLE_FT_CR == 1
    /* Mark as a tool program */
    tmp_env_var = mca_base_param_env_var("opal_cr_is_tool");
    opal_setenv(tmp_env_var,
                "1",
                true, &environ);
    free(tmp_env_var);
#endif
    tmp_env_var = NULL; /* Silence compiler warning */

    /* if mapreduce set, flag it */
    if (orted_globals.mapreduce) {
        orte_map_reduce = true;
    }

    /* Set the flag telling OpenRTE that I am NOT a
     * singleton, but am "infrastructure" - prevents setting
     * up incorrect infrastructure that only a singleton would
     * require.
     */
    if (orted_globals.hnp) {
        if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_HNP))) {
            ORTE_ERROR_LOG(ret);
            return ret;
        }
    } else {
        if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_DAEMON))) {
            ORTE_ERROR_LOG(ret);
            return ret;
        }
    }
    /* finalize the OPAL utils. As they are opened again from orte_init->opal_init
     * we continue to have a reference count on them. So we have to finalize them twice...
     */
    opal_finalize_util();

    if ((int)ORTE_VPID_INVALID != orted_globals.fail) {
        orted_globals.abort=false;
        /* some vpid was ordered to fail. The value can be positive
         * or negative, depending upon the desired method for failure,
         * so need to check both here
         */
        if (0 > orted_globals.fail) {
            orted_globals.fail = -1*orted_globals.fail;
            orted_globals.abort = true;
        }
        /* are we the specified vpid? */
        if ((int)ORTE_PROC_MY_NAME->vpid == orted_globals.fail) {
            /* if the user specified we delay, then setup a timer
             * and have it kill us
             */
            if (0 < orted_globals.fail_delay) {
                ORTE_TIMER_EVENT(orted_globals.fail_delay, 0, shutdown_callback, ORTE_SYS_PRI);
                
            } else {
                opal_output(0, "%s is executing clean %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            orted_globals.abort ? "abort" : "abnormal termination");

                /* do -not- call finalize as this will send a message to the HNP
                 * indicating clean termination! Instead, just forcibly cleanup
                 * the local session_dir tree and exit
                 */
                orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
                
                /* if we were ordered to abort, do so */
                if (orted_globals.abort) {
                    abort();
                }
                
                /* otherwise, return with non-zero status */
                ret = ORTE_ERROR_DEFAULT_EXIT_CODE;
                goto DONE;
            }
        }
    }

    /* detach from controlling terminal
     * otherwise, remain attached so output can get to us
     */
    if(!orte_debug_flag &&
       !orte_debug_daemons_flag &&
       orted_globals.daemonize) {
        opal_daemon_init(NULL);
    }
    
    /* insert our contact info into our process_info struct so we
     * have it for later use and set the local daemon field to our name
     */
    orte_process_info.my_daemon_uri = orte_rml.get_contact_info();
    ORTE_PROC_MY_DAEMON->jobid = ORTE_PROC_MY_NAME->jobid;
    ORTE_PROC_MY_DAEMON->vpid = ORTE_PROC_MY_NAME->vpid;
    
    /* if I am also the hnp, then update that contact info field too */
    if (ORTE_PROC_IS_HNP) {
        orte_process_info.my_hnp_uri = orte_rml.get_contact_info();
        ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid;
        ORTE_PROC_MY_HNP->vpid = ORTE_PROC_MY_NAME->vpid;
    }
    
    /* setup the primary daemon command receive function */
    ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON,
                                  ORTE_RML_PERSISTENT, orte_daemon_recv, NULL);
    if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) {
        ORTE_ERROR_LOG(ret);
        goto DONE;
    }
    
    /* output a message indicating we are alive, our name, and our pid
     * for debugging purposes
     */
    if (orte_debug_daemons_flag) {
        fprintf(stderr, "Daemon %s checking in as pid %ld on host %s\n",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)orte_process_info.pid,
                orte_process_info.nodename);
    }

    /* We actually do *not* want the orted to voluntarily yield() the
       processor more than necessary.  The orted already blocks when
       it is doing nothing, so it doesn't use any more CPU cycles than
       it should; but when it *is* doing something, we do not want it
       to be unnecessarily delayed because it voluntarily yielded the
       processor in the middle of its work.

       For example: when a message arrives at the orted, we want the
       OS to wake up the orted in a timely fashion (which most OS's
       seem good about doing) and then we want the orted to process
       the message as fast as possible.  If the orted yields and lets
       aggressive MPI applications get the processor back, it may be a
       long time before the OS schedules the orted to run again
       (particularly if there is no IO event to wake it up).  Hence,
       routed OOB messages (for example) may be significantly delayed
       before being delivered to MPI processes, which can be
       problematic in some scenarios (e.g., COMM_SPAWN, BTL's that
       require OOB messages for wireup, etc.). */
    opal_progress_set_yield_when_idle(false);

    /* Change the default behavior of libevent such that we want to
       continually block rather than blocking for the default timeout
       and then looping around the progress engine again.  There
       should be nothing in the orted that cannot block in libevent
       until "something" happens (i.e., there's no need to keep
       cycling through progress because the only things that should
       happen will happen in libevent).  This is a minor optimization,
       but what the heck... :-) */
    opal_progress_set_event_flag(OPAL_EVLOOP_ONCE);

    /* if requested, obtain and report a new process name and my uri to the indicated pipe */
    if (orted_globals.uri_pipe > 0) {
        orte_job_t *jdata;
        orte_proc_t *proc;
        orte_node_t *node;
        orte_app_context_t *app;
        char *tmp, *nptr, *sysinfo;
        int32_t ljob;

        /* setup the singleton's job */
        jdata = OBJ_NEW(orte_job_t);
        orte_plm_base_create_jobid(jdata);
        ljob = ORTE_LOCAL_JOBID(jdata->jobid);
        opal_pointer_array_set_item(orte_job_data, ljob, jdata);

        /* must create a map for it (even though it has no
         * info in it) so that the job info will be picked
         * up in subsequent pidmaps or other daemons won't
         * know how to route
         */
        jdata->map = OBJ_NEW(orte_job_map_t);

        /* setup an app_context for the singleton */
        app = OBJ_NEW(orte_app_context_t);
        app->app = strdup("singleton");
        app->num_procs = 1;
        opal_pointer_array_add(jdata->apps, app);
        
#if 0
        /* run our local allocator to read the available
         * allocation in case this singleton decides to
         * comm_spawn other procs
         */
        if (ORTE_SUCCESS != (ret = orte_ras.allocate(jdata))) {
            ORTE_ERROR_LOG(ret);
            /* don't quit as this would cause the singleton
             * to hang!
             */
        }
#endif
        
        /* setup a proc object for the singleton - since we
         * -must- be the HNP, and therefore we stored our
         * node on the global node pool, and since the singleton
         * -must- be on the same node as us, indicate that
         */
        proc = OBJ_NEW(orte_proc_t);
        proc->name.jobid = jdata->jobid;
        proc->name.vpid = 0;
        proc->alive = true;
        proc->state = ORTE_PROC_STATE_RUNNING;
        proc->app_idx = 0;
        /* obviously, it is on my node */
        node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
        proc->node = node;
        OBJ_RETAIN(node);  /* keep accounting straight */
        opal_pointer_array_add(jdata->procs, proc);
        jdata->num_procs = 1;
        /* and obviously it is one of my local procs */
        OBJ_RETAIN(proc);
        opal_pointer_array_add(orte_local_children, proc);
        jdata->num_local_procs = 1;
        /* set the trivial */
        proc->local_rank = 0;
        proc->node_rank = 0;
        proc->app_rank = 0;
        proc->state = ORTE_PROC_STATE_RUNNING;
        proc->alive = true;
        proc->app_idx = 0;
        proc->local_proc = true;
#if OPAL_HAVE_HWLOC
        proc->bind_idx = 0;
#endif

        /* the singleton will use the first three collectives
         * for its modex/barriers
         */
        orte_grpcomm_base.coll_id += 3;

        /* need to setup a pidmap for it */
        jdata->pmap = (opal_byte_object_t*)malloc(sizeof(opal_byte_object_t));
        if (ORTE_SUCCESS != (ret = orte_util_encode_pidmap(jdata->pmap))) {
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }
    
    
        /* if we don't yet have a daemon map, then we have to generate one
         * to pass back to it
         */
        if (NULL == orte_odls_globals.dmap) {
            orte_odls_globals.dmap = (opal_byte_object_t*)malloc(sizeof(opal_byte_object_t));
            /* construct a nodemap */
            if (ORTE_SUCCESS != (ret = orte_util_encode_nodemap(orte_odls_globals.dmap))) {
                ORTE_ERROR_LOG(ret);
                goto DONE;
            }
        }

        /* create a string that contains our uri + the singleton's name + sysinfo */
        orte_util_convert_process_name_to_string(&nptr, &proc->name);
        orte_util_convert_sysinfo_to_string(&sysinfo, orte_local_cpu_type, orte_local_cpu_model);
        asprintf(&tmp, "%s[%s][%s]", orte_process_info.my_daemon_uri, nptr, sysinfo);
        free(nptr);
	free(sysinfo);

        /* pass that info to the singleton */
#ifndef __WINDOWS__
        write(orted_globals.uri_pipe, tmp, strlen(tmp)+1); /* need to add 1 to get the NULL */
#else
        send(orted_globals.uri_pipe, tmp, strlen(tmp)+1, 0); /* need to add 1 to get the NULL */
#endif

        /* cleanup */
        free(tmp);
    }

    /* if we were given a pipe to monitor for singleton termination, set that up */
    if (orted_globals.singleton_died_pipe > 0) {
        /* register shutdown handler */
        pipe_handler = (opal_event_t*)malloc(sizeof(opal_event_t));
        opal_event_set(orte_event_base, pipe_handler,
                       orted_globals.singleton_died_pipe,
                       OPAL_EV_READ,
                       pipe_closed,
                       pipe_handler);
        opal_event_add(pipe_handler, NULL);
    }

    /* If I have a parent, then save his contact info so
     * any messages we send can flow thru him.
     */
    mca_base_param_reg_string_name("orte", "parent_uri",
                                   "URI for the parent if tree launch is enabled.",
                                   true, false, NULL,  &rml_uri);
    if (NULL != rml_uri) {
        orte_process_name_t parent;

        /* set the contact info into the hash table */
        if (ORTE_SUCCESS != (ret = orte_rml.set_contact_info(rml_uri))) {
            ORTE_ERROR_LOG(ret);
            free(rml_uri);
            goto DONE;
        }
        ret = orte_rml_base_parse_uris(rml_uri, &parent, NULL );
        if( ORTE_SUCCESS != ret ) {
            ORTE_ERROR_LOG(ret);
            free(rml_uri);
            goto DONE;
        }
        free(rml_uri);
        /* tell the routed module that we have a path
         * back to the HNP
         */
        if (ORTE_SUCCESS != (ret = orte_routed.update_route(ORTE_PROC_MY_HNP, &parent))) {
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }
        /* set the lifeline to point to our parent so that we
         * can handle the situation if that lifeline goes away
         */
        if (ORTE_SUCCESS != (ret = orte_routed.set_lifeline(&parent))) {
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }
    }

    /* if we are not the HNP...the only time we will be an HNP
     * is if we are launched by a singleton to provide support
     * for it
     */
    if (!ORTE_PROC_IS_HNP) {
        /* send the information to the orted report-back point - this function
         * will process the data, but also counts the number of
         * orteds that reported back so the launch procedure can continue.
         * We need to do this at the last possible second as the HNP
         * can turn right around and begin issuing orders to us
         */

        buffer = OBJ_NEW(opal_buffer_t);
        /* insert our name for rollup purposes */
        if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
            ORTE_ERROR_LOG(ret);
            OBJ_RELEASE(buffer);
            goto DONE;
        }
        /* for now, always include our contact info, even if we are using
         * static ports. Eventually, this will be removed
         */
        rml_uri = orte_rml.get_contact_info();
        if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &rml_uri, 1, OPAL_STRING))) {
            ORTE_ERROR_LOG(ret);
            OBJ_RELEASE(buffer);
            goto DONE;
        }

        /* include our node name */
        opal_dss.pack(buffer, &orte_process_info.nodename, 1, OPAL_STRING);
        
#if OPAL_HAVE_HWLOC
        /* add the local topology */
        if (NULL != opal_hwloc_topology &&
            (1 == ORTE_PROC_MY_NAME->vpid || orte_hetero_nodes)) {
            if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO))) {
                ORTE_ERROR_LOG(ret);
            }
        }
#endif

        if ((orte_static_ports || orte_use_common_port) && !orted_globals.tree_spawn) {
            /* use the rollup collective to send our data to the HNP
             * so we minimize the HNP bottleneck
             */
            orte_grpcomm_collective_t *coll;
            coll = OBJ_NEW(orte_grpcomm_collective_t);
            /* get the list of contributors we need from the routed module */
            orte_routed.get_routing_list(ORTE_GRPCOMM_COLL_PEERS, coll);
            /* add the collective to our list */
            opal_list_append(&orte_grpcomm_base.active_colls, &coll->super);
            /* send the buffer to ourselves to start the collective */
            if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_NAME, buffer,
                                                   ORTE_RML_TAG_ROLLUP, 0,
                                                   rml_cbfunc, NULL))) {
                ORTE_ERROR_LOG(ret);
                OBJ_RELEASE(buffer);
                goto DONE;
            }
        } else {
            /* send directly to the HNP's callback */
            if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buffer,
                                                   ORTE_RML_TAG_ORTED_CALLBACK, 0,
                                                   rml_cbfunc, NULL))) {
                ORTE_ERROR_LOG(ret);
                OBJ_RELEASE(buffer);
                goto DONE;
            }
        }
    }

    if (orte_debug_daemons_flag) {
        opal_output(0, "%s orted: up and running - waiting for commands!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
    }

    /* loop the event lib until an exit event is detected */
    while (orte_event_base_active) {
        opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
    }

    /* ensure all local procs are dead */
    orte_odls.kill_local_procs(NULL);

 DONE:
    /* update the exit status, in case it wasn't done */
    ORTE_UPDATE_EXIT_STATUS(orte_exit_status);

    /* cleanup and leave */
    orte_finalize();

    if (orte_debug_flag) {
        fprintf(stderr, "exiting with status %d\n", orte_exit_status);
    }
    exit(orte_exit_status);
}
int opal_maffinity_base_select(void)
{
    int priority = 0, best_priority = 0;
    opal_list_item_t *item = NULL;
    mca_base_component_list_item_t *cli = NULL;
    opal_maffinity_base_component_1_0_0_t *component = NULL,
                                           *best_component = NULL;
    const opal_maffinity_base_module_1_0_0_t *module = NULL,
                                              *best_module = NULL;
    char *value;

    /* Register the framework MCA param and look it up */

    mca_base_param_reg_string_name("maffinity", NULL,
                                   "Which maffinity component to use (empty = auto-select)",
                                   false, false,
                                   NULL, &value);
    if (NULL == value || 0 == strlen(value)) {
        opal_output_verbose(10, opal_maffinity_base_output,
                            "maffinity:select: auto-selecting");
    } else {
        opal_output_verbose(10, opal_maffinity_base_output,
                            "maffinity:select: looking for %s component",
                            value);
    }

    /* Traverse the list of available components; call their init
       functions. */

    best_priority = -1;
    best_component = NULL;
    module = NULL;
    for (item = opal_list_get_first(&opal_maffinity_base_components_opened);
            opal_list_get_end(&opal_maffinity_base_components_opened) != item;
            item = opal_list_get_next(item) ) {
        cli = (mca_base_component_list_item_t *) item;
        component = (opal_maffinity_base_component_1_0_0_t *) cli->cli_component;

        /* if there is an include list - item must be in the list to
           be included */

        if (NULL != value && strlen(value) > 0 &&
                0 != strcmp(component->maffinityc_version.mca_component_name,
                            value)) {
            opal_output_verbose(10, opal_maffinity_base_output,
                                "maffinity:select: skipping %s component",
                                component->maffinityc_version.mca_component_name);
            continue;
        }

        if (NULL == component->maffinityc_query) {
            opal_output_verbose(10, opal_maffinity_base_output,
                                "pafinity:select: no init function; ignoring component %s",
                                component->maffinityc_version.mca_component_name );
            continue;
        }
        opal_output_verbose(10, opal_maffinity_base_output,
                            "maffinity:select: initializing component %s",
                            component->maffinityc_version.mca_component_name);
        module = component->maffinityc_query(&priority);
        if (NULL == module) {
            opal_output_verbose(10, opal_maffinity_base_output,
                                "maffinity:select: init returned failure for component %s",
                                component->maffinityc_version.mca_component_name );
            continue;
        }
        opal_output_verbose(10, opal_maffinity_base_output,
                            "maffinity:select: init returned priority %d",
                            priority );
        if (priority > best_priority) {
            best_priority = priority;
            best_component = component;
            best_module = module;
        }
    }

    /* Finished querying all components.  Check for the bozo case. */

    if (NULL == best_component ) {
        return OPAL_ERR_NOT_FOUND;
    }

    /* Now go through the opened list and close all the non-selected
       components */

    mca_base_components_close(opal_maffinity_base_output,
                              &opal_maffinity_base_components_opened,
                              (mca_base_component_t *) best_component);

    /* Save the winner */

    opal_maffinity_base_component = best_component;
    opal_maffinity_base_module = best_module;
    opal_output_verbose(10, opal_maffinity_base_output,
                        "maffinity:select: component %s selected",
                        best_component->maffinityc_version.mca_component_name);
    opal_maffinity_base_selected = true;

    /* Initialize the winner */

    if (NULL != opal_maffinity_base_module) {
        if (OPAL_SUCCESS != opal_maffinity_base_module->maff_module_init()) {
            return OPAL_ERROR;
        }
    }

    return OPAL_SUCCESS;
}
int
orte_rml_base_select(void)
{
    opal_list_item_t *item;

    int selected_priority = -1;
    orte_rml_component_t *selected_component = NULL;
    orte_rml_module_t *selected_module = NULL;

    int wrapper_priority = -1;
    orte_rml_component_t *wrapper_component = NULL;
    orte_rml_module_t *wrapper_module = NULL;
    char *rml_wrapper = NULL;

    mca_base_param_reg_string_name("rml", "wrapper",
                                   "Use a Wrapper component around the selected RML component",
                                   false, false,
                                   NULL, &rml_wrapper);
    
    for (item = opal_list_get_first(&orte_rml_base_components);
         item != opal_list_get_end(&orte_rml_base_components) ;
         item = opal_list_get_next(item)) {
        mca_base_component_list_item_t *cli;
        orte_rml_component_t* component;
 
        cli = (mca_base_component_list_item_t *) item;
        component = (orte_rml_component_t *) cli->cli_component;

        opal_output_verbose(10, orte_rml_base_output, 
                            "orte_rml_base_select: initializing %s component %s",
                            component->rml_version.mca_type_name,
                            component->rml_version.mca_component_name);

        if (NULL == component->rml_init) {
            opal_output_verbose(10, orte_rml_base_output, 
                                "orte_rml_base_select: no init function; ignoring component");
        } else {
            int priority = 0;

            orte_rml_module_t* module = component->rml_init(&priority);
            if (NULL == module) {
                opal_output_verbose(10, orte_rml_base_output,
                                    "orte_rml_base_select: init returned failure");
                continue;
            }

            if(NULL != rml_wrapper &&
               /* If this is a wrapper component then save it for later */
               RML_SELECT_WRAPPER_PRIORITY >= priority) {
                if( 0 == strncmp(component->rml_version.mca_component_name, 
                                 rml_wrapper,
                                 strlen(rml_wrapper) ) ) {
                    wrapper_priority  = priority;
                    wrapper_component = component;
                    wrapper_module    = module;
                }
            } else if (priority > selected_priority) {
                /* Otherwise this is a normal module and subject to normal selection */
                if (NULL != selected_module && NULL != selected_module->finalize) {
                    selected_module->finalize();
                }

                selected_priority = priority;
                selected_component = component;
                selected_module = module;
            }
        }
    }

    /* 
     * Unload all components that were not selected
     */
    item = opal_list_get_first(&orte_rml_base_components);
    while (item != opal_list_get_end(&orte_rml_base_components)) {
        opal_list_item_t* next = opal_list_get_next(item);
        orte_rml_component_t* component;
        mca_base_component_list_item_t *cli;

        cli = (mca_base_component_list_item_t *) item;
        component = (orte_rml_component_t *) cli->cli_component;

        /* Keep it if it is the wrapper component */
        if (NULL != wrapper_component &&
            component == wrapper_component) {
            item = next;
            continue;
        }
        /* Not the selected component */
        if (component != selected_component) {
            opal_output_verbose(10, orte_rml_base_output,
                                "orte_rml_base_select: module %s unloaded",
                                component->rml_version.mca_component_name);

            mca_base_component_repository_release((mca_base_component_t *) component);
            opal_list_remove_item(&orte_rml_base_components, item);
            OBJ_RELEASE(item);
        }
        item = next;
    }

    /* setup reference to selected module */
    if (NULL != selected_module) {
        orte_rml = *selected_module;
        orte_rml_component = selected_component;
    }

    /* If a wrapper component was requested then 
     * Make sure it can switch out the selected module
     */
    if( NULL != wrapper_component) {
        wrapper_component->rml_init(NULL);
    }

    if( NULL != rml_wrapper ) {
        free(rml_wrapper);
    }

    if (NULL == selected_component) return ORTE_ERROR;
    
    return ORTE_SUCCESS;
}
Пример #12
0
int orte_ess_base_orted_setup(char **hosts)
{
    int ret = ORTE_ERROR;
    int fd;
    char log_file[PATH_MAX];
    char *jobidstring;
    char *error = NULL;
    char *plm_to_use;
    orte_job_t *jdata;
    orte_proc_t *proc;
    orte_app_context_t *app;
    orte_node_t *node;

#ifndef __WINDOWS__
    /* setup callback for SIGPIPE */
    setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback);
    /* Set signal handlers to catch kill signals so we can properly clean up
     * after ourselves. 
     */
    setup_sighandler(SIGTERM, &term_handler, shutdown_signal);
    setup_sighandler(SIGINT, &int_handler, shutdown_signal);
    
    /** setup callbacks for signals we should ignore */
    setup_sighandler(SIGUSR1, &sigusr1_handler, signal_callback);
    setup_sighandler(SIGUSR2, &sigusr2_handler, signal_callback);
#endif  /* __WINDOWS__ */
    
    signals_set = true;
    
#if OPAL_HAVE_HWLOC
    {
        hwloc_obj_t obj;
        unsigned i, j;
        
        /* get the local topology */
        if (NULL == opal_hwloc_topology) {
            if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
                error = "topology discovery";
                goto error;
            }
        }
        
        /* remove the hostname from the topology. Unfortunately, hwloc
         * decided to add the source hostname to the "topology", thus
         * rendering it unusable as a pure topological description. So
         * we remove that information here.
         */
        obj = hwloc_get_root_obj(opal_hwloc_topology);
        for (i=0; i < obj->infos_count; i++) {
            if (NULL == obj->infos[i].name ||
                NULL == obj->infos[i].value) {
                continue;
            }
            if (0 == strncmp(obj->infos[i].name, "HostName", strlen("HostName"))) {
                free(obj->infos[i].name);
                free(obj->infos[i].value);
                /* left justify the array */
                for (j=i; j < obj->infos_count-1; j++) {
                    obj->infos[j] = obj->infos[j+1];
                }
                obj->infos[obj->infos_count-1].name = NULL;
                obj->infos[obj->infos_count-1].value = NULL;
                obj->infos_count--;
                break;
            }
        }
        
        if (4 < opal_output_get_verbosity(orte_ess_base_output)) {
            opal_output(0, "%s Topology Info:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO);
        }
    }
#endif
    
    /* open and setup the opal_pstat framework so we can provide
     * process stats if requested
     */
    if (ORTE_SUCCESS != (ret = opal_pstat_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "opal_pstat_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "opal_pstat_base_select";
        goto error;
    }
    
    /* open and setup the state machine */
    if (ORTE_SUCCESS != (ret = orte_state_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_state_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_state_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_state_base_select";
        goto error;
    }
    
    /* open the errmgr */
    if (ORTE_SUCCESS != (ret = orte_errmgr_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_errmgr_base_open";
        goto error;
    }
    
    /* some environments allow remote launches - e.g., ssh - so
     * open and select something -only- if we are given
     * a specific module to use
     */
    mca_base_param_reg_string_name("plm", NULL,
                                   "Which plm component to use (empty = none)",
                                   false, false,
                                   NULL, &plm_to_use);
    
    if (NULL == plm_to_use) {
        plm_in_use = false;
    } else {
        plm_in_use = true;
        
        if (ORTE_SUCCESS != (ret = orte_plm_base_open())) {
            ORTE_ERROR_LOG(ret);
            error = "orte_plm_base_open";
            goto error;
        }
        
        if (ORTE_SUCCESS != (ret = orte_plm_base_select())) {
            ORTE_ERROR_LOG(ret);
            error = "orte_plm_base_select";
            goto error;
        }
    }
    
    /* Setup the communication infrastructure */
    
    /* Runtime Messaging Layer - this opens/selects the OOB as well */
    if (ORTE_SUCCESS != (ret = orte_rml_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_rml_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml_base_select";
        goto error;
    }
    
    /* select the errmgr */
    if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_errmgr_base_select";
        goto error;
    }
    
    /* Routed system */
    if (ORTE_SUCCESS != (ret = orte_routed_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_routed_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_routed_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_routed_base_select";
        goto error;
    }
    
    /* database */
    if (ORTE_SUCCESS != (ret = orte_db_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_db_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_db_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_db_base_select";
        goto error;
    }

    /*
     * Group communications
     */
    if (ORTE_SUCCESS != (ret = orte_grpcomm_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_select";
        goto error;
    }
    
    /* Open/select the odls */
    if (ORTE_SUCCESS != (ret = orte_odls_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_odls_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_odls_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_odls_base_select";
        goto error;
    }
    
    /* enable communication with the rml */
    if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml.enable_comm";
        goto error;
    }
    
    /* initialize the nidmaps */
    if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_util_nidmap_init";
        goto error;
    }
#if ORTE_ENABLE_STATIC_PORTS
    /* if we are using static ports, then we need to setup
     * the daemon info so the RML can function properly
     * without requiring a wireup stage. This must be done
     * after we enable_comm as that function determines our
     * own port, which we need in order to construct the nidmap
     */
    if (orte_static_ports) {
        /* define the routing tree so we know the pattern
         * if we are trying to setup common or static ports
         */
        orte_routed.update_routing_plan();

        /* extract the node info from the environment and
         * build a nidmap from it
         */
        if (ORTE_SUCCESS != (ret = orte_util_build_daemon_nidmap(hosts))) {
            ORTE_ERROR_LOG(ret);
            error = "construct daemon map from static ports";
            goto error;
        }
    }
#endif
    /* be sure to update the routing tree so the initial "phone home"
     * to mpirun goes through the tree if static ports were enabled - still
     * need to do it anyway just to initialize things
     */
    orte_routed.update_routing_plan();
    
    /* Now provide a chance for the PLM
     * to perform any module-specific init functions. This
     * needs to occur AFTER the communications are setup
     * as it may involve starting a non-blocking recv
     * Do this only if a specific PLM was given to us - the
     * orted has no need of the proxy PLM at all
     */
    if (plm_in_use) {
        if (ORTE_SUCCESS != (ret = orte_plm.init())) {
            ORTE_ERROR_LOG(ret);
            error = "orte_plm_init";
            goto error;
        }
    }
    
    /* setup my session directory */
    if (orte_create_session_dirs) {
        OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output,
                             "%s setting up session dir with\n\ttmpdir: %s\n\thost %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base,
                             orte_process_info.nodename));
        
        if (ORTE_SUCCESS != (ret = orte_session_dir(true,
                                                    orte_process_info.tmpdir_base,
                                                    orte_process_info.nodename, NULL,
                                                    ORTE_PROC_MY_NAME))) {
            ORTE_ERROR_LOG(ret);
            error = "orte_session_dir";
            goto error;
        }
        /* Once the session directory location has been established, set
           the opal_output env file location to be in the
           proc-specific session directory. */
        opal_output_set_output_file_info(orte_process_info.proc_session_dir,
                                         "output-", NULL, NULL);
        
        /* setup stdout/stderr */
        if (orte_debug_daemons_file_flag) {
            /* if we are debugging to a file, then send stdout/stderr to
             * the orted log file
             */
            
            /* get my jobid */
            if (ORTE_SUCCESS != (ret = orte_util_convert_jobid_to_string(&jobidstring,
                                                                         ORTE_PROC_MY_NAME->jobid))) {
                ORTE_ERROR_LOG(ret);
                error = "convert_jobid";
                goto error;
            }
            
            /* define a log file name in the session directory */
            snprintf(log_file, PATH_MAX, "output-orted-%s-%s.log",
                     jobidstring, orte_process_info.nodename);
            log_path = opal_os_path(false,
                                    orte_process_info.tmpdir_base,
                                    orte_process_info.top_session_dir,
                                    log_file,
                                    NULL);
            
            fd = open(log_path, O_RDWR|O_CREAT|O_TRUNC, 0640);
            if (fd < 0) {
                /* couldn't open the file for some reason, so
                 * just connect everything to /dev/null
                 */
                fd = open("/dev/null", O_RDWR|O_CREAT|O_TRUNC, 0666);
            } else {
                dup2(fd, STDOUT_FILENO);
                dup2(fd, STDERR_FILENO);
                if(fd != STDOUT_FILENO && fd != STDERR_FILENO) {
                    close(fd);
                }
            }
        }
    }
    
    /* setup the global job and node arrays */
    orte_job_data = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data,
                                                       1,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       1))) {
        ORTE_ERROR_LOG(ret);
        error = "setup job array";
        goto error;
    }
    
    orte_node_pool = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
        ORTE_ERROR_LOG(ret);
        error = "setup node array";
        goto error;
    }
    orte_node_topologies = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_topologies,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
        ORTE_ERROR_LOG(ret);
        error = "setup node topologies array";
        goto error;
    }

    /* Setup the job data object for the daemons */        
    /* create and store the job data object */
    jdata = OBJ_NEW(orte_job_t);
    jdata->jobid = ORTE_PROC_MY_NAME->jobid;
    opal_pointer_array_set_item(orte_job_data, 0, jdata);
    
    /* every job requires at least one app */
    app = OBJ_NEW(orte_app_context_t);
    opal_pointer_array_set_item(jdata->apps, 0, app);
    jdata->num_apps++;
    
    /* create and store a node object where we are */
    node = OBJ_NEW(orte_node_t);
    node->name = strdup(orte_process_info.nodename);
    node->index = opal_pointer_array_set_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid, node);
#if OPAL_HAVE_HWLOC
    /* point our topology to the one detected locally */
    node->topology = opal_hwloc_topology;
#endif

    /* create and store a proc object for us */
    proc = OBJ_NEW(orte_proc_t);
    proc->name.jobid = ORTE_PROC_MY_NAME->jobid;
    proc->name.vpid = ORTE_PROC_MY_NAME->vpid;
    
    proc->pid = orte_process_info.pid;
    proc->rml_uri = orte_rml.get_contact_info();
    proc->state = ORTE_PROC_STATE_RUNNING;
    opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc);
    
    /* record that the daemon (i.e., us) is on this node 
     * NOTE: we do not add the proc object to the node's
     * proc array because we are not an application proc.
     * Instead, we record it in the daemon field of the
     * node object
     */
    OBJ_RETAIN(proc);   /* keep accounting straight */
    node->daemon = proc;
    node->daemon_launched = true;
    node->state = ORTE_NODE_STATE_UP;
    
    /* record that the daemon job is running */
    jdata->num_procs = 1;
    jdata->state = ORTE_JOB_STATE_RUNNING;
    /* obviously, we have "reported" */
    jdata->num_reported = 1;
    
    /* setup the routed info - the selected routed component
     * will know what to do. 
     */
    if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_routed.init_routes";
        goto error;
    }
    
    /* setup I/O forwarding system - must come after we init routes */
    if (ORTE_SUCCESS != (ret = orte_iof_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_iof_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_iof_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_iof_base_select";
        goto error;
    }
    
    /* setup the FileM */
    if (ORTE_SUCCESS != (ret = orte_filem_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_filem_base_open";
        goto error;
    }
    
    if (ORTE_SUCCESS != (ret = orte_filem_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_filem_base_select";
        goto error;
    }
    
#if OPAL_ENABLE_FT_CR == 1
    /*
     * Setup the SnapC
     */
    if (ORTE_SUCCESS != (ret = orte_snapc_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_snapc_base_open";
        goto error;
    }
    
    if (ORTE_SUCCESS != (ret = orte_snapc_base_select(ORTE_PROC_IS_HNP, !ORTE_PROC_IS_DAEMON))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_snapc_base_select";
        goto error;
    }
    
    /* For daemons, ORTE doesn't need the OPAL CR stuff */
    opal_cr_set_enabled(false);
#else
    opal_cr_set_enabled(false);
#endif
    
    /*
     * Initalize the CR setup
     * Note: Always do this, even in non-FT builds.
     * If we don't some user level tools may hang.
     */
    if (ORTE_SUCCESS != (ret = orte_cr_init())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_cr_init";
        goto error;
    }
    
    /* setup the SENSOR framework */
    if (ORTE_SUCCESS != (ret = orte_sensor_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_sensor_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_sensor_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_sensor_select";
        goto error;
    }
    /* start the local sensors */
    orte_sensor.start(ORTE_PROC_MY_NAME->jobid);
    
    return ORTE_SUCCESS;
    
 error:
    orte_show_help("help-orte-runtime.txt",
                   "orte_init:startup:internal-failure",
                   true, error, ORTE_ERROR_NAME(ret), ret);
    
    return ORTE_ERR_SILENT;
}
Пример #13
0
int orte_snapc_base_select(bool seed, bool app)
{
    int exit_status = OPAL_SUCCESS;
    orte_snapc_base_component_t *best_component = NULL;
    orte_snapc_base_module_t *best_module = NULL;
    char *include_list = NULL;

    /*
     * Register the framework MCA param and look up include list
     */
    mca_base_param_reg_string_name("snapc", NULL,
                                   "Which SNAPC component to use (empty = auto-select)",
                                   false, false,
                                   strdup("none"), &include_list);
    if(NULL != include_list && 0 == strncmp(include_list, "none", strlen("none")) ){ 
        opal_output_verbose(10, orte_snapc_base_output,
                            "snapc:select: Using %s component",
                            include_list);
        best_component = &none_component;
        best_module    = &none_module;
        /* Close all components since none will be used */
        mca_base_components_close(orte_snapc_base_output,
                                  &orte_snapc_base_components_available,
                                  NULL, false);
        /* JJH: Todo: Check if none is in the list */
        goto skip_select;
    }

    /*
     * Select the best component
     */
    if( OPAL_SUCCESS != mca_base_select("snapc", orte_snapc_base_output,
                                        &orte_snapc_base_components_available,
                                        (mca_base_module_t **) &best_module,
                                        (mca_base_component_t **) &best_component) ) {
        /* This will only happen if no component was selected */
        exit_status = ORTE_ERROR;
        goto cleanup;
    }

 skip_select:
    /* Save the winner */
    orte_snapc_base_selected_component = *best_component;
    orte_snapc = *best_module;

    /* Initialize the winner */
    if (NULL != best_module) {
        if (OPAL_SUCCESS != orte_snapc.snapc_init(seed, app)) {
            exit_status = OPAL_ERROR;
            goto cleanup;
        }
    }

 cleanup:
    if( NULL != include_list ) {
        free(include_list);
        include_list = NULL;
    }

    return exit_status;
}
Пример #14
0
int orte_proc_info(void)
{
    
    int tmp;
    char *uri, *ptr;
    char hostname[ORTE_MAX_HOSTNAME_SIZE];
    
    if (init) {
        return ORTE_SUCCESS;
    }
    init = true;
    
    mca_base_param_reg_string_name("orte", "hnp_uri",
                                   "HNP contact info",
                                   true, false, NULL,  &uri);
    if (NULL != uri) {
        /* the uri value passed to us will have quote marks around it to protect
        * the value if passed on the command line. We must remove those
        * to have a correct uri string
        */
        if ('"' == uri[0]) {
            /* if the first char is a quote, then so will the last one be */
            uri[strlen(uri)-1] = '\0';
            ptr = &uri[1];
        } else {
            ptr = &uri[0];
        }
        orte_process_info.my_hnp_uri = strdup(ptr);
        free(uri);
    }
    
    mca_base_param_reg_string_name("orte", "local_daemon_uri",
                                   "Daemon contact info",
                                   true, false, NULL,  &(uri));
    
    if (NULL != uri) {
        /* the uri value passed to us may have quote marks around it to protect
         * the value if passed on the command line. We must remove those
         * to have a correct uri string
         */
        if ('"' == uri[0]) {
            /* if the first char is a quote, then so will the last one be */
            uri[strlen(uri)-1] = '\0';
            ptr = &uri[1];
        } else {
            ptr = &uri[0];
        }
        orte_process_info.my_daemon_uri = strdup(ptr);
        free(uri);
    }
    
    mca_base_param_reg_int_name("orte", "app_num",
                                "Index of the app_context that defines this proc",
                                true, false, 0, &tmp);
    orte_process_info.app_num = tmp;
    
    /* get the process id */
    orte_process_info.pid = getpid();

    /* get the nodename */
    gethostname(hostname, ORTE_MAX_HOSTNAME_SIZE);
    orte_process_info.nodename = strdup(hostname);
    
    /* get the number of nodes in the job */
    mca_base_param_reg_int_name("orte", "num_nodes",
                                "Number of nodes in the job",
                                true, false,
                                orte_process_info.num_nodes, &tmp);
    orte_process_info.num_nodes = tmp;
    
    /* get the number of times this proc has restarted */
    mca_base_param_reg_int_name("orte", "num_restarts",
                                "Number of times this proc has restarted",
                                true, false, 0, &tmp);
    orte_process_info.num_restarts = tmp;
    
    mca_base_param_reg_string_name("orte", "job_name",
                                   "Job name",
                                   true, false, NULL,  &orte_process_info.job_name);

    mca_base_param_reg_string_name("orte", "job_instance",
                                   "Job instance",
                                   true, false, NULL,  &orte_process_info.job_instance);

    mca_base_param_reg_string_name("orte", "executable",
                                   "Executable",
                                   true, false, NULL,  &orte_process_info.executable);

    mca_base_param_reg_int_name("orte", "app_rank",
                                "Rank of this proc within its app_context",
                                true, false, 0, &tmp);
    orte_process_info.app_rank = tmp;

    /* setup the sync buffer */
    orte_process_info.sync_buf = OBJ_NEW(opal_buffer_t);
    
    return ORTE_SUCCESS;
}
Пример #15
0
static int rte_init(void)
{
    int ret, i, j;
    char *error = NULL, *localj;
    int32_t jobfam, stepid;
    char *envar, *ev1, *ev2;
    uint64_t unique_key[2];
    char *cs_env, *string_key;
    char *pmi_id=NULL;
    int *ranks;
    char *tmp;
    orte_jobid_t jobid;
    orte_process_name_t proc;
    orte_local_rank_t local_rank;
    orte_node_rank_t node_rank;

    /* run the prolog */
    if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
        error = "orte_ess_base_std_prolog";
        goto error;
    }
    
#if OPAL_HAVE_HWLOC
    /* get the topology */
    if (NULL == opal_hwloc_topology) {
        if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
            error = "topology discovery";
            goto error;
        }
    }
#endif

    if (ORTE_PROC_IS_DAEMON) {  /* I am a daemon, launched by mpirun */
        /* we had to be given a jobid */
        mca_base_param_reg_string_name("orte", "ess_jobid", "Process jobid",
                                       true, false, NULL, &tmp);
        if (NULL == tmp) {
            error = "missing jobid";
            ret = ORTE_ERR_FATAL;
            goto error;
        }
        if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_jobid(&jobid, tmp))) {
            ORTE_ERROR_LOG(ret);
            error = "convert jobid";
            goto error;
        }
        free(tmp);
        ORTE_PROC_MY_NAME->jobid = jobid;
        /* get our rank from PMI */
        if (PMI_SUCCESS != (ret = PMI_Get_rank(&i))) {
            ORTE_PMI_ERROR(ret, "PMI_Get_rank");
            error = "could not get PMI rank";
            goto error;
        }
        ORTE_PROC_MY_NAME->vpid = i + 1;  /* compensate for orterun */

        /* get the number of procs from PMI */
        if (PMI_SUCCESS != (ret = PMI_Get_universe_size(&i))) {
            ORTE_PMI_ERROR(ret, "PMI_Get_universe_size");
            error = "could not get PMI universe size";
            goto error;
        }
        orte_process_info.num_procs = i + 1;  /* compensate for orterun */

        /* complete setup */
        if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(NULL))) {
            ORTE_ERROR_LOG(ret);
            error = "orte_ess_base_orted_setup";
            goto error;
        }
    } else {  /* we are a direct-launched MPI process */
        /* get our PMI id length */
        if (PMI_SUCCESS != (ret = PMI_Get_id_length_max(&pmi_maxlen))) {
            error = "PMI_Get_id_length_max";
            goto error;
        }
        pmi_id = malloc(pmi_maxlen);
        if (PMI_SUCCESS != (ret = PMI_Get_kvs_domain_id(pmi_id, pmi_maxlen))) {
            free(pmi_id);
            error = "PMI_Get_kvs_domain_id";
            goto error;
        }
        /* PMI is very nice to us - the domain id is an integer followed
         * by a '.', followed by essentially a stepid. The first integer
         * defines an overall job number. The second integer is the number of
         * individual jobs we have run within that allocation. So we translate
         * this as the overall job number equating to our job family, and
         * the individual number equating to our local jobid
         */
        jobfam = strtol(pmi_id, &localj, 10);
        if (NULL == localj) {
            /* hmmm - no '.', so let's just use zero */
            stepid = 0;
        } else {
            localj++; /* step over the '.' */
            stepid = strtol(localj, NULL, 10) + 1; /* add one to avoid looking like a daemon */
        }
        free(pmi_id);

        /* now build the jobid */
        ORTE_PROC_MY_NAME->jobid = ORTE_CONSTRUCT_LOCAL_JOBID(jobfam << 16, stepid);

        /* get our rank */
        if (PMI_SUCCESS != (ret = PMI_Get_rank(&i))) {
            ORTE_PMI_ERROR(ret, "PMI_Get_rank");
            error = "could not get PMI rank";
            goto error;
        }
        ORTE_PROC_MY_NAME->vpid = i;

        /* get the number of procs from PMI */
        if (PMI_SUCCESS != (ret = PMI_Get_universe_size(&i))) {
            ORTE_PMI_ERROR(ret, "PMI_Get_universe_size");
            error = "could not get PMI universe size";
            goto error;
        }
        orte_process_info.num_procs = i;
        /* push into the environ for pickup in MPI layer for
         * MPI-3 required info key
         */
        asprintf(&ev1, "OMPI_MCA_orte_ess_num_procs=%d", i);
        putenv(ev1);
        asprintf(&ev2, "OMPI_APP_CTX_NUM_PROCS=%d", i);
        putenv(ev2);

        /* setup transport keys in case the MPI layer needs them -
         * we can use the jobfam and stepid as unique keys
         * because they are unique values assigned by the RM
         */
        unique_key[0] = (uint64_t)jobfam;
        unique_key[1] = (uint64_t)stepid;
        if (NULL == (string_key = orte_pre_condition_transports_print(unique_key))) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            return ORTE_ERR_OUT_OF_RESOURCE;
        }
        if (NULL == (cs_env = mca_base_param_environ_variable("orte_precondition_transports",NULL,NULL))) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            return ORTE_ERR_OUT_OF_RESOURCE;
        }
        asprintf(&envar, "%s=%s", cs_env, string_key);
        putenv(envar);
        /* cannot free the envar as that messes up our environ */
        free(cs_env);
        free(string_key);

        /* our app_context number can only be 0 as we don't support
         * dynamic spawns
         */
        orte_process_info.app_num = 0;

        /* setup my daemon's name - arbitrary, since we don't route
         * messages
         */
        ORTE_PROC_MY_DAEMON->jobid = 0;
        ORTE_PROC_MY_DAEMON->vpid = 0;

        /* ensure we pick the correct critical components */
        putenv("OMPI_MCA_grpcomm=pmi");
        putenv("OMPI_MCA_routed=direct");
    
        /* now use the default procedure to finish my setup */
        if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) {
            ORTE_ERROR_LOG(ret);
            error = "orte_ess_base_app_setup";
            goto error;
        }

        /* store our info into the database */
        if (ORTE_SUCCESS != (ret = orte_db.store(ORTE_PROC_MY_NAME, ORTE_DB_HOSTNAME, orte_process_info.nodename, OPAL_STRING))) {
            error = "db store daemon vpid";
            goto error;
        }
        /* get our local proc info to find our local rank */
        if (PMI_SUCCESS != (ret = PMI_Get_clique_size(&i))) {
            ORTE_PMI_ERROR(ret, "PMI_Get_clique_size");
            error = "could not get PMI clique size";
            goto error;
        }
        /* store that info - remember, we want the number of peers that
         * share the node WITH ME, so we have to subtract ourselves from
         * that number
         */
        orte_process_info.num_local_peers = i - 1;
        /* now get the specific ranks */
        ranks = (int*)malloc(i * sizeof(int));
        if (PMI_SUCCESS != (ret = PMI_Get_clique_ranks(ranks, i))) {
            ORTE_PMI_ERROR(ret, "PMI_Get_clique_ranks");
            error = "could not get clique ranks";
            goto error;
        }
        /* The clique ranks are returned in rank order, so
         * cycle thru the array and update the local/node
         * rank info
         */
        proc.jobid = ORTE_PROC_MY_NAME->jobid;
        for (j=0; j < i; j++) {
            proc.vpid = ranks[j];
            local_rank = j;
            node_rank = j;
            if (ranks[j] == (int)ORTE_PROC_MY_NAME->vpid) {
                orte_process_info.my_local_rank = local_rank;
                orte_process_info.my_node_rank = node_rank;
            }
            if (ORTE_SUCCESS != (ret = orte_db.store(&proc, ORTE_DB_LOCALRANK, &local_rank, ORTE_LOCAL_RANK))) {
                error = "db store local rank";
                goto error;
            }
            if (ORTE_SUCCESS != (ret = orte_db.store(&proc, ORTE_DB_NODERANK, &node_rank, ORTE_NODE_RANK))) {
                error = "db store node rank";
                goto error;
            }
        }
        free(ranks);

        /* setup process binding */
        if (ORTE_SUCCESS != (ret = orte_ess_base_proc_binding())) {
            error = "proc_binding";
            goto error;
        }
    }

    /* set max procs */
    if (orte_process_info.max_procs < orte_process_info.num_procs) {
        orte_process_info.max_procs = orte_process_info.num_procs;
    }

    /* flag that we completed init */
    app_init_complete = true;
    
    return ORTE_SUCCESS;

 error:
    if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) {
        orte_show_help("help-orte-runtime.txt",
                       "orte_init:startup:internal-failure",
                       true, error, ORTE_ERROR_NAME(ret), ret);
    }

    return ret;
}
Пример #16
0
int opal_cr_init(void )
{
    int ret, exit_status = OPAL_SUCCESS;
    opal_cr_coord_callback_fn_t prev_coord_func;
    int val;

    if( ++opal_cr_initalized != 1 ) {
        if( opal_cr_initalized < 1 ) {
            exit_status = OPAL_ERROR;
            goto cleanup;
        }
        exit_status = OPAL_SUCCESS;
        goto cleanup;
    }

    /*
     * Some startup MCA parameters
     */
    ret = mca_base_param_reg_int_name("opal_cr", "verbose",
                                      "Verbose output level for the runtime OPAL Checkpoint/Restart functionality",
                                      false, false,
                                      0,
                                      &val);
    if(0 != val) {
        opal_cr_output = opal_output_open(NULL);
    } else {
        opal_cr_output = -1;
    }
    opal_output_set_verbosity(opal_cr_output, val);

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: Verbose Level: %d",
                        val);

    mca_base_param_reg_int_name("ft", "cr_enabled",
                                "Enable fault tolerance for this program",
                                false, false,
                                0, &val);
    opal_cr_set_enabled(OPAL_INT_TO_BOOL(val));

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: FT Enabled: %d",
                        val);

    mca_base_param_reg_int_name("opal_cr", "enable_timer",
                                "Enable Checkpoint timer (Default: Disabled)",
                                false, false,
                                0, &val);
    opal_cr_timing_enabled = OPAL_INT_TO_BOOL(val);

    mca_base_param_reg_int_name("opal_cr", "enable_timer_barrier",
                                "Enable Checkpoint timer Barrier (Default: Disabled)",
                                false, false,
                                0, &val);
    if( opal_cr_timing_enabled ) {
        opal_cr_timing_barrier_enabled = OPAL_INT_TO_BOOL(val);
    } else {
        opal_cr_timing_barrier_enabled = false;
    }

    mca_base_param_reg_int_name("opal_cr", "timer_target_rank",
                                "Target Rank for the timer (Default: 0)",
                                false, false,
                                0, &val);
    opal_cr_timing_target_rank = val;

#if OPAL_ENABLE_FT_THREAD == 1
    mca_base_param_reg_int_name("opal_cr", "use_thread",
                                "Use an async thread to checkpoint this program (Default: Disabled)",
                                false, false,
                                0, &val);
    opal_cr_thread_use_if_avail = OPAL_INT_TO_BOOL(val);

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: FT Use thread: %d",
                        val);

    mca_base_param_reg_int_name("opal_cr", "thread_sleep_check",
                                "Time to sleep between checking for a checkpoint (Default: 0)",
                                false, false,
                                0, &val);
    opal_cr_thread_sleep_check = val;

    mca_base_param_reg_int_name("opal_cr", "thread_sleep_wait",
                                "Time to sleep waiting for process to exit MPI library (Default: 0)",
                                false, false,
                                0, &val);
    opal_cr_thread_sleep_wait = val;

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: FT thread sleep: check = %d, wait = %d",
                        opal_cr_thread_sleep_check, opal_cr_thread_sleep_wait);
#endif

    mca_base_param_reg_int_name("opal_cr", "is_tool",
                                "Is this a tool program, meaning does it require a fully operational OPAL or just enough to exec.",
                                false, false,
                                0,
                                &val);
    opal_cr_is_tool = OPAL_INT_TO_BOOL(val);

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: Is a tool program: %d",
                        val);
#ifndef __WINDOWS__
    mca_base_param_reg_int_name("opal_cr", "signal",
                                "Checkpoint/Restart signal used to initialize an OPAL Only checkpoint of a program",
                                false, false,
                                SIGUSR1,
                                &opal_cr_entry_point_signal);

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: Checkpoint Signal: %d",
                        opal_cr_entry_point_signal);

    mca_base_param_reg_int_name("opal_cr", "debug_sigpipe",
                                "Activate a signal handler for debugging SIGPIPE Errors that can happen on restart. (Default: Disabled)",
                                false, false,
                                0, &val);
    opal_cr_debug_sigpipe = OPAL_INT_TO_BOOL(val);

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: Debug SIGPIPE: %d (%s)",
                        val, (opal_cr_debug_sigpipe ? "True" : "False"));

#if OPAL_ENABLE_FT_THREAD == 1
    /* If we have a thread, then attach the SIGPIPE signal handler there since
     * it is most likely to be the one that needs it.
     */
    if( opal_cr_debug_sigpipe && !opal_cr_thread_use_if_avail ) {
        if( SIG_ERR == signal(SIGPIPE, opal_cr_sigpipe_debug_signal_handler) ) {
            ;
        }
    }
#else
    if( opal_cr_debug_sigpipe ) {
        if( SIG_ERR == signal(SIGPIPE, opal_cr_sigpipe_debug_signal_handler) ) {
            ;
        }
    }
#endif

#else
    opal_cr_is_tool = true;  /* no support for CR on Windows yet */ 
#endif  /* __WINDOWS__ */

    mca_base_param_reg_string_name("opal_cr", "tmp_dir",
                                   "Temporary directory to place rendezvous files for a checkpoint",
                                   false, false,
                                   "/tmp",
                                   &opal_cr_pipe_dir);

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: Temp Directory: %s",
                        opal_cr_pipe_dir);

    if( !opal_cr_is_tool ) {
        /* Register the OPAL interlevel coordination callback */
        opal_cr_reg_coord_callback(opal_cr_coord, &prev_coord_func);

        opal_cr_stall_check = false;
        opal_cr_currently_stalled = false;

    } /* End opal_cr_is_tool = true */

    /* 
     * If fault tolerance was not compiled in then
     * we need to make sure that the listener thread is active to tell
     * the tools that this is not a checkpointable job.
     * We don't need the CRS framework to be initalized.
     */
#if OPAL_ENABLE_FT    == 1
    /*
     * Open the checkpoint / restart service components
     */
    if (OPAL_SUCCESS != (ret = opal_crs_base_open())) {
        opal_output(opal_cr_output,
                    "opal_cr: init: opal_crs_base_open Failed to open. (%d)\n", ret);
        exit_status = ret;
        goto cleanup;
    }
    
    if (OPAL_SUCCESS != (ret = opal_crs_base_select())) {
        opal_output(opal_cr_output,
                    "opal_cr: init: opal_crs_base_select Failed. (%d)\n", ret);
        exit_status = ret;
        goto cleanup;
    }
#endif

#if OPAL_ENABLE_FT_THREAD == 1
    if( !opal_cr_is_tool && opal_cr_thread_use_if_avail) {
        opal_output_verbose(10, opal_cr_output,
                            "opal_cr: init: starting the thread\n");

        opal_set_using_threads(true);
        /*
         * Start the thread
         */
        OBJ_CONSTRUCT(&opal_cr_thread,     opal_thread_t);
        OBJ_CONSTRUCT(&opal_cr_thread_lock, opal_mutex_t);

        opal_cr_thread_is_done    = false;
        opal_cr_thread_is_active  = false;
        opal_cr_thread_in_library = false;
        opal_cr_thread_num_in_library = 0;

        opal_cr_thread.t_run = opal_cr_thread_fn;
        opal_cr_thread.t_arg = NULL;
        opal_thread_start(&opal_cr_thread);

    } /* End opal_cr_is_tool = true */
    else {
        opal_output_verbose(10, opal_cr_output,
                            "opal_cr: init: *Not* Using C/R thread\n");
    }
#endif /* OPAL_ENABLE_FT_THREAD == 1 */

 cleanup:
    return exit_status;
}
Пример #17
0
static int rte_init(void)
{
    int ret;
    char *error = NULL;
    char *tmp=NULL, *tailpiece;
    orte_jobid_t jobid=ORTE_JOBID_INVALID;
    orte_vpid_t vpid=ORTE_VPID_INVALID;
    int32_t jfam;

    OBJ_CONSTRUCT(&ctl, orte_thread_ctl_t);
    
    my_uid = (uint32_t)getuid();

    /* run the prolog */
    if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
        error = "orte_ess_base_std_prolog";
        goto error;
    }

    /* if we were given a jobid, use it */
    mca_base_param_reg_string_name("orte", "ess_jobid", "Process jobid",
                                   true, false, NULL, &tmp);
    if (NULL != tmp) {
        if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_jobid(&jobid, tmp))) {
            ORTE_ERROR_LOG(ret);
            error = "convert_jobid";
            goto error;
        }
        free(tmp);
        ORTE_PROC_MY_NAME->jobid = jobid;
    } else {
        /* if we were given a job family, use it */
        mca_base_param_reg_string_name("orte", "ess_job_family", "Job family",
                                       true, false, NULL, &tmp);
        if (NULL != tmp) {
            jfam = strtoul(tmp, &tailpiece, 10);
            if (UINT16_MAX < jfam || NULL != tailpiece) {
                /* use a string hash to restructure this to fit */
                OPAL_HASH_STR(tmp, jfam);
            }
            ORTE_PROC_MY_NAME->jobid = ORTE_CONSTRUCT_LOCAL_JOBID(jfam << 16, 0);
        }
    }
    /* if we were given a vpid, use it */
    mca_base_param_reg_string_name("orte", "ess_vpid", "Process vpid",
                                   true, false, NULL, &tmp);
    if (NULL != tmp) {
        if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_vpid(&vpid, tmp))) {
            ORTE_ERROR_LOG(ret);
            error = "convert_vpid";
            goto error;
        }
        free(tmp);
        ORTE_PROC_MY_NAME->vpid = vpid;
        if (vpid < 2) {
            /* NOT ALLOWED - POTENTIAL CONFLICT WITH ORCM AND ORCM-SCHED */
            error = "disallowed_vpid";
            ret = ORTE_ERR_BAD_PARAM;
            goto error;
        }
    }
        
    /* if both were given, then we are done */
    if (ORTE_JOBID_INVALID != ORTE_PROC_MY_NAME->jobid &&
        ORTE_VPID_INVALID != ORTE_PROC_MY_NAME->vpid) {
        goto complete;
    }

#if HAVE_QINFO_H
    /* if we have qlib, then we can ask it for info by which we determine our
     * name based on provided rack location info
     */
    {
        qinfo_t *qinfo;

        if (NULL != (qinfo = get_qinfo())) {
            /* if we were given a jobid, then leave it alone */
            if (ORTE_JOBID_INVALID == ORTE_PROC_MY_NAME->jobid) {
                /* not given - assign it to 0 */
                ORTE_PROC_MY_NAME->jobid = 0;
            }
            /* must ensure that no daemon gets vpid 0 or 1 */
            ORTE_PROC_MY_NAME->vpid = (qinfo->rack * QLIB_MAX_SLOTS_PER_RACK) + qinfo->slot + 2;
            /* ensure that the HNP uri is NULL */
            if (NULL != orte_process_info.my_hnp_uri) {
                opal_output(0, "%s CONFLICTING NAME RESOLUTION - NO NAME GIVEN, BUT HNP SPECIFIED",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
                error = "name conflict";
                ret = ORTE_ERR_FATAL;
                goto error;
            }
            OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output,
                                 "GOT NAME %s FROM QINFO rack %d slot %d ",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 qinfo->rack, qinfo->slot));
            goto complete;
        }
    }
#endif

    /* we must have been given a vpid - we can get the jobid
     * in other ways
     */
    if (ORTE_VPID_INVALID == ORTE_PROC_MY_NAME->vpid) {
        /* we have an error */
        error = "missing vpid assignment";
        ret = ORTE_ERR_FATAL;
        goto error;
    }

    /* if we were given an HNP, we can get the jobid from
     * the HNP's name - this is decoded in proc_info.c during
     * the prolog
     */
    if (ORTE_JOBID_INVALID != ORTE_PROC_MY_HNP->jobid) {
        ORTE_PROC_MY_NAME->jobid = orte_process_info.my_hnp.jobid;
    } else {
        /* just fake it */
        ORTE_PROC_MY_NAME->jobid = 0;
    }

 complete:
    if (ORTE_SUCCESS != (ret = local_setup())) {
        ORTE_ERROR_LOG(ret);
        error = "local_setup";
        goto error;
    }

    OBJ_DESTRUCT(&ctl);
    
    return ORTE_SUCCESS;
    
 error:
    orte_show_help("help-orte-runtime.txt",
                   "orte_init:startup:internal-failure",
                   true, error, ORTE_ERROR_NAME(ret), ret);
    
    OBJ_DESTRUCT(&ctl);
    
    return ret;
}
Пример #18
0
static int rte_init(void)
{
    int ret;
    char *error = NULL;
    char **hosts = NULL;
    char *slurm_nodelist;

    /* run the prolog */
    if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
        error = "orte_ess_base_std_prolog";
        goto error;
    }
    
    /* Start by getting a unique name */
    slurm_set_name();
    
    /* if I am a daemon, complete my setup using the
     * default procedure
     */
    if (ORTE_PROC_IS_DAEMON) {
        /* get the list of nodes used for this job */
        mca_base_param_reg_string_name("orte", "nodelist", "List of nodes in job",
                                       true, false, NULL, &slurm_nodelist);
        
        if (NULL != slurm_nodelist) {
            /* split the node list into an argv array */
            hosts = opal_argv_split(slurm_nodelist, ',');
        }
        if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) {
            ORTE_ERROR_LOG(ret);
            error = "orte_ess_base_orted_setup";
            goto error;
        }
        opal_argv_free(hosts);
        return ORTE_SUCCESS;
    }
    
    if (ORTE_PROC_IS_TOOL) {
        /* otherwise, if I am a tool proc, use that procedure */
        if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup())) {
            ORTE_ERROR_LOG(ret);
            error = "orte_ess_base_tool_setup";
            goto error;
        }
        /* as a tool, I don't need a nidmap - so just return now */
        return ORTE_SUCCESS;
        
    }
    
    /* otherwise, I must be an application process - use
     * the default procedure to finish my setup
     */
    if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_ess_base_app_setup";
        goto error;
    }
    /* setup the nidmap arrays */
    if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(orte_process_info.sync_buf))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_util_nidmap_init";
        goto error;
    }

    return ORTE_SUCCESS;
    
error:
    orte_show_help("help-orte-runtime.txt",
                   "orte_init:startup:internal-failure",
                   true, error, ORTE_ERROR_NAME(ret), ret);
    
    return ret;
}
Пример #19
0
int opal_register_params(void)
{
    int value;

    /*
     * This string is going to be used in opal/util/stacktrace.c
     */
    {
        char *string = NULL;
        int j;
        int signals[] = {
#ifdef SIGABRT
            SIGABRT,
#endif
#ifdef SIGBUS
            SIGBUS,
#endif
#ifdef SIGFPE
            SIGFPE,
#endif
#ifdef SIGSEGV
            SIGSEGV,
#endif
            -1
        };
        for (j = 0 ; signals[j] != -1 ; ++j) {
            if (j == 0) {
                asprintf(&string, "%d", signals[j]);
            } else {
                char *tmp;
                asprintf(&tmp, "%s,%d", string, signals[j]);
                free(string);
                string = tmp;
            }
        }

        mca_base_param_reg_string_name("opal", "signal", 
                                       "Comma-delimited list of integer signal numbers to Open MPI to attempt to intercept.  Upon receipt of the intercepted signal, Open MPI will display a stack trace and abort.  Open MPI will *not* replace signals if handlers are already installed by the time MPI_INIT is invoked.  Optionally append \":complain\" to any signal number in the comma-delimited list to make Open MPI complain if it detects another signal handler (and therefore does not insert its own).",
                                       false, false, string, NULL);
        free(string);
    }

#if OMPI_ENABLE_DEBUG


    mca_base_param_reg_int_name("opal", "progress_debug", 
                                "Set to non-zero to debug progress engine features",
                                false, false, 0, NULL);

    {
        mca_base_param_reg_int_name("opal", "debug_locks",
                                    "Debug mutex usage within Open MPI.  On a "
                                    "non-threaded build, this enables integer counters and "
                                    "warning messages when double-locks are detected.",
                                    false, false, 0, &value);
        if (value) opal_mutex_check_locks = true;
    }
#endif

    /*
     * Do we want the "warning: your mmap file is on NFS!" message?  Per a
     * thread on the OMPI devel list
     * (http://www.open-mpi.org/community/lists/devel/2011/12/10054.php),
     * on some systems, it doesn't seem to matter.  But per older threads,
     * it definitely does matter on some systems.  Perhaps newer kernels
     * are smarter about this kind of stuff...?  Regardless, we should
     * provide the ability to turn off this message for systems where the
     * effect doesn't matter.
     *
     * v1.4.x-specific note: the MCA param name is "shmem_mmap_...",
     * where "shmem" is not a framework that exists in the v1.4
     * series.  This parameter was added right before 1.4.5, and at a
     * similar time as 1.5.5 (where the "shmem" framework *does*
     * exist).  The idea was to have a consistent MCA param name
     * starting with v1.4.5.  Hence, we put a slightly non-sensiscal
     * name here in v1.4.x so that we'd have a correct/good name
     * moving forward.
     */
    mca_base_param_reg_int_name("shmem",
                                "mmap_enable_nfs_warning", 
                                "Enable the warning emitted when Open MPI detects that its shared memory backing file is located on a network filesystem (1 = enabled, 0 = disabled).",
                                false, false,
                                (int)true, &value);
    opal_mmap_on_nfs_warning = OPAL_INT_TO_BOOL(value);

    /* Paffinity base also has some parameters */
    return opal_paffinity_base_register_params();
}
Пример #20
0
int opal_cr_init(void )
{
    int ret, exit_status = OPAL_SUCCESS;
    opal_cr_coord_callback_fn_t prev_coord_func;
    int val, t;

    if( ++opal_cr_initalized != 1 ) {
        if( opal_cr_initalized < 1 ) {
            exit_status = OPAL_ERROR;
            goto cleanup;
        }
        exit_status = OPAL_SUCCESS;
        goto cleanup;
    }

    /*
     * Some startup MCA parameters
     */
    ret = mca_base_param_reg_int_name("opal_cr", "verbose",
                                      "Verbose output level for the runtime OPAL Checkpoint/Restart functionality",
                                      false, false,
                                      0,
                                      &val);
    if(0 != val) {
        opal_cr_output = opal_output_open(NULL);
    } else {
        opal_cr_output = -1;
    }
    opal_output_set_verbosity(opal_cr_output, val);

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: Verbose Level: %d",
                        val);

    mca_base_param_reg_int_name("ft", "cr_enabled",
                                "Enable fault tolerance for this program",
                                false, false,
                                0, &val);
    opal_cr_set_enabled(OPAL_INT_TO_BOOL(val));

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: FT Enabled: %d",
                        val);

    mca_base_param_reg_int_name("opal_cr", "enable_timer",
                                "Enable Checkpoint timer (Default: Disabled)",
                                false, false,
                                0, &val);
    opal_cr_timing_enabled = OPAL_INT_TO_BOOL(val);

    mca_base_param_reg_int_name("opal_cr", "enable_timer_barrier",
                                "Enable Checkpoint timer Barrier (Default: Disabled)",
                                false, false,
                                0, &val);
    if( opal_cr_timing_enabled ) {
        opal_cr_timing_barrier_enabled = OPAL_INT_TO_BOOL(val);
    } else {
        opal_cr_timing_barrier_enabled = false;
    }

    mca_base_param_reg_int_name("opal_cr", "timer_target_rank",
                                "Target Rank for the timer (Default: 0)",
                                false, false,
                                0, &val);
    opal_cr_timing_target_rank = val;

#if OPAL_ENABLE_FT_THREAD == 1
    mca_base_param_reg_int_name("opal_cr", "use_thread",
                                "Use an async thread to checkpoint this program (Default: Disabled)",
                                false, false,
                                0, &val);
    opal_cr_thread_use_if_avail = OPAL_INT_TO_BOOL(val);

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: FT Use thread: %d",
                        val);

    mca_base_param_reg_int_name("opal_cr", "thread_sleep_check",
                                "Time to sleep between checking for a checkpoint (Default: 0)",
                                false, false,
                                0, &val);
    opal_cr_thread_sleep_check = val;

    mca_base_param_reg_int_name("opal_cr", "thread_sleep_wait",
                                "Time to sleep waiting for process to exit MPI library (Default: 1000)",
                                false, false,
                                1000, &val);
    opal_cr_thread_sleep_wait = val;

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: FT thread sleep: check = %d, wait = %d",
                        opal_cr_thread_sleep_check, opal_cr_thread_sleep_wait);
#endif

    mca_base_param_reg_int_name("opal_cr", "is_tool",
                                "Is this a tool program, meaning does it require a fully operational OPAL or just enough to exec.",
                                false, false,
                                0,
                                &val);
    opal_cr_is_tool = OPAL_INT_TO_BOOL(val);

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: Is a tool program: %d",
                        val);
#if OPAL_ENABLE_CRDEBUG == 1
    mca_base_param_reg_int_name("opal_cr", "enable_crdebug",
                                "Enable checkpoint/restart debugging",
                                false, false,
                                0,
                                &val);
    MPIR_debug_with_checkpoint = OPAL_INT_TO_BOOL(val);

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: C/R Debugging Enabled [%s]\n",
                        (MPIR_debug_with_checkpoint ? "True": "False"));
#endif

#ifndef __WINDOWS__
    mca_base_param_reg_int_name("opal_cr", "signal",
                                "Checkpoint/Restart signal used to initialize an OPAL Only checkpoint of a program",
                                false, false,
                                SIGUSR1,
                                &opal_cr_entry_point_signal);

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: Checkpoint Signal: %d",
                        opal_cr_entry_point_signal);

    mca_base_param_reg_int_name("opal_cr", "debug_sigpipe",
                                "Activate a signal handler for debugging SIGPIPE Errors that can happen on restart. (Default: Disabled)",
                                false, false,
                                0, &val);
    opal_cr_debug_sigpipe = OPAL_INT_TO_BOOL(val);

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: Debug SIGPIPE: %d (%s)",
                        val, (opal_cr_debug_sigpipe ? "True" : "False"));

#if OPAL_ENABLE_FT_THREAD == 1
    /* If we have a thread, then attach the SIGPIPE signal handler there since
     * it is most likely to be the one that needs it.
     */
    if( opal_cr_debug_sigpipe && !opal_cr_thread_use_if_avail ) {
        if( SIG_ERR == signal(SIGPIPE, opal_cr_sigpipe_debug_signal_handler) ) {
            ;
        }
    }
#else
    if( opal_cr_debug_sigpipe ) {
        if( SIG_ERR == signal(SIGPIPE, opal_cr_sigpipe_debug_signal_handler) ) {
            ;
        }
    }
#endif

#else
    opal_cr_is_tool = true;  /* no support for CR on Windows yet */ 
#endif  /* __WINDOWS__ */

#if OPAL_ENABLE_CRDEBUG == 1
    opal_cr_debug_num_free_threads = 3;
    opal_cr_debug_free_threads = (opal_thread_t **)malloc(sizeof(opal_thread_t *) * opal_cr_debug_num_free_threads );
    for(t = 0; t < opal_cr_debug_num_free_threads; ++t ) {
        opal_cr_debug_free_threads[t] = NULL;
    }
 
    mca_base_param_reg_int_name("opal_cr", "crdebug_signal",
                                "Checkpoint/Restart signal used to hold threads when debugging",
                                false, false,
                                SIGTSTP,
                                &opal_cr_debug_signal);

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: Checkpoint Signal (Debug): %d",
                        opal_cr_debug_signal);
    if( SIG_ERR == signal(opal_cr_debug_signal, MPIR_checkpoint_debugger_signal_handler) ) {
        opal_output(opal_cr_output,
                    "opal_cr: init: Failed to register C/R debug signal (%d)",
                    opal_cr_debug_signal);
    }
#else
    /* Silence a compiler warning */
    t = 0;
#endif

    mca_base_param_reg_string_name("opal_cr", "tmp_dir",
                                   "Temporary directory to place rendezvous files for a checkpoint",
                                   false, false,
                                   opal_tmp_directory(),
                                   &opal_cr_pipe_dir);

    opal_output_verbose(10, opal_cr_output,
                        "opal_cr: init: Temp Directory: %s",
                        opal_cr_pipe_dir);

    if( !opal_cr_is_tool ) {
        /* Register the OPAL interlevel coordination callback */
        opal_cr_reg_coord_callback(opal_cr_coord, &prev_coord_func);

        opal_cr_stall_check = false;
        opal_cr_currently_stalled = false;

    } /* End opal_cr_is_tool = true */

    /* 
     * If fault tolerance was not compiled in then
     * we need to make sure that the listener thread is active to tell
     * the tools that this is not a checkpointable job.
     * We don't need the CRS framework to be initalized.
     */
#if OPAL_ENABLE_FT_CR    == 1
    /*
     * Open the checkpoint / restart service components
     */
    if (OPAL_SUCCESS != (ret = opal_crs_base_open())) {
        opal_show_help( "help-opal-runtime.txt",
                        "opal_cr_init:no-crs", true,
                        "opal_crs_base_open", ret );
        exit_status = ret;
        goto cleanup;
    }
    
    if (OPAL_SUCCESS != (ret = opal_crs_base_select())) {
        opal_show_help( "help-opal-runtime.txt",
                        "opal_cr_init:no-crs", true,
                        "opal_crs_base_select", ret );
        exit_status = ret;
        goto cleanup;
    }
#endif

#if OPAL_ENABLE_FT_THREAD == 1
    if( !opal_cr_is_tool && opal_cr_thread_use_if_avail) {
        opal_output_verbose(10, opal_cr_output,
                            "opal_cr: init: starting the thread\n");

        /* JJH: We really do need this line below since it enables
         *      actual locks for threads. However currently the
         *      upper layers will deadlock if it is enabled.
         *      So hack around the problem for now, while working
         *      on a complete solution. See ticket #2741 for more
         *      details.
         * opal_set_using_threads(true);
         */

        /*
         * Start the thread
         */
        OBJ_CONSTRUCT(&opal_cr_thread,     opal_thread_t);
        OBJ_CONSTRUCT(&opal_cr_thread_lock, opal_mutex_t);

        opal_cr_thread_is_done    = false;
        opal_cr_thread_is_active  = false;
        opal_cr_thread_in_library = false;
        opal_cr_thread_num_in_library = 0;

        opal_cr_thread.t_run = opal_cr_thread_fn;
        opal_cr_thread.t_arg = NULL;
        opal_thread_start(&opal_cr_thread);

    } /* End opal_cr_is_tool = true */
    else {
        opal_output_verbose(10, opal_cr_output,
                            "opal_cr: init: *Not* Using C/R thread\n");
    }
#endif /* OPAL_ENABLE_FT_THREAD == 1 */

 cleanup:
    return exit_status;
}
Пример #21
0
static int slurm_set_name(void)
{
    int slurm_nodeid;
    int rc;
    orte_jobid_t jobid;
    orte_vpid_t vpid;
    char* tmp;
    
    
    OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
                         "ess:slurm setting name"));
    
    mca_base_param_reg_string_name("orte", "ess_jobid", "Process jobid",
                                   true, false, NULL, &tmp);
    if (NULL == tmp) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        return ORTE_ERR_NOT_FOUND;
    }
    if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_jobid(&jobid, tmp))) {
        ORTE_ERROR_LOG(rc);
        return(rc);
    }
    free(tmp);
    
    mca_base_param_reg_string_name("orte", "ess_vpid", "Process vpid",
                                   true, false, NULL, &tmp);
    if (NULL == tmp) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        return ORTE_ERR_NOT_FOUND;
    }
    if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_vpid(&vpid, tmp))) {
        ORTE_ERROR_LOG(rc);
        return(rc);
    }
    free(tmp);
    
    ORTE_PROC_MY_NAME->jobid = jobid;
    
    /* fix up the vpid and make it the "real" vpid */
    slurm_nodeid = atoi(getenv("SLURM_NODEID"));
    ORTE_PROC_MY_NAME->vpid = vpid + slurm_nodeid;

    
    OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
                         "ess:slurm set name to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    /* get my node rank in case we are using static ports - this won't
     * be present for daemons, so don't error out if we don't have it
     */
    mca_base_param_reg_string_name("orte", "ess_node_rank", "Process node rank",
                                   true, false, NULL, &tmp);
    if (NULL != tmp) {
        my_node_rank = strtol(tmp, NULL, 10);
    }

    /* fix up the system info nodename to match exactly what slurm returned */
    if (NULL != orte_process_info.nodename) {
        free(orte_process_info.nodename);
    }
    orte_process_info.nodename = get_slurm_nodename(slurm_nodeid);

    
    OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
                         "ess:slurm set nodename to %s",
                         orte_process_info.nodename));
    
    /* get the non-name common environmental variables */
    if (ORTE_SUCCESS != (rc = orte_ess_env_get())) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    
    return ORTE_SUCCESS;
}
Пример #22
0
int ompi_mpi_register_params(void)
{
    int value;
    char *param;

    /* Whether we want MPI API function parameter checking or not */

    mca_base_param_reg_int_name("mpi", "param_check", 
                                "Whether you want MPI API parameters checked at run-time or not.  Possible values are 0 (no checking) and 1 (perform checking at run-time)",
                                false, false, MPI_PARAM_CHECK, &value);
    ompi_mpi_param_check = OPAL_INT_TO_BOOL(value);
    if (ompi_mpi_param_check) {
        value = 0;
        if (MPI_PARAM_CHECK) {
            value = 1;
        }
        if (0 == value) {
            orte_show_help("help-mpi-runtime.txt", 
                           "mpi-param-check-enabled-but-compiled-out",
                           true);
            ompi_mpi_param_check = false;
        }
    }
    
    /*
     * opal_progress: decide whether to yield and the event library
     * tick rate
     */
    /* JMS: Need ORTE data here -- set this to 0 when
       exactly/under-subscribed, or 1 when oversubscribed */
    mca_base_param_reg_int_name("mpi", "yield_when_idle", 
                                "Yield the processor when waiting for MPI communication (for MPI processes, will default to 1 when oversubscribing nodes)",
                                false, false, -1, NULL);
    mca_base_param_reg_int_name("mpi", "event_tick_rate", 
                                "How often to progress TCP communications (0 = never, otherwise specified in microseconds)",
                                false, false, -1, NULL);

    /* Whether or not to show MPI handle leaks */
    
    mca_base_param_reg_int_name("mpi", "show_handle_leaks",
                                "Whether MPI_FINALIZE shows all MPI handles that were not freed or not",
                                false, false, 
                                (int) ompi_debug_show_handle_leaks, &value);
    ompi_debug_show_handle_leaks = OPAL_INT_TO_BOOL(value);
    
    /* Whether or not to free MPI handles.  Useless without run-time
       param checking, so implicitly set that to true if we don't want
       to free the handles. */
    
    mca_base_param_reg_int_name("mpi", "no_free_handles", 
                                "Whether to actually free MPI objects when their handles are freed",
                                false, false, 
                                (int) ompi_debug_no_free_handles, &value);
    ompi_debug_no_free_handles = OPAL_INT_TO_BOOL(value);
    if (ompi_debug_no_free_handles) {
        ompi_mpi_param_check = true;
        value = 0;
        if (MPI_PARAM_CHECK) {
            value = 1;
        }
        if (0 == value) {
            opal_output(0, "WARNING: MCA parameter mpi_no_free_handles set to true, but MPI");
            opal_output(0, "WARNING: parameter checking has been compiled out of Open MPI.");
            opal_output(0, "WARNING: mpi_no_free_handles is therefore only partially effective!");
        }
    }

    /* Whether or not to show MPI_ALLOC_MEM leaks */

    mca_base_param_reg_int_name("mpi", "show_mpi_alloc_mem_leaks",
                                "If >0, MPI_FINALIZE will show up to this many instances of memory allocated by MPI_ALLOC_MEM that was not freed by MPI_FREE_MEM",
                                false, false, 
                                ompi_debug_show_mpi_alloc_mem_leaks,
                                &ompi_debug_show_mpi_alloc_mem_leaks);

    /* Whether or not to print all MCA parameters in MPI_INIT */
    mca_base_param_reg_string_name("mpi", "show_mca_params",
                                   "Whether to show all MCA parameter values during MPI_INIT or not (good for reproducability of MPI jobs "
                                   "for debug purposes). Accepted values are all, default, file, api, and enviro - or a comma "
                                   "delimited combination of them",
                                   false, false, NULL,  &param);
    if (NULL != param) {
        char **args;
        int i;
        
        ompi_mpi_show_mca_params = true;
        args = opal_argv_split(param, ',');
        if (NULL == args) {
            opal_output(0, "WARNING: could not parse mpi_show_mca_params request - defaulting to show \"all\"");
            show_default_mca_params = true;
            show_file_mca_params = true;
            show_enviro_mca_params = true;
            show_override_mca_params = true;
        } else {
            for (i=0; NULL != args[i]; i++) {
                if (0 == strcasecmp(args[i], "all")  || 0 == strcmp(args[i], "1")) {
                    show_default_mca_params = true;
                    show_file_mca_params = true;
                    show_enviro_mca_params = true;
                    show_override_mca_params = true;
                } else if (0 == strcasecmp(args[i], "default")) {
                    show_default_mca_params = true;
                } else if (0 == strcasecmp(args[i], "file")) {
                    show_file_mca_params = true;
                } else if (0 == strcasecmp(args[i], "enviro") || 
                           0 == strcasecmp(args[i], "env")) {
                    show_enviro_mca_params = true;
                } else if (0 == strcasecmp(args[i], "api")) {
                    show_override_mca_params = true;
                }
            }
            opal_argv_free(args);
        }
    }

    /* File to use when dumping the parameters */
    mca_base_param_reg_string_name("mpi", "show_mca_params_file",
                                   "If mpi_show_mca_params is true, setting this string to a valid filename tells Open MPI to dump all the MCA parameter values into a file suitable for reading via the mca_param_files parameter (good for reproducability of MPI jobs)",
                                   false, false,
                                   "", &ompi_mpi_show_mca_params_file);
    
    /* User-level process pinning controls */

    /* Do we want to save hostnames for debugging messages?  This can
       eat quite a bit of memory... */

    mca_base_param_reg_int_name("mpi", "keep_peer_hostnames",
                                "If nonzero, save the string hostnames of all MPI peer processes (mostly for error / debugging output messages).  This can add quite a bit of memory usage to each MPI process.",
                                false, false, 1, &value);
    ompi_mpi_keep_peer_hostnames = OPAL_INT_TO_BOOL(value);

    /* MPI_ABORT controls */

    mca_base_param_reg_int_name("mpi", "abort_delay",
                                "If nonzero, print out an identifying message when MPI_ABORT is invoked (hostname, PID of the process that called MPI_ABORT) and delay for that many seconds before exiting (a negative delay value means to never abort).  This allows attaching of a debugger before quitting the job.",
                                false, false, 
                                ompi_mpi_abort_delay,
                                &ompi_mpi_abort_delay);
    
    mca_base_param_reg_int_name("mpi", "abort_print_stack",
                                "If nonzero, print out a stack trace when MPI_ABORT is invoked",
                                false, 
                                /* If we do not have stack trace
                                   capability, make this a read-only
                                   MCA param */
#if OMPI_WANT_PRETTY_PRINT_STACKTRACE && ! defined(__WINDOWS__) && defined(HAVE_BACKTRACE)
                                false, 
#else
                                true,
#endif
                                (int) ompi_mpi_abort_print_stack,
                                &value);
#if OMPI_WANT_PRETTY_PRINT_STACKTRACE && ! defined(__WINDOWS__) && defined(HAVE_BACKTRACE)
    /* Only take the value if we have stack trace capability */
    ompi_mpi_abort_print_stack = OPAL_INT_TO_BOOL(value);
#else
    /* If we do not have stack trace capability, ensure that this is
       hard-coded to false */
    ompi_mpi_abort_print_stack = false;
#endif

    value = mca_base_param_reg_int_name("mpi", "preconnect_mpi",
                                        "Whether to force MPI processes to fully "
                                        "wire-up the MPI connections between MPI "
                                        "processes during "
                                        "MPI_INIT (vs. making connections lazily -- "
                                        "upon the first MPI traffic between each "
                                        "process peer pair)",
                                        false, false, 0, NULL);
    mca_base_param_reg_syn_name(value, "mpi", "preconnect_all", true);
    
    /* Leave pinned parameter */

    mca_base_param_reg_int_name("mpi", "leave_pinned",
                                "Whether to use the \"leave pinned\" protocol or not.  Enabling this setting can help bandwidth performance when repeatedly sending and receiving large messages with the same buffers over RDMA-based networks (0 = do not use \"leave pinned\" protocol, 1 = use \"leave pinned\" protocol, -1 = allow network to choose at runtime).",
                                false, false,
                                ompi_mpi_leave_pinned, &value);
    ompi_mpi_leave_pinned = (value >= 1) ? true: false;

    mca_base_param_reg_int_name("mpi", "leave_pinned_pipeline",
                                "Whether to use the \"leave pinned pipeline\" protocol or not.",
                                false, false,
                                (int) ompi_mpi_leave_pinned_pipeline, &value);
    ompi_mpi_leave_pinned_pipeline = OPAL_INT_TO_BOOL(value);
    
    if (ompi_mpi_leave_pinned && ompi_mpi_leave_pinned_pipeline) {
        ompi_mpi_leave_pinned_pipeline = 0;
        orte_show_help("help-mpi-runtime.txt", 
                       "mpi-params:leave-pinned-and-pipeline-selected",
                       true);
    }

    mca_base_param_reg_int_name("mpi", "warn_on_fork",
                                "If nonzero, issue a warning if program forks under conditions that could cause system errors",
                                false, false, 
                                (int) true, &value);
    ompi_warn_on_fork = OPAL_INT_TO_BOOL(value);
    
    /* Sparse group storage support */

    mca_base_param_reg_int_name("mpi", "have_sparse_group_storage", 
                                "Whether this Open MPI installation supports storing of data in MPI groups in \"sparse\" formats (good for extremely large process count MPI jobs that create many communicators/groups)",
                                false, true, (int) OMPI_GROUP_SPARSE, NULL);
    mca_base_param_reg_int_name("mpi", "use_sparse_group_storage", 
                                "Whether to use \"sparse\" storage formats for MPI groups (only relevant if mpi_have_sparse_group_storage is 1)",
                                false, false, OMPI_GROUP_SPARSE, &value);
    ompi_use_sparse_group_storage = OPAL_INT_TO_BOOL(value);
    if (ompi_use_sparse_group_storage) {
        value = 0;
        if (OMPI_GROUP_SPARSE) {
            value = 1;
        }
        if (0 == value) {
            orte_show_help("help-mpi-runtime.txt", 
                           "sparse groups enabled but compiled out",
                           true);
            ompi_use_sparse_group_storage = false;
        }
    }

    /* The ddt engine has a few parameters */
    return ompi_ddt_register_params();
}
static int rte_init(char flags)
{
    int rc;
    orte_nid_t *node;
    orte_jmap_t *jmap;
    orte_pmap_t pmap;
    char *server_uri, *param;
    uint16_t jobfam;
    uint32_t hash32;
    uint32_t bias;
    
    /* run the prolog */
    if (ORTE_SUCCESS != (rc = orte_ess_base_std_prolog())) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    
    /* look for the ompi-server MCA param */
    mca_base_param_reg_string_name("orte", "server",
                                   "Server to be used as HNP - [file|FILE]:<filename> or just uri",
                                   false, false, NULL, &server_uri);
    
    if (NULL != server_uri) {
        /* we are going to connect to a server HNP */
        if (0 == strncmp(server_uri, "file", strlen("file")) ||
            0 == strncmp(server_uri, "FILE", strlen("FILE"))) {
            char input[1024], *filename;
            FILE *fp;
            
            /* it is a file - get the filename */
            filename = strchr(server_uri, ':');
            if (NULL == filename) {
                /* filename is not correctly formatted */
                orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-bad", true,
                               "singleton", server_uri);
                return ORTE_ERROR;
            }
            ++filename; /* space past the : */
            
            if (0 >= strlen(filename)) {
                /* they forgot to give us the name! */
                orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-missing", true,
                               "singleton", server_uri);
                return ORTE_ERROR;
            }
            
            /* open the file and extract the uri */
            fp = fopen(filename, "r");
            if (NULL == fp) { /* can't find or read file! */
                orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-access", true,
                               "singleton", server_uri);
                return ORTE_ERROR;
            }
            if (NULL == fgets(input, 1024, fp)) {
                /* something malformed about file */
                fclose(fp);
                orte_show_help("help-orterun.txt", "orterun:ompi-server-file-bad", true,
                               "singleton", server_uri, "singleton");
                return ORTE_ERROR;
            }
            fclose(fp);
            input[strlen(input)-1] = '\0';  /* remove newline */
            orte_process_info.my_hnp_uri = strdup(input);
        } else {
            orte_process_info.my_hnp_uri = strdup(server_uri);
        }
        /* save the daemon uri - we will process it later */
        orte_process_info.my_daemon_uri = strdup(orte_process_info.my_hnp_uri);
        /* indicate we are a singleton so orte_init knows what to do */
        orte_process_info.singleton = true;
        /* for convenience, push the pubsub version of this param into the environ */
        asprintf(&param, "OMPI_MCA_dpm_orte_server=%s", orte_process_info.my_hnp_uri);
        putenv(param);
        free(param);
        /* now define my own name */
        /* hash the nodename */
        OPAL_HASH_STR(orte_process_info.nodename, hash32);
        
        bias = (uint32_t)orte_process_info.pid;
        
        OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
                             "ess:singleton: initial bias %ld nodename hash %lu",
                             (long)bias, (unsigned long)hash32));
        
        /* fold in the bias */
        hash32 = hash32 ^ bias;
        
        /* now compress to 16-bits */
        jobfam = (uint16_t)(((0x0000ffff & (0xffff0000 & hash32) >> 16)) ^ (0x0000ffff & hash32));
        
        OPAL_OUTPUT_VERBOSE((5, orte_ess_base_output,
                             "ess:singleton:: final jobfam %lu",
                             (unsigned long)jobfam));
        
        /* set the name */
        ORTE_PROC_MY_NAME->jobid = 0xffff0000 & ((uint32_t)jobfam << 16);
        ORTE_PROC_MY_NAME->vpid = 0;
        
    } else {
        /*
         * If we are the selected module, then we must be a singleton
         * as it means that no other method for discovering a name
         * could be found. In this case, we need to start a daemon that
         * can support our operation. We must do this for two reasons:
         *
         * (1) if we try to play the role of the HNP, then any child processes
         * we might start via comm_spawn will rely on us for all ORTE-level
         * support. However, we can only progress those requests when the
         * the application calls into the OMPI/ORTE library! Thus, if this
         * singleton just does computation, the other processes will "hang"
         * in any calls into the ORTE layer that communicate with the HNP -
         * and most calls on application procs *do*.
         *
         * (2) daemons are used to communicate messages for administrative
         * purposes in a broadcast-like manner. Thus, daemons are expected
         * to be able to interpret specific commands. Our application process
         * doesn't have any idea how to handle those commands, thus causing
         * the entire ORTE administrative system to break down.
         *
         * For those reasons, we choose to fork/exec a daemon at this time
         * and then reconnect ourselves to it. We could just "fork" and declare
         * the child to be a daemon, but that would require we place *all* of the
         * daemon command processing code in the ORTE library, do some strange
         * mojo in a few places, etc. This doesn't seem worth it, so we'll just
         * do the old fork/exec here
         *
         * Note that Windows-based systems have to do their own special trick as
         * they don't support fork/exec. So we have to use a giant "if" here to
         * protect the Windows world. To make the results more readable, we put
         * the whole mess in a separate function below
         */
        if (ORTE_SUCCESS != (rc= fork_hnp())) {
Пример #24
0
int ompi_crcp_base_select(void)
{
    int ret, exit_status = OMPI_SUCCESS;
    ompi_crcp_base_component_t *best_component = NULL;
    ompi_crcp_base_module_t *best_module = NULL;
    char *include_list = NULL;

    /*
     * Register the framework MCA param and look up include list
     */
    mca_base_param_reg_string_name("crcp", NULL,
                                   "Which CRCP component to use (empty = auto-select)",
                                   false, false,
                                   strdup("none"), &include_list);

    if(NULL != include_list && 0 == strncmp(include_list, "none", strlen("none")) ){ 
        opal_output_verbose(10, ompi_crcp_base_output,
                            "crcp:select: Using %s component",
                            include_list);
        best_component = &none_component;
        best_module    = &none_module;
        /* JJH: Todo: Check if none is in the list */
        /* Close all components since none will be used */
        mca_base_components_close(0, /* Pass 0 to keep this from closing the output handle */
                                  &ompi_crcp_base_components_available,
                                  NULL);
        goto skip_select;
    }

    /*
     * Select the best component
     */
    if( OPAL_SUCCESS != mca_base_select("crcp", ompi_crcp_base_output,
                                        &ompi_crcp_base_components_available,
                                        (mca_base_module_t **) &best_module,
                                        (mca_base_component_t **) &best_component) ) {
        /* This will only happen if no component was selected */
        exit_status = OMPI_ERROR;
        goto cleanup;
    }

 skip_select:
    /* Save the winner */
    ompi_crcp_base_selected_component = *best_component;
    ompi_crcp = *best_module;

    /* Initialize the winner */
    if (NULL != best_module) {
        if (OPAL_SUCCESS != (ret = ompi_crcp.crcp_init()) ) {
            exit_status = ret;
            goto cleanup;
        }
    }

 cleanup:
    if( NULL != include_list ) {
        free(include_list);
        include_list = NULL;
    }

    return exit_status;
}
Пример #25
0
int opal_register_params(void)
{
    int ret;

    /*
     * This string is going to be used in opal/util/stacktrace.c
     */
    {
        char *string = NULL;
        int j;
        int signals[] = {
#ifdef SIGABRT
            SIGABRT,
#endif
#ifdef SIGBUS
            SIGBUS,
#endif
#ifdef SIGFPE
            SIGFPE,
#endif
#ifdef SIGSEGV
            SIGSEGV,
#endif
            -1
        };
        for (j = 0 ; signals[j] != -1 ; ++j) {
            if (j == 0) {
                asprintf(&string, "%d", signals[j]);
            } else {
                char *tmp;
                asprintf(&tmp, "%s,%d", string, signals[j]);
                free(string);
                string = tmp;
            }
        }

        mca_base_param_reg_string_name("opal", "signal", 
                                       "Comma-delimited list of integer signal numbers to Open MPI to attempt to intercept.  Upon receipt of the intercepted signal, Open MPI will display a stack trace and abort.  Open MPI will *not* replace signals if handlers are already installed by the time MPI_INIT is invoked.  Optionally append \":complain\" to any signal number in the comma-delimited list to make Open MPI complain if it detects another signal handler (and therefore does not insert its own).",
                                       false, false, string, NULL);
        free(string);
    }

    {
        int j;

        mca_base_param_reg_int_name("opal", "profile", 
                                    "Set to non-zero to profile component selections",
                                    false, false, (int)false, &j);
        opal_profile = OPAL_INT_TO_BOOL(j);

        mca_base_param_reg_string_name("opal", "profile_file", 
                                       "Name of the file containing the cluster configuration information",
                                       false, false, NULL, &opal_profile_file);
    }
    
#if OPAL_ENABLE_DEBUG


    mca_base_param_reg_int_name("opal", "progress_debug", 
                                "Set to non-zero to debug progress engine features",
                                false, false, 0, NULL);

    {
        int value;
        mca_base_param_reg_int_name("opal", "debug_locks",
                                    "Debug mutex usage within Open MPI.  On a "
                                    "non-threaded build, this enables integer counters and "
                                    "warning messages when double-locks are detected.",
                                    false, false, 0, &value);
        if (value) opal_mutex_check_locks = true;
    }
#endif
    /* The ddt engine has a few parameters */
    ret = opal_datatype_register_params();
    if (OPAL_SUCCESS != ret) {
        return ret;
    }

    /* Paffinity base also has some parameters */
    return opal_paffinity_base_register_params();
}
Пример #26
0
static int alps_set_name(void)
{
    int rc;
    orte_jobid_t jobid;
    char *tmp;
    orte_vpid_t vpid;

    OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
                         "ess:alps setting name"));

    mca_base_param_reg_string_name("orte", "ess_jobid", "Process jobid",
                                   true, false, NULL, &tmp);
    if (NULL == tmp) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        return ORTE_ERR_NOT_FOUND;
    }
    if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_jobid(&jobid, tmp))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    free(tmp);

    mca_base_param_reg_string_name("orte", "ess_vpid", "Process vpid",
                                   true, false, NULL, &tmp);
    if (NULL == tmp) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        return ORTE_ERR_NOT_FOUND;
    }
    if (ORTE_SUCCESS != (rc = orte_util_convert_string_to_vpid(&starting_vpid,
                                                               tmp))) {
        ORTE_ERROR_LOG(rc);
        return(rc);
    }
    free(tmp);

    if (ORTE_SUCCESS != (rc = get_vpid(&vpid, starting_vpid))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }

    ORTE_PROC_MY_NAME->jobid = jobid;
    ORTE_PROC_MY_NAME->vpid = vpid;
    ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,ORTE_EPOCH_INVALID);
    ORTE_EPOCH_SET(ORTE_PROC_MY_NAME->epoch,
                   orte_ess.proc_get_epoch(ORTE_PROC_MY_NAME));

    OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output,
                         "ess:alps set name to %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

    /* get the num procs as provided in the cmd line param */
    if (ORTE_SUCCESS != (rc = orte_ess_env_get())) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }

    if (orte_process_info.max_procs < orte_process_info.num_procs) {
        orte_process_info.max_procs = orte_process_info.num_procs;
    }

    return ORTE_SUCCESS;
}