int orte_ess_base_orted_setup(char **hosts)
{
    int ret = ORTE_ERROR;
    int fd;
    char log_file[PATH_MAX];
    char *jobidstring;
    char *error = NULL;
    char *plm_to_use;
    orte_job_t *jdata;
    orte_proc_t *proc;
    orte_app_context_t *app;
    orte_node_t *node;

#ifndef __WINDOWS__
    /* setup callback for SIGPIPE */
    setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback);
    /* Set signal handlers to catch kill signals so we can properly clean up
     * after ourselves. 
     */
    setup_sighandler(SIGTERM, &term_handler, shutdown_signal);
    setup_sighandler(SIGINT, &int_handler, shutdown_signal);
    
    /** setup callbacks for signals we should ignore */
    setup_sighandler(SIGUSR1, &sigusr1_handler, signal_callback);
    setup_sighandler(SIGUSR2, &sigusr2_handler, signal_callback);
#endif  /* __WINDOWS__ */
    
    signals_set = true;
    
#if OPAL_HAVE_HWLOC
    {
        hwloc_obj_t obj;
        unsigned i, j;
        
        /* get the local topology */
        if (NULL == opal_hwloc_topology) {
            if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
                error = "topology discovery";
                goto error;
            }
        }
        
        /* remove the hostname from the topology. Unfortunately, hwloc
         * decided to add the source hostname to the "topology", thus
         * rendering it unusable as a pure topological description. So
         * we remove that information here.
         */
        obj = hwloc_get_root_obj(opal_hwloc_topology);
        for (i=0; i < obj->infos_count; i++) {
            if (NULL == obj->infos[i].name ||
                NULL == obj->infos[i].value) {
                continue;
            }
            if (0 == strncmp(obj->infos[i].name, "HostName", strlen("HostName"))) {
                free(obj->infos[i].name);
                free(obj->infos[i].value);
                /* left justify the array */
                for (j=i; j < obj->infos_count-1; j++) {
                    obj->infos[j] = obj->infos[j+1];
                }
                obj->infos[obj->infos_count-1].name = NULL;
                obj->infos[obj->infos_count-1].value = NULL;
                obj->infos_count--;
                break;
            }
        }
        
        if (4 < opal_output_get_verbosity(orte_ess_base_output)) {
            opal_output(0, "%s Topology Info:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO);
        }
    }
#endif
    
    /* open and setup the opal_pstat framework so we can provide
     * process stats if requested
     */
    if (ORTE_SUCCESS != (ret = opal_pstat_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "opal_pstat_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "opal_pstat_base_select";
        goto error;
    }
    
    /* open and setup the state machine */
    if (ORTE_SUCCESS != (ret = orte_state_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_state_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_state_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_state_base_select";
        goto error;
    }
    
    /* open the errmgr */
    if (ORTE_SUCCESS != (ret = orte_errmgr_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_errmgr_base_open";
        goto error;
    }
    
    /* some environments allow remote launches - e.g., ssh - so
     * open and select something -only- if we are given
     * a specific module to use
     */
    mca_base_param_reg_string_name("plm", NULL,
                                   "Which plm component to use (empty = none)",
                                   false, false,
                                   NULL, &plm_to_use);
    
    if (NULL == plm_to_use) {
        plm_in_use = false;
    } else {
        plm_in_use = true;
        
        if (ORTE_SUCCESS != (ret = orte_plm_base_open())) {
            ORTE_ERROR_LOG(ret);
            error = "orte_plm_base_open";
            goto error;
        }
        
        if (ORTE_SUCCESS != (ret = orte_plm_base_select())) {
            ORTE_ERROR_LOG(ret);
            error = "orte_plm_base_select";
            goto error;
        }
    }
    
    /* Setup the communication infrastructure */
    
    /* Runtime Messaging Layer - this opens/selects the OOB as well */
    if (ORTE_SUCCESS != (ret = orte_rml_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_rml_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml_base_select";
        goto error;
    }
    
    /* select the errmgr */
    if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_errmgr_base_select";
        goto error;
    }
    
    /* Routed system */
    if (ORTE_SUCCESS != (ret = orte_routed_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_routed_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_routed_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_routed_base_select";
        goto error;
    }
    
    /* database */
    if (ORTE_SUCCESS != (ret = orte_db_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_db_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_db_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_db_base_select";
        goto error;
    }

    /*
     * Group communications
     */
    if (ORTE_SUCCESS != (ret = orte_grpcomm_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_select";
        goto error;
    }
    
    /* Open/select the odls */
    if (ORTE_SUCCESS != (ret = orte_odls_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_odls_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_odls_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_odls_base_select";
        goto error;
    }
    
    /* enable communication with the rml */
    if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml.enable_comm";
        goto error;
    }
    
    /* initialize the nidmaps */
    if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(NULL))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_util_nidmap_init";
        goto error;
    }
#if ORTE_ENABLE_STATIC_PORTS
    /* if we are using static ports, then we need to setup
     * the daemon info so the RML can function properly
     * without requiring a wireup stage. This must be done
     * after we enable_comm as that function determines our
     * own port, which we need in order to construct the nidmap
     */
    if (orte_static_ports) {
        /* define the routing tree so we know the pattern
         * if we are trying to setup common or static ports
         */
        orte_routed.update_routing_plan();

        /* extract the node info from the environment and
         * build a nidmap from it
         */
        if (ORTE_SUCCESS != (ret = orte_util_build_daemon_nidmap(hosts))) {
            ORTE_ERROR_LOG(ret);
            error = "construct daemon map from static ports";
            goto error;
        }
    }
#endif
    /* be sure to update the routing tree so the initial "phone home"
     * to mpirun goes through the tree if static ports were enabled - still
     * need to do it anyway just to initialize things
     */
    orte_routed.update_routing_plan();
    
    /* Now provide a chance for the PLM
     * to perform any module-specific init functions. This
     * needs to occur AFTER the communications are setup
     * as it may involve starting a non-blocking recv
     * Do this only if a specific PLM was given to us - the
     * orted has no need of the proxy PLM at all
     */
    if (plm_in_use) {
        if (ORTE_SUCCESS != (ret = orte_plm.init())) {
            ORTE_ERROR_LOG(ret);
            error = "orte_plm_init";
            goto error;
        }
    }
    
    /* setup my session directory */
    if (orte_create_session_dirs) {
        OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output,
                             "%s setting up session dir with\n\ttmpdir: %s\n\thost %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base,
                             orte_process_info.nodename));
        
        if (ORTE_SUCCESS != (ret = orte_session_dir(true,
                                                    orte_process_info.tmpdir_base,
                                                    orte_process_info.nodename, NULL,
                                                    ORTE_PROC_MY_NAME))) {
            ORTE_ERROR_LOG(ret);
            error = "orte_session_dir";
            goto error;
        }
        /* Once the session directory location has been established, set
           the opal_output env file location to be in the
           proc-specific session directory. */
        opal_output_set_output_file_info(orte_process_info.proc_session_dir,
                                         "output-", NULL, NULL);
        
        /* setup stdout/stderr */
        if (orte_debug_daemons_file_flag) {
            /* if we are debugging to a file, then send stdout/stderr to
             * the orted log file
             */
            
            /* get my jobid */
            if (ORTE_SUCCESS != (ret = orte_util_convert_jobid_to_string(&jobidstring,
                                                                         ORTE_PROC_MY_NAME->jobid))) {
                ORTE_ERROR_LOG(ret);
                error = "convert_jobid";
                goto error;
            }
            
            /* define a log file name in the session directory */
            snprintf(log_file, PATH_MAX, "output-orted-%s-%s.log",
                     jobidstring, orte_process_info.nodename);
            log_path = opal_os_path(false,
                                    orte_process_info.tmpdir_base,
                                    orte_process_info.top_session_dir,
                                    log_file,
                                    NULL);
            
            fd = open(log_path, O_RDWR|O_CREAT|O_TRUNC, 0640);
            if (fd < 0) {
                /* couldn't open the file for some reason, so
                 * just connect everything to /dev/null
                 */
                fd = open("/dev/null", O_RDWR|O_CREAT|O_TRUNC, 0666);
            } else {
                dup2(fd, STDOUT_FILENO);
                dup2(fd, STDERR_FILENO);
                if(fd != STDOUT_FILENO && fd != STDERR_FILENO) {
                    close(fd);
                }
            }
        }
    }
    
    /* setup the global job and node arrays */
    orte_job_data = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data,
                                                       1,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       1))) {
        ORTE_ERROR_LOG(ret);
        error = "setup job array";
        goto error;
    }
    
    orte_node_pool = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
        ORTE_ERROR_LOG(ret);
        error = "setup node array";
        goto error;
    }
    orte_node_topologies = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_topologies,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
        ORTE_ERROR_LOG(ret);
        error = "setup node topologies array";
        goto error;
    }

    /* Setup the job data object for the daemons */        
    /* create and store the job data object */
    jdata = OBJ_NEW(orte_job_t);
    jdata->jobid = ORTE_PROC_MY_NAME->jobid;
    opal_pointer_array_set_item(orte_job_data, 0, jdata);
    
    /* every job requires at least one app */
    app = OBJ_NEW(orte_app_context_t);
    opal_pointer_array_set_item(jdata->apps, 0, app);
    jdata->num_apps++;
    
    /* create and store a node object where we are */
    node = OBJ_NEW(orte_node_t);
    node->name = strdup(orte_process_info.nodename);
    node->index = opal_pointer_array_set_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid, node);
#if OPAL_HAVE_HWLOC
    /* point our topology to the one detected locally */
    node->topology = opal_hwloc_topology;
#endif

    /* create and store a proc object for us */
    proc = OBJ_NEW(orte_proc_t);
    proc->name.jobid = ORTE_PROC_MY_NAME->jobid;
    proc->name.vpid = ORTE_PROC_MY_NAME->vpid;
    
    proc->pid = orte_process_info.pid;
    proc->rml_uri = orte_rml.get_contact_info();
    proc->state = ORTE_PROC_STATE_RUNNING;
    opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc);
    
    /* record that the daemon (i.e., us) is on this node 
     * NOTE: we do not add the proc object to the node's
     * proc array because we are not an application proc.
     * Instead, we record it in the daemon field of the
     * node object
     */
    OBJ_RETAIN(proc);   /* keep accounting straight */
    node->daemon = proc;
    node->daemon_launched = true;
    node->state = ORTE_NODE_STATE_UP;
    
    /* record that the daemon job is running */
    jdata->num_procs = 1;
    jdata->state = ORTE_JOB_STATE_RUNNING;
    /* obviously, we have "reported" */
    jdata->num_reported = 1;
    
    /* setup the routed info - the selected routed component
     * will know what to do. 
     */
    if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_routed.init_routes";
        goto error;
    }
    
    /* setup I/O forwarding system - must come after we init routes */
    if (ORTE_SUCCESS != (ret = orte_iof_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_iof_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_iof_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_iof_base_select";
        goto error;
    }
    
    /* setup the FileM */
    if (ORTE_SUCCESS != (ret = orte_filem_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_filem_base_open";
        goto error;
    }
    
    if (ORTE_SUCCESS != (ret = orte_filem_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_filem_base_select";
        goto error;
    }
    
#if OPAL_ENABLE_FT_CR == 1
    /*
     * Setup the SnapC
     */
    if (ORTE_SUCCESS != (ret = orte_snapc_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_snapc_base_open";
        goto error;
    }
    
    if (ORTE_SUCCESS != (ret = orte_snapc_base_select(ORTE_PROC_IS_HNP, !ORTE_PROC_IS_DAEMON))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_snapc_base_select";
        goto error;
    }
    
    /* For daemons, ORTE doesn't need the OPAL CR stuff */
    opal_cr_set_enabled(false);
#else
    opal_cr_set_enabled(false);
#endif
    
    /*
     * Initalize the CR setup
     * Note: Always do this, even in non-FT builds.
     * If we don't some user level tools may hang.
     */
    if (ORTE_SUCCESS != (ret = orte_cr_init())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_cr_init";
        goto error;
    }
    
    /* setup the SENSOR framework */
    if (ORTE_SUCCESS != (ret = orte_sensor_base_open())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_sensor_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_sensor_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_sensor_select";
        goto error;
    }
    /* start the local sensors */
    orte_sensor.start(ORTE_PROC_MY_NAME->jobid);
    
    return ORTE_SUCCESS;
    
 error:
    orte_show_help("help-orte-runtime.txt",
                   "orte_init:startup:internal-failure",
                   true, error, ORTE_ERROR_NAME(ret), ret);
    
    return ORTE_ERR_SILENT;
}
Example #2
0
static int orcmd_init(void)
{
    int ret = ORTE_ERROR;
    char *error = NULL;
    opal_buffer_t buf, *clusterbuf, *uribuf;
    orte_job_t *jdata;
    orte_node_t *node;
    orte_proc_t *proc;
    opal_list_t config;
    orcm_scheduler_t *scheduler;
    orcm_node_t *mynode=NULL;
    int32_t n;

    if (initialized) {
        return ORCM_SUCCESS;
    }
    initialized = true;

    /* Initialize the ORTE data type support */
    if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
        error = "orte_std_prolog";
        goto error;
    }

    /* setup the global job and node arrays */
    orte_job_data = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data,
                                                       1,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       1))) {
        ORTE_ERROR_LOG(ret);
        error = "setup job array";
        goto error;
    }

    orte_node_pool = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
        ORTE_ERROR_LOG(ret);
        error = "setup node array";
        goto error;
    }
    orte_node_topologies = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_topologies,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
        ORTE_ERROR_LOG(ret);
        error = "setup node topologies array";
        goto error;
    }

    /* create a job tracker for the daemons */
    jdata = OBJ_NEW(orte_job_t);
    jdata->jobid = 0;
    ORTE_PROC_MY_NAME->jobid = 0;
    opal_pointer_array_set_item(orte_job_data, 0, jdata);

    /* read the site configuration */
    OBJ_CONSTRUCT(&config, opal_list_t);
    if (ORCM_SUCCESS != (ret = orcm_cfgi.read_config(&config))) {
        error = "getting config";
        goto error;
    }

    /* define the cluster and collect contact info for all
     * aggregators - we'll need to know how to talk to any
     * of them in case of failures
     */
    OBJ_CONSTRUCT(&buf, opal_buffer_t);
    if (ORCM_SUCCESS != (ret = orcm_cfgi.define_system(&config,
                                                       &mynode,
                                                       &orte_process_info.num_procs,
                                                       &buf))) {
        OBJ_DESTRUCT(&buf);
        error = "define system";
        goto error;
    }

    /* if my name didn't get set, then we didn't find our node
     * in the config - report it and die
     */
    if (NULL == mynode) {
        orte_show_help("help-ess-orcm.txt", "node-not-found", true,
                       orcm_cfgi_base.config_file,
                       orte_process_info.nodename);
        OBJ_DESTRUCT(&buf);
        return ORTE_ERR_SILENT;
    }

    /* define a node and proc object for ourselves as some parts
     * of ORTE and ORCM require it */
    if (NULL == (node = OBJ_NEW(orte_node_t))) {
        ret = ORTE_ERR_OUT_OF_RESOURCE;
        error = "out of memory";
        goto error;
    }
    node->name = strdup(orte_process_info.nodename);
    opal_pointer_array_set_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid, node);
    if (NULL == (proc = OBJ_NEW(orte_proc_t))) {
        ret = ORTE_ERR_OUT_OF_RESOURCE;
        error = "out of memory";
        goto error;
    }
    proc->name.jobid = ORTE_PROC_MY_NAME->jobid;
    proc->name.vpid = ORTE_PROC_MY_NAME->vpid;
    OBJ_RETAIN(proc);
    node->daemon = proc;
    OBJ_RETAIN(node);
    proc->node = node;
    opal_pointer_array_set_item(jdata->procs, ORTE_PROC_MY_NAME->vpid, proc);

    /* For now, we only support a single scheduler daemon in the system.
     * This *may* change someday in the future */
    scheduler = (orcm_scheduler_t*)opal_list_get_first(orcm_schedulers);

    /* If we are in test mode, then we don't *require* that a scheduler
     * be defined in the system - otherwise, we do */
    if (NULL == scheduler) {
        if (mca_sst_orcmd_component.scheduler_reqd) {
            error = "no scheduler found";
            ret = ORTE_ERR_NOT_FOUND;
            goto error;
        }
    } else {
        ORTE_PROC_MY_SCHEDULER->jobid = scheduler->controller.daemon.jobid;
        ORTE_PROC_MY_SCHEDULER->vpid = scheduler->controller.daemon.vpid;
    }

    /* register the ORTE-level params at this time now that the
     * config has had a chance to push things into the environ
     */
    if (ORTE_SUCCESS != (ret = orte_register_params())) {
        OBJ_DESTRUCT(&buf);
        error = "orte_register_params";
        goto error;
    }

    /* setup callback for SIGPIPE */
    setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback);
    /* Set signal handlers to catch kill signals so we can properly clean up
     * after ourselves.
     */
    setup_sighandler(SIGTERM, &term_handler, shutdown_signal);
    setup_sighandler(SIGINT, &int_handler, shutdown_signal);

    /** setup callbacks for signals we should ignore */
    setup_sighandler(SIGUSR1, &sigusr1_handler, signal_callback);
    setup_sighandler(SIGUSR2, &sigusr2_handler, signal_callback);
    signals_set = true;

#if OPAL_HAVE_HWLOC
    {
        hwloc_obj_t obj;
        unsigned i, j;

        /* get the local topology */
        if (NULL == opal_hwloc_topology) {
            if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
                OBJ_DESTRUCT(&buf);
                error = "topology discovery";
                goto error;
            }
        }

        /* remove the hostname from the topology. Unfortunately, hwloc
         * decided to add the source hostname to the "topology", thus
         * rendering it unusable as a pure topological description. So
         * we remove that information here.
         */
        obj = hwloc_get_root_obj(opal_hwloc_topology);
        for (i=0; i < obj->infos_count; i++) {
            if (NULL == obj->infos[i].name ||
                NULL == obj->infos[i].value) {
                continue;
            }
            if (0 == strncmp(obj->infos[i].name, "HostName", strlen("HostName"))) {
                free(obj->infos[i].name);
                free(obj->infos[i].value);
                /* left justify the array */
                for (j=i; j < obj->infos_count-1; j++) {
                    obj->infos[j] = obj->infos[j+1];
                }
                obj->infos[obj->infos_count-1].name = NULL;
                obj->infos[obj->infos_count-1].value = NULL;
                obj->infos_count--;
                break;
            }
        }

        if (15 < opal_output_get_verbosity(orcm_sst_base_framework.framework_output)) {
            opal_output(0, "%s Topology Info:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO);
        }

        /* if we were asked to bind to specific core(s), do so now */
        if (NULL != orte_daemon_cores) {
            char **cores=NULL, tmp[128];
            hwloc_obj_t pu;
            hwloc_cpuset_t ours, pucpus, res;
            int core;

            /* could be a collection of comma-delimited ranges, so
             * use our handy utility to parse it
             */
            orte_util_parse_range_options(orte_daemon_cores, &cores);
            if (NULL != cores) {
                ours = hwloc_bitmap_alloc();
                hwloc_bitmap_zero(ours);
                pucpus = hwloc_bitmap_alloc();
                res = hwloc_bitmap_alloc();
                for (i=0; NULL != cores[i]; i++) {
                    core = strtoul(cores[i], NULL, 10);
                    if (NULL == (pu = opal_hwloc_base_get_pu(opal_hwloc_topology, core, OPAL_HWLOC_LOGICAL))) {
                        orte_show_help("help-orted.txt", "orted:cannot-bind",
                                       true, orte_process_info.nodename,
                                       orte_daemon_cores);
                        ret = ORTE_ERR_NOT_SUPPORTED;
                        OBJ_DESTRUCT(&buf);
                        error = "cannot bind";
                        goto error;
                    }
                    hwloc_bitmap_and(pucpus, pu->online_cpuset, pu->allowed_cpuset);
                    hwloc_bitmap_or(res, ours, pucpus);
                    hwloc_bitmap_copy(ours, res);
                }
                /* if the result is all zeros, then don't bind */
                if (!hwloc_bitmap_iszero(ours)) {
                    (void)hwloc_set_cpubind(opal_hwloc_topology, ours, 0);
                    if (opal_hwloc_report_bindings) {
                        opal_hwloc_base_cset2mapstr(tmp, sizeof(tmp), opal_hwloc_topology, ours);
                        opal_output(0, "Daemon %s is bound to cores %s",
                                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp);
                    }
                }
                /* cleanup */
                hwloc_bitmap_free(ours);
                hwloc_bitmap_free(pucpus);
                hwloc_bitmap_free(res);
                opal_argv_free(cores);
            }
        }
    }
#endif

    /* open and select the pstat framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_pstat_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "opal_pstat_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "opal_pstat_base_select";
        goto error;
    }

    /* open and setup the state machine */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_state_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_state_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_state_base_select";
        goto error;
    }

    /* open the notifier */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_notifier_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_notifier_base_open";
        goto error;
    }

    /* open the errmgr */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_errmgr_base_open";
        goto error;
    }

    /* Setup the communication infrastructure */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_oob_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_oob_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_oob_base_select";
        goto error;
    }
    if (!opal_list_get_size(&orte_oob_base.actives)) {
        ret = ORTE_ERROR;
        error = "orte_oob: Found 0 active transports";
        goto error;
    }

    /* Runtime Messaging Layer */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_rml_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_rml_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_rml_base_select";
        goto error;
    }

    /* select the notifier*/
    if (ORTE_SUCCESS != (ret = orte_notifier_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_notifier_base_select";
        goto error;
    }

    /* select the errmgr */
    if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_errmgr_base_select";
        goto error;
    }

    /* Routed system */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_rml_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_routed_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orte_routed_base_select";
        goto error;
    }

    /* database */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_db_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orcm_db_base_open";
        goto error;
    }
    /* always restrict daemons to local database components */
    if (ORTE_SUCCESS != (ret = orcm_db_base_select())) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "orcm_db_base_select";
        goto error;
    }

    /* datastore - ensure we don't pickup the pmi component, but
     * don't override anything set by user
     */
    if (NULL == getenv(OPAL_MCA_PREFIX"dstore")) {
        putenv(OPAL_MCA_PREFIX"dstore=^pmi");
    }
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_dstore_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "opal_dstore_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = opal_dstore_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "opal_dstore_base_select";
        goto error;
    }
    /* create the handle */
    if (0 > (opal_dstore_internal = opal_dstore.open("INTERNAL", NULL, NULL))) {
        error = "opal dstore internal";
        ret = ORTE_ERR_FATAL;
        goto error;
    }

    /* extract the cluster description and setup the routed info - the orcm routed component
     * will know what to do. */
    n = 1;
    if (OPAL_SUCCESS != (ret = opal_dss.unpack(&buf, &clusterbuf, &n, OPAL_BUFFER))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "extract cluster buf";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, clusterbuf))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        OBJ_RELEASE(clusterbuf);
        error = "orte_routed.init_routes";
        goto error;
    }
    OBJ_RELEASE(clusterbuf);

    /* extract the uri buffer and load the hash tables */
    n = 1;
    if (OPAL_SUCCESS != (ret = opal_dss.unpack(&buf, &uribuf, &n, OPAL_BUFFER))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        error = "extract uri buffer";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_rml_base_update_contact_info(uribuf))) {
        ORTE_ERROR_LOG(ret);
        OBJ_DESTRUCT(&buf);
        OBJ_RELEASE(uribuf);
        error = "load hash tables";
        goto error;
    }
    OBJ_DESTRUCT(&buf);
    OBJ_RELEASE(uribuf);

    /*
     * Group communications
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_select";
        goto error;
    }

    /* Open/select the odls */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_odls_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_odls_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_odls_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_odls_base_select";
        goto error;
    }

    /* enable communication with the rml */
    if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml.enable_comm";
        goto error;
    }

    /* setup the FileM */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_filem_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_filem_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_filem_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_filem_base_select";
        goto error;
    }

    /*
     * Initalize the CR setup
     * Note: Always do this, even in non-FT builds.
     * If we don't some user level tools may hang.
     */
    opal_cr_set_enabled(false);
    if (ORTE_SUCCESS != (ret = orte_cr_init())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_cr_init";
        goto error;
    }

    /* setup the ANALYTICS framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_analytics_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_analytics_base_open";
        goto error;
    }

    /* setup the EVGEN framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_evgen_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_evgen_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orcm_evgen_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_evgen_select";
        goto error;
    }

    /* setup the SENSOR framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_sensor_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_sensor_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orcm_sensor_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_sensor_select";
        goto error;
    }
    /* start the local sensors */
    orcm_sensor.start(ORTE_PROC_MY_NAME->jobid);

    /* setup the PWRMGMT framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_pwrmgmt_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_pwrmgmt_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orcm_pwrmgmt_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_pwrmgmt_select";
        goto error;
    }

    /* setup the DFS framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_dfs_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_dfs_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_dfs_select";
        goto error;
    }

    /* open and setup the DIAG framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orcm_diag_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_diag_base_open";
        goto error;
    }
    if (ORCM_SUCCESS != (ret = orcm_diag_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orcm_diag_select";
        goto error;
    }

    return ORTE_SUCCESS;
    
 error:
    orte_show_help("help-orcm-runtime.txt",
                   "orcm_init:startup:internal-failure",
                   true, error, ORTE_ERROR_NAME(ret), ret);
    
    return ORTE_ERR_SILENT;
}
Example #3
0
/* configure using getifaddrs(3) */
static int if_posix_open(void)
{
    int sd;
    int lastlen, rem;
    char *ptr;
    struct ifconf ifconf;
    int ifc_len;
    bool successful_locate = false;

    /* Create the internet socket to test with.  Must use AF_INET;
       using AF_UNSPEC or AF_INET6 will cause everything to
       fail. */
    if ((sd = socket(AF_INET, SOCK_DGRAM, 0)) < 0) {
        opal_output(0, "opal_ifinit: socket() failed with errno=%d\n", 
                    errno);
        return OPAL_ERROR;
    }

    /*
     * Get Network Interface configuration 
     *
     * Some notes on the behavior of ioctl(..., SIOCGIFCONF,...)
     * when not enough space is allocated for all the entries.
     *
     * - Solaris returns -1, errno EINVAL if there is not enough
     *   space 
     * - OS X returns 0, sets .ifc_len to the space used by the
     *   by the entries that did fit.
     * - Linux returns 0, sets .ifc_len to the space required to
     *   hold all the entries (although it only writes what will
     *   fit in the buffer of .ifc_len passed to the function).
     * - FreeBSD returns 0, sets .ifc_len to 0.
     *
     * Everyone else seems to do one of the four.
     */
    lastlen = 0;
    ifc_len = sizeof(struct ifreq) * DEFAULT_NUMBER_INTERFACES;
    do {
        ifconf.ifc_len = ifc_len;
        ifconf.ifc_req = malloc(ifc_len);
        if (NULL == ifconf.ifc_req) {
            close(sd);
            return OPAL_ERROR;
        }
            
        /* initialize the memory so valgrind and purify won't
         * complain.  Since this isn't performance critical, just
         * always memset.
         */
        memset(ifconf.ifc_req, 0, ifconf.ifc_len);
            
        if (ioctl(sd, SIOCGIFCONF, &ifconf) < 0) {
            /* if we got an einval, we probably don't have enough
               space.  so we'll fall down and try to expand our
               space */
            if (errno != EINVAL && lastlen != 0) {
                opal_output(0, "opal_ifinit: ioctl(SIOCGIFCONF) \
                            failed with errno=%d", 
                            errno);
                free(ifconf.ifc_req);
                close(sd);
                return OPAL_ERROR;
            }
        } else {
            /* if ifc_len is 0 or different than what we set it to
               at call to ioctl, try again with a bigger buffer.
               else stop */
            if (ifconf.ifc_len == lastlen && ifconf.ifc_len > 0) {
                /* we didn't expand.  we're done */
                successful_locate = true;
                break;
            }
            lastlen = ifconf.ifc_len;
        }
            
        /* Yes, we overflowed (or had an EINVAL on the ioctl).
           Loop back around and try again with a bigger buffer */
        free(ifconf.ifc_req);
        ifc_len = (ifc_len == 0) ? 1 : ifc_len * 2;
    } while (ifc_len < MAX_IFCONF_SIZE);
int
mca_fcoll_static_file_write_all (mca_io_ompio_file_t *fh,
                                 void *buf,
                                 int count,
                                 struct ompi_datatype_t *datatype,
                                 ompi_status_public_t *status)
{



    size_t max_data = 0, bytes_per_cycle=0;
    struct iovec *iov=NULL, *decoded_iov=NULL;
    uint32_t iov_count=0, iov_index=0;
    int i=0,j=0,l=0, temp_index;
    int ret=OMPI_SUCCESS, cycles, local_cycles, *bytes_per_process=NULL;
    int index, *disp_index=NULL, **blocklen_per_process=NULL;
    int *iovec_count_per_process=NULL, *displs=NULL;
    size_t total_bytes_written=0;
    MPI_Aint **displs_per_process=NULL, *memory_displacements=NULL;
    MPI_Aint bytes_to_write_in_cycle=0, global_iov_count=0, global_count=0;

    local_io_array *local_iov_array =NULL, *global_iov_array=NULL;
    local_io_array *file_offsets_for_agg=NULL;
    int *sorted=NULL, *sorted_file_offsets=NULL, temp_pindex, *temp_disp_index=NULL;
    char *send_buf=NULL, *global_buf=NULL;
    int iov_size=0, current_position=0, *current_index=NULL;
    int *bytes_remaining=NULL, entries_per_aggregator=0;
    ompi_datatype_t **recvtype = NULL;
    MPI_Request *send_req=NULL, *recv_req=NULL;
    /* For creating datatype of type io_array */
    int blocklen[3] = {1, 1, 1};
    int static_num_io_procs=1;
    OPAL_PTRDIFF_TYPE d[3], base;
    ompi_datatype_t *types[3];
    ompi_datatype_t *io_array_type=MPI_DATATYPE_NULL;
    /*----------------------------------------------*/
#if TIME_BREAKDOWN
    double write_time = 0.0, start_write_time = 0.0, end_write_time = 0.0;
    double comm_time = 0.0, start_comm_time = 0.0, end_comm_time = 0.0;
    double exch_write = 0.0, start_exch = 0.0, end_exch = 0.0;
    print_entry nentry;
#endif


#if DEBUG_ON
    MPI_Aint gc_in;
#endif

//  if (opal_datatype_is_contiguous_memory_layout(&datatype->super,1)) {
//    fh->f_flags |= OMPIO_CONTIGUOUS_MEMORY;
//  }

    /* In case the data is not contigous in memory, decode it into an iovec */
    if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) {
        ompi_io_ompio_decode_datatype (fh,
                                       datatype,
                                       count,
                                       buf,
                                       &max_data,
                                       &decoded_iov,
                                       &iov_count);
    }
    else {
        max_data = count * datatype->super.size;
    }

    if ( MPI_STATUS_IGNORE != status ) {
        status->_ucount = max_data;
    }

    mca_io_ompio_get_num_aggregators ( & static_num_io_procs );
    ompi_io_ompio_set_aggregator_props (fh,
                                        static_num_io_procs,
                                        max_data);


    /* io_array datatype  for using in communication*/
    types[0] = &ompi_mpi_long.dt;
    types[1] = &ompi_mpi_long.dt;
    types[2] = &ompi_mpi_int.dt;

    d[0] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0];
    d[1] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0].length;
    d[2] = (OPAL_PTRDIFF_TYPE)&local_iov_array[0].process_id;
    base = d[0];
    for (i=0 ; i<3 ; i++) {
        d[i] -= base;
    }
    ompi_datatype_create_struct (3,
                                 blocklen,
                                 d,
                                 types,
                                 &io_array_type);
    ompi_datatype_commit (&io_array_type);
    /* #########################################################*/



    ret = ompi_io_ompio_generate_current_file_view(fh,
            max_data,
            &iov,
            &iov_size);
    if (ret != OMPI_SUCCESS) {
        fprintf(stderr,"Current File View Generation Error\n");
        goto exit;
    }

    if (0 == iov_size) {
        iov_size  = 1;
    }

    local_iov_array = (local_io_array *)malloc (iov_size * sizeof(local_io_array));
    if ( NULL == local_iov_array) {
        fprintf(stderr,"local_iov_array allocation error\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }


    for (j=0; j < iov_size; j++) {
        local_iov_array[j].offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)
                                    iov[j].iov_base;
        local_iov_array[j].length = (size_t)iov[j].iov_len;
        local_iov_array[j].process_id = fh->f_rank;

    }

    mca_io_ompio_get_bytes_per_agg ( (int *) &bytes_per_cycle);


    local_cycles = ceil((double)max_data/bytes_per_cycle);
    ret = fh->f_comm->c_coll.coll_allreduce (&local_cycles,
            &cycles,
            1,
            MPI_INT,
            MPI_MAX,
            fh->f_comm,
            fh->f_comm->c_coll.coll_allreduce_module);

    if (OMPI_SUCCESS != ret) {
        fprintf(stderr,"local cycles allreduce!\n");
        goto exit;
    }

    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {

        disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int));
        if (NULL == disp_index) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        bytes_per_process = (int *) malloc (fh->f_procs_per_group * sizeof(int ));
        if (NULL == bytes_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        bytes_remaining = (int *) malloc (fh->f_procs_per_group * sizeof(int));
        if (NULL == bytes_remaining) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        current_index = (int *) malloc (fh->f_procs_per_group * sizeof(int));
        if (NULL == current_index) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        blocklen_per_process = (int **)malloc (fh->f_procs_per_group * sizeof (int*));
        if (NULL == blocklen_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        displs_per_process = (MPI_Aint **)
                             malloc (fh->f_procs_per_group * sizeof (MPI_Aint*));

        if (NULL == displs_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        for(i=0; i<fh->f_procs_per_group; i++) {
            current_index[i] = 0;
            bytes_remaining[i] =0;
            blocklen_per_process[i] = NULL;
            displs_per_process[i] = NULL;
        }
    }

    iovec_count_per_process = (int *) malloc (fh->f_procs_per_group * sizeof(int));
    if (NULL == iovec_count_per_process) {
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }

    displs = (int *) malloc (fh->f_procs_per_group * sizeof(int));
    if (NULL == displs) {
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }

    ret = ompi_io_ompio_allgather_array (&iov_size,
                                         1,
                                         MPI_INT,
                                         iovec_count_per_process,
                                         1,
                                         MPI_INT,
                                         fh->f_aggregator_index,
                                         fh->f_procs_in_group,
                                         fh->f_procs_per_group,
                                         fh->f_comm);

    if( OMPI_SUCCESS != ret) {
        fprintf(stderr,"iov size allgatherv array!\n");
        goto exit;
    }


    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
        displs[0] = 0;
        global_iov_count = iovec_count_per_process[0];
        for (i=1 ; i<fh->f_procs_per_group ; i++) {
            global_iov_count += iovec_count_per_process[i];
            displs[i] = displs[i-1] + iovec_count_per_process[i-1];
        }
    }


    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
        global_iov_array = (local_io_array *) malloc (global_iov_count *
                           sizeof(local_io_array));
        if (NULL == global_iov_array) {
            opal_output (1, "OUT OF MEMORY\n");
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
    }

    ret = ompi_io_ompio_gatherv_array (local_iov_array,
                                       iov_size,
                                       io_array_type,
                                       global_iov_array,
                                       iovec_count_per_process,
                                       displs,
                                       io_array_type,
                                       fh->f_aggregator_index,
                                       fh->f_procs_in_group,
                                       fh->f_procs_per_group,
                                       fh->f_comm);
    if (OMPI_SUCCESS != ret) {
        fprintf(stderr,"global_iov_array gather error!\n");
        goto exit;
    }

    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {

        if ( 0 == global_iov_count) {
            global_iov_count =  1;
        }

        sorted = (int *)malloc (global_iov_count * sizeof(int));
        if (NULL == sorted) {
            opal_output (1, "OUT OF MEMORY\n");
            return OMPI_ERR_OUT_OF_RESOURCE;
        }
        local_heap_sort (global_iov_array, global_iov_count, sorted);
    }

#if DEBUG_ON

    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
        for (gc_in=0; gc_in<global_iov_count; gc_in++) {
            printf("%d: Offset[%ld]: %lld, Length[%ld]: %ld\n",
                   global_iov_array[gc_in].process_id,
                   gc_in, global_iov_array[gc_in].offset,
                   gc_in, global_iov_array[gc_in].length);
        }
    }
#endif

#if TIME_BREAKDOWN
    start_exch = MPI_Wtime();
#endif


    for (index = 0; index < cycles; index++) {
        if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
            if (NULL == recvtype) {
                recvtype = (ompi_datatype_t **)
                           malloc (fh->f_procs_per_group  * sizeof(ompi_datatype_t *));
                if (NULL == recvtype) {
                    opal_output (1, "OUT OF MEMORY\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
            }
            for(l=0; l<fh->f_procs_per_group; l++) {
                disp_index[l] =  1;
                if (NULL != blocklen_per_process[l]) {
                    free(blocklen_per_process[l]);
                    blocklen_per_process[l] = NULL;
                }
                if (NULL != displs_per_process[l]) {
                    free(displs_per_process[l]);
                    displs_per_process[l] = NULL;
                }
                blocklen_per_process[l] = (int *) calloc (1, sizeof(int));
                if (NULL == blocklen_per_process[l]) {
                    opal_output (1, "OUT OF MEMORY for blocklen\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint));
                if (NULL == displs_per_process[l]) {
                    opal_output (1, "OUT OF MEMORY for displs\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
            }
            if (NULL != sorted_file_offsets) {
                free(sorted_file_offsets);
                sorted_file_offsets = NULL;
            }

            if(NULL != file_offsets_for_agg) {
                free(file_offsets_for_agg);
                file_offsets_for_agg = NULL;
            }

            if (NULL != memory_displacements) {
                free(memory_displacements);
                memory_displacements = NULL;
            }

        }
        if (local_cycles > index) {
            if ((index == local_cycles-1) && (max_data % bytes_per_cycle)) {
                bytes_to_write_in_cycle = max_data % bytes_per_cycle;
            }
            else if (max_data <= bytes_per_cycle) {
                bytes_to_write_in_cycle = max_data;
            }
            else {
                bytes_to_write_in_cycle = bytes_per_cycle;
            }
        }
        else {
            bytes_to_write_in_cycle = 0;
        }
#if DEBUG_ON
        /*    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {*/
        printf ("***%d: CYCLE %d   Bytes %ld**********\n",
                fh->f_rank,
                index,
                bytes_to_write_in_cycle);
        /* }*/
#endif
        /**********************************************************
         **Gather the Data from all the processes at the writers **
         *********************************************************/

        /* gather from each process how many bytes each will be sending */
        ompi_io_ompio_gather_array (&bytes_to_write_in_cycle,
                                    1,
                                    MPI_INT,
                                    bytes_per_process,
                                    1,
                                    MPI_INT,
                                    fh->f_aggregator_index,
                                    fh->f_procs_in_group,
                                    fh->f_procs_per_group,
                                    fh->f_comm);

        /*
           For each aggregator
           it needs to get bytes_to_write_in_cycle from each process
           in group which adds up to bytes_per_cycle

        */
        if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
            for (i=0; i<fh->f_procs_per_group; i++) {
                /*	    printf("bytes_per_process[%d]: %d\n", i, bytes_per_process[i]);
                 */

#if DEBUG_ON
                printf ("%d : bytes_per_process : %d\n",
                        fh->f_procs_in_group[i],
                        bytes_per_process[i]);
#endif

                while (bytes_per_process[i] > 0) {
                    if (get_process_id(global_iov_array[sorted[current_index[i]]].process_id,
                                       fh) == i) { /* current id owns this entry!*/

                        /*Add and subtract length and create
                          blocklength and displs array*/
                        if (bytes_remaining[i]) {
                            /*Remaining bytes in the current entry of
                            		     the global offset array*/
                            if (bytes_remaining[i] <= bytes_per_process[i]) {
                                blocklen_per_process[i][disp_index[i] - 1] = bytes_remaining[i];
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset +
                                    (global_iov_array[sorted[current_index[i]]].length
                                     - bytes_remaining[i]);

                                blocklen_per_process[i] = (int *) realloc
                                                          ((void *)blocklen_per_process[i], (disp_index[i]+1)*sizeof(int));
                                displs_per_process[i] = (MPI_Aint *)realloc
                                                        ((void *)displs_per_process[i], (disp_index[i]+1)*sizeof(MPI_Aint));
                                bytes_per_process[i] -= bytes_remaining[i];
                                blocklen_per_process[i][disp_index[i]] = 0;
                                displs_per_process[i][disp_index[i]] = 0;
                                bytes_remaining[i] = 0;
                                disp_index[i] += 1;
                                /* This entry has been used up, we need to move to the
                                next entry of this process and make current_index point there*/
                                current_index[i]  = find_next_index(i,
                                                                    current_index[i],
                                                                    fh,
                                                                    global_iov_array,
                                                                    global_iov_count,
                                                                    sorted);
                                if (current_index[i] == -1) {
                                    /* No more entries left, so Its all done! exit!*/
                                    break;
                                }
                                continue;
                            }
                            else {
                                blocklen_per_process[i][disp_index[i] - 1] = bytes_per_process[i];
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset +
                                    (global_iov_array[sorted[current_index[i]]].length
                                     - bytes_remaining[i]);
                                bytes_remaining[i] -= bytes_per_process[i];
                                bytes_per_process[i] = 0;
                                break;
                            }
                        }
                        else {
                            if (bytes_per_process[i] <
                                    global_iov_array[sorted[current_index[i]]].length) {
                                blocklen_per_process[i][disp_index[i] - 1] =
                                    bytes_per_process[i];
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset;

                                bytes_remaining[i] =
                                    global_iov_array[sorted[current_index[i]]].length -
                                    bytes_per_process[i];
                                bytes_per_process[i] = 0;
                                break;
                            }
                            else {
                                blocklen_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].length;
                                displs_per_process[i][disp_index[i] - 1] =
                                    global_iov_array[sorted[current_index[i]]].offset;
                                blocklen_per_process[i] =
                                    (int *) realloc ((void *)blocklen_per_process[i], (disp_index[i]+1)*sizeof(int));
                                displs_per_process[i] = (MPI_Aint *)realloc
                                                        ((void *)displs_per_process[i], (disp_index[i]+1)*sizeof(MPI_Aint));
                                blocklen_per_process[i][disp_index[i]] = 0;
                                displs_per_process[i][disp_index[i]] = 0;
                                disp_index[i] += 1;
                                bytes_per_process[i] -=
                                    global_iov_array[sorted[current_index[i]]].length;
                                current_index[i] = find_next_index(i,
                                                                   current_index[i],
                                                                   fh,
                                                                   global_iov_array,
                                                                   global_iov_count,
                                                                   sorted);
                                if (current_index[i] == -1) {
                                    break;
                                }
                            }
                        }
                    }
                    else {
                        current_index[i] = find_next_index(i,
                                                           current_index[i],
                                                           fh,
                                                           global_iov_array,
                                                           global_iov_count,
                                                           sorted);
                        if (current_index[i] == -1) {
                            bytes_per_process[i] = 0; /* no more entries left
						   to service this request*/
                            continue;
                        }
                    }
                }
            }
            entries_per_aggregator=0;
            for (i=0; i<fh->f_procs_per_group; i++) {
                for (j=0; j<disp_index[i]; j++) {
                    if (blocklen_per_process[i][j] > 0) {
                        entries_per_aggregator++;
#if DEBUG_ON
                        printf("%d sends blocklen[%d]: %d, disp[%d]: %ld to %d\n",
                               fh->f_procs_in_group[i],j,
                               blocklen_per_process[i][j],j,
                               displs_per_process[i][j],
                               fh->f_rank);

#endif
                    }

                }
            }

            if (entries_per_aggregator > 0) {
                file_offsets_for_agg = (local_io_array *)
                                       malloc(entries_per_aggregator*sizeof(local_io_array));
                if (NULL == file_offsets_for_agg) {
                    opal_output (1, "OUT OF MEMORY\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                sorted_file_offsets = (int *)
                                      malloc (entries_per_aggregator*sizeof(int));
                if (NULL == sorted_file_offsets) {
                    opal_output (1, "OUT OF MEMORY\n");
                    ret =  OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                temp_index = 0;
                for (i=0; i<fh->f_procs_per_group; i++) {
                    for(j=0; j<disp_index[i]; j++) {
                        if (blocklen_per_process[i][j] > 0) {
                            file_offsets_for_agg[temp_index].length =
                                blocklen_per_process[i][j];
                            file_offsets_for_agg[temp_index].process_id = i;
                            file_offsets_for_agg[temp_index].offset =
                                displs_per_process[i][j];
                            temp_index++;
                        }
                    }
                }
            }
            else {
                continue;
            }
            local_heap_sort (file_offsets_for_agg,
                             entries_per_aggregator,
                             sorted_file_offsets);

            memory_displacements = (MPI_Aint *) malloc
                                   (entries_per_aggregator * sizeof(MPI_Aint));
            memory_displacements[sorted_file_offsets[0]] = 0;
            for (i=1; i<entries_per_aggregator; i++) {
                memory_displacements[sorted_file_offsets[i]] =
                    memory_displacements[sorted_file_offsets[i-1]] +
                    file_offsets_for_agg[sorted_file_offsets[i-1]].length;
            }

            temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int));
            if (NULL == temp_disp_index) {
                opal_output (1, "OUT OF MEMORY\n");
                return OMPI_ERR_OUT_OF_RESOURCE;
            }
            global_count = 0;
            for (i=0; i<entries_per_aggregator; i++) {
                temp_pindex =
                    file_offsets_for_agg[sorted_file_offsets[i]].process_id;
                displs_per_process[temp_pindex][temp_disp_index[temp_pindex]] =
                    memory_displacements[sorted_file_offsets[i]];
                if (temp_disp_index[temp_pindex] < disp_index[temp_pindex])
                    temp_disp_index[temp_pindex] += 1;
                else {
                    printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n",
                           temp_pindex, temp_disp_index[temp_pindex],
                           temp_pindex, disp_index[temp_pindex]);
                }
                global_count +=
                    file_offsets_for_agg[sorted_file_offsets[i]].length;
            }
            if (NULL != temp_disp_index) {
                free(temp_disp_index);
                temp_disp_index = NULL;
            }

#if DEBUG_ON
            printf("************Cycle: %d,  Aggregator: %d ***************\n",
                   index+1,fh->f_rank);
            for (i=0; i<entries_per_aggregator; i++) {
                printf("%d: OFFSET: %lld   LENGTH: %ld, Mem-offset: %ld, disp : %d\n",
                       file_offsets_for_agg[sorted_file_offsets[i]].process_id,
                       file_offsets_for_agg[sorted_file_offsets[i]].offset,
                       file_offsets_for_agg[sorted_file_offsets[i]].length,
                       memory_displacements[sorted_file_offsets[i]],
                       disp_index[ file_offsets_for_agg[sorted_file_offsets[i]].process_id]);
            }
#endif

#if DEBUG_ON
            printf("%d: global_count : %ld, bytes_to_write_in_cycle : %ld, procs_per_group: %d\n",
                   fh->f_rank,
                   global_count,
                   bytes_to_write_in_cycle,
                   fh->f_procs_per_group);
#endif
#if TIME_BREAKDOWN
            start_comm_time = MPI_Wtime();
#endif
            global_buf  = (char *) malloc (global_count);
            if (NULL == global_buf) {
                opal_output(1, "OUT OF MEMORY");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }

            recv_req = (MPI_Request *)
                       malloc (fh->f_procs_per_group * sizeof(MPI_Request));
            if (NULL == recv_req) {
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
            for (i=0; i<fh->f_procs_per_group; i++) {
                ompi_datatype_create_hindexed(disp_index[i],
                                              blocklen_per_process[i],
                                              displs_per_process[i],
                                              MPI_BYTE,
                                              &recvtype[i]);
                ompi_datatype_commit(&recvtype[i]);
                ret = MCA_PML_CALL(irecv(global_buf,
                                         1,
                                         recvtype[i],
                                         fh->f_procs_in_group[i],
                                         123,
                                         fh->f_comm,
                                         &recv_req[i]));
                if (OMPI_SUCCESS != ret) {
                    fprintf(stderr,"irecv Error!\n");
                    goto exit;
                }
            }
        }

        if (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY) {
            send_buf = &((char*)buf)[total_bytes_written];
        }
        else if (bytes_to_write_in_cycle) {
            /* allocate a send buffer and copy the data that needs
               to be sent into it in case the data is non-contigous
               in memory */
            OPAL_PTRDIFF_TYPE mem_address;
            size_t remaining = 0;
            size_t temp_position = 0;

            send_buf = malloc (bytes_to_write_in_cycle);
            if (NULL == send_buf) {
                opal_output (1, "OUT OF MEMORY\n");
                return OMPI_ERR_OUT_OF_RESOURCE;
            }
            remaining = bytes_to_write_in_cycle;

            while (remaining) {
                mem_address = (OPAL_PTRDIFF_TYPE)
                              (decoded_iov[iov_index].iov_base) + current_position;

                if (remaining >=
                        (decoded_iov[iov_index].iov_len - current_position)) {
                    memcpy (send_buf+temp_position,
                            (IOVBASE_TYPE *)mem_address,
                            decoded_iov[iov_index].iov_len - current_position);
                    remaining = remaining -
                                (decoded_iov[iov_index].iov_len - current_position);
                    temp_position = temp_position +
                                    (decoded_iov[iov_index].iov_len - current_position);
                    iov_index = iov_index + 1;
                    current_position = 0;
                }
                else {
                    memcpy (send_buf+temp_position,
                            (IOVBASE_TYPE *)mem_address,
                            remaining);
                    current_position = current_position + remaining;
                    remaining = 0;
                }
            }
        }
        total_bytes_written += bytes_to_write_in_cycle;

        send_req = (MPI_Request *) malloc (sizeof(MPI_Request));
        if (NULL == send_req) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        ret = MCA_PML_CALL(isend(send_buf,
                                 bytes_to_write_in_cycle,
                                 MPI_BYTE,
                                 fh->f_procs_in_group[fh->f_aggregator_index],
                                 123,
                                 MCA_PML_BASE_SEND_STANDARD,
                                 fh->f_comm,
                                 send_req));

        if ( OMPI_SUCCESS != ret ) {
            fprintf(stderr,"isend error!\n");
            goto exit;
        }

        ret = ompi_request_wait (send_req, MPI_STATUS_IGNORE);
        if (OMPI_SUCCESS != ret) {
            goto exit;
        }

        if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
            ret = ompi_request_wait_all (fh->f_procs_per_group,
                                         recv_req,
                                         MPI_STATUS_IGNORE);
            if (OMPI_SUCCESS != ret) {
                goto exit;
            }

#if DEBUG_ON
            printf("************Cycle: %d,  Aggregator: %d ***************\n",
                   index+1,fh->f_rank);
            if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
                for (i=0 ; i<global_count/4 ; i++)
                    printf (" RECV %d \n",((int *)global_buf)[i]);
            }
#endif
        }
#if TIME_BREAKDOWN
        end_comm_time = MPI_Wtime();
        comm_time += end_comm_time - start_comm_time;
#endif



        if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
            fh->f_io_array = (mca_io_ompio_io_array_t *) malloc
                             (entries_per_aggregator * sizeof (mca_io_ompio_io_array_t));
            if (NULL == fh->f_io_array) {
                opal_output(1, "OUT OF MEMORY\n");
                return OMPI_ERR_OUT_OF_RESOURCE;
            }
            fh->f_num_of_io_entries = 0;
            /*First entry for every aggregator*/
            fh->f_io_array[fh->f_num_of_io_entries].offset =
                (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset;
            fh->f_io_array[fh->f_num_of_io_entries].length =
                file_offsets_for_agg[sorted_file_offsets[0]].length;
            fh->f_io_array[fh->f_num_of_io_entries].memory_address =
                global_buf+memory_displacements[sorted_file_offsets[0]];
            fh->f_num_of_io_entries++;
            for (i=1; i<entries_per_aggregator; i++) {
                if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset +
                        file_offsets_for_agg[sorted_file_offsets[i-1]].length ==
                        file_offsets_for_agg[sorted_file_offsets[i]].offset) {
                    fh->f_io_array[fh->f_num_of_io_entries - 1].length +=
                        file_offsets_for_agg[sorted_file_offsets[i]].length;
                }
                else {
                    fh->f_io_array[fh->f_num_of_io_entries].offset =
                        (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset;
                    fh->f_io_array[fh->f_num_of_io_entries].length =
                        file_offsets_for_agg[sorted_file_offsets[i]].length;
                    fh->f_io_array[fh->f_num_of_io_entries].memory_address =
                        global_buf+memory_displacements[sorted_file_offsets[i]];
                    fh->f_num_of_io_entries++;
                }
            }
#if DEBUG_ON
            printf("*************************** %d\n", fh->f_num_of_io_entries);
            for (i=0 ; i<fh->f_num_of_io_entries ; i++) {
                printf(" ADDRESS: %p  OFFSET: %ld   LENGTH: %ld\n",
                       fh->f_io_array[i].memory_address,
                       (OPAL_PTRDIFF_TYPE)fh->f_io_array[i].offset,
                       fh->f_io_array[i].length);
            }
#endif

#if TIME_BREAKDOWN
            start_write_time = MPI_Wtime();
#endif

            if (fh->f_num_of_io_entries) {
                if ( 0 >  fh->f_fbtl->fbtl_pwritev (fh)) {
                    opal_output (1, "WRITE FAILED\n");
                    ret = OMPI_ERROR;
                    goto exit;
                }
            }

#if TIME_BREAKDOWN
            end_write_time = MPI_Wtime();
            write_time += end_write_time - start_write_time;
#endif

        }
        if (NULL != send_req) {
            free(send_req);
            send_req = NULL;
        }

        if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
            fh->f_num_of_io_entries = 0;
            if (NULL != fh->f_io_array) {
                free (fh->f_io_array);
                fh->f_io_array = NULL;
            }
            for (i = 0; i < fh->f_procs_per_group; i++)
                ompi_datatype_destroy(recvtype+i);
            if (NULL != recvtype) {
                free(recvtype);
                recvtype=NULL;
            }
            if (NULL != recv_req) {
                free(recv_req);
                recv_req = NULL;
            }
            if (NULL != global_buf) {
                free (global_buf);
                global_buf = NULL;
            }
        }
    }

#if TIME_BREAKDOWN
    end_exch = MPI_Wtime();
    exch_write += end_exch - start_exch;
    nentry.time[0] = write_time;
    nentry.time[1] = comm_time;
    nentry.time[2] = exch_write;
    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank)
        nentry.aggregator = 1;
    else
        nentry.aggregator = 0;
    nentry.nprocs_for_coll = static_num_io_procs;
    if (!ompi_io_ompio_full_print_queue(WRITE_PRINT_QUEUE)) {
        ompi_io_ompio_register_print_entry(WRITE_PRINT_QUEUE,
                                           nentry);
    }
#endif



exit:
    if (NULL != decoded_iov) {
        free(decoded_iov);
        decoded_iov = NULL;
    }

    if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {

        if (NULL != disp_index) {
            free(disp_index);
            disp_index = NULL;
        }

        if (NULL != local_iov_array) {
            free(local_iov_array);
            local_iov_array = NULL;
        }
        for(l=0; l<fh->f_procs_per_group; l++) {
            if (NULL != blocklen_per_process[l]) {
                free(blocklen_per_process[l]);
                blocklen_per_process[l] = NULL;
            }
            if (NULL != displs_per_process[l]) {
                free(displs_per_process[l]);
                displs_per_process[l] = NULL;
            }
        }
        if (NULL != blocklen_per_process) {
            free(blocklen_per_process);
            blocklen_per_process = NULL;
        }
        if (NULL != displs_per_process) {
            free(displs_per_process);
            displs_per_process = NULL;
        }
        if(NULL != bytes_remaining) {
            free(bytes_remaining);
            bytes_remaining = NULL;
        }
        if(NULL != current_index) {
            free(current_index);
            current_index = NULL;
        }
    }
    return ret;
}
Example #5
0
static int rmaps_lama_merge_trees(opal_tree_t *src_tree, opal_tree_t *max_tree,
                                  opal_tree_item_t *src_parent, opal_tree_item_t *max_parent)
{
    int ret, exit_status = ORTE_SUCCESS;
    rmaps_lama_level_type_t *key_src, *key_max;
    opal_tree_item_t *child_item = NULL, *max_grandparent = NULL;
    opal_tree_item_t *max_child_item = NULL;
    int num_max, num_src;
    int i;
    char *key_src_str = NULL;
    char *key_max_str = NULL;
#if 1
    char *str = NULL;
#endif

    /*
     * Basecase
     */
    if( NULL == src_parent ) {
        return ORTE_SUCCESS;
    }

    key_src = (rmaps_lama_level_type_t*)src_tree->get_key(src_parent);
    key_max = (rmaps_lama_level_type_t*)max_tree->get_key(max_parent);

    key_src_str = lama_type_enum_to_str(*key_src);
    key_max_str = lama_type_enum_to_str(*key_max);

    if( 15 <= opal_output_get_verbosity(orte_rmaps_base_framework.framework_output) ) {
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:lama: CHECK: Merge Trees: Keys Src (%2d - %s) vs Max (%2d - %s)",
                            *key_src, key_src_str, *key_max, key_max_str);
    }

    /*
     * Make sure keys at this level match.
     *
     * JJH: Give up if they do not match.
     * JJH: We should pick a victim and prune from the tree
     * JJH: preferably from the 'native' tree.
     */
    if( 0 != max_tree->comp(max_parent, src_tree->get_key(src_parent)) ) {
        /*
         * If the source conflicts due to cache, iterate to children to find a match.
         * JJH: Double check this for different heterogenous systems
         */
        if( LAMA_LEVEL_CACHE_L3 == *key_src ||
            LAMA_LEVEL_CACHE_L2 == *key_src ||
            LAMA_LEVEL_CACHE_L1 == *key_src ||
            LAMA_LEVEL_NUMA     == *key_src ) {
            opal_output_verbose(10, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps:lama: Warning: Merge Trees: "
                                "Src with Conflicting Memory Hierarchy [Src (%2d - %s) vs Max (%2d - %s)]",
                                *key_src, key_src_str, *key_max, key_max_str);

            /*
             * If we are pruning a cache level, then check to make sure it is
             * not important to the process layout.
             */
            if( !rmaps_lama_ok_to_prune_level(*key_src) ) {
                orte_show_help("help-orte-rmaps-lama.txt",
                               "orte-rmaps-lama:merge-conflict-bad-prune-src",
                               true,
                               key_src_str,
                               (NULL == rmaps_lama_cmd_map      ? "[Not Provided]" : rmaps_lama_cmd_map),
                               (NULL == rmaps_lama_cmd_bind     ? "[Not Provided]" : rmaps_lama_cmd_bind),
                               (NULL == rmaps_lama_cmd_mppr     ? "[Not Provided]" : rmaps_lama_cmd_mppr),
                               (NULL == rmaps_lama_cmd_ordering ? "[Not Provided]" : rmaps_lama_cmd_ordering));
                exit_status = ORTE_ERROR;
                goto cleanup;
            }

            /*
             * If the number of children at this pruned level was larger than
             * the max tree arity at this level, then duplicate the max_tree
             * element the approprate number of times
             */
            max_grandparent = opal_tree_get_parent(max_parent);
            num_max = opal_tree_num_children(max_grandparent);
            num_src = opal_tree_num_children(src_parent);

            for(i = 0; i < (num_src - num_max); ++i ) {
#if 1
                str = rmaps_lama_max_tree_pretty_print_subtree_element_get(max_tree, max_parent, 0);
                opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                    "mca:rmaps:lama: Merge: Appending child %s - post prune",
                                    str);
                free(str);
#endif
                /* Duplicate max child subtree */
                opal_tree_copy_subtree(max_tree, max_parent, max_tree, max_grandparent);
            }

            /*
             * Iterate to children, until we find a match
             */
            for(child_item  = opal_tree_get_first_child(src_parent);
                child_item != NULL;
                child_item  = opal_tree_get_next_sibling(child_item) ) {

                if( ORTE_SUCCESS != (ret = rmaps_lama_merge_trees(src_tree,
                                                                  max_tree,
                                                                  child_item,
                                                                  max_parent)) ) {
                    exit_status =  ret;
                    goto cleanup;
                }
            }

            exit_status = ORTE_SUCCESS;
            goto cleanup;
        }
        /*
         * If the max tree conflicts due to cache, then we need to prune the
         * max tree until it matches.
         * JJH: If we are pruning a level of the hierarchy then make sure we
         * JJH: don't need it for the process layout.
         */
        else if( LAMA_LEVEL_CACHE_L3 == *key_max ||
                 LAMA_LEVEL_CACHE_L2 == *key_max ||
                 LAMA_LEVEL_CACHE_L1 == *key_max ||
                 LAMA_LEVEL_NUMA     == *key_max ) {
            opal_output_verbose(10, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps:lama: Warning: Merge Trees: "
                                "Max with Conflicting Memory Hierarchy [Src (%2d - %s) vs Max (%2d - %s)]",
                                *key_src, key_src_str, *key_max, key_max_str);

            /*
             * If we are pruning a cache level, then check to make sure it is
             * not important to the process layout.
             */
            if( !rmaps_lama_ok_to_prune_level(*key_max) ) {
                orte_show_help("help-orte-rmaps-lama.txt",
                               "orte-rmaps-lama:merge-conflict-bad-prune-src",
                               true,
                               key_max_str,
                               (NULL == rmaps_lama_cmd_map      ? "[Not Provided]" : rmaps_lama_cmd_map),
                               (NULL == rmaps_lama_cmd_bind     ? "[Not Provided]" : rmaps_lama_cmd_bind),
                               (NULL == rmaps_lama_cmd_mppr     ? "[Not Provided]" : rmaps_lama_cmd_mppr),
                               (NULL == rmaps_lama_cmd_ordering ? "[Not Provided]" : rmaps_lama_cmd_ordering));
                exit_status = ORTE_ERROR;
                goto cleanup;
            }

            max_child_item = opal_tree_get_first_child(max_parent);
            /* Prune parent */
            opal_tree_remove_item(max_tree, max_parent);

            /* Try again with child */
            exit_status = rmaps_lama_merge_trees(src_tree,
                                                 max_tree,
                                                 src_parent,
                                                 max_child_item);
            goto cleanup;
        }

        /*
         * If we cannot resolve it, give up.
         */
        opal_output(0, "mca:rmaps:lama: Error: Merge Trees: "
                    "Different Keys Src (%2d - %s) vs Max (%2d - %s) - Do not know how to resolve - give up!",
                    *key_src, key_src_str, *key_max, key_max_str);

        exit_status = ORTE_ERROR;
        goto cleanup;
    }

    num_max = opal_tree_num_children(max_parent);
    num_src = opal_tree_num_children(src_parent);

    /*
     * If the 'native' tree has more children than the 'max' tree.
     * Add the missing children to the 'max' tree.
     */
    if( num_max < num_src ) {
        i = 0;
        for(child_item  = opal_tree_get_first_child(src_parent);
            child_item != NULL;
            child_item  = opal_tree_get_next_sibling(child_item)) {
            if(i >= num_max ) {
#if 1
                str = rmaps_lama_max_tree_pretty_print_subtree_element_get(src_tree, child_item, 0);
                opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                    "mca:rmaps:lama: Merge: Appending child %s",
                                    str);
                free(str);
#endif
                /* Add child's subtree to max */
                opal_tree_copy_subtree(src_tree, child_item, max_tree, max_parent);
            }
            ++i;
        }
    }

    /*
     * Recursively search all children of 'native' tree.
     *
     * Note: Only need to add the children to the 'left-most' branch of the
     * 'max' tree since that is the only branch that is searched during mapping.
     * But do the whole thing for good measure.
     */
    for( child_item  = opal_tree_get_first_child(src_parent),
         max_child_item  = opal_tree_get_first_child(max_parent);
         child_item != NULL;
         child_item  = opal_tree_get_next_sibling(child_item),
         max_child_item  = opal_tree_get_next_sibling(max_child_item) ) {

        if( ORTE_SUCCESS != (ret = rmaps_lama_merge_trees(src_tree,
                                                          max_tree,
                                                          child_item,
                                                          max_child_item)) ) {
            exit_status = ret;
            goto cleanup;
        }
    }

 cleanup:
    if( NULL != key_src_str ) {
        free(key_src_str);
        key_src_str = NULL;
    }

    if( NULL != key_max_str ) {
        free(key_max_str);
        key_max_str = NULL;
    }

    return exit_status;
}
Example #6
0
static int initialize(int argc, char *argv[])
{
    int ret, exit_status = OPAL_SUCCESS;
    char * tmp_env_var = NULL;

    /*
     * Make sure to init util before parse_args
     * to ensure installdirs is setup properly
     * before calling mca_base_open();
     */
    if( OPAL_SUCCESS != (ret = opal_init_util(&argc, &argv)) ) {
        return ret;
    }

    /*
     * Parse Command line arguments
     */
    if (OPAL_SUCCESS != (ret = parse_args(argc, argv))) {
        exit_status = ret;
        goto cleanup;
    }

    /*
     * Setup OPAL Output handle from the verbose argument
     */
    if( opal_restart_globals.verbose ) {
        opal_restart_globals.output = opal_output_open(NULL);
        opal_output_set_verbosity(opal_restart_globals.output, 10);
    } else {
        opal_restart_globals.output = 0; /* Default=STDOUT */
    }

    /*
     * Turn off the selection of the CRS component,
     * we need to do that later
     */
    (void) mca_base_var_env_name("crs_base_do_not_select", &tmp_env_var);
    opal_setenv(tmp_env_var,
                "1", /* turn off the selection */
                true, &environ);
    free(tmp_env_var);
    tmp_env_var = NULL;

    /*
     * Make sure we select the proper compress component.
     */
    if( NULL != opal_restart_globals.snapshot_compress ) {
        (void) mca_base_var_env_name("compress", &tmp_env_var);
        opal_setenv(tmp_env_var,
                    opal_restart_globals.snapshot_compress,
                    true, &environ);
        free(tmp_env_var);
        tmp_env_var = NULL;
    }

    /*
     * Initialize the OPAL layer
     */
    if (OPAL_SUCCESS != (ret = opal_init(&argc, &argv))) {
        exit_status = ret;
        goto cleanup;
    }

    /*
     * If the checkpoint was compressed, then decompress it before continuing
     */
    if( NULL != opal_restart_globals.snapshot_compress ) {
        char * zip_dir = NULL;
        char * tmp_str = NULL;

        /* Make sure to clear the selection for the restart,
         * this way the user can swich compression mechanism
         * across restart
         */
        (void) mca_base_var_env_name("compress", &tmp_env_var);
        opal_unsetenv(tmp_env_var, &environ);
        free(tmp_env_var);
        tmp_env_var = NULL;

        opal_asprintf(&zip_dir, "%s/%s%s",
                 opal_restart_globals.snapshot_loc,
                 opal_restart_globals.snapshot_ref,
                 opal_restart_globals.snapshot_compress_postfix);

        if (0 >  (ret = access(zip_dir, F_OK)) ) {
            opal_output(opal_restart_globals.output,
                        "Error: Unable to access the file [%s]!",
                        zip_dir);
            exit_status = OPAL_ERROR;
            goto cleanup;
        }

        opal_output_verbose(10, opal_restart_globals.output,
                            "Decompressing (%s)",
                            zip_dir);

        opal_compress.decompress(zip_dir, &tmp_str);

        if( NULL != zip_dir ) {
            free(zip_dir);
            zip_dir = NULL;
        }
        if( NULL != tmp_str ) {
            free(tmp_str);
            tmp_str = NULL;
        }
    }

    /*
     * If a cache directory has been suggested, see if it exists
     */
    if( NULL != opal_restart_globals.snapshot_cache ) {
        if(0 == (ret = access(opal_restart_globals.snapshot_cache, F_OK)) ) {
            opal_output_verbose(10, opal_restart_globals.output,
                                "Using the cached snapshot (%s) instead of (%s)",
                                opal_restart_globals.snapshot_cache,
                                opal_restart_globals.snapshot_loc);
            if( NULL != opal_restart_globals.snapshot_loc ) {
                free(opal_restart_globals.snapshot_loc);
                opal_restart_globals.snapshot_loc = NULL;
            }
            opal_restart_globals.snapshot_loc = opal_dirname(opal_restart_globals.snapshot_cache);
        } else {
            opal_show_help("help-opal-restart.txt", "cache_not_avail", true,
                           opal_restart_globals.snapshot_cache,
                           opal_restart_globals.snapshot_loc);
        }
    }

    /*
     * Mark this process as a tool
     */
    opal_cr_is_tool = true;

 cleanup:
    return exit_status;
}
Example #7
0
static int parse_env(char *path,
                     opal_cmd_line_t *cmd_line,
                     char **srcenv,
                     char ***dstenv)
{
    int i, j;
    char *param;
    char *value;
    char *env_set_flag;
    char **vars;
    bool takeus = false;

    opal_output_verbose(1, orte_schizo_base_framework.framework_output,
                        "%s schizo:ompi: parse_env",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));

    if (NULL != orte_schizo_base.personalities) {
        /* see if we are included */
        for (i=0; NULL != orte_schizo_base.personalities[i]; i++) {
            if (0 == strcmp(orte_schizo_base.personalities[i], "ompi")) {
                takeus = true;
                break;
            }
        }
        if (!takeus) {
            return ORTE_ERR_TAKE_NEXT_OPTION;
        }
    }

    for (i = 0; NULL != srcenv[i]; ++i) {
        if (0 == strncmp("OMPI_", srcenv[i], 5)) {
            /* check for duplicate in app->env - this
             * would have been placed there by the
             * cmd line processor. By convention, we
             * always let the cmd line override the
             * environment
             */
            param = strdup(srcenv[i]);
            value = strchr(param, '=');
            *value = '\0';
            value++;
            opal_setenv(param, value, false, dstenv);
            free(param);
        }
    }

    /* set necessary env variables for external usage from tune conf file*/
    int set_from_file = 0;
    vars = NULL;
    if (OPAL_SUCCESS == mca_base_var_process_env_list_from_file(&vars) &&
            NULL != vars) {
        for (i=0; NULL != vars[i]; i++) {
            value = strchr(vars[i], '=');
            /* terminate the name of the param */
            *value = '\0';
            /* step over the equals */
            value++;
            /* overwrite any prior entry */
            opal_setenv(vars[i], value, true, dstenv);
            /* save it for any comm_spawn'd apps */
            opal_setenv(vars[i], value, true, &orte_forwarded_envars);
        }
        set_from_file = 1;
        opal_argv_free(vars);
    }
    /* Did the user request to export any environment variables on the cmd line? */
    env_set_flag = getenv("OMPI_MCA_mca_base_env_list");
    if (opal_cmd_line_is_taken(cmd_line, "x")) {
        if (NULL != env_set_flag) {
            orte_show_help("help-orterun.txt", "orterun:conflict-env-set", false);
            return ORTE_ERR_FATAL;
        }
        j = opal_cmd_line_get_ninsts(cmd_line, "x");
        for (i = 0; i < j; ++i) {
            param = opal_cmd_line_get_param(cmd_line, "x", i, 0);

            if (NULL != (value = strchr(param, '='))) {
                /* terminate the name of the param */
                *value = '\0';
                /* step over the equals */
                value++;
                /* overwrite any prior entry */
                opal_setenv(param, value, true, dstenv);
                /* save it for any comm_spawn'd apps */
                opal_setenv(param, value, true, &orte_forwarded_envars);
            } else {
                value = getenv(param);
                if (NULL != value) {
                    /* overwrite any prior entry */
                    opal_setenv(param, value, true, dstenv);
                    /* save it for any comm_spawn'd apps */
                    opal_setenv(param, value, true, &orte_forwarded_envars);
                } else {
                    opal_output(0, "Warning: could not find environment variable \"%s\"\n", param);
                }
            }
        }
    } else if (NULL != env_set_flag) {
        /* if mca_base_env_list was set, check if some of env vars were set via -x from a conf file.
         * If this is the case, error out.
         */
        if (!set_from_file) {
            /* set necessary env variables for external usage */
            vars = NULL;
            if (OPAL_SUCCESS == mca_base_var_process_env_list(env_set_flag, &vars) &&
                    NULL != vars) {
                for (i=0; NULL != vars[i]; i++) {
                    value = strchr(vars[i], '=');
                    /* terminate the name of the param */
                    *value = '\0';
                    /* step over the equals */
                    value++;
                    /* overwrite any prior entry */
                    opal_setenv(vars[i], value, true, dstenv);
                    /* save it for any comm_spawn'd apps */
                    opal_setenv(vars[i], value, true, &orte_forwarded_envars);
                }
                opal_argv_free(vars);
            }
        } else {
            orte_show_help("help-orterun.txt", "orterun:conflict-env-set", false);
            return ORTE_ERR_FATAL;
        }
    }

    /* If the user specified --path, store it in the user's app
       environment via the OMPI_exec_path variable. */
    if (NULL != path) {
        asprintf(&value, "OMPI_exec_path=%s", path);
        opal_argv_append_nosize(dstenv, value);
        /* save it for any comm_spawn'd apps */
        opal_argv_append_nosize(&orte_forwarded_envars, value);
        free(value);
    }

    return ORTE_SUCCESS;
}
Example #8
0
static int hostfile_parse_line(int token, opal_list_t* updates,
                               opal_list_t* exclude, bool keep_all)
{
    int rc;
    orte_node_t* node;
    bool got_max = false;
    char* value;
    char** argv;
    char* node_name = NULL;
    char* node_alias = NULL;
    char* username = NULL;
    int cnt;
    int number_of_slots = 0;
    char buff[64];

    if (ORTE_HOSTFILE_STRING == token ||
        ORTE_HOSTFILE_HOSTNAME == token ||
        ORTE_HOSTFILE_INT == token ||
        ORTE_HOSTFILE_IPV4 == token ||
        ORTE_HOSTFILE_IPV6 == token) {

        if(ORTE_HOSTFILE_INT == token) {
            snprintf(buff, 64, "%d", orte_util_hostfile_value.ival);
            value = buff;
        } else {
            value = orte_util_hostfile_value.sval;
        }
        argv = opal_argv_split (value, '@');
        
        cnt = opal_argv_count (argv);
        if (1 == cnt) {
            node_name = strdup(argv[0]);
        } else if (2 == cnt) {
            username = strdup(argv[0]);
            node_name = strdup(argv[1]);
        } else {
            opal_output(0, "WARNING: Unhandled user@host-combination\n"); /* XXX */
        }
        opal_argv_free (argv);

        /* if the first letter of the name is '^', then this is a node
         * to be excluded. Remove the ^ character so the nodename is
         * usable, and put it on the exclude list
         */
        if ('^' == node_name[0]) {
            int i, len;
            len = strlen(node_name);
            for (i=1; i < len; i++) {
                node_name[i-1] = node_name[i];
            }
            node_name[len-1] = '\0';  /* truncate */
            
            OPAL_OUTPUT_VERBOSE((3, orte_ras_base_framework.framework_output,
                                 "%s hostfile: node %s is being excluded",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node_name));
            
            /* convert this into something globally unique */
            if (strcmp(node_name, "localhost") == 0 || opal_ifislocal(node_name)) {
                /* Nodename has been allocated, that is for sure */
                if (orte_show_resolved_nodenames &&
                    0 != strcmp(node_name, orte_process_info.nodename)) {
                    node_alias = strdup(node_name);
                }
                free (node_name);
                node_name = strdup(orte_process_info.nodename);
            }
            
            /* Do we need to make a new node object?  First check to see
               if it's already in the exclude list */
            if (NULL == (node = hostfile_lookup(exclude, node_name, keep_all))) {
                node = OBJ_NEW(orte_node_t);
                node->name = node_name;
                if (NULL != username) {
                    node->username = strdup(username);
                }
            }
            /* Note that we need to add this back to the exclude list.
               If it was found, we just removed it (in hostfile_lookup()),
               so this puts it back. If it was not found, then we have to
               add it to the exclude list anyway. */
            opal_list_append(exclude, &node->super);
            return ORTE_SUCCESS;
        }
        
        /* this is not a node to be excluded, so we need to process it and
         * add it to the "include" list. See if this host is actually us.
         */
        if (strcmp(node_name, "localhost") == 0 || opal_ifislocal(node_name)) {
            /* Nodename has been allocated, that is for sure */
            if (orte_show_resolved_nodenames &&
                0 != strcmp(node_name, orte_process_info.nodename)) {
                node_alias = strdup(node_name);
            }
            free (node_name);
            node_name = strdup(orte_process_info.nodename);
        }

        OPAL_OUTPUT_VERBOSE((3, orte_ras_base_framework.framework_output,
                             "%s hostfile: node %s is being included - keep all is %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node_name,
                             keep_all ? "TRUE" : "FALSE"));

        /* Do we need to make a new node object? */
        if (NULL == (node = hostfile_lookup(updates, node_name, keep_all))) {
            node = OBJ_NEW(orte_node_t);
            node->name = node_name;
            node->slots = 1;
            if (NULL != username) {
                node->username = strdup(username);
            }
        } else {
            /* this node was already found once - add a slot and mark slots as "given" */
            node->slots++;
            node->slots_given = true;
        }
        /* do we need to record an alias for this node? */
        if (NULL != node_alias) {
            /* add to list of aliases for this node - only add if unique */
            opal_argv_append_unique_nosize(&node->alias, node_alias, false);
            free(node_alias);
        }
    } else if (ORTE_HOSTFILE_RELATIVE == token) {
        /* store this for later processing */
        node = OBJ_NEW(orte_node_t);
        node->name = strdup(orte_util_hostfile_value.sval);
        if (NULL != username) {
            node->username = strdup(username);
        }
    } else if (ORTE_HOSTFILE_RANK == token) {
        /* we can ignore the rank, but we need to extract the node name. we
         * first need to shift over to the other side of the equal sign as
         * this is where the node name will be
         */
        while (!orte_util_hostfile_done &&
               ORTE_HOSTFILE_EQUAL != token) {
            token = orte_util_hostfile_lex();
        }
        if (orte_util_hostfile_done) {
            /* bad syntax somewhere */
            return ORTE_ERROR;
        }
        /* next position should be the node name */
        token = orte_util_hostfile_lex();
        if(ORTE_HOSTFILE_INT == token) {
            snprintf(buff, 64, "%d", orte_util_hostfile_value.ival);
            value = buff;
        } else {
            value = orte_util_hostfile_value.sval;
        }
        
        argv = opal_argv_split (value, '@');
        
        cnt = opal_argv_count (argv);
        if (1 == cnt) {
            node_name = strdup(argv[0]);
        } else if (2 == cnt) {
            username = strdup(argv[0]);
            node_name = strdup(argv[1]);
        } else {
            opal_output(0, "WARNING: Unhandled user@host-combination\n"); /* XXX */
        }
        opal_argv_free (argv);
        /* Do we need to make a new node object? */
        if (NULL == (node = hostfile_lookup(updates, node_name, keep_all))) {
            node = OBJ_NEW(orte_node_t);
            node->name = node_name;
            node->slots = 1;
            if (NULL != username) {
                node->username = strdup(username);
            }
        } else {
            /* add a slot */
            node->slots++;
        }
        OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
                             "%s hostfile: node %s slots %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, node->slots));
        /* mark the slots as "given" since we take them as being the
         * number specified via the rankfile
         */
        node->slots_given = true;
        /* skip to end of line */
        while (!orte_util_hostfile_done &&
               ORTE_HOSTFILE_NEWLINE != token) {
            token = orte_util_hostfile_lex();
        }
        opal_list_append(updates, &node->super);
        return ORTE_SUCCESS;
    } else {
        hostfile_parse_error(token);
        return ORTE_ERROR;
    }
    
    while (!orte_util_hostfile_done) {
        token = orte_util_hostfile_lex();
        
        switch (token) {            
        case ORTE_HOSTFILE_DONE:
            goto done;

        case ORTE_HOSTFILE_NEWLINE:
            goto done;

        case ORTE_HOSTFILE_USERNAME:
            node->username = hostfile_parse_string();
            break;

        case ORTE_HOSTFILE_COUNT:
        case ORTE_HOSTFILE_CPU:
        case ORTE_HOSTFILE_SLOTS:
            rc = hostfile_parse_int();
            if (rc < 0) {
                orte_show_help("help-hostfile.txt", "slots",
                               true,
                               cur_hostfile_name, rc);
                OBJ_RELEASE(node);
                return ORTE_ERROR;
            }
            if (node->slots_given) {
                /* multiple definitions were given for the
                 * slot count - this is not allowed
                 */
                orte_show_help("help-hostfile.txt", "slots-given",
                               true,
                               cur_hostfile_name, node->name);
                OBJ_RELEASE(node);
                return ORTE_ERROR;
            }
            node->slots = rc;
            node->slots_given = true;

            /* Ensure that slots_max >= slots */
            if (node->slots_max != 0 && node->slots_max < node->slots) {
                node->slots_max = node->slots;
            }
            break;

        case ORTE_HOSTFILE_SLOTS_MAX:
            rc = hostfile_parse_int();
            if (rc < 0) {
                orte_show_help("help-hostfile.txt", "max_slots",
                               true,
                               cur_hostfile_name, ((size_t) rc));
                OBJ_RELEASE(node);
                return ORTE_ERROR;
            }
            /* Only take this update if it puts us >= node_slots */
            if (rc >= node->slots) {
                if (node->slots_max != rc) {
                    node->slots_max = rc;
                    got_max = true;
                }
            } else {
                orte_show_help("help-hostfile.txt", "max_slots_lt",
                               true,
                               cur_hostfile_name, node->slots, rc);
                ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
                OBJ_RELEASE(node);
                return ORTE_ERROR;
            }
            break;

        default:
            hostfile_parse_error(token);
            OBJ_RELEASE(node);
            return ORTE_ERROR;
        }
        if (number_of_slots > node->slots) {
            ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
            OBJ_RELEASE(node);
            return ORTE_ERROR;
        }
    }

 done:
    if (got_max && !node->slots_given) {
        node->slots = node->slots_max;
        node->slots_given = true;
    }
    opal_list_append(updates, &node->super);

    return ORTE_SUCCESS;
}
Example #9
0
/* Parse the provided hostfile and filter the nodes that are
 * on the input list, removing those that
 * are not found in the hostfile
 */
int orte_util_filter_hostfile_nodes(opal_list_t *nodes,
                                    char *hostfile,
                                    bool remove)
{
    opal_list_t newnodes, exclude;
    opal_list_item_t *item1, *item2, *next, *item3;
    orte_node_t *node_from_list, *node_from_file, *node_from_pool, *node3;
    int rc = ORTE_SUCCESS;
    char *cptr;
    int num_empty, nodeidx;
    bool want_all_empty = false;
    opal_list_t keep;
    bool found;

    OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
                        "%s hostfile: filtering nodes through hostfile %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostfile));

    /* parse the hostfile and create local list of findings */
    OBJ_CONSTRUCT(&newnodes, opal_list_t);
    OBJ_CONSTRUCT(&exclude, opal_list_t);
    if (ORTE_SUCCESS != (rc = hostfile_parse(hostfile, &newnodes, &exclude, false))) {
        OBJ_DESTRUCT(&newnodes);
        OBJ_DESTRUCT(&exclude);
        return rc;
    }
    
    /* if the hostfile was empty, then treat it as a no-op filter */
    if (0 == opal_list_get_size(&newnodes)) {
        OBJ_DESTRUCT(&newnodes);
        OBJ_DESTRUCT(&exclude);
        /* indicate that the hostfile was empty */
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }

    /* remove from the list of newnodes those that are in the exclude list
     * since we could have added duplicate names above due to the */
    while (NULL != (item1 = opal_list_remove_first(&exclude))) {
        node_from_file = (orte_node_t*)item1;
        /* check for matches on nodes */
        for (item2 = opal_list_get_first(&newnodes);
             item2 != opal_list_get_end(&newnodes);
             item2 = opal_list_get_next(item2)) {
            orte_node_t *node = (orte_node_t*)item2;
            if (0 == strcmp(node_from_file->name, node->name)) {
                /* match - remove it */
                opal_output(0, "HOST %s ON EXCLUDE LIST - REMOVING", node->name);
                opal_list_remove_item(&newnodes, item2);
                OBJ_RELEASE(item2);
                break;
            }
        }
        OBJ_RELEASE(item1);
    }
    
    /* now check our nodes and keep or mark those that match. We can
     * destruct our hostfile list as we go since this won't be needed
     */
    OBJ_CONSTRUCT(&keep, opal_list_t);
    while (NULL != (item2 = opal_list_remove_first(&newnodes))) {
        node_from_file = (orte_node_t*)item2;
        
        next = opal_list_get_next(item2);
        
        /* see if this is a relative node syntax */
        if ('+' == node_from_file->name[0]) {
            /* see if we specified empty nodes */
            if ('e' == node_from_file->name[1] ||
                'E' == node_from_file->name[1]) {
                /* request for empty nodes - do they want
                 * all of them?
                 */
                if (NULL != (cptr = strchr(node_from_file->name, ':'))) {
                    /* the colon indicates a specific # are requested */
                    cptr++; /* step past : */
                    num_empty = strtol(cptr, NULL, 10);
                } else {
                    /* want them all - set num_empty to max */
                    num_empty = INT_MAX;
                    want_all_empty = true;
                }
                /* search the list of nodes provided to us and find those
                 * that are empty
                 */
                item1 = opal_list_get_first(nodes);
                while (0 < num_empty && item1 != opal_list_get_end(nodes)) {
                    node_from_list = (orte_node_t*)item1;
                    next = opal_list_get_next(item1);  /* keep our place */
                    if (0 == node_from_list->slots_inuse) {
                        /* check to see if this node is explicitly called
                         * out later - if so, don't use it here
                         */
                        for (item3 = opal_list_get_first(&newnodes);
                             item3 != opal_list_get_end(&newnodes);
                             item3 = opal_list_get_next(item3)) {
                            node3 = (orte_node_t*)item3;
                            if (0 == strcmp(node3->name, node_from_list->name)) {
                                /* match - don't use it */
                                goto skipnode;
                            }
                        }
                        if (remove) {
                            /* remove item from list */
                            opal_list_remove_item(nodes, item1);
                            /* xfer to keep list */
                            opal_list_append(&keep, item1);
                        } else {
                            /* mark as included */
                            node_from_list->mapped = true;
                        }
                        --num_empty;
                    }
                skipnode:
                    item1 = next;
                }
                /* did they get everything they wanted? */
                if (!want_all_empty && 0 < num_empty) {
                    orte_show_help("help-hostfile.txt", "hostfile:not-enough-empty",
                                   true, num_empty);
                    rc = ORTE_ERR_SILENT;
                    goto cleanup;
                }            
            } else if ('n' == node_from_file->name[1] ||
                       'N' == node_from_file->name[1]) {
                /* they want a specific relative node #, so
                 * look it up on global pool
                 */
                nodeidx = strtol(&node_from_file->name[2], NULL, 10);
                if (NULL == (node_from_pool = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, nodeidx))) {
                    /* this is an error */
                    orte_show_help("help-hostfile.txt", "hostfile:relative-node-not-found",
                                   true, nodeidx, node_from_file->name);
                    rc = ORTE_ERR_SILENT;
                    goto cleanup;
                }
                /* search the list of nodes provided to us and find it */
                for (item1 = opal_list_get_first(nodes);
                     item1 != opal_list_get_end(nodes);
                     item1 = opal_list_get_next(nodes)) {
                    node_from_list = (orte_node_t*)item1;
                    if (0 == strcmp(node_from_list->name, node_from_pool->name)) {
                        if (remove) {
                            /* match - remove item from list */
                            opal_list_remove_item(nodes, item1);
                            /* xfer to keep list */
                            opal_list_append(&keep, item1);
                        } else {
                            /* mark as included */
                            node_from_list->mapped = true;
                        }
                        break;
                    }
                }
            } else {
                /* invalid relative node syntax */
                orte_show_help("help-hostfile.txt", "hostfile:invalid-relative-node-syntax",
                               true, node_from_file->name);
                rc = ORTE_ERR_SILENT;
                goto cleanup;
            }
        } else {
            /* we are looking for a specific node on the list
             * search the provided list of nodes to see if this
             * one is found
             */
            found = false;
            for (item1 = opal_list_get_first(nodes);
                 item1 != opal_list_get_end(nodes);
                 item1 = opal_list_get_next(item1)) {
                node_from_list = (orte_node_t*)item1;
                /* since the name in the hostfile might not match
                 * our local name, and yet still be intended to match,
                 * we have to check for local interfaces
                 */
                if (0 == strcmp(node_from_file->name, node_from_list->name) ||
                    (0 == strcmp(node_from_file->name, "localhost") &&
                     0 == strcmp(node_from_list->name, orte_process_info.nodename)) ||
                    (opal_ifislocal(node_from_list->name) &&
                     opal_ifislocal(node_from_file->name))) {
                    /* if the slot count here is less than the
                     * total slots avail on this node, set it
                     * to the specified count - this allows people
                     * to subdivide an allocation
                     */
                    if (node_from_file->slots < node_from_list->slots) {
                        node_from_list->slots = node_from_file->slots;
                    }
                    if (remove) {
                        /* remove the node from the list */
                        opal_list_remove_item(nodes, item1);
                        /* xfer it to keep list */
                        opal_list_append(&keep, item1);
                    } else {
                        /* mark as included */
                        node_from_list->mapped = true;
                    }
                    found = true;
                    break;
                }
            }
            /* if the host in the newnode list wasn't found,
             * then that is an error we need to report to the
             * user and abort
             */
            if (!found) {
                orte_show_help("help-hostfile.txt", "hostfile:extra-node-not-found",
                               true, hostfile, node_from_file->name);
                rc = ORTE_ERR_SILENT;
                goto cleanup;
            }
        }
        /* cleanup the newnode list */
        OBJ_RELEASE(item2);
    }
    
    /* if we still have entries on our hostfile list, then
     * there were requested hosts that were not in our allocation.
     * This is an error - report it to the user and return an error
     */
    if (0 != opal_list_get_size(&newnodes)) {
        orte_show_help("help-hostfile.txt", "not-all-mapped-alloc",
                       true, hostfile);
        while (NULL != (item1 = opal_list_remove_first(&newnodes))) {
            OBJ_RELEASE(item1);
        }
        OBJ_DESTRUCT(&newnodes);
        return ORTE_ERR_SILENT;
    }

    if (!remove) {
        /* all done */
        OBJ_DESTRUCT(&newnodes);
        return ORTE_SUCCESS;
    }

    /* clear the rest of the nodes list */
    while (NULL != (item1 = opal_list_remove_first(nodes))) {
        OBJ_RELEASE(item1);
    }
    
    /* the nodes list has been cleared - rebuild it in order */
    while (NULL != (item1 = opal_list_remove_first(&keep))) {
        opal_list_append(nodes, item1);
    }
    
cleanup:
    OBJ_DESTRUCT(&newnodes);

    return rc;
}
Example #10
0
int opal_hwloc_base_set_binding_policy(opal_binding_policy_t *policy, char *spec)
{
    int i;
    opal_binding_policy_t tmp;
    char **tmpvals, **quals;

    /* set default */
    tmp = 0;

    /* binding specification */
    if (NULL == spec) {
        if (opal_hwloc_use_hwthreads_as_cpus) {
            /* default to bind-to hwthread */
            OPAL_SET_DEFAULT_BINDING_POLICY(tmp, OPAL_BIND_TO_HWTHREAD);
        } else {
            /* default to bind-to core */
            OPAL_SET_DEFAULT_BINDING_POLICY(tmp, OPAL_BIND_TO_CORE);
        }
    } else if (0 == strncasecmp(spec, "none", strlen("none"))) {
        OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_NONE);
    } else {
        tmpvals = opal_argv_split(spec, ':');
        if (1 < opal_argv_count(tmpvals) || ':' == spec[0]) {
            if (':' == spec[0]) {
                quals = opal_argv_split(&spec[1], ',');
            } else {
                quals = opal_argv_split(tmpvals[1], ',');
            }
            for (i=0; NULL != quals[i]; i++) {
                if (0 == strncasecmp(quals[i], "if-supported", strlen(quals[i]))) {
                    tmp |= OPAL_BIND_IF_SUPPORTED;
                } else if (0 == strncasecmp(quals[i], "overload-allowed", strlen(quals[i])) ||
                           0 == strncasecmp(quals[i], "oversubscribe-allowed", strlen(quals[i]))) {
                    tmp |= OPAL_BIND_ALLOW_OVERLOAD;
                } else {
                    /* unknown option */
                    opal_output(0, "Unknown qualifier to binding policy: %s", spec);
                    opal_argv_free(quals);
                    opal_argv_free(tmpvals);
                    return OPAL_ERR_BAD_PARAM;
                }
            }
            opal_argv_free(quals);
        }
        if (NULL == tmpvals[0] || ':' == spec[0]) {
            OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_CORE);
            tmp &= ~OPAL_BIND_GIVEN;
        } else {
            if (0 == strcasecmp(tmpvals[0], "hwthread")) {
                OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_HWTHREAD);
            } else if (0 == strcasecmp(tmpvals[0], "core")) {
                OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_CORE);
            } else if (0 == strcasecmp(tmpvals[0], "l1cache")) {
                OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_L1CACHE);
            } else if (0 == strcasecmp(tmpvals[0], "l2cache")) {
                OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_L2CACHE);
            } else if (0 == strcasecmp(tmpvals[0], "l3cache")) {
                OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_L3CACHE);
            } else if (0 == strcasecmp(tmpvals[0], "socket")) {
                OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_SOCKET);
            } else if (0 == strcasecmp(tmpvals[0], "numa")) {
                OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_NUMA);
            } else if (0 == strcasecmp(tmpvals[0], "board")) {
                OPAL_SET_BINDING_POLICY(tmp, OPAL_BIND_TO_BOARD);
            } else {
                opal_show_help("help-opal-hwloc-base.txt", "invalid binding_policy", true, "binding", spec);
                opal_argv_free(tmpvals);
                return OPAL_ERR_BAD_PARAM;
            }
        }
        opal_argv_free(tmpvals);
    }

    *policy = tmp;
    return OPAL_SUCCESS;
}
Example #11
0
/*
 * Function for selecting one component from all those that are
 * available.
 */
int orte_rmaps_base_map_job(orte_job_t *jdata)
{
    orte_job_map_t *map;
    int rc;
    bool did_map;
    opal_list_item_t *item;
    orte_rmaps_base_selected_module_t *mod;

    /* NOTE: NO PROXY COMPONENT REQUIRED - REMOTE PROCS ARE NOT
     * ALLOWED TO CALL RMAPS INDEPENDENTLY. ONLY THE PLM CAN
     * DO SO, AND ALL PLM COMMANDS ARE RELAYED TO HNP
     */
    
    /* NOTE: CHECK FOR JDATA->MAP == NULL. IF IT IS, THEN USE
     * THE VALUES THAT WERE READ BY THE LOCAL MCA PARAMS. THE
     * PLM PROXY WILL SEND A JOB-OBJECT THAT WILL INCLUDE ANY
     * MAPPING DIRECTIVES - OTHERWISE, THAT OBJECT WILL HAVE A
     * NULL MAP FIELD
     * LONE EXCEPTION - WE COPY DISPLAY MAP ACROSS IF THEY
     * DIDN'T SET IT
     */
    
    if (NULL == jdata->map) {
        /* a map has not been defined yet for this job, so set one
         * up here
         */
        /* create a map object where we will store the results */
        map = OBJ_NEW(orte_job_map_t);
        if (NULL == map) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            return ORTE_ERR_OUT_OF_RESOURCE;
        }
        /* load it with the system defaults */
        map->policy = orte_default_mapping_policy;
        map->cpus_per_rank = orte_rmaps_base.cpus_per_rank;
        map->stride = orte_rmaps_base.stride;
        map->oversubscribe = orte_rmaps_base.oversubscribe;
        map->display_map = orte_rmaps_base.display_map;
        /* assign the map object to this job */
        jdata->map = map;
    } else {
        if (!jdata->map->display_map) {
            jdata->map->display_map = orte_rmaps_base.display_map;
        }
        if (!ORTE_MAPPING_POLICY_IS_SET(jdata->map->policy)) {
            jdata->map->policy = jdata->map->policy | orte_default_mapping_policy;
        }
    }

    /* if the job is the daemon job, then we are just mapping daemons and
     * not apps in preparation to launch a virtual machine
     */
    if (ORTE_PROC_MY_NAME->jobid == jdata->jobid) {
        if (ORTE_SUCCESS != (rc = orte_rmaps_base_setup_virtual_machine(jdata))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
    } else {
        /* cycle thru the available mappers until one agrees to map
         * the job
         */
        did_map = false;
        for (item = opal_list_get_first(&orte_rmaps_base.selected_modules);
             item != opal_list_get_end(&orte_rmaps_base.selected_modules);
             item = opal_list_get_next(item)) {
            mod = (orte_rmaps_base_selected_module_t*)item;
            if (ORTE_SUCCESS == (rc = mod->module->map_job(jdata))) {
                did_map = true;
                break;
            }
            /* mappers return "next option" if they didn't attempt to
             * map the job. anything else is a true error.
             */
            if (ORTE_ERR_TAKE_NEXT_OPTION != rc) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
        }
        /* if we get here without doing the map, or with zero procs in
         * the map, then that's an error
         */
        if (!did_map || 0 == jdata->num_procs) {
            orte_show_help("help-orte-rmaps-base.txt", "failed-map", true);
            return ORTE_ERR_FAILED_TO_MAP;
        }
    }
    
    /* if we wanted to display the map, now is the time to do it */
    if (jdata->map->display_map) {
        char *output;
        opal_dss.print(&output, NULL, jdata->map, ORTE_JOB_MAP);
        if (orte_xml_output) {
            fprintf(orte_xml_fp, "%s\n", output);
            fflush(orte_xml_fp);
        } else {
            opal_output(orte_clean_output, "%s", output);
        }
        free(output);
    }
    
    return ORTE_SUCCESS;
}
int orte_plm_proxy_spawn(orte_job_t *jdata)
{
    opal_buffer_t buf;
    orte_plm_cmd_flag_t command;
    orte_std_cntr_t count;
    orte_process_name_t *target;
    int rc;
    
    OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                         "%s plm:base:proxy spawn child job",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

    /* setup the buffer */
    OBJ_CONSTRUCT(&buf, opal_buffer_t);
    
    /* tell the HNP we are sending a launch request */
    command = ORTE_PLM_LAUNCH_JOB_CMD;
    if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &command, 1, ORTE_PLM_CMD))) {
        ORTE_ERROR_LOG(rc);
        goto CLEANUP;
    }
    
    /* pack the jdata object */
    if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jdata, 1, ORTE_JOB))) {
        ORTE_ERROR_LOG(rc);
        goto CLEANUP;
        
    }
    
    /* identify who gets this command - the HNP or the local orted */
    if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SPAWN) {
        /* for now, this is unsupported */
        opal_output(0, "LOCAL DAEMON SPAWN IS CURRENTLY UNSUPPORTED");
        target = ORTE_PROC_MY_HNP;
        /* target = ORTE_PROC_MY_DAEMON; */
    } else {
        target = ORTE_PROC_MY_HNP;
    }
    
    OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                         "%s plm:base:proxy sending spawn cmd to %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(target)));
    
    /* tell the target to launch the job */
    if (0 > (rc = orte_rml.send_buffer(target, &buf, ORTE_RML_TAG_PLM, 0))) {
        ORTE_ERROR_LOG(rc);
        goto CLEANUP;
    }
    OBJ_DESTRUCT(&buf);
    
    
    OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                         "%s plm:base:proxy waiting for response",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    /* wait for the target's response */
    OBJ_CONSTRUCT(&buf, opal_buffer_t);
    if (0 > (rc = orte_rml.recv_buffer(ORTE_NAME_WILDCARD, &buf, ORTE_RML_TAG_PLM_PROXY, 0))) {
        ORTE_ERROR_LOG(rc);
        goto CLEANUP;
    }

    /* get the new jobid back in case the caller wants it */
    count = 1;
    if (ORTE_SUCCESS != (rc = opal_dss.unpack(&buf, &(jdata->jobid), &count, ORTE_JOBID))) {
        ORTE_ERROR_LOG(rc);
        goto CLEANUP;
    }
    if (ORTE_JOBID_INVALID == jdata->jobid) {
        /* something went wrong on far end - go no further */
        rc = ORTE_ERR_FAILED_TO_START;
        goto CLEANUP;
    }
    
    /* good to go! */
    
CLEANUP:
    OBJ_DESTRUCT(&buf);
    
    return rc;
}
Example #13
0
int mca_io_ompio_file_preallocate (ompi_file_t *fh,
                                   OMPI_MPI_OFFSET_TYPE diskspace)
{
    int ret = OMPI_SUCCESS, cycles, i;
    OMPI_MPI_OFFSET_TYPE tmp, current_size, size, written, len;
    mca_common_ompio_data_t *data;
    char *buf = NULL;
    ompi_status_public_t *status = NULL;

    data = (mca_common_ompio_data_t *) fh->f_io_selected_data;

    OPAL_THREAD_LOCK(&fh->f_lock);
    tmp = diskspace;

    ret = data->ompio_fh.f_comm->c_coll->coll_bcast (&tmp,
                                                    1,
                                                    OMPI_OFFSET_DATATYPE,
                                                    OMPIO_ROOT,
                                                    data->ompio_fh.f_comm,
                                                    data->ompio_fh.f_comm->c_coll->coll_bcast_module);
    if ( OMPI_SUCCESS != ret ) {
        OPAL_THREAD_UNLOCK(&fh->f_lock);
        return OMPI_ERROR;
    }

    if (tmp != diskspace) {
        OPAL_THREAD_UNLOCK(&fh->f_lock);
        return OMPI_ERROR;
    }
    ret = data->ompio_fh.f_fs->fs_file_get_size (&data->ompio_fh,
                                                 &current_size);
    if ( OMPI_SUCCESS != ret ) {
        OPAL_THREAD_UNLOCK(&fh->f_lock);
        return OMPI_ERROR;
    }
    
    if ( current_size > diskspace ) {
        OPAL_THREAD_UNLOCK(&fh->f_lock);
        return OMPI_SUCCESS;
    }


    /* ROMIO explanation
       On file systems with no preallocation function, we have to
       explicitly write to allocate space. Since there could be holes in the file,
       we need to read up to the current file size, write it back,
       and then write beyond that depending on how much
       preallocation is needed.
    */
    if (OMPIO_ROOT == data->ompio_fh.f_rank) {
        OMPI_MPI_OFFSET_TYPE prev_offset;
        mca_common_ompio_file_get_position (&data->ompio_fh, &prev_offset );

        size = diskspace;
        if (size > current_size) {
            size = current_size;
        }

        cycles = (size + OMPIO_PREALLOC_MAX_BUF_SIZE - 1)/
            OMPIO_PREALLOC_MAX_BUF_SIZE;
        buf = (char *) malloc (OMPIO_PREALLOC_MAX_BUF_SIZE);
        if (NULL == buf) {
            opal_output(1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        written = 0;

        for (i=0; i<cycles; i++) {
            len = OMPIO_PREALLOC_MAX_BUF_SIZE;
            if (len > size-written) {
                len = size - written;
            }
            ret = mca_common_ompio_file_read (&data->ompio_fh, buf, len, MPI_BYTE, status);
            if (ret != OMPI_SUCCESS) {
                goto exit;
            }
            ret = mca_common_ompio_file_write (&data->ompio_fh, buf, len, MPI_BYTE, status);
            if (ret != OMPI_SUCCESS) {
                goto exit;
            }
            written += len;
        }

        if (diskspace > current_size) {
            memset(buf, 0, OMPIO_PREALLOC_MAX_BUF_SIZE);
            size = diskspace - current_size;
            cycles = (size + OMPIO_PREALLOC_MAX_BUF_SIZE - 1) /
                OMPIO_PREALLOC_MAX_BUF_SIZE;
            for (i=0; i<cycles; i++) {
                len = OMPIO_PREALLOC_MAX_BUF_SIZE;
                if (len > diskspace-written) {
                    len = diskspace - written;
                }
                ret = mca_common_ompio_file_write (&data->ompio_fh, buf, len, MPI_BYTE, status);
                if (ret != OMPI_SUCCESS) {
                    goto exit;
                }
                written += len;
            }
        }

        // This operation should not affect file pointer position.
        mca_common_ompio_set_explicit_offset ( &data->ompio_fh, prev_offset);
    }

exit:     
    free ( buf );
    fh->f_comm->c_coll->coll_bcast ( &ret, 1, MPI_INT, OMPIO_ROOT, fh->f_comm,
                                   fh->f_comm->c_coll->coll_bcast_module);
    
    if ( diskspace > current_size ) {
        data->ompio_fh.f_fs->fs_file_set_size (&data->ompio_fh, diskspace);
    }
    OPAL_THREAD_UNLOCK(&fh->f_lock);

    return ret;
}
Example #14
0
static int orte_ras_localhost_allocate(orte_jobid_t jobid, opal_list_t *attributes)
{
    bool empty;
    int ret;
    opal_list_t nodes;
    orte_ras_node_t *node;
    opal_list_item_t *item;

    /* If the node segment is not empty, do nothing */

    if (ORTE_SUCCESS != (ret = orte_ras_base_node_segment_empty(&empty))) {
        ORTE_ERROR_LOG(ret);
        return ret;
    }
    if (!empty) {
        opal_output(orte_ras_base.ras_output,
                    "orte:ras:localhost: node segment not empty; not doing anything");
        return ORTE_SUCCESS;
    }
        opal_output(orte_ras_base.ras_output,
                    "orte:ras:localhost: node segment empty; adding \"localhost\"");

    /* Ok, the node segment is empty -- so add a localhost node */

    node = OBJ_NEW(orte_ras_node_t);
    if (NULL == node) {
        return ORTE_ERR_OUT_OF_RESOURCE;
    }
    /* use the same name we got in orte_system_info so we avoid confusion in
     * the session directories
     */
    node->node_name = strdup(orte_system_info.nodename);
    node->node_arch = NULL;
    node->node_state = ORTE_NODE_STATE_UP;
    /* JMS: this should not be hard-wired to 0, but there's no
       other value to put it to [yet]... */
    node->node_cellid = 0;
    node->node_slots_inuse = 0;
    node->node_slots_max = 0;
    node->node_slots = 1;
    OBJ_CONSTRUCT(&nodes, opal_list_t);
    opal_list_append(&nodes, &node->super);

    /* Put it on the segment and allocate it */

    if (ORTE_SUCCESS !=
        (ret = orte_ras_base_node_insert(&nodes)) ||
        ORTE_SUCCESS != 
        (ret = orte_ras_base_allocate_nodes(jobid, &nodes))) {
        goto cleanup;
    }
    
    /* now indicate that there is uncertainty about the number of slots here,
        * so the launcher should use knowledge of the local number of processors to
        * override any oversubscription flags
        */
    ret = orte_ras_base_set_oversubscribe_override(jobid);
    if (ORTE_SUCCESS != ret) {
        goto cleanup;
    }
    
cleanup:
    item = opal_list_remove_first(&nodes);
    OBJ_RELEASE(item);
    OBJ_DESTRUCT(&nodes);

    /* All done */

    return ret;
}
Example #15
0
static int plm_slurm_start_proc(int argc, char **argv, char **env,
                                char *prefix)
{
    int fd;
    int srun_pid;
    char *exec_argv = opal_path_findv(argv[0], 0, env, NULL);

    if (NULL == exec_argv) {
        return ORTE_ERR_NOT_FOUND;
    }

    srun_pid = fork();
    if (-1 == srun_pid) {
        ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
        free(exec_argv);
        return ORTE_ERR_SYS_LIMITS_CHILDREN;
    }
    
    if (0 == srun_pid) {  /* child */
        char *bin_base = NULL, *lib_base = NULL;

        /* Figure out the basenames for the libdir and bindir.  There
           is a lengthy comment about this in plm_rsh_module.c
           explaining all the rationale for how / why we're doing
           this. */

        lib_base = opal_basename(opal_install_dirs.libdir);
        bin_base = opal_basename(opal_install_dirs.bindir);

        /* If we have a prefix, then modify the PATH and
           LD_LIBRARY_PATH environment variables.  */
        if (NULL != prefix) {
            char *oldenv, *newenv;

            /* Reset PATH */
            oldenv = getenv("PATH");
            if (NULL != oldenv) {
                asprintf(&newenv, "%s/%s:%s", prefix, bin_base, oldenv);
            } else {
                asprintf(&newenv, "%s/%s", prefix, bin_base);
            }
            opal_setenv("PATH", newenv, true, &env);
            OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
                                 "%s plm:slurm: reset PATH: %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 newenv));
            free(newenv);

            /* Reset LD_LIBRARY_PATH */
            oldenv = getenv("LD_LIBRARY_PATH");
            if (NULL != oldenv) {
                asprintf(&newenv, "%s/%s:%s", prefix, lib_base, oldenv);
            } else {
                asprintf(&newenv, "%s/%s", prefix, lib_base);
            }
            opal_setenv("LD_LIBRARY_PATH", newenv, true, &env);
            OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
                                 "%s plm:slurm: reset LD_LIBRARY_PATH: %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 newenv));
            free(newenv);
        }

        fd = open("/dev/null", O_CREAT|O_RDWR|O_TRUNC, 0666);
        if (fd >= 0) {
            dup2(fd, 0);
            /* When not in debug mode and --debug-daemons was not passed,
             * tie stdout/stderr to dev null so we don't see messages from orted
             * EXCEPT if the user has requested that we leave sessions attached
             */
            if (0 > opal_output_get_verbosity(orte_plm_base_framework.framework_output) &&
                !orte_debug_daemons_flag && !orte_leave_session_attached) {
                dup2(fd,1);
                dup2(fd,2);
            }
        }

        /* get the srun process out of orterun's process group so that
           signals sent from the shell (like those resulting from
           cntl-c) don't get sent to srun */
        setpgid(0, 0);

        execve(exec_argv, argv, env);

        opal_output(0, "plm:slurm:start_proc: exec failed");
        /* don't return - need to exit - returning would be bad -
           we're not in the calling process anymore */
        exit(1);
    } else {  /* parent */
        /* just in case, make sure that the srun process is not in our
           process group any more.  Stevens says always do this on both
           sides of the fork... */
        setpgid(srun_pid, srun_pid);
        
        /* if this is the primary launch - i.e., not a comm_spawn of a
         * child job - then save the pid
         */
        if (!primary_pid_set) {
            primary_srun_pid = srun_pid;
            primary_pid_set = true;
        }
        
        /* setup the waitpid so we can find out if srun succeeds! */
        orte_wait_cb(srun_pid, srun_wait_cb, NULL);
        free(exec_argv);
    }

    return ORTE_SUCCESS;
}
/*
 * Function for selecting one component from all those that are
 * available.
 */
void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
{
    orte_job_t *jdata;
    orte_job_map_t *map;
    int rc;
    bool did_map;
    opal_list_item_t *item;
    orte_rmaps_base_selected_module_t *mod;
    orte_job_t *parent;
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;

    /* convenience */
    jdata = caddy->jdata;

    /* NOTE: NO PROXY COMPONENT REQUIRED - REMOTE PROCS ARE NOT
     * ALLOWED TO CALL RMAPS INDEPENDENTLY. ONLY THE PLM CAN
     * DO SO, AND ALL PLM COMMANDS ARE RELAYED TO HNP
     */
    
    opal_output_verbose(5, orte_rmaps_base.rmaps_output,
                        "mca:rmaps: mapping job %s",
                        ORTE_JOBID_PRINT(jdata->jobid));

    /* NOTE: CHECK FOR JDATA->MAP == NULL. IF IT IS, THEN USE
     * THE VALUES THAT WERE READ BY THE LOCAL MCA PARAMS. THE
     * PLM PROXY WILL SEND A JOB-OBJECT THAT WILL INCLUDE ANY
     * MAPPING DIRECTIVES - OTHERWISE, THAT OBJECT WILL HAVE A
     * NULL MAP FIELD
     * LONE EXCEPTION - WE COPY DISPLAY MAP ACROSS IF THEY
     * DIDN'T SET IT
     */        
    if (NULL == jdata->map) {
        opal_output_verbose(5, orte_rmaps_base.rmaps_output,
                            "mca:rmaps: creating new map for job %s",
                            ORTE_JOBID_PRINT(jdata->jobid));
        /* create a map object where we will store the results */
        map = OBJ_NEW(orte_job_map_t);
        if (NULL == map) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
            OBJ_RELEASE(caddy);
            return;
        }
        /* load it with the system defaults */
        map->mapping = orte_rmaps_base.mapping;
        map->ranking = orte_rmaps_base.ranking;
#if OPAL_HAVE_HWLOC
        map->binding = opal_hwloc_binding_policy;
#endif
        if (NULL != orte_rmaps_base.ppr) {
            map->ppr = strdup(orte_rmaps_base.ppr);
        }
        map->cpus_per_rank = orte_rmaps_base.cpus_per_rank;
        map->display_map = orte_rmaps_base.display_map;
        /* assign the map object to this job */
        jdata->map = map;
    } else {
        opal_output_verbose(5, orte_rmaps_base.rmaps_output,
                            "mca:rmaps: setting mapping policies for job %s",
                            ORTE_JOBID_PRINT(jdata->jobid));

        if (!jdata->map->display_map) {
            jdata->map->display_map = orte_rmaps_base.display_map;
        }
        /* set the default mapping policy IFF it wasn't provided */
        if (!ORTE_MAPPING_POLICY_IS_SET(jdata->map->mapping)) {
            ORTE_SET_MAPPING_POLICY(jdata->map->mapping, orte_rmaps_base.mapping);
        }
        if (!ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
            ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping));
        }
        /* ditto for rank and bind policies */
        if (!ORTE_RANKING_POLICY_IS_SET(jdata->map->ranking)) {
            ORTE_SET_RANKING_POLICY(jdata->map->ranking, orte_rmaps_base.ranking);
        }
#if OPAL_HAVE_HWLOC
        if (!OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
            jdata->map->binding = opal_hwloc_binding_policy;
        }
#endif
    }

#if OPAL_HAVE_HWLOC
    /* if we are not going to launch, then we need to set any
     * undefined topologies to match our own so the mapper
     * can operate
     */
    if (orte_do_not_launch) {
        orte_node_t *node;
        hwloc_topology_t t0;
        int i;
        node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
        t0 = node->topology;
        for (i=1; i < orte_node_pool->size; i++) {
            if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) {
                continue;
            }
            if (NULL == node->topology) {
                node->topology = t0;
            }
        }
    }
#endif

    /* cycle thru the available mappers until one agrees to map
     * the job
     */
    did_map = false;
    for (item = opal_list_get_first(&orte_rmaps_base.selected_modules);
         item != opal_list_get_end(&orte_rmaps_base.selected_modules);
         item = opal_list_get_next(item)) {
        mod = (orte_rmaps_base_selected_module_t*)item;
        if (ORTE_SUCCESS == (rc = mod->module->map_job(jdata))) {
            did_map = true;
            break;
        }
        /* mappers return "next option" if they didn't attempt to
         * map the job. anything else is a true error.
         */
        if (ORTE_ERR_TAKE_NEXT_OPTION != rc) {
            ORTE_ERROR_LOG(rc);
            ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
            OBJ_RELEASE(caddy);
            return;
        }
    }
    /* if we get here without doing the map, or with zero procs in
     * the map, then that's an error
     */
    if (!did_map || 0 == jdata->num_procs) {
        orte_show_help("help-orte-rmaps-base.txt", "failed-map", true);
        ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        OBJ_RELEASE(caddy);
        return;
    }

    /* compute and save local ranks */
    if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
        ORTE_ERROR_LOG(rc);
        ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        OBJ_RELEASE(caddy);
        return;
    }
    
#if OPAL_HAVE_HWLOC
    /* compute and save bindings */
    if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) {
        ORTE_ERROR_LOG(rc);
        ORTE_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        OBJ_RELEASE(caddy);
        return;
    }
#endif
    
    /* if it is a dynamic spawn, save the bookmark on the parent's job too */
    if (ORTE_JOBID_INVALID != jdata->originator.jobid) {
        if (NULL != (parent = orte_get_job_data_object(jdata->originator.jobid))) {
            parent->bookmark = jdata->bookmark;
        }
    }

    /* if we wanted to display the map, now is the time to do it - ignore
     * daemon job
     */
    if (jdata->map->display_map) {
        char *output;
        int i, j;
        orte_node_t *node;
        orte_proc_t *proc;

        if (orte_display_diffable_output) {
            /* intended solely to test mapping methods, this output
             * can become quite long when testing at scale. Rather
             * than enduring all the malloc/free's required to
             * create an arbitrary-length string, custom-generate
             * the output a line at a time here
             */
            /* display just the procs in a diffable format */
            opal_output(orte_clean_output, "<map>");
            fflush(stderr);
            /* loop through nodes */
            for (i=0; i < jdata->map->nodes->size; i++) {
                if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, i))) {
                    continue;
                }
                opal_output(orte_clean_output, "\t<host name=%s>", (NULL == node->name) ? "UNKNOWN" : node->name);
                fflush(stderr);
                for (j=0; j < node->procs->size; j++) {
                    if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
                        continue;
                    }
#if OPAL_HAVE_HWLOC
                    {
                        char locale[64];

                        if (NULL != proc->locale) {
                            hwloc_bitmap_list_snprintf(locale, 64, proc->locale->cpuset);
                        }
                        opal_output(orte_clean_output, "\t\t<process rank=%s app_idx=%ld local_rank=%lu node_rank=%lu locale=%s binding=%s[%s:%u]>",
                                    ORTE_VPID_PRINT(proc->name.vpid),  (long)proc->app_idx,
                                    (unsigned long)proc->local_rank,
                                    (unsigned long)proc->node_rank, locale,
                                    (NULL == proc->cpu_bitmap) ? "NULL" : proc->cpu_bitmap,
                                    opal_hwloc_base_print_level(jdata->map->bind_level), proc->bind_idx);
                    }
#else
                    opal_output(orte_clean_output, "\t\t<process rank=%s app_idx=%ld local_rank=%lu node_rank=%lu>",
                                ORTE_VPID_PRINT(proc->name.vpid),  (long)proc->app_idx,
                                (unsigned long)proc->local_rank,
                                (unsigned long)proc->node_rank);
#endif
                    fflush(stderr);
                }
                opal_output(orte_clean_output, "\t</host>");
                fflush(stderr);
            }
#if OPAL_HAVE_HWLOC
            {
                opal_hwloc_locality_t locality;
                orte_proc_t *p0;

                /* test locality - for the first node, print the locality of each proc relative to the first one */
                node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, 0);
                p0 = (orte_proc_t*)opal_pointer_array_get_item(node->procs, 0);
                opal_output(orte_clean_output, "\t<locality>");
                for (j=1; j < node->procs->size; j++) {
                    if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
                        continue;
                    }
                    locality = opal_hwloc_base_get_relative_locality(node->topology,
                                                                     jdata->map->bind_level,
                                                                     p0->bind_idx,
                                                                     jdata->map->bind_level,
                                                                     proc->bind_idx);
                    opal_output(orte_clean_output, "\t\t<bind_level=%s rank=%s bind_idx=%u rank=%s bind_idx=%u locality=%s>",
                                opal_hwloc_base_print_level(jdata->map->bind_level),
                                ORTE_VPID_PRINT(p0->name.vpid),
                                p0->bind_idx, ORTE_VPID_PRINT(proc->name.vpid),
                                proc->bind_idx, opal_hwloc_base_print_locality(locality));
                }
                opal_output(orte_clean_output, "\t</locality>\n</map>");
                fflush(stderr);
            }
#else
            opal_output(orte_clean_output, "\n</map>");
            fflush(stderr);
#endif
        } else {
            opal_dss.print(&output, NULL, jdata->map, ORTE_JOB_MAP);
            if (orte_xml_output) {
                fprintf(orte_xml_fp, "%s\n", output);
                fflush(orte_xml_fp);
            } else {
                opal_output(orte_clean_output, "%s", output);
            }
            free(output);
        }
    }
    /* set the job state to the next position */
    ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_COMPLETE);

    /* cleanup */
    OBJ_RELEASE(caddy);
}
Example #17
0
int
main(int argc, char *argv[])
{
    int ret, exit_status = OPAL_SUCCESS;
    int child_pid;
    int prev_pid = 0;
    int idx;
    opal_crs_base_snapshot_t *snapshot = NULL;
    char * tmp_env_var = NULL;
    bool select = false;

    /***************
     * Initialize
     ***************/
    if (OPAL_SUCCESS != (ret = initialize(argc, argv))) {
        exit_status = ret;
        goto cleanup;
    }

    /*
     * Check for existence of the file, or program in the case of self
     */
    if( OPAL_SUCCESS != (ret = check_file() )) {
        opal_show_help("help-opal-restart.txt", "invalid_filename", true,
                       opal_restart_globals.snapshot_ref);
        exit_status = ret;
        goto cleanup;
    }

    /* Re-enable the selection of the CRS component, so we can choose the right one */
    idx = mca_base_var_find(NULL, "crs", "base", "do_not_select");

    if (0 > idx) {
        opal_output(opal_restart_globals.output,
                    "MCA variable opal_crs_base_do_not_select not found\n");
        exit_status = OPAL_ERROR;
        goto cleanup;
    }

    ret = mca_base_var_set_value(idx, &select, 0, MCA_BASE_VAR_SOURCE_DEFAULT, NULL);
    if (OPAL_SUCCESS != ret) {
        exit_status = ret;
        goto cleanup;
    }

    /*
     * Make sure we are using the correct checkpointer
     */
    if(NULL == expected_crs_comp) {
        char * full_metadata_path = NULL;
        FILE * metadata = NULL;

        opal_asprintf(&full_metadata_path, "%s/%s/%s",
                 opal_restart_globals.snapshot_loc,
                 opal_restart_globals.snapshot_ref,
                 opal_restart_globals.snapshot_metadata);
        if( NULL == (metadata = fopen(full_metadata_path, "r")) ) {
            opal_show_help("help-opal-restart.txt", "invalid_metadata", true,
                           opal_restart_globals.snapshot_metadata,
                           full_metadata_path);
            exit_status = OPAL_ERROR;
            goto cleanup;
        }
        if( OPAL_SUCCESS != (ret = opal_crs_base_extract_expected_component(metadata,
                                                                            &expected_crs_comp,
                                                                            &prev_pid)) ) {
            opal_show_help("help-opal-restart.txt", "invalid_metadata", true,
                           opal_restart_globals.snapshot_metadata,
                           full_metadata_path);
            exit_status = ret;
            goto cleanup;
        }

        free(full_metadata_path);
        full_metadata_path = NULL;

        fclose(metadata);
        metadata = NULL;
    }

    opal_output_verbose(10, opal_restart_globals.output,
                        "Restart Expects checkpointer: (%s)",
                        expected_crs_comp);

    (void) mca_base_var_env_name("crs", &tmp_env_var);
    opal_setenv(tmp_env_var,
                expected_crs_comp,
                true, &environ);
    free(tmp_env_var);
    tmp_env_var = NULL;

    /* Select this component or don't continue.
     * If the selection of this component fails, then we can't
     * restart on this node because it doesn't have the proper checkpointer
     * available.
     */
    if( OPAL_SUCCESS != (ret = opal_crs_base_open(MCA_BASE_OPEN_DEFAULT)) ) {
        opal_show_help("help-opal-restart.txt", "comp_select_failure", true,
                       "crs", ret);
        exit_status = ret;
        goto cleanup;
    }

    if( OPAL_SUCCESS != (ret = opal_crs_base_select()) ) {
        opal_show_help("help-opal-restart.txt", "comp_select_failure", true,
                       expected_crs_comp, ret);
        exit_status = ret;
        goto cleanup;
    }

    /*
     * Make sure we have selected the proper component
     */
    if(NULL == expected_crs_comp ||
       0 != strncmp(expected_crs_comp,
                    opal_crs_base_selected_component.base_version.mca_component_name,
                    strlen(expected_crs_comp)) ) {
        opal_show_help("help-opal-restart.txt", "comp_select_mismatch",
                       true,
                       expected_crs_comp,
                       opal_crs_base_selected_component.base_version.mca_component_name,
                       ret);
        exit_status = ret;
        goto cleanup;
    }

    /******************************
     * Restart in this process
     ******************************/
    opal_output_verbose(10, opal_restart_globals.output,
                        "Restarting from file (%s)\n",
                        opal_restart_globals.snapshot_ref);

    snapshot = OBJ_NEW(opal_crs_base_snapshot_t);
    snapshot->cold_start         = true;
    opal_asprintf(&(snapshot->snapshot_directory), "%s/%s",
             opal_restart_globals.snapshot_loc,
             opal_restart_globals.snapshot_ref);
    opal_asprintf(&(snapshot->metadata_filename), "%s/%s",
             snapshot->snapshot_directory,
             opal_restart_globals.snapshot_metadata);

    /* Since some checkpoint/restart systems don't pass along env vars to the
     * restarted app, we need to take care of that.
     *
     * Included here is the creation of any files or directories that need to be
     * created before the process is restarted.
     */
    if(OPAL_SUCCESS != (ret = post_env_vars(prev_pid, snapshot) ) ) {
        exit_status = ret;
        goto cleanup;
    }

    /*
     * Do the actual restart
     */
    ret = opal_crs.crs_restart(snapshot,
                               false,
                               &child_pid);

    if (OPAL_SUCCESS != ret) {
        opal_show_help("help-opal-restart.txt", "restart_cmd_failure", true,
                       opal_restart_globals.snapshot_ref,
                       ret,
                       opal_crs_base_selected_component.base_version.mca_component_name);
        exit_status = ret;
        goto cleanup;
    }
    /* Should never get here, since crs_restart calls exec */

    /***************
     * Cleanup
     ***************/
 cleanup:
    if (OPAL_SUCCESS != (ret = finalize())) {
        return ret;
    }

    if(NULL != snapshot )
        OBJ_DESTRUCT(snapshot);

    return exit_status;
}
Example #18
0
/*
 * A file descriptor is available/ready for send. Check the state
 * of the socket and take the appropriate action.
 */
void mca_oob_usock_send_handler(int sd, short flags, void *cbdata)
{
    mca_oob_usock_peer_t* peer = (mca_oob_usock_peer_t*)cbdata;
    mca_oob_usock_send_t* msg = peer->send_msg;
    int rc;

    opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                        "%s usock:send_handler called to send to peer %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(&peer->name));

    switch (peer->state) {
    case MCA_OOB_USOCK_CONNECTING:
    case MCA_OOB_USOCK_CLOSED:
        opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                            "%s usock:send_handler %s",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            mca_oob_usock_state_print(peer->state));
        mca_oob_usock_peer_complete_connect(peer);
        /* de-activate the send event until the connection
         * handshake completes
         */
        if (peer->send_ev_active) {
            opal_event_del(&peer->send_event);
            peer->send_ev_active = false;
        }
        break;
    case MCA_OOB_USOCK_CONNECTED:
        opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                            "%s usock:send_handler SENDING TO %s",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            (NULL == peer->send_msg) ? "NULL" : ORTE_NAME_PRINT(&peer->name));
        if (NULL != msg) {
            /* if the header hasn't been completely sent, send it */
            if (!msg->hdr_sent) {
                if (ORTE_SUCCESS == (rc = send_bytes(peer))) {
                    /* header is completely sent */
                    msg->hdr_sent = true;
                    /* setup to send the data */
                    if (NULL == msg->msg) {
                        /* this was a zero-byte msg - nothing more to do */
                        OBJ_RELEASE(msg);
                        peer->send_msg = NULL;
                        goto next;
                    } else if (NULL != msg->msg->buffer) {
                        /* send the buffer data as a single block */
                        msg->sdptr = msg->msg->buffer->base_ptr;
                        msg->sdbytes = msg->msg->buffer->bytes_used;
                    } else if (NULL != msg->msg->iov) {
                        /* start with the first iovec */
                        msg->sdptr = msg->msg->iov[0].iov_base;
                        msg->sdbytes = msg->msg->iov[0].iov_len;
                        msg->iovnum = 0;
                    } else {
                        msg->sdptr = msg->msg->data;
                        msg->sdbytes = msg->msg->count;
                    }
                    /* fall thru and let the send progress */
                } else if (ORTE_ERR_RESOURCE_BUSY == rc ||
                           ORTE_ERR_WOULD_BLOCK == rc) {
                    /* exit this event and let the event lib progress */
                    return;
                } else {
                    // report the error
                    opal_output(0, "%s-%s mca_oob_usock_peer_send_handler: unable to send header",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                ORTE_NAME_PRINT(&(peer->name)));
                    opal_event_del(&peer->send_event);
                    peer->send_ev_active = false;
                    msg->msg->status = rc;
                    ORTE_RML_SEND_COMPLETE(msg->msg);
                    OBJ_RELEASE(msg);
                    peer->send_msg = NULL;
                    goto next;
                }
            }
            /* progress the data transmission */
            if (msg->hdr_sent) {
                if (ORTE_SUCCESS == (rc = send_bytes(peer))) {
                    /* this block is complete */
                    if (NULL != msg->msg->buffer) {
                        /* we are done - notify the RML */
                        opal_output_verbose(2, orte_oob_base_framework.framework_output,
                                            "%s MESSAGE SEND COMPLETE TO %s OF %d BYTES ON SOCKET %d",
                                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                            ORTE_NAME_PRINT(&(peer->name)),
                                            msg->hdr.nbytes, peer->sd);
                        msg->msg->status = ORTE_SUCCESS;
                        ORTE_RML_SEND_COMPLETE(msg->msg);
                        OBJ_RELEASE(msg);
                        peer->send_msg = NULL;
                    } else if (NULL != msg->msg->data) {
                        /* this was a relay message - nothing more to do */
                        opal_output_verbose(2, orte_oob_base_framework.framework_output,
                                            "%s MESSAGE SEND COMPLETE TO %s OF %d BYTES ON SOCKET %d",
                                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                            ORTE_NAME_PRINT(&(peer->name)),
                                            msg->hdr.nbytes, peer->sd);
                        OBJ_RELEASE(msg);
                        peer->send_msg = NULL;
                    } else {
                        /* rotate to the next iovec */
                        msg->iovnum++;
                        if (msg->iovnum < msg->msg->count) {
                            msg->sdptr = msg->msg->iov[msg->iovnum].iov_base;
                            msg->sdbytes = msg->msg->iov[msg->iovnum].iov_len;
                            /* exit this event to give the event lib
                             * a chance to progress any other pending
                             * actions
                             */
                            return;
                        } else {
                            /* this message is complete - notify the RML */
                            opal_output_verbose(2, orte_oob_base_framework.framework_output,
                                                "%s MESSAGE SEND COMPLETE TO %s OF %d BYTES ON SOCKET %d",
                                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                                ORTE_NAME_PRINT(&(peer->name)),
                                                msg->hdr.nbytes, peer->sd);
                            msg->msg->status = ORTE_SUCCESS;
                            ORTE_RML_SEND_COMPLETE(msg->msg);
                            OBJ_RELEASE(msg);
                            peer->send_msg = NULL;
                        }
                    }
                    /* fall thru to queue the next message */
                } else if (ORTE_ERR_RESOURCE_BUSY == rc ||
                           ORTE_ERR_WOULD_BLOCK == rc) {
                    /* exit this event and let the event lib progress */
                    return;
                } else {
                    // report the error
                    opal_output(0, "%s-%s mca_oob_usock_peer_send_handler: unable to send message ON SOCKET %d",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                ORTE_NAME_PRINT(&(peer->name)), peer->sd);
                    opal_event_del(&peer->send_event);
                    peer->send_ev_active = false;
                    msg->msg->status = rc;
                    ORTE_RML_SEND_COMPLETE(msg->msg);
                    OBJ_RELEASE(msg);
                    peer->send_msg = NULL;
                    ORTE_FORCED_TERMINATE(1);
                    return;
                }
            }

        next:
            /* if current message completed - progress any pending sends by
             * moving the next in the queue into the "on-deck" position. Note
             * that this doesn't mean we send the message right now - we will
             * wait for another send_event to fire before doing so. This gives
             * us a chance to service any pending recvs.
             */
            peer->send_msg = (mca_oob_usock_send_t*)
                opal_list_remove_first(&peer->send_queue);
        }
        /* if nothing else to do unregister for send event notifications */
        if (NULL == peer->send_msg && peer->send_ev_active) {
            opal_event_del(&peer->send_event);
            peer->send_ev_active = false;
        }
        break;
    default:
        opal_output(0, "%s-%s mca_oob_usock_peer_send_handler: invalid connection state (%d) on socket %d",
                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                    ORTE_NAME_PRINT(&(peer->name)),
                    peer->state, peer->sd);
        if (peer->send_ev_active) {
            opal_event_del(&peer->send_event);
            peer->send_ev_active = false;
        }
        break;
    }
}
Example #19
0
static int post_env_vars(int prev_pid, opal_crs_base_snapshot_t *snapshot)
{
    int ret, exit_status = OPAL_SUCCESS;
    char *command = NULL;
    char *proc_file = NULL;
    char **loc_touch = NULL;
    char **loc_mkdir = NULL;
    int argc, i;

    if( 0 > prev_pid ) {
        opal_output(opal_restart_globals.output,
                    "Invalid PID (%d)\n",
                    prev_pid);
        exit_status = OPAL_ERROR;
        goto cleanup;
    }

    /*
     * This is needed so we can pass the previous environment to the restarted
     * application process.
     */
    opal_asprintf(&proc_file, "%s/%s-%d", opal_tmp_directory(), OPAL_CR_BASE_ENV_NAME, prev_pid);
    opal_asprintf(&command, "env | grep OMPI_ > %s", proc_file);

    opal_output_verbose(5, opal_restart_globals.output,
                        "post_env_vars: Execute: <%s>", command);

    ret = system(command);
    if( 0 > ret) {
        exit_status = ret;
        goto cleanup;
    }

    /*
     * Any directories that need to be created
     */
    if( NULL == (snapshot->metadata = fopen(snapshot->metadata_filename, "r")) ) {
        opal_show_help("help-opal-restart.txt", "invalid_metadata", true,
                       opal_restart_globals.snapshot_metadata,
                       snapshot->metadata_filename);
        exit_status = OPAL_ERROR;
        goto cleanup;
    }
    opal_crs_base_metadata_read_token(snapshot->metadata, CRS_METADATA_MKDIR, &loc_mkdir);
    argc = opal_argv_count(loc_mkdir);
    for( i = 0; i < argc; ++i ) {
        if( NULL != command ) {
            free(command);
            command = NULL;
        }
        opal_asprintf(&command, "mkdir -p %s", loc_mkdir[i]);

        opal_output_verbose(5, opal_restart_globals.output,
                            "post_env_vars: Execute: <%s>", command);

        ret = system(command);
        if( 0 > ret) {
            exit_status = ret;
            goto cleanup;
        }
    }
    if( 0 < argc ) {
        system("sync ; sync");
    }

    /*
     * Any files that need to exist
     */
    opal_crs_base_metadata_read_token(snapshot->metadata, CRS_METADATA_TOUCH, &loc_touch);
    argc = opal_argv_count(loc_touch);
    for( i = 0; i < argc; ++i ) {
        if( NULL != command ) {
            free(command);
            command = NULL;
        }
        opal_asprintf(&command, "touch %s", loc_touch[i]);

        opal_output_verbose(5, opal_restart_globals.output,
                            "post_env_vars: Execute: <%s>", command);

        ret = system(command);
        if( 0 > ret) {
            exit_status = ret;
            goto cleanup;
        }
    }
    if( 0 < argc ) {
        system("sync ; sync");
    }

 cleanup:
    if( NULL != command) {
        free(command);
        command = NULL;
    }
    if( NULL != proc_file) {
        free(proc_file);
        proc_file = NULL;
    }
    if( NULL != loc_mkdir ) {
        opal_argv_free(loc_mkdir);
        loc_mkdir = NULL;
    }
    if( NULL != loc_touch ) {
        opal_argv_free(loc_touch);
        loc_touch = NULL;
    }

    if( NULL != snapshot->metadata ) {
        fclose(snapshot->metadata);
        snapshot->metadata = NULL;
    }

    return exit_status;
}
Example #20
0
void mca_oob_usock_recv_handler(int sd, short flags, void *cbdata)
{
    mca_oob_usock_peer_t* peer = (mca_oob_usock_peer_t*)cbdata;
    int rc;
    orte_rml_send_t *snd;

    if (orte_abnormal_term_ordered) {
        return;
    }

    opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                        "%s:usock:recv:handler called for peer %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(&peer->name));

    switch (peer->state) {
    case MCA_OOB_USOCK_CONNECT_ACK:
        if (ORTE_SUCCESS == (rc = mca_oob_usock_peer_recv_connect_ack(peer, peer->sd, NULL))) {
            opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                                "%s:usock:recv:handler starting send/recv events",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            /* we connected! Start the send/recv events */
            if (!peer->recv_ev_active) {
                opal_event_add(&peer->recv_event, 0);
                peer->recv_ev_active = true;
            }
            if (peer->timer_ev_active) {
                opal_event_del(&peer->timer_event);
                peer->timer_ev_active = false;
            }
            /* if there is a message waiting to be sent, queue it */
            if (NULL == peer->send_msg) {
                peer->send_msg = (mca_oob_usock_send_t*)opal_list_remove_first(&peer->send_queue);
            }
            if (NULL != peer->send_msg && !peer->send_ev_active) {
                opal_event_add(&peer->send_event, 0);
                peer->send_ev_active = true;
            }
            /* update our state */
            peer->state = MCA_OOB_USOCK_CONNECTED;
        } else {
            opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                                "%s UNABLE TO COMPLETE CONNECT ACK WITH %s",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                ORTE_NAME_PRINT(&peer->name));
            opal_event_del(&peer->recv_event);
            peer->recv_ev_active = false;
            ORTE_FORCED_TERMINATE(1);
            return;
        }
        break;
    case MCA_OOB_USOCK_CONNECTED:
        opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                            "%s:usock:recv:handler CONNECTED",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
        /* allocate a new message and setup for recv */
        if (NULL == peer->recv_msg) {
            opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                                "%s:usock:recv:handler allocate new recv msg",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            peer->recv_msg = OBJ_NEW(mca_oob_usock_recv_t);
            if (NULL == peer->recv_msg) {
                opal_output(0, "%s-%s mca_oob_usock_peer_recv_handler: unable to allocate recv message\n",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_NAME_PRINT(&(peer->name)));
                return;
            }
            /* start by reading the header */
            peer->recv_msg->rdptr = (char*)&peer->recv_msg->hdr;
            peer->recv_msg->rdbytes = sizeof(mca_oob_usock_hdr_t);
        }
        /* if the header hasn't been completely read, read it */
        if (!peer->recv_msg->hdr_recvd) {
            opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                                "%s:usock:recv:handler read hdr",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            if (ORTE_SUCCESS == (rc = read_bytes(peer))) {
                /* completed reading the header */
                peer->recv_msg->hdr_recvd = true;
                /* if this is a zero-byte message, then we are done */
                if (0 == peer->recv_msg->hdr.nbytes) {
                    opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                                        "%s RECVD ZERO-BYTE MESSAGE FROM %s for tag %d",
                                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                        ORTE_NAME_PRINT(&peer->name), peer->recv_msg->hdr.tag);
                    peer->recv_msg->data = NULL;  // make sure
                    peer->recv_msg->rdptr = NULL;
                    peer->recv_msg->rdbytes = 0;
                } else {
                    opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                                        "%s:usock:recv:handler allocate data region of size %lu",
                                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (unsigned long)peer->recv_msg->hdr.nbytes);
                    /* allocate the data region */
                    peer->recv_msg->data = (char*)malloc(peer->recv_msg->hdr.nbytes);
                    /* point to it */
                    peer->recv_msg->rdptr = peer->recv_msg->data;
                    peer->recv_msg->rdbytes = peer->recv_msg->hdr.nbytes;
                }
                /* fall thru and attempt to read the data */
            } else if (ORTE_ERR_RESOURCE_BUSY == rc ||
                       ORTE_ERR_WOULD_BLOCK == rc) {
                /* exit this event and let the event lib progress */
                return;
            } else {
                /* close the connection */
                opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                                    "%s:usock:recv:handler error reading bytes - closing connection",
                                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
                mca_oob_usock_peer_close(peer);
                return;
            }
        }

        if (peer->recv_msg->hdr_recvd) {
            /* continue to read the data block - we start from
             * wherever we left off, which could be at the
             * beginning or somewhere in the message
             */
            if (ORTE_SUCCESS == (rc = read_bytes(peer))) {
                /* we recvd all of the message */
                opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                                    "%s RECVD COMPLETE MESSAGE FROM %s OF %d BYTES FOR DEST %s TAG %d",
                                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                    ORTE_NAME_PRINT(&peer->recv_msg->hdr.origin),
                                    (int)peer->recv_msg->hdr.nbytes,
                                    ORTE_NAME_PRINT(&peer->recv_msg->hdr.dst),
                                    peer->recv_msg->hdr.tag);
                /* am I the intended recipient? */
                if (peer->recv_msg->hdr.dst.jobid == ORTE_PROC_MY_NAME->jobid &&
                    peer->recv_msg->hdr.dst.vpid == ORTE_PROC_MY_NAME->vpid) {
                    /* yes - post it to the RML for delivery */
                    opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
                                       "%s DELIVERING TO RML",
                                       ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
                    ORTE_RML_POST_MESSAGE(&peer->recv_msg->hdr.origin, peer->recv_msg->hdr.tag,
                                          peer->recv_msg->hdr.seq_num,
                                          peer->recv_msg->data,
                                          peer->recv_msg->hdr.nbytes);
                    OBJ_RELEASE(peer->recv_msg);
                } else {
                    /* no - we don't route things, so we promote this
                     * back to the OOB and let another transport move
                     * it along. If we are a daemon and it is intended
                     * for another of our local procs, it will just come
                     * back to us and be handled then
                     */
                    snd = OBJ_NEW(orte_rml_send_t);
                    snd->dst = peer->recv_msg->hdr.dst;
                    snd->origin = peer->recv_msg->hdr.origin;
                    snd->tag = peer->recv_msg->hdr.tag;
                    snd->data = peer->recv_msg->data;
                    snd->seq_num = peer->recv_msg->hdr.seq_num;
                    snd->count = peer->recv_msg->hdr.nbytes;
                    snd->cbfunc.iov = NULL;
                    snd->cbdata = NULL;
                    /* activate the OOB send state */
                    ORTE_OOB_SEND(snd);
                    /* protect the data */
                    peer->recv_msg->data = NULL;
                    /* cleanup */
                    OBJ_RELEASE(peer->recv_msg);
                    return;
                }
            } else if (ORTE_ERR_RESOURCE_BUSY == rc ||
                       ORTE_ERR_WOULD_BLOCK == rc) {
                /* exit this event and let the event lib progress */
                return;
            } else {
                // report the error
                opal_output(0, "%s-%s mca_oob_usock_peer_recv_handler: unable to recv message",
                            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_NAME_PRINT(&(peer->name)));
                /* turn off the recv event */
                opal_event_del(&peer->recv_event);
                peer->recv_ev_active = false;
                ORTE_FORCED_TERMINATE(1);
                return;
            }
        }
        break;
    default:
        opal_output(0, "%s-%s mca_oob_usock_peer_recv_handler: invalid socket state(%d)",
                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                    ORTE_NAME_PRINT(&(peer->name)),
                    peer->state);
        // mca_oob_usock_peer_close(peer);
        break;
    }
}
Example #21
0
int mca_btl_vader_get_cma (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address,
                           uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
                           mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
                           int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
    struct iovec src_iov = {.iov_base = (void *)(intptr_t) remote_address, .iov_len = size};
    struct iovec dst_iov = {.iov_base = local_address, .iov_len = size};
    ssize_t ret;

    /*
     * According to the man page :
     * "On success, process_vm_readv() returns the number of bytes read and
     * process_vm_writev() returns the number of bytes written.  This return
     * value may be less than the total number of requested bytes, if a
     * partial read/write occurred.  (Partial transfers apply at the
     * granularity of iovec elements.  These system calls won't perform a
     * partial transfer that splits a single iovec element.)".
     * So since we use a single iovec element, the returned size should either
     * be 0 or size, and the do loop should not be needed here.
     * We tried on various Linux kernels with size > 2 GB, and surprisingly,
     * the returned value is always 0x7ffff000 (fwiw, it happens to be the size
     * of the larger number of pages that fits a signed 32 bits integer).
     * We do not know whether this is a bug from the kernel, the libc or even
     * the man page, but for the time being, we do as is process_vm_readv() could
     * return any value.
     */
    do {
        ret = process_vm_readv (endpoint->segment_data.other.seg_ds->seg_cpid, &dst_iov, 1, &src_iov, 1, 0);
        if (0 > ret) {
            opal_output(0, "Read %ld, expected %lu, errno = %d\n", (long)ret, (unsigned long)size, errno);
            return OPAL_ERROR;
        }
        src_iov.iov_base = (void *)((char *)src_iov.iov_base + ret);
        src_iov.iov_len -= ret;
        dst_iov.iov_base = (void *)((char *)dst_iov.iov_base + ret);
        dst_iov.iov_len -= ret;
    } while (0 < src_iov.iov_len);

    /* always call the callback function */
    cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS);

    return OPAL_SUCCESS;
}
#endif

#if OPAL_BTL_VADER_HAVE_KNEM
int mca_btl_vader_get_knem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address,
                            uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
                            mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
                            int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
    struct knem_cmd_param_iovec recv_iovec;
    struct knem_cmd_inline_copy icopy;

    /* Fill in the ioctl data fields.  There's no async completion, so
       we don't need to worry about getting a slot, etc. */
    recv_iovec.base = (uintptr_t) local_address;
    recv_iovec.len = size;
    icopy.local_iovec_array = (uintptr_t) &recv_iovec;
    icopy.local_iovec_nr    = 1;
    icopy.remote_cookie     = remote_handle->cookie;
    icopy.remote_offset     = remote_address - remote_handle->base_addr;
    icopy.write             = 0;
    icopy.flags             = 0;

    /* Use the DMA flag if knem supports it *and* the segment length
     * is greater than the cutoff. Not that if DMA is not supported
     * or the user specified 0 for knem_dma_min the knem_dma_min was
     * set to UINT_MAX in mca_btl_vader_knem_init. */
    if (mca_btl_vader_component.knem_dma_min <= size) {
        icopy.flags = KNEM_FLAG_DMA;
    }
    /* synchronous flags only, no need to specify icopy.async_status_index */

    /* When the ioctl returns, the transfer is done and we can invoke
       the btl callback and return the frag */
    if (OPAL_UNLIKELY(0 != ioctl (mca_btl_vader.knem_fd, KNEM_CMD_INLINE_COPY, &icopy))) {
        return OPAL_ERROR;
    }

    if (KNEM_STATUS_FAILED == icopy.current_status) {
        return OPAL_ERROR;
    }

    /* always call the callback function */
    cbfunc (btl, endpoint, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS);

    return OPAL_SUCCESS;
}
#endif

static void mca_btl_vader_sc_emu_get_complete (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint,
                                               mca_btl_base_descriptor_t *desc, int status)
{
    mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) desc;
    mca_btl_vader_sc_emu_hdr_t *hdr;
    void *local_address = frag->rdma.local_address;
    size_t len = frag->segments[0].seg_len - sizeof (*hdr);
    void *context = frag->rdma.context;
    void *cbdata = frag->rdma.cbdata;
    mca_btl_base_rdma_completion_fn_t cbfunc = frag->rdma.cbfunc;
    void *data;

    hdr = (mca_btl_vader_sc_emu_hdr_t *) frag->segments[0].seg_addr.pval;
    data = (void *) (hdr + 1);

    memcpy (local_address, data, len);

    /* return the fragment before calling the callback */
    MCA_BTL_VADER_FRAG_RETURN(frag);

    cbfunc (btl, endpoint, local_address, NULL, context, cbdata, status);
}
Example #22
0
int opal_register_params(void)
{
    int ret;

    if (opal_register_done) {
        return OPAL_SUCCESS;
    }

    opal_register_done = true;

    /*
     * This string is going to be used in opal/util/stacktrace.c
     */
    {
        char *string = NULL;
        int j;
        int signals[] = {
#ifdef SIGABRT
            SIGABRT,
#endif
#ifdef SIGBUS
            SIGBUS,
#endif
#ifdef SIGFPE
            SIGFPE,
#endif
#ifdef SIGSEGV
            SIGSEGV,
#endif
            -1
        };
        for (j = 0 ; signals[j] != -1 ; ++j) {
            if (j == 0) {
                asprintf(&string, "%d", signals[j]);
            } else {
                char *tmp;
                asprintf(&tmp, "%s,%d", string, signals[j]);
                free(string);
                string = tmp;
            }
        }

        opal_signal_string = string;
        ret = mca_base_var_register ("opal", "opal", NULL, "signal",
				     "Comma-delimited list of integer signal numbers to Open MPI to attempt to intercept.  Upon receipt of the intercepted signal, Open MPI will display a stack trace and abort.  Open MPI will *not* replace signals if handlers are already installed by the time MPI_INIT is invoked.  Optionally append \":complain\" to any signal number in the comma-delimited list to make Open MPI complain if it detects another signal handler (and therefore does not insert its own).",
				     MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
				     OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
				     &opal_signal_string);
        free (string);
        if (0 > ret) {
            return ret;
        }
    }

#if defined(HAVE_SCHED_YIELD)
    opal_progress_yield_when_idle = false;
    ret = mca_base_var_register ("opal", "opal", "progress", "yield_when_idle",
                                 "Yield the processor when waiting on progress",
                                 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                 OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_LOCAL,
                                 &opal_progress_yield_when_idle);
#endif

#if OPAL_ENABLE_DEBUG
    opal_progress_debug = false;
    ret = mca_base_var_register ("opal", "opal", "progress", "debug",
				 "Set to non-zero to debug progress engine features",
				 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
				 OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_LOCAL,
				 &opal_progress_debug);
    if (0 > ret) {
        return ret;
    }

    opal_debug_threads = false;
    ret = mca_base_var_register ("opal", "opal", "debug", "threads",
				 "Debug thread usage within OPAL. Reports out "
				 "when threads are acquired and released.",
				 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
				 OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_LOCAL,
				 &opal_debug_threads);
    if (0 > ret) {
        return ret;
    }
#endif

#if OPAL_ENABLE_FT_CR == 1
    opal_base_distill_checkpoint_ready = false;
    ret = mca_base_var_register("opal", "opal", "base", "distill_checkpoint_ready",
                                "Distill only those components that are Checkpoint Ready",
                                MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                OPAL_INFO_LVL_8, MCA_BASE_VAR_SCOPE_LOCAL,
                                &opal_base_distill_checkpoint_ready);

    if (0 > ret) {
        return ret;
    }
#endif

    /* RFC1918 defines
       - 10.0.0./8
       - 172.16.0.0/12
       - 192.168.0.0/16
       
       RFC3330 also mentions
       - 169.254.0.0/16 for DHCP onlink iff there's no DHCP server
    */
    opal_net_private_ipv4 = "10.0.0.0/8;172.16.0.0/12;192.168.0.0/16;169.254.0.0/16";
    ret = mca_base_var_register ("opal", "opal", "net", "private_ipv4",
				 "Semicolon-delimited list of CIDR notation entries specifying what networks are considered \"private\" (default value based on RFC1918 and RFC3330)",
				 MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
				 OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_ALL_EQ,
				 &opal_net_private_ipv4);
    if (0 > ret) {
        return ret;
    }

    opal_set_max_sys_limits = NULL;
    ret = mca_base_var_register ("opal", "opal", NULL, "set_max_sys_limits",
				 "Set the specified system-imposed limits to the specified value, including \"unlimited\"."
                                 "Supported params: core, filesize, maxmem, openfiles, stacksize, maxchildren",
				 MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
				 OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_ALL_EQ,
				 &opal_set_max_sys_limits);
    if (0 > ret) {
        return ret;
    }

    ret = mca_base_var_register("opal", "opal", NULL, "built_with_cuda_support",
                                "Whether CUDA GPU buffer support is built into library or not",
                                MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_DEFAULT_ONLY,
                                OPAL_INFO_LVL_4, MCA_BASE_VAR_SCOPE_CONSTANT,
                                &opal_built_with_cuda_support);
    if (0 > ret) {
        return ret;
    }

    /* Current default is to enable CUDA support if it is built into library */
    opal_cuda_support = opal_built_with_cuda_support;
    ret = mca_base_var_register ("opal", "opal", NULL, "cuda_support",
                                 "Whether CUDA GPU buffer support is enabled or not",
                                 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                 OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_ALL_EQ,
                                 &opal_cuda_support);
    if (0 > ret) {
        return ret;
    }

    /* Leave pinned parameter */
    opal_leave_pinned = -1;
    ret = mca_base_var_register("ompi", "mpi", NULL, "leave_pinned",
                                "Whether to use the \"leave pinned\" protocol or not.  Enabling this setting can help bandwidth performance when repeatedly sending and receiving large messages with the same buffers over RDMA-based networks (0 = do not use \"leave pinned\" protocol, 1 = use \"leave pinned\" protocol, -1 = allow network to choose at runtime).",
                                MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
                                OPAL_INFO_LVL_9,
                                MCA_BASE_VAR_SCOPE_READONLY,
                                &opal_leave_pinned);
    mca_base_var_register_synonym(ret, "opal", "opal", NULL, "leave_pinned",
                                  MCA_BASE_VAR_SYN_FLAG_DEPRECATED);

    opal_leave_pinned_pipeline = false;
    ret = mca_base_var_register("ompi", "mpi", NULL, "leave_pinned_pipeline",
                                "Whether to use the \"leave pinned pipeline\" protocol or not.",
                                 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
                                 OPAL_INFO_LVL_9,
                                 MCA_BASE_VAR_SCOPE_READONLY,
                                 &opal_leave_pinned_pipeline);
    mca_base_var_register_synonym(ret, "opal", "opal", NULL, "leave_pinned_pipeline",
                                  MCA_BASE_VAR_SYN_FLAG_DEPRECATED);

    if (opal_leave_pinned > 0 && opal_leave_pinned_pipeline) {
        opal_leave_pinned_pipeline = 0;
        opal_show_help("help-opal-runtime.txt",
                       "mpi-params:leave-pinned-and-pipeline-selected",
                       true);
    }

#if OPAL_ENABLE_TIMING
    opal_timing_sync_file = NULL;
    (void) mca_base_var_register ("opal", "opal", NULL, "timing_sync_file",
                                  "Clock synchronisation information generated by mpisync tool. You don't need to touch this if you use mpirun_prof tool.",
                                  MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
                                  OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL,
                                  &opal_timing_sync_file);
    if( opal_timing_clocksync_read(opal_timing_sync_file) ){
        opal_output(0, "Cannot read file %s containing clock synchronisation information\n", opal_timing_sync_file);
    }

    opal_timing_output = NULL;
    (void) mca_base_var_register ("opal", "opal", NULL, "timing_output",
                                  "The name of output file for timing information. If this parameter is not set then output will be directed into OPAL debug channel.",
                                  MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
                                  OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL,
                                  &opal_timing_output);

    opal_timing_overhead = true;
    (void) mca_base_var_register ("opal", "opal", NULL, "timing_overhead",
                                  "Timing framework introduce additional overhead (malloc's mostly)."
                                  " The time spend in such costly routines is measured and may be accounted"
                                  " (subtracted from timestamps). 'true' means consider overhead, 'false' - ignore (default: true).",
                                  MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
                                  OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_ALL,
                                  &opal_timing_overhead);
#endif

    opal_warn_on_fork = true;
    (void) mca_base_var_register("ompi", "mpi", NULL, "warn_on_fork",
                                 "If nonzero, issue a warning if program forks under conditions that could cause system errors",
                                 MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
                                 OPAL_INFO_LVL_9,
                                 MCA_BASE_VAR_SCOPE_READONLY,
                                 &opal_warn_on_fork);

    /* The ddt engine has a few parameters */
    ret = opal_datatype_register_params();
    if (OPAL_SUCCESS != ret) {
        return ret;
    }

    /* dss has parameters */
    ret = opal_dss_register_vars ();
    if (OPAL_SUCCESS != ret) { 
        return ret; 
    }

    return OPAL_SUCCESS;
}
static int local_heap_sort (local_io_array *io_array,
                            int num_entries,
                            int *sorted)
{
    int i = 0;
    int j = 0;
    int left = 0;
    int right = 0;
    int largest = 0;
    int heap_size = num_entries - 1;
    int temp = 0;
    unsigned char done = 0;
    int* temp_arr = NULL;

    if( 0 == num_entries) {
        num_entries = 1;
    }


    temp_arr = (int*)malloc(num_entries*sizeof(int));
    if (NULL == temp_arr) {
        opal_output (1, "OUT OF MEMORY\n");
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    temp_arr[0] = 0;
    for (i = 1; i < num_entries; ++i) {
        temp_arr[i] = i;
    }
    /* num_entries can be a large no. so NO RECURSION */
    for (i = num_entries/2-1 ; i>=0 ; i--) {
        done = 0;
        j = i;
        largest = j;

        while (!done) {
            left = j*2+1;
            right = j*2+2;
            if ((left <= heap_size) &&
                    (io_array[temp_arr[left]].offset > io_array[temp_arr[j]].offset)) {
                largest = left;
            }
            else {
                largest = j;
            }
            if ((right <= heap_size) &&
                    (io_array[temp_arr[right]].offset >
                     io_array[temp_arr[largest]].offset)) {
                largest = right;
            }
            if (largest != j) {
                temp = temp_arr[largest];
                temp_arr[largest] = temp_arr[j];
                temp_arr[j] = temp;
                j = largest;
            }
            else {
                done = 1;
            }
        }
    }

    for (i = num_entries-1; i >=1; --i) {
        temp = temp_arr[0];
        temp_arr[0] = temp_arr[i];
        temp_arr[i] = temp;
        heap_size--;
        done = 0;
        j = 0;
        largest = j;

        while (!done) {
            left =  j*2+1;
            right = j*2+2;

            if ((left <= heap_size) &&
                    (io_array[temp_arr[left]].offset >
                     io_array[temp_arr[j]].offset)) {
                largest = left;
            }
            else {
                largest = j;
            }
            if ((right <= heap_size) &&
                    (io_array[temp_arr[right]].offset >
                     io_array[temp_arr[largest]].offset)) {
                largest = right;
            }
            if (largest != j) {
                temp = temp_arr[largest];
                temp_arr[largest] = temp_arr[j];
                temp_arr[j] = temp;
                j = largest;
            }
            else {
                done = 1;
            }
        }
        sorted[i] = temp_arr[i];
    }
    sorted[0] = temp_arr[0];

    if (NULL != temp_arr) {
        free(temp_arr);
        temp_arr = NULL;
    }
    return OMPI_SUCCESS;
}
/**
 *  Discover the available resources.  Obtain directly from LoadLeveler (and
 *  therefore have no need to validate) -- ignore hostfile or any other
 *  user-specified parameters.
 */
static int orte_ras_loadleveler_discover(opal_list_t* nodelist)
{
    orte_node_t *node;
    opal_list_item_t* item;
    FILE *fp;
    char *hostname;
    char *filename;
    char input[LL_FILE_MAX_LINE_LENGTH];

    /* Ignore anything that the user already specified -- we're
       getting nodes only from LoadLeveler. */
    filename = getenv("LOADL_HOSTFILE");
    if(NULL == filename) {
        opal_output(orte_ras_base_framework.framework_output,
                "ras:loadleveler:allocate:discover: LOADL_HOSTFILE not set. "
                "Unable to discover allocated nodes.");
        return ORTE_ERROR;
    }
    fp = fopen(filename, "r");
    if (NULL == fp) {
        ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
        return ORTE_ERR_FILE_OPEN_FAILURE;
    }

    /* Iterate through all the nodes and make an entry for each */
    while (0 != ll_getline(fp, input)) {
        hostname = strdup(input);
        OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
                             "%s ras:loadleveler:allocate:discover: got hostname %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostname));

        /* Remember that LoadLeveler may list the same node more than once.
           So we have to check for duplicates. */
        for (item = opal_list_get_first(nodelist);
             opal_list_get_end(nodelist) != item;
             item = opal_list_get_next(item)) {
            node = (orte_node_t*) item;
            if (0 == strcmp(node->name, hostname)) {
                ++node->slots;

                OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
                                     "%s ras:loadleveler:allocate:discover: found -- bumped slots to %d",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->slots));
                break;
            }
        }

        /* Did we find it? */
        if (opal_list_get_end(nodelist) == item) {
            /* Nope -- didn't find it, so add a new item to the list */
            OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
                                 "%s ras:loadleveler:allocate:discover: not found -- added to list",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

            node = OBJ_NEW(orte_node_t);
            node->name = hostname;
            node->state = ORTE_NODE_STATE_UP;
            node->slots_inuse = 0;
            node->slots_max = 0;
            node->slots = 1;
            opal_list_append(nodelist, &node->super);
        } else {
            /* Yes, so we need to free the hostname that came back */
            free(hostname);
        }
    }
    fclose(fp);

    return ORTE_SUCCESS;
}
Example #25
0
ssize_t  mca_fbtl_plfs_preadv (mca_io_ompio_file_t *fh )
{

    Plfs_fd *pfd = NULL;
    plfs_error_t plfs_ret;
    pfd = fh->f_fs_ptr;
    ssize_t total_bytes_read=0;

    int i, block=1;
    struct iovec *iov = NULL;
    int iov_count = 0;
    OMPI_MPI_OFFSET_TYPE iov_offset = 0;

    if (NULL == fh->f_io_array) {
        return OMPI_ERROR;
    }

    iov = (struct iovec *) malloc 
        (OMPIO_IOVEC_INITIAL_SIZE * sizeof (struct iovec));
    if (NULL == iov) {
        opal_output(1, "OUT OF MEMORY\n");
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    for (i=0 ; i<fh->f_num_of_io_entries ; i++) {
	if (0 == iov_count) {
	    iov[iov_count].iov_base = fh->f_io_array[i].memory_address;
	    iov[iov_count].iov_len = fh->f_io_array[i].length;
	    iov_offset = (OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i].offset;
	    iov_count ++;
	}
	
	if (OMPIO_IOVEC_INITIAL_SIZE*block <= iov_count) {
	    block ++;
	    iov = (struct iovec *)realloc
		(iov, OMPIO_IOVEC_INITIAL_SIZE * block *
		 sizeof(struct iovec));
	    if (NULL == iov) {
		opal_output(1, "OUT OF MEMORY\n");
		return OMPI_ERR_OUT_OF_RESOURCE;
	    }
	}
	
	if (fh->f_num_of_io_entries != i+1) {
	    if (((OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i].offset + 
		 (OPAL_PTRDIFF_TYPE)fh->f_io_array[i].length) == 
		(OMPI_MPI_OFFSET_TYPE)(intptr_t)fh->f_io_array[i+1].offset) {                    
		iov[iov_count].iov_base = 
		    fh->f_io_array[i+1].memory_address;
		iov[iov_count].iov_len = fh->f_io_array[i+1].length;
		iov_count ++;
		continue;
	    }
	}
	
	// Find the total number of bytes to be read.
	size_t bytes = 0;
	for (int i = 0; i < iov_count; ++i) {
	    bytes += iov[i].iov_len;
	}
	
	// Allocate a temporary buffer to hold the data
	char *buffer;
	buffer = (char *) malloc (bytes);
	if (buffer == NULL) {
	    return OMPI_ERROR;
	}
	
	// Read the data
	ssize_t bytes_read;
	plfs_ret = plfs_read( pfd, buffer, bytes, iov_offset, &bytes_read );
	if (PLFS_SUCCESS != plfs_ret) {
	    opal_output(0, "fbtl_plfs_preadv: Error in plfs_read:\n%s\n", strplfserr(plfs_ret));
	    return OMPI_ERROR;
	}
	
	if (bytes_read < 0)
	    return OMPI_ERROR;
	total_bytes_read += bytes_read;
	// Copy the data from BUFFER into the memory specified by IOV
	bytes = bytes_read;
	for (int i = 0; i < iov_count; ++i) {
	    size_t copy = MIN (iov[i].iov_len, bytes);
	    (void) memcpy ((void *) iov[i].iov_base, (void *) buffer, copy);
	    buffer += copy;
	    bytes -= copy;
	    if (bytes == 0) {
		break;
	    }
	}
	iov_count = 0;
	if ( NULL != buffer ) {
	    free (buffer);
	    buffer=NULL;
	}
    }

    if (NULL != iov) {
	free (iov);
	iov = NULL;
    }

    return total_bytes_read;
}
int mca_sharedfp_individual_collaborate_data(struct mca_sharedfp_base_data_t *sh)
{
    int ret = OMPI_SUCCESS;
    mca_sharedfp_individual_header_record *headnode = NULL;
    char *buff=NULL;  
    MPI_Comm comm;
    int rank, size;
    int nodesoneachprocess = 0;
    int idx = 0,i = 0;
    double *timestampbuff = NULL;
    OMPI_MPI_OFFSET_TYPE *offsetbuff = NULL;
    int *countbuff = NULL;
    int *displ = NULL;
    double *ind_ts = NULL;
    long *ind_recordlength = NULL;
    OMPI_MPI_OFFSET_TYPE *local_off = NULL;
    int totalnodes = 0;
    ompi_status_public_t status;

    comm = sh->comm;

    rank = ompi_comm_rank ( comm );
    size = ompi_comm_size ( comm );

    headnode = (mca_sharedfp_individual_header_record*)sh->selected_module_data;
    if ( NULL == headnode)  {
	opal_output(0, "sharedfp_individual_collaborate_data: headnode is NULL but file is open\n");
	return OMPI_ERROR;
    }

    /* Number of nodes on each process is the sum of records
     * on file and records in the linked list
     */
    nodesoneachprocess = headnode->numofrecordsonfile + headnode->numofrecords;

    if ( mca_sharedfp_individual_verbose ) {
	printf("Nodes of each process = %d\n",nodesoneachprocess);
    }

    countbuff = (int*)malloc(size * sizeof(int));
    if ( NULL == countbuff  ) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    displ = (int*)malloc(sizeof(int) * size);
    if ( NULL == displ ) {
        ret = OMPI_ERR_OUT_OF_RESOURCE;
	goto exit;
    }

    /* Each process counts the number of nodes
     * in its linked list for which global offset */
    ret =  mca_sharedfp_individual_get_timestamps_and_reclengths ( &ind_ts, &ind_recordlength,
								   &local_off, sh );
    if ( OMPI_SUCCESS != ret ) {
	goto exit;
    }

    comm->c_coll.coll_allgather ( &nodesoneachprocess, 1, MPI_INT, 
				  countbuff, 1, MPI_INT, comm, 
				  comm->c_coll.coll_allgather_module );

    if ( mca_sharedfp_individual_verbose) {
	for (i = 0; i < size ; i++) {
	    printf("sharedfp_individual_collaborate_data: Countbuff[%d] = %d\n", i, countbuff[i]);
	}
    }

    if ( nodesoneachprocess == 0)    {
        ind_ts[0] = 0;
        ind_recordlength[0] = 0;
        local_off[0] = 0;
    }

    for(i = 0; i < size; i++) {
        displ[i]    = totalnodes;
	if ( mca_sharedfp_individual_verbose ) {
	    printf("sharedfp_individual_collaborate_data: displ[%d] = %d\n",i,displ[i]);
	}
        totalnodes  = totalnodes + countbuff[i];
    }

    if (totalnodes <= 0 ) { 
	goto exit;
    }

    ret =  mca_sharedfp_individual_create_buff ( &timestampbuff, &offsetbuff, totalnodes, size);
    if ( OMPI_SUCCESS != ret ) {
	goto exit;
    }

    ret = comm->c_coll.coll_allgatherv ( ind_ts, countbuff[rank], MPI_DOUBLE,
					 timestampbuff, countbuff, displ, MPI_DOUBLE,
					 comm, comm->c_coll.coll_allgatherv_module );
    if ( OMPI_SUCCESS != ret ) {
	goto exit;
    }
      
    ret = comm->c_coll.coll_allgatherv ( ind_recordlength, countbuff[rank], OMPI_OFFSET_DATATYPE,
					 offsetbuff, countbuff, displ, OMPI_OFFSET_DATATYPE,
					 comm, comm->c_coll.coll_allgatherv_module );
    if ( OMPI_SUCCESS != ret ) {
	goto exit;
    }
    
    ret =  mca_sharedfp_individual_sort_timestamps(&timestampbuff, &offsetbuff,totalnodes);
    if ( OMPI_SUCCESS != ret ) {
	goto exit;
    }
    
    sh->global_offset = mca_sharedfp_individual_assign_globaloffset ( &offsetbuff, totalnodes, sh);
    
    buff = (char * ) malloc( ind_recordlength[0] * 1.2 );
    if  ( NULL == buff ) {
	ret = OMPI_ERR_OUT_OF_RESOURCE;
	goto exit;
    }
    
    for (i = 0; i < nodesoneachprocess ; i++)  {
	/*Read from the local data file*/
	ompio_io_ompio_file_read_at ( headnode->datafilehandle,
				      local_off[i], buff, ind_recordlength[i],
				      MPI_BYTE, &status);
	
	idx =  mca_sharedfp_individual_getoffset(ind_ts[i],timestampbuff,totalnodes);
	
	if ( mca_sharedfp_individual_verbose ) {
	    printf("sharedfp_individual_collaborate_data: Process %d writing %ld bytes to main file \n",
		   rank,ind_recordlength[i]);
	}
	
	/*Write into main data file*/
	ompio_io_ompio_file_write_at( sh->sharedfh, offsetbuff[idx], buff,
				      ind_recordlength[i], MPI_BYTE, &status);
	
    }    

exit:
    if ( NULL != countbuff ) {
	free ( countbuff );
    }
    if ( NULL != displ ) {
	free ( displ );
    }

    if( NULL != timestampbuff ){
        free ( timestampbuff );
    }
    if ( NULL != offsetbuff ){
        free ( offsetbuff );
    }
    if ( NULL != ind_ts ) {
	free ( ind_ts );
    }
    if ( NULL != ind_recordlength ) {
	free ( ind_recordlength );
    }
    if ( NULL != local_off ) {
	free ( local_off );
    }
    if ( NULL != buff ) {
	free ( buff );
    }

    return ret;
}
/*
 * We have received a segment, take action based on the 
 * packet type in the BTL header
 */
void ompi_btl_usnic_recv_call(ompi_btl_usnic_module_t *module,
                           ompi_btl_usnic_recv_segment_t *seg,
                           ompi_btl_usnic_channel_t *channel)
{
    ompi_btl_usnic_segment_t *bseg;
    mca_btl_active_message_callback_t* reg;
    ompi_btl_usnic_endpoint_t *endpoint;
    ompi_btl_usnic_btl_chunk_header_t *chunk_hdr;
    ompi_btl_usnic_btl_header_t *hdr;
    uint32_t window_index;
    int rc;
#if MSGDEBUG1
    char src_mac[32];
    char dest_mac[32];
#endif

    bseg = &seg->rs_base;

    ++module->stats.num_total_recvs;

    /* Valgrind help */
    opal_memchecker_base_mem_defined((void*)(seg->rs_recv_desc.sg_list[0].addr),
                                     seg->rs_recv_desc.sg_list[0].length);

#if MSGDEBUG1
    memset(src_mac, 0, sizeof(src_mac));
    memset(dest_mac, 0, sizeof(dest_mac));
    ompi_btl_usnic_sprintf_gid_mac(src_mac,
            &seg->rs_protocol_header->grh.sgid);
    ompi_btl_usnic_sprintf_gid_mac(dest_mac, 
            &seg->rs_protocol_header->grh.dgid);

    opal_output(0, "Got message from MAC %s", src_mac);
    opal_output(0, "Looking for sender: 0x%016lx",
        bseg->us_btl_header->sender);
#endif

    /* Find out who sent this segment */
    endpoint = seg->rs_endpoint;
    if (FAKE_RECV_FRAG_DROP || OPAL_UNLIKELY(NULL == endpoint)) {
        /* No idea who this was from, so drop it */
#if MSGDEBUG1
        opal_output(0, "=== Unknown sender; dropped: from MAC %s to MAC %s, seq %" UDSEQ, 
                    src_mac, 
                    dest_mac, 
                    bseg->us_btl_header->seq);
#endif
        ++module->stats.num_unk_recvs;
        goto repost_no_endpoint;
    }

    /***********************************************************************/
    /* Segment is an incoming frag */
    if (OMPI_BTL_USNIC_PAYLOAD_TYPE_FRAG == bseg->us_btl_header->payload_type) {

        /* do the receive bookkeeping */
        rc = ompi_btl_usnic_recv_frag_bookkeeping(module, seg, channel);
        if (rc != 0) {
            return;
        }

        hdr = seg->rs_base.us_btl_header;

#if MSGDEBUG1
        opal_output(0, "<-- Received FRAG ep %p, seq %" UDSEQ ", len=%d\n",
                    (void*) endpoint, hdr->seq, hdr->payload_len);
#if 0

        opal_output(0, "<-- Received FRAG ep %p, seq %" UDSEQ " from %s to %s: GOOD! (rel seq %d, lowest seq %" UDSEQ ", highest seq: %" UDSEQ ", rwstart %d) seg %p, module %p\n",
                    (void*) endpoint,
                    seg->rs_base.us_btl_header->seq, 
                    src_mac, dest_mac,
                    window_index,
                    endpoint->endpoint_next_contig_seq_to_recv,
                    endpoint->endpoint_highest_seq_rcvd,
                    endpoint->endpoint_rfstart,
                    (void*) seg, (void*) module);
        if (hdr->put_addr != NULL) {
            opal_output(0, "  put_addr = %p\n",
                    seg->rs_base.us_btl_header->put_addr);
        }
#endif
#endif

        /* If this it not a PUT, Pass this segment up to the PML.
         * Be sure to get the payload length from the BTL header because
         * the L2 layer may artificially inflate (or otherwise change)
         * the frame length to meet minimum sizes, add protocol information,
         * etc.
         */
        if (hdr->put_addr == NULL) {
            reg = mca_btl_base_active_message_trigger + hdr->tag;
            seg->rs_segment.seg_len = hdr->payload_len;
#if MSGDEBUG2
                opal_output(0, "small recv complete, pass up %u bytes, tag=%d\n",
                        (unsigned)bseg->us_btl_header->payload_len,
                        (int)bseg->us_btl_header->tag);
#endif
            reg->cbfunc(&module->super, hdr->tag, &seg->rs_desc, reg->cbdata);

        /*
         * If this is a PUT, need to copy it to user buffer
         */
        } else {
#if MSGDEBUG1
            opal_output(0, "Copy %d PUT bytes to %p\n", 
                seg->rs_base.us_btl_header->payload_len,
                (void*)seg->rs_base.us_btl_header->put_addr);
#endif
            memcpy(seg->rs_base.us_btl_header->put_addr,
                    seg->rs_base.us_payload.raw,
                    seg->rs_base.us_btl_header->payload_len);
        }

        /* do not jump to repost, already done by bookkeeping */
        return;
    }

    /***********************************************************************/
    /* Segment is an incoming chunk */
    if (OMPI_BTL_USNIC_PAYLOAD_TYPE_CHUNK == bseg->us_btl_header->payload_type) {
        int frag_index;
        ompi_btl_usnic_rx_frag_info_t *fip;

        /* Is incoming sequence # ok? */
        if (OPAL_UNLIKELY(ompi_btl_usnic_check_rx_seq(endpoint, seg,
                        &window_index) != 0)) {
            goto repost;
        }

#if MSGDEBUG1
        opal_output(0, "<-- Received CHUNK fid %d ep %p, seq %" UDSEQ " from %s to %s: GOOD! (rel seq %d, lowest seq %" UDSEQ ", highest seq: %" UDSEQ ", rwstart %d) seg %p, module %p\n",
                    seg->rs_base.us_btl_chunk_header->ch_frag_id,
                    (void*) endpoint,
                    seg->rs_base.us_btl_chunk_header->ch_hdr.seq, 
                    src_mac, dest_mac,
                    window_index,
                    endpoint->endpoint_next_contig_seq_to_recv,
                    endpoint->endpoint_highest_seq_rcvd,
                    endpoint->endpoint_rfstart,
                    (void*) seg, (void*) module);
#endif

        /* start a new fragment if not one in progress
         * alloc memory, etc.  when last byte arrives, dealloc the
         * frag_id and pass data to PML
         */
        chunk_hdr = seg->rs_base.us_btl_chunk_header;
        frag_index = chunk_hdr->ch_frag_id % MAX_ACTIVE_FRAGS;
        fip = &(endpoint->endpoint_rx_frag_info[frag_index]);

        /* frag_id == 0 means this slot it empty, grab it! */
        if (0 == fip->rfi_frag_id) {
            fip->rfi_frag_id = chunk_hdr->ch_frag_id;
            fip->rfi_frag_size = chunk_hdr->ch_frag_size;
            if (chunk_hdr->ch_hdr.put_addr == NULL) {
                int pool;

                fip->rfi_data = NULL;

                /* See which data pool this should come from,
                 * or if it should be malloc()ed
                 */
                pool = usnic_fls(chunk_hdr->ch_frag_size-1);
                if (pool >= module->first_pool &&
                        pool <= module->last_pool) {
                    ompi_free_list_item_t* item;
                    ompi_btl_usnic_rx_buf_t *rx_buf;
                    OMPI_FREE_LIST_GET_MT(&module->module_recv_buffers[pool],
                                          item);
                    rx_buf = (ompi_btl_usnic_rx_buf_t *)item;
                    if (OPAL_LIKELY(NULL != rx_buf)) {
                        fip->rfi_fl_elt = item;
                        fip->rfi_data = rx_buf->buf;
                        fip->rfi_data_pool = pool;
                    }
                }
                if (fip->rfi_data == NULL) {
                    fip->rfi_data = malloc(chunk_hdr->ch_frag_size);
                    fip->rfi_data_pool = 0;
                }
                if (fip->rfi_data == NULL) {
                    abort();
                }
#if MSGDEBUG1
                opal_output(0, "Start large recv to %p, size=%"PRIu32"\n",
                    (void *)fip->rfi_data, chunk_hdr->ch_frag_size);
#endif
            } else {
#if MSGDEBUG1
                opal_output(0, "Start PUT to %p\n",
                        (void *)chunk_hdr->ch_hdr.put_addr);
#endif
                fip->rfi_data = chunk_hdr->ch_hdr.put_addr;
            }
            fip->rfi_bytes_left = chunk_hdr->ch_frag_size;
            fip->rfi_frag_id = chunk_hdr->ch_frag_id;

        /* frag_id is not 0 - it must match, drop if not */
        } else if (fip->rfi_frag_id != chunk_hdr->ch_frag_id) {
            ++module->stats.num_badfrag_recvs;
            goto repost;
        }
#if MSGDEBUG1
        opal_output(0, "put_addr=%p, copy_addr=%p, off=%d\n",
                chunk_hdr->ch_hdr.put_addr,
                fip->rfi_data+chunk_hdr->ch_frag_offset,
                chunk_hdr->ch_frag_offset);
#endif

        /* Stats */
        ++module->stats.num_chunk_recvs;

        /* validate offset and len to be within fragment */
        assert(chunk_hdr->ch_frag_offset + chunk_hdr->ch_hdr.payload_len <=
                fip->rfi_frag_size);
        assert(fip->rfi_frag_size == chunk_hdr->ch_frag_size);

        /* copy the data into place */
        memcpy(fip->rfi_data + chunk_hdr->ch_frag_offset, (char *)(chunk_hdr+1),
                chunk_hdr->ch_hdr.payload_len);

        /* update sliding window */
        ompi_btl_usnic_update_window(endpoint, window_index);

        fip->rfi_bytes_left -= chunk_hdr->ch_hdr.payload_len;
        if (0 == fip->rfi_bytes_left) {
            mca_btl_base_descriptor_t desc;
            mca_btl_base_segment_t segment;

            segment.seg_addr.pval = fip->rfi_data;
            segment.seg_len = fip->rfi_frag_size;
            desc.des_dst = &segment;
            desc.des_dst_cnt = 1;

            /* only up to PML if this was not a put */
            if (chunk_hdr->ch_hdr.put_addr == NULL) {

                /* Pass this segment up to the PML */
#if MSGDEBUG2
                opal_output(0, "large recv complete, pass up %p, %u bytes, tag=%d\n",
                        desc.des_dst->seg_addr.pval,
                        (unsigned)desc.des_dst->seg_len,
                        (int)chunk_hdr->ch_hdr.tag);
#endif
                reg = mca_btl_base_active_message_trigger +
                    chunk_hdr->ch_hdr.tag;

                /* mca_pml_ob1_recv_frag_callback_frag() */
                reg->cbfunc(&module->super, chunk_hdr->ch_hdr.tag,
                        &desc, reg->cbdata);

                /* free temp buffer for non-put */
                if (0 == fip->rfi_data_pool) {
                    free(fip->rfi_data);
                } else {
                    OMPI_FREE_LIST_RETURN_MT(
                            &module->module_recv_buffers[fip->rfi_data_pool],
                            fip->rfi_fl_elt);
                }

#if MSGDEBUG1
            } else {
                opal_output(0, "PUT recv complete, no callback\n");
#endif
            }

            /* release the fragment ID */
            fip->rfi_frag_id = 0;

            /* force immediate ACK */
            endpoint->endpoint_acktime = 0;
        }
        goto repost;
    }

    /***********************************************************************/
    /* Frag is an incoming ACK */
    else if (OPAL_LIKELY(OMPI_BTL_USNIC_PAYLOAD_TYPE_ACK == 
                         bseg->us_btl_header->payload_type)) {
        ompi_btl_usnic_seq_t ack_seq;

        /* sequence being ACKed */
        ack_seq = bseg->us_btl_header->ack_seq;

        /* Stats */
        ++module->stats.num_ack_recvs;

#if MSGDEBUG1
        opal_output(0, "    Received ACK for sequence number %" UDSEQ " from %s to %s\n",
                    bseg->us_btl_header->ack_seq, src_mac, dest_mac);
#endif
        ompi_btl_usnic_handle_ack(endpoint, ack_seq);

        goto repost;
    }

    /***********************************************************************/
    /* Have no idea what the frag is; drop it */
    else {
        ++module->stats.num_unk_recvs;
        opal_output(0, "==========================unknown 2");
        goto repost;
    }

    /***********************************************************************/
 repost:

    /* if endpoint exiting, and all ACKs received, release the endpoint */
    if (endpoint->endpoint_exiting && ENDPOINT_DRAINED(endpoint)) {
        OBJ_RELEASE(endpoint);
    }
 repost_no_endpoint:
    ++module->stats.num_recv_reposts;

    /* Add recv to linked list for reposting */
    seg->rs_recv_desc.next = channel->repost_recv_head;
    channel->repost_recv_head = &seg->rs_recv_desc;
}
Example #28
0
static void launch_daemons(int fd, short args, void *cbdata)
{
    orte_app_context_t *app;
    orte_node_t *node;
    orte_std_cntr_t n;
    orte_job_map_t *map;
    char *jobid_string = NULL;
    char *param;
    char **argv = NULL;
    int argc;
    int rc;
    char *tmp;
    char** env = NULL;
    char *nodelist_flat;
    char **nodelist_argv;
    char *name_string;
    char **custom_strings;
    int num_args, i;
    char *cur_prefix;
    int proc_vpid_index;
    bool failed_launch=true;
    orte_job_t *daemons;
    orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;

    OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
                         "%s plm:slurm: LAUNCH DAEMONS CALLED",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

    /* if we are launching debugger daemons, then just go
     * do it - no new daemons will be launched
     */
    if (ORTE_JOB_CONTROL_DEBUGGER_DAEMON & state->jdata->controls) {
        state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
        ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
        OBJ_RELEASE(state);
        return;
    }

    /* start by setting up the virtual machine */
    daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
    if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(state->jdata))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }

   /* if we don't want to launch, then don't attempt to
     * launch the daemons - the user really wants to just
     * look at the proposed process map
     */
    if (orte_do_not_launch) {
        /* set the state to indicate the daemons reported - this
         * will trigger the daemons_reported event and cause the
         * job to move to the following step
         */
        state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
        ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
        OBJ_RELEASE(state);
        return;
    }

    /* Get the map for this job */
    if (NULL == (map = daemons->map)) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        rc = ORTE_ERR_NOT_FOUND;
        goto cleanup;
    }
        
    if (0 == map->num_new_daemons) {
        /* set the state to indicate the daemons reported - this
         * will trigger the daemons_reported event and cause the
         * job to move to the following step
         */
        OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
                             "%s plm:slurm: no new daemons to launch",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
        ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
        OBJ_RELEASE(state);
        return;
    }

    /* need integer value for command line parameter */
    asprintf(&jobid_string, "%lu", (unsigned long) daemons->jobid);

    /*
     * start building argv array
     */
    argv = NULL;
    argc = 0;

    /*
     * SLURM srun OPTIONS
     */

    /* add the srun command */
    opal_argv_append(&argc, &argv, "srun");

    /* start one orted on each node */
    opal_argv_append(&argc, &argv, "--ntasks-per-node=1");

    /* alert us if any orteds die during startup */
    opal_argv_append(&argc, &argv, "--kill-on-bad-exit");

    /* ensure the orteds are not bound to a single processor,
     * just in case the TaskAffinity option is set by default.
     * This will *not* release the orteds from any cpu-set
     * constraint, but will ensure it doesn't get
     * bound to only one processor
     */
    opal_argv_append(&argc, &argv, "--cpu_bind=none");

    /* Append user defined arguments to srun */
    if ( NULL != mca_plm_slurm_component.custom_args ) {
        custom_strings = opal_argv_split(mca_plm_slurm_component.custom_args, ' ');
        num_args       = opal_argv_count(custom_strings);
        for (i = 0; i < num_args; ++i) {
            opal_argv_append(&argc, &argv, custom_strings[i]);
        }
        opal_argv_free(custom_strings);
    }

    /* create nodelist */
    nodelist_argv = NULL;

    for (n=0; n < map->nodes->size; n++ ) {
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) {
            continue;
        }
        /* if the daemon already exists on this node, then
         * don't include it
         */
        if (node->daemon_launched) {
            continue;
        }
        
        /* otherwise, add it to the list of nodes upon which
         * we need to launch a daemon
         */
        opal_argv_append_nosize(&nodelist_argv, node->name);
    }
    if (0 == opal_argv_count(nodelist_argv)) {
        orte_show_help("help-plm-slurm.txt", "no-hosts-in-list", true);
        rc = ORTE_ERR_FAILED_TO_START;
        goto cleanup;
    }
    nodelist_flat = opal_argv_join(nodelist_argv, ',');
    opal_argv_free(nodelist_argv);

    /* if we are using all allocated nodes, then srun doesn't
     * require any further arguments
     */
    if (map->num_new_daemons < orte_num_allocated_nodes) {
        asprintf(&tmp, "--nodes=%lu", (unsigned long)map->num_new_daemons);
        opal_argv_append(&argc, &argv, tmp);
        free(tmp);

        asprintf(&tmp, "--nodelist=%s", nodelist_flat);
        opal_argv_append(&argc, &argv, tmp);
        free(tmp);
    }

    /* tell srun how many tasks to run */
    asprintf(&tmp, "--ntasks=%lu", (unsigned long)map->num_new_daemons);
    opal_argv_append(&argc, &argv, tmp);
    free(tmp);

    OPAL_OUTPUT_VERBOSE((2, orte_plm_base_framework.framework_output,
                         "%s plm:slurm: launching on nodes %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodelist_flat));
    
    /*
     * ORTED OPTIONS
     */

    /* add the daemon command (as specified by user) */
    orte_plm_base_setup_orted_cmd(&argc, &argv);
    
    /* Add basic orted command line options, including debug flags */
    orte_plm_base_orted_append_basic_args(&argc, &argv,
                                          NULL, &proc_vpid_index,
                                          nodelist_flat);
    free(nodelist_flat);

    /* tell the new daemons the base of the name list so they can compute
     * their own name on the other end
     */
    rc = orte_util_convert_vpid_to_string(&name_string, map->daemon_vpid_start);
    if (ORTE_SUCCESS != rc) {
        opal_output(0, "plm_slurm: unable to get daemon vpid as string");
        goto cleanup;
    }

    free(argv[proc_vpid_index]);
    argv[proc_vpid_index] = strdup(name_string);
    free(name_string);

    /* Copy the prefix-directory specified in the
       corresponding app_context.  If there are multiple,
       different prefix's in the app context, complain (i.e., only
       allow one --prefix option for the entire slurm run -- we
       don't support different --prefix'es for different nodes in
       the SLURM plm) */
    cur_prefix = NULL;
    for (n=0; n < state->jdata->apps->size; n++) {
        char * app_prefix_dir;
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(state->jdata->apps, n))) {
            continue;
        }
        app_prefix_dir = app->prefix_dir;
        /* Check for already set cur_prefix_dir -- if different,
           complain */
        if (NULL != app_prefix_dir) {
            if (NULL != cur_prefix &&
                0 != strcmp (cur_prefix, app_prefix_dir)) {
                orte_show_help("help-plm-slurm.txt", "multiple-prefixes",
                               true, cur_prefix, app_prefix_dir);
                goto cleanup;
            }

            /* If not yet set, copy it; iff set, then it's the
             * same anyway
             */
            if (NULL == cur_prefix) {
                cur_prefix = strdup(app_prefix_dir);
                OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
                                     "%s plm:slurm: Set prefix:%s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     cur_prefix));
            }
        }
    }

    /* setup environment */
    env = opal_argv_copy(orte_launch_environ);

    if (0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) {
        param = opal_argv_join(argv, ' ');
        OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
                             "%s plm:slurm: final top-level argv:\n\t%s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             (NULL == param) ? "NULL" : param));
        if (NULL != param) free(param);
    }
    
    /* exec the daemon(s) */
    if (ORTE_SUCCESS != (rc = plm_slurm_start_proc(argc, argv, env, cur_prefix))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
    
    /* indicate that the daemons for this job were launched */
    state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
    daemons->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;

    /* flag that launch was successful, so far as we currently know */
    failed_launch = false;

 cleanup:
    if (NULL != argv) {
        opal_argv_free(argv);
    }
    if (NULL != env) {
        opal_argv_free(env);
    }
    
    if(NULL != jobid_string) {
        free(jobid_string);
    }
    
    /* cleanup the caddy */
    OBJ_RELEASE(state);

    /* check for failed launch - if so, force terminate */
    if (failed_launch) {
        ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
    }
}
Example #29
0
int mca_sharedfp_sm_file_open (struct ompi_communicator_t *comm,
                               char* filename,
                               int amode,
                               struct ompi_info_t *info,
                               mca_io_ompio_file_t *fh)
{
    int err = OMPI_SUCCESS;
    struct mca_sharedfp_base_data_t* sh;
    struct mca_sharedfp_sm_data * sm_data = NULL;
    mca_io_ompio_file_t * shfileHandle;
    char * filename_basename;
    char * sm_filename;
    struct sm_offset * sm_offset_ptr;
    struct sm_offset sm_offset;
    int sm_fd;
    int rank;

    /*----------------------------------------------------*/
    /*Open the same file again without shared file pointer*/
    /*----------------------------------------------------*/
    shfileHandle = (mca_io_ompio_file_t *)malloc(sizeof(mca_io_ompio_file_t));
    err = ompio_io_ompio_file_open(comm,filename,amode,info,shfileHandle,false);
    if ( OMPI_SUCCESS != err) {
        opal_output(0, "mca_sharedfp_sm_file_open: Error during file open\n");
        return err;
    }

    /*Memory is allocated here for the sh structure*/
    if ( mca_sharedfp_sm_verbose ) {
	printf( "mca_sharedfp_sm_file_open: malloc f_sharedfp_ptr struct\n");
    }

    sh = (struct mca_sharedfp_base_data_t*)malloc(sizeof(struct mca_sharedfp_base_data_t));
    if ( NULL == sh ) {
	opal_output(0, "mca_sharedfp_sm_file_open: Error, unable to malloc f_sharedfp_ptr struct\n");
	free(shfileHandle);
	return OMPI_ERR_OUT_OF_RESOURCE;
    }

    /*Populate the sh file structure based on the implementation*/
    sh->sharedfh      = shfileHandle;			/* Shared file pointer*/
    sh->global_offset = 0;				/* Global Offset*/
    sh->comm          = comm; 				/* Communicator*/
    sh->selected_module_data = NULL;

    rank = ompi_comm_rank ( sh->comm );

    /*Open a shared memory segment which will hold the shared file pointer*/
    if ( mca_sharedfp_sm_verbose ) {
	printf( "mca_sharedfp_sm_file_open: allocatge shared memory segment.\n");
    }


    sm_data = (struct mca_sharedfp_sm_data*) malloc ( sizeof(struct mca_sharedfp_sm_data));
    if ( NULL == sm_data ){
        opal_output(0, "mca_sharedfp_sm_file_open: Error, unable to malloc sm_data struct\n");
        free(sh);
        free(shfileHandle);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    sm_data->sm_filename=NULL;


    /* the shared memory segment is identified opening a file
    ** and then mapping it to memory
    ** For sharedfp we also want to put the file backed shared memory into the tmp directory
    ** TODO: properly name the file so that different jobs can run on the same system w/o
    **      overwriting each other, e.g.  orte_process_info.proc_session_dir
    */
    /*sprintf(sm_filename,"%s%s",filename,".sm");*/
    filename_basename = basename(filename);
    sm_filename = (char*) malloc( sizeof(char) * (strlen(filename_basename)+64) );
    if (NULL == sm_filename) {
        free(sm_data);
        free(sh);
        free(shfileHandle);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }
    sprintf(sm_filename,"/tmp/OMPIO_sharedfp_sm_%s%s",filename_basename,".sm");

    /* open shared memory file, initialize to 0, map into memory */
    sm_fd = open(sm_filename, O_RDWR | O_CREAT,
                 S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
    if ( sm_fd == -1){
        /*error opening file*/
        printf("mca_sharedfp_sm_file_open: Error, unable to open file for mmap: %s\n",sm_filename);
        free(sm_filename);
        free(sm_data);
        free(sh);
        free(shfileHandle);
        return OMPI_ERROR;
    }

        free(sm_filename);
    sm_data->sm_filename = sm_filename;

    /*TODO: is it necessary to write to the file first?*/
    if( 0 == rank ){
	memset ( &sm_offset, 0, sizeof (struct sm_offset ));
	write ( sm_fd, &sm_offset, sizeof(struct sm_offset));
    }
    comm->c_coll.coll_barrier (comm, comm->c_coll.coll_barrier_module );

    /*the file has been written to, now we can map*/
    sm_offset_ptr = mmap(NULL, sizeof(struct sm_offset), PROT_READ | PROT_WRITE,
			 MAP_SHARED, sm_fd, 0);

    close(sm_fd);

    if ( sm_offset_ptr==MAP_FAILED){
	err = OMPI_ERROR;
	printf("mca_sharedfp_sm_file_open: Error, unable to mmap file: %s\n",sm_filename);
	printf("%s\n", strerror(errno));
        free(sm_filename);
        free(sm_data);
        free(sh);
        free(shfileHandle);
        return OMPI_ERROR;
    }

    /* Initialize semaphore so that is shared between processes.           */
    /* the semaphore is shared by keeping it in the shared memory segment  */

#ifdef OMPIO_SHAREDFP_USE_UNNAMED_SEMAPHORES
    if(sem_init(&sm_offset_ptr->mutex, 1, 1) != -1){
#else
    sm_data->sem_name = (char*) malloc( sizeof(char) * (strlen(filename_basename)+32) );
    sprintf(sm_data->sem_name,"OMPIO_sharedfp_sem_%s",filename_basename);

    if( (sm_data->mutex = sem_open(sm_data->sem_name, O_CREAT, 0644, 1)) != SEM_FAILED ) {
#endif
	/*If opening was successful*/
	/*Store the new file handle*/
	sm_data->sm_offset_ptr = sm_offset_ptr;
	/* Assign the sm_data to sh->selected_module_data*/
	sh->selected_module_data   = sm_data;
	/*remember the shared file handle*/
	fh->f_sharedfp_data = sh;

	/*write initial zero*/
	if(rank==0){
	    MPI_Offset position=0;

#ifdef OMPIO_SHAREDFP_USE_UNNAMED_SEMAPHORES
	    sem_wait(sm_offset_ptr->mutex);
	    sm_offset_ptr->offset=position;
	    sem_post(sm_offset_ptr->mutex);
#else
	    sem_wait(sm_data->mutex);
	    sm_offset_ptr->offset=position;
	    sem_post(sm_data->mutex);
#endif
	}
    }else{
        free(sm_filename);
	free(sm_data);
	free(sh);
	free(shfileHandle);
        munmap(sm_offset_ptr, sizeof(struct sm_offset));
	err = OMPI_ERROR;
    }

    comm->c_coll.coll_barrier (comm, comm->c_coll.coll_barrier_module );

    return err;
}

int mca_sharedfp_sm_file_close (mca_io_ompio_file_t *fh)
{
    int err = OMPI_SUCCESS;
    /*sharedfp data structure*/
    struct mca_sharedfp_base_data_t *sh=NULL;
    /*sharedfp sm module data structure*/
    struct mca_sharedfp_sm_data * file_data=NULL;

    if( NULL == fh->f_sharedfp_data ){
	if ( mca_sharedfp_sm_verbose ) {
	    printf("sharedfp_sm_file_close: shared file pointer structure not initialized\n");
	}
        return OMPI_SUCCESS;
    }
    sh = fh->f_sharedfp_data;

    /* Use an MPI Barrier in order to make sure that
     * all processes are ready to release the
     * shared file pointer resources
     */
    sh->comm->c_coll.coll_barrier (sh->comm, sh->comm->c_coll.coll_barrier_module );

    file_data = (sm_data*)(sh->selected_module_data);
    if (file_data)  {
        /*Close sm handle*/
        if (file_data->sm_offset_ptr) {
            /* destroy semaphore */
#ifdef OMPIO_SHAREDFP_USE_UNNAMED_SEMAPHORES
	    sem_destroy(file_data->sm_offset_ptr->mutex);
#else
 	    sem_unlink (file_data->sem_name);
 	    free (file_data->sem_name);
#endif
            /*Release the shared memory segment.*/
            munmap(file_data->sm_offset_ptr,sizeof(struct sm_offset));
            /*Q: Do we need to delete the file? */
            remove(file_data->sm_filename);
        }
        /*free our sm data structure*/
        if(file_data->sm_filename){
            free(file_data->sm_filename);
        }
        free(file_data);
    }

    /* Close the main file opened by this component*/
    err = ompio_io_ompio_file_close(sh->sharedfh);

    /*free shared file pointer data struct*/
    free(sh);

    return err;

}
Example #30
0
int orte_daemon(int argc, char *argv[])
{
    int ret = 0;
    opal_cmd_line_t *cmd_line = NULL;
    char *rml_uri;
    int i;
    opal_buffer_t *buffer;
    char hostname[100];
#if OPAL_ENABLE_FT_CR == 1
    char *tmp_env_var = NULL;
#endif
    
    /* initialize the globals */
    memset(&orted_globals, 0, sizeof(orted_globals));
    /* initialize the singleton died pipe to an illegal value so we can detect it was set */
    orted_globals.singleton_died_pipe = -1;
    /* init the failure orted vpid to an invalid value */
    orted_globals.fail = ORTE_VPID_INVALID;
    
    /* setup to check common command line options that just report and die */
    cmd_line = OBJ_NEW(opal_cmd_line_t);
    if (OPAL_SUCCESS != opal_cmd_line_create(cmd_line, orte_cmd_line_opts)) {
        OBJ_RELEASE(cmd_line);
        exit(1);
    }
    mca_base_cmd_line_setup(cmd_line);
    if (ORTE_SUCCESS != (ret = opal_cmd_line_parse(cmd_line, false,
                                                   argc, argv))) {
        char *args = NULL;
        args = opal_cmd_line_get_usage_msg(cmd_line);
        fprintf(stderr, "Usage: %s [OPTION]...\n%s\n", argv[0], args);
        free(args);
        OBJ_RELEASE(cmd_line);
        return ret;
    }
    
    /*
     * Since this process can now handle MCA/GMCA parameters, make sure to
     * process them.
     */
    mca_base_cmd_line_process_args(cmd_line, &environ, &environ);
    
    /* Ensure that enough of OPAL is setup for us to be able to run */
    /*
     * NOTE: (JJH)
     *  We need to allow 'mca_base_cmd_line_process_args()' to process command
     *  line arguments *before* calling opal_init_util() since the command
     *  line could contain MCA parameters that affect the way opal_init_util()
     *  functions. AMCA parameters are one such option normally received on the
     *  command line that affect the way opal_init_util() behaves.
     *  It is "safe" to call mca_base_cmd_line_process_args() before 
     *  opal_init_util() since mca_base_cmd_line_process_args() does *not*
     *  depend upon opal_init_util() functionality.
     */
    if (OPAL_SUCCESS != opal_init_util(&argc, &argv)) {
        fprintf(stderr, "OPAL failed to initialize -- orted aborting\n");
        exit(1);
    }

    /* save the environment for launch purposes. This MUST be
     * done so that we can pass it to any local procs we
     * spawn - otherwise, those local procs won't see any
     * non-MCA envars that were set in the enviro when the
     * orted was executed - e.g., by .csh
     */
    orte_launch_environ = opal_argv_copy(environ);
    
    /* purge any ess flag set in the environ when we were launched */
    opal_unsetenv("OMPI_MCA_ess", &orte_launch_environ);
    
    /* if orte_daemon_debug is set, let someone know we are alive right
     * away just in case we have a problem along the way
     */
    if (orted_globals.debug) {
        gethostname(hostname, 100);
        fprintf(stderr, "Daemon was launched on %s - beginning to initialize\n", hostname);
    }
    
    /* check for help request */
    if (orted_globals.help) {
        char *args = NULL;
        args = opal_cmd_line_get_usage_msg(cmd_line);
        orte_show_help("help-orted.txt", "orted:usage", false,
                       argv[0], args);
        free(args);
        return 1;
    }
#if defined(HAVE_SETSID)
    /* see if we were directed to separate from current session */
    if (orted_globals.set_sid) {
        setsid();
    }
#endif
    /* see if they want us to spin until they can connect a debugger to us */
    i=0;
    while (orted_spin_flag) {
        i++;
        if (1000 < i) i=0;        
    }

#if OPAL_ENABLE_FT_CR == 1
    /* Mark as a tool program */
    (void) mca_base_var_env_name ("opal_cr_is_tool", &tmp_env_var);
    opal_setenv(tmp_env_var,
                "1",
                true, &environ);
    free(tmp_env_var);
#endif

    /* if mapreduce set, flag it */
    if (orted_globals.mapreduce) {
        orte_map_reduce = true;
    }

    /* detach from controlling terminal
     * otherwise, remain attached so output can get to us
     */
    if(!orte_debug_flag &&
       !orte_debug_daemons_flag &&
       orted_globals.daemonize) {
        opal_daemon_init(NULL);
    }
    
    /* Set the flag telling OpenRTE that I am NOT a
     * singleton, but am "infrastructure" - prevents setting
     * up incorrect infrastructure that only a singleton would
     * require.
     */
    if (orted_globals.hnp) {
        if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_HNP))) {
            ORTE_ERROR_LOG(ret);
            return ret;
        }
    } else {
        if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_DAEMON))) {
            ORTE_ERROR_LOG(ret);
            return ret;
        }
    }
    /* finalize the OPAL utils. As they are opened again from orte_init->opal_init
     * we continue to have a reference count on them. So we have to finalize them twice...
     */
    opal_finalize_util();

    if ((int)ORTE_VPID_INVALID != orted_globals.fail) {
        orted_globals.abort=false;
        /* some vpid was ordered to fail. The value can be positive
         * or negative, depending upon the desired method for failure,
         * so need to check both here
         */
        if (0 > orted_globals.fail) {
            orted_globals.fail = -1*orted_globals.fail;
            orted_globals.abort = true;
        }
        /* are we the specified vpid? */
        if ((int)ORTE_PROC_MY_NAME->vpid == orted_globals.fail) {
            /* if the user specified we delay, then setup a timer
             * and have it kill us
             */
            if (0 < orted_globals.fail_delay) {
                ORTE_TIMER_EVENT(orted_globals.fail_delay, 0, shutdown_callback, ORTE_SYS_PRI);
                
            } else {
                opal_output(0, "%s is executing clean %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            orted_globals.abort ? "abort" : "abnormal termination");

                /* do -not- call finalize as this will send a message to the HNP
                 * indicating clean termination! Instead, just forcibly cleanup
                 * the local session_dir tree and exit
                 */
                orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
                
                /* if we were ordered to abort, do so */
                if (orted_globals.abort) {
                    abort();
                }
                
                /* otherwise, return with non-zero status */
                ret = ORTE_ERROR_DEFAULT_EXIT_CODE;
                goto DONE;
            }
        }
    }

    /* insert our contact info into our process_info struct so we
     * have it for later use and set the local daemon field to our name
     */
    orte_process_info.my_daemon_uri = orte_rml.get_contact_info();
    ORTE_PROC_MY_DAEMON->jobid = ORTE_PROC_MY_NAME->jobid;
    ORTE_PROC_MY_DAEMON->vpid = ORTE_PROC_MY_NAME->vpid;
    
    /* if I am also the hnp, then update that contact info field too */
    if (ORTE_PROC_IS_HNP) {
        orte_process_info.my_hnp_uri = orte_rml.get_contact_info();
        ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid;
        ORTE_PROC_MY_HNP->vpid = ORTE_PROC_MY_NAME->vpid;
    }
    
    /* setup the primary daemon command receive function */
    orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON,
                            ORTE_RML_PERSISTENT, orte_daemon_recv, NULL);
    
    /* output a message indicating we are alive, our name, and our pid
     * for debugging purposes
     */
    if (orte_debug_daemons_flag) {
        fprintf(stderr, "Daemon %s checking in as pid %ld on host %s\n",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)orte_process_info.pid,
                orte_process_info.nodename);
    }

    /* We actually do *not* want the orted to voluntarily yield() the
       processor more than necessary.  The orted already blocks when
       it is doing nothing, so it doesn't use any more CPU cycles than
       it should; but when it *is* doing something, we do not want it
       to be unnecessarily delayed because it voluntarily yielded the
       processor in the middle of its work.

       For example: when a message arrives at the orted, we want the
       OS to wake up the orted in a timely fashion (which most OS's
       seem good about doing) and then we want the orted to process
       the message as fast as possible.  If the orted yields and lets
       aggressive MPI applications get the processor back, it may be a
       long time before the OS schedules the orted to run again
       (particularly if there is no IO event to wake it up).  Hence,
       routed OOB messages (for example) may be significantly delayed
       before being delivered to MPI processes, which can be
       problematic in some scenarios (e.g., COMM_SPAWN, BTL's that
       require OOB messages for wireup, etc.). */
    opal_progress_set_yield_when_idle(false);

    /* Change the default behavior of libevent such that we want to
       continually block rather than blocking for the default timeout
       and then looping around the progress engine again.  There
       should be nothing in the orted that cannot block in libevent
       until "something" happens (i.e., there's no need to keep
       cycling through progress because the only things that should
       happen will happen in libevent).  This is a minor optimization,
       but what the heck... :-) */
    opal_progress_set_event_flag(OPAL_EVLOOP_ONCE);

    /* if requested, report my uri to the indicated pipe */
    if (orted_globals.uri_pipe > 0) {
        orte_job_t *jdata;
        orte_proc_t *proc;
        orte_node_t *node;
        orte_app_context_t *app;
        char *tmp, *nptr, *sysinfo;
        int32_t ljob;
        orte_grpcomm_collective_t *coll;
        orte_namelist_t *nm;

        /* setup the singleton's job */
        jdata = OBJ_NEW(orte_job_t);
        orte_plm_base_create_jobid(jdata);
        ljob = ORTE_LOCAL_JOBID(jdata->jobid);
        opal_pointer_array_set_item(orte_job_data, ljob, jdata);

        /* must create a map for it (even though it has no
         * info in it) so that the job info will be picked
         * up in subsequent pidmaps or other daemons won't
         * know how to route
         */
        jdata->map = OBJ_NEW(orte_job_map_t);

        /* setup an app_context for the singleton */
        app = OBJ_NEW(orte_app_context_t);
        app->app = strdup("singleton");
        app->num_procs = 1;
        opal_pointer_array_add(jdata->apps, app);
        
        /* setup a proc object for the singleton - since we
         * -must- be the HNP, and therefore we stored our
         * node on the global node pool, and since the singleton
         * -must- be on the same node as us, indicate that
         */
        proc = OBJ_NEW(orte_proc_t);
        proc->name.jobid = jdata->jobid;
        proc->name.vpid = 0;
        proc->alive = true;
        proc->state = ORTE_PROC_STATE_RUNNING;
        proc->app_idx = 0;
        /* obviously, it is on my node */
        node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
        proc->node = node;
        OBJ_RETAIN(node);  /* keep accounting straight */
        opal_pointer_array_add(jdata->procs, proc);
        jdata->num_procs = 1;
        /* and it obviously is on the node */
        OBJ_RETAIN(proc);
        opal_pointer_array_add(node->procs, proc);
        node->num_procs++;
        /* and obviously it is one of my local procs */
        OBJ_RETAIN(proc);
        opal_pointer_array_add(orte_local_children, proc);
        jdata->num_local_procs = 1;
        /* set the trivial */
        proc->local_rank = 0;
        proc->node_rank = 0;
        proc->app_rank = 0;
        proc->state = ORTE_PROC_STATE_RUNNING;
        proc->alive = true;
        proc->app_idx = 0;
        proc->local_proc = true;

        /* account for the collectives in its modex/barriers */
        jdata->peer_modex = orte_grpcomm_base_get_coll_id();
        coll = orte_grpcomm_base_setup_collective(jdata->peer_modex);
        nm = OBJ_NEW(orte_namelist_t);
        nm->name.jobid = jdata->jobid;
        nm->name.vpid = ORTE_VPID_WILDCARD;
        opal_list_append(&coll->participants, &nm->super);

        jdata->peer_init_barrier = orte_grpcomm_base_get_coll_id();
        coll = orte_grpcomm_base_setup_collective(jdata->peer_init_barrier);
        nm = OBJ_NEW(orte_namelist_t);
        nm->name.jobid = jdata->jobid;
        nm->name.vpid = ORTE_VPID_WILDCARD;
        opal_list_append(&coll->participants, &nm->super);

        jdata->peer_fini_barrier = orte_grpcomm_base_get_coll_id();
        coll = orte_grpcomm_base_setup_collective(jdata->peer_fini_barrier);
        nm = OBJ_NEW(orte_namelist_t);
        nm->name.jobid = jdata->jobid;
        nm->name.vpid = ORTE_VPID_WILDCARD;
        opal_list_append(&coll->participants, &nm->super);

        /* need to setup a pidmap for it */
        if (ORTE_SUCCESS != (ret = orte_util_encode_pidmap(&orte_pidmap, false))) {
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }

        /* if we don't yet have a daemon map, then we have to generate one
         * to pass back to it
         */
        if (NULL != orte_nidmap.bytes) {
            free(orte_nidmap.bytes);
        }
        if (ORTE_SUCCESS != (ret = orte_util_encode_nodemap(&orte_nidmap, false))) {
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }

        /* create a string that contains our uri + sysinfo */
        orte_util_convert_sysinfo_to_string(&sysinfo, orte_local_cpu_type, orte_local_cpu_model);
        asprintf(&tmp, "%s[%s]", orte_process_info.my_daemon_uri, sysinfo);
	free(sysinfo);

        /* pass that info to the singleton */
        write(orted_globals.uri_pipe, tmp, strlen(tmp)+1); /* need to add 1 to get the NULL */

        /* cleanup */
        free(tmp);

        /* since a singleton spawned us, we need to harvest
         * any MCA params from the local environment so
         * we can pass them along to any subsequent daemons
         * we may start as the result of a comm_spawn
         */
        for (i=0; NULL != environ[i]; i++) {
            if (0 == strncmp(environ[i], "OMPI_MCA", 8)) {
                /* make a copy to manipulate */
                tmp = strdup(environ[i]);
                /* find the equal sign */
                nptr = strchr(tmp, '=');
                *nptr = '\0';
                nptr++;
                /* add the mca param to the orted cmd line */
                opal_argv_append_nosize(&orted_cmd_line, "-mca");
                opal_argv_append_nosize(&orted_cmd_line, &tmp[9]);
                opal_argv_append_nosize(&orted_cmd_line, nptr);
                free(tmp);
            }
        }
    }

    /* if we were given a pipe to monitor for singleton termination, set that up */
    if (orted_globals.singleton_died_pipe > 0) {
        /* register shutdown handler */
        pipe_handler = (opal_event_t*)malloc(sizeof(opal_event_t));
        opal_event_set(orte_event_base, pipe_handler,
                       orted_globals.singleton_died_pipe,
                       OPAL_EV_READ,
                       pipe_closed,
                       pipe_handler);
        opal_event_add(pipe_handler, NULL);
    }

    /* If I have a parent, then save his contact info so
     * any messages we send can flow thru him.
     */

    orte_parent_uri = NULL;
    (void) mca_base_var_register ("orte", "orte", NULL, "parent_uri",
                                  "URI for the parent if tree launch is enabled.",
                                  MCA_BASE_VAR_TYPE_STRING, NULL, 0,
                                  MCA_BASE_VAR_FLAG_INTERNAL,
                                  OPAL_INFO_LVL_9,
                                  MCA_BASE_VAR_SCOPE_CONSTANT,
                                  &orte_parent_uri);
    if (NULL != orte_parent_uri) {
        orte_process_name_t parent;

        /* set the contact info into the hash table */
        orte_rml.set_contact_info(orte_parent_uri);
        ret = orte_rml_base_parse_uris(orte_parent_uri, &parent, NULL);
        if (ORTE_SUCCESS != ret) {
            ORTE_ERROR_LOG(ret);
            free (orte_parent_uri);
            orte_parent_uri = NULL;
            goto DONE;
        }

        /* don't need this value anymore */
        free(orte_parent_uri);
        orte_parent_uri = NULL;

        /* tell the routed module that we have a path
         * back to the HNP
         */
        if (ORTE_SUCCESS != (ret = orte_routed.update_route(ORTE_PROC_MY_HNP, &parent))) {
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }
        /* set the lifeline to point to our parent so that we
         * can handle the situation if that lifeline goes away
         */
        if (ORTE_SUCCESS != (ret = orte_routed.set_lifeline(&parent))) {
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }
    }

    /* if we are not the HNP...the only time we will be an HNP
     * is if we are launched by a singleton to provide support
     * for it
     */
    if (!ORTE_PROC_IS_HNP) {
        /* send the information to the orted report-back point - this function
         * will process the data, but also counts the number of
         * orteds that reported back so the launch procedure can continue.
         * We need to do this at the last possible second as the HNP
         * can turn right around and begin issuing orders to us
         */

        buffer = OBJ_NEW(opal_buffer_t);
        /* insert our name for rollup purposes */
        if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
            ORTE_ERROR_LOG(ret);
            OBJ_RELEASE(buffer);
            goto DONE;
        }
        /* for now, always include our contact info, even if we are using
         * static ports. Eventually, this will be removed
         */
        rml_uri = orte_rml.get_contact_info();
        if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &rml_uri, 1, OPAL_STRING))) {
            ORTE_ERROR_LOG(ret);
            OBJ_RELEASE(buffer);
            goto DONE;
        }

        /* include our node name */
        opal_dss.pack(buffer, &orte_process_info.nodename, 1, OPAL_STRING);

        /* if requested, include any non-loopback aliases for this node */
        if (orte_retain_aliases) {
            char **aliases=NULL;
            uint8_t naliases, ni;
            char hostname[ORTE_MAX_HOSTNAME_SIZE];

            /* if we stripped the prefix or removed the fqdn,
             * include full hostname as an alias
             */
            gethostname(hostname, ORTE_MAX_HOSTNAME_SIZE);
            if (strlen(orte_process_info.nodename) < strlen(hostname)) {
                opal_argv_append_nosize(&aliases, hostname);
            }
            opal_ifgetaliases(&aliases);
            naliases = opal_argv_count(aliases);
            if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &naliases, 1, OPAL_UINT8))) {
                ORTE_ERROR_LOG(ret);
                OBJ_RELEASE(buffer);
                goto DONE;
            }
            for (ni=0; ni < naliases; ni++) {
                if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &aliases[ni], 1, OPAL_STRING))) {
                    ORTE_ERROR_LOG(ret);
                    OBJ_RELEASE(buffer);
                    goto DONE;
                }
            }
            opal_argv_free(aliases);
        }

#if OPAL_HAVE_HWLOC
        {
            char *coprocessors;
            /* add the local topology */
            if (NULL != opal_hwloc_topology &&
                (1 == ORTE_PROC_MY_NAME->vpid || orte_hetero_nodes)) {
                if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO))) {
                    ORTE_ERROR_LOG(ret);
                }
            }
            /* detect and add any coprocessors */
            coprocessors = opal_hwloc_base_find_coprocessors(opal_hwloc_topology);
            if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &coprocessors, 1, OPAL_STRING))) {
                ORTE_ERROR_LOG(ret);
            }
            /* see if I am on a coprocessor */
            coprocessors = opal_hwloc_base_check_on_coprocessor();
            if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &coprocessors, 1, OPAL_STRING))) {
                ORTE_ERROR_LOG(ret);
            }
        }
#endif

        /* send to the HNP's callback - will be routed if routes are available */
        if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buffer,
                                               ORTE_RML_TAG_ORTED_CALLBACK,
                                               orte_rml_send_callback, NULL))) {
            ORTE_ERROR_LOG(ret);
            OBJ_RELEASE(buffer);
            goto DONE;
        }
    }

    /* if we are tree-spawning, then we need to capture the MCA params
     * from our cmd line so we can pass them along to the daemons we spawn -
     * otherwise, only the first layer of daemons will ever see them
     */
    if (orted_globals.tree_spawn) {
        int j, k;
        bool ignore;
        char *no_keep[] = {
            "orte_hnp_uri",
            "orte_ess_jobid",
            "orte_ess_vpid",
            "orte_ess_num_procs",
            "orte_parent_uri",
            NULL
        };
        for (i=0; i < argc; i++) {
            if (0 == strcmp("-mca",  argv[i]) ||
                0 == strcmp("--mca", argv[i]) ) {
                ignore = false;
                /* see if this is something we cannot pass along */
                for (k=0; NULL != no_keep[k]; k++) {
                    if (0 == strcmp(no_keep[k], argv[i+1])) {
                        ignore = true;
                        break;
                    }
                }
                if (!ignore) {
                    /* see if this is already present so we at least can
                     * avoid growing the cmd line with duplicates
                     */
                    if (NULL != orted_cmd_line) {
                        for (j=0; NULL != orted_cmd_line[j]; j++) {
                            if (0 == strcmp(argv[i+1], orted_cmd_line[j])) {
                                /* already here - ignore it */
                                ignore = true;
                                break;
                            }
                        }
                    }
                    if (!ignore) {
                        opal_argv_append_nosize(&orted_cmd_line, argv[i]);
                        opal_argv_append_nosize(&orted_cmd_line, argv[i+1]);
                        opal_argv_append_nosize(&orted_cmd_line, argv[i+2]);
                    }
                }
                i += 2;
            }
        }
    }
            
    if (orte_debug_daemons_flag) {
        opal_output(0, "%s orted: up and running - waiting for commands!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
    }

    /* loop the event lib until an exit event is detected */
    while (orte_event_base_active) {
        opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
    }

    /* ensure all local procs are dead */
    orte_odls.kill_local_procs(NULL);

 DONE:
    /* update the exit status, in case it wasn't done */
    ORTE_UPDATE_EXIT_STATUS(orte_exit_status);

    /* cleanup and leave */
    orte_finalize();

    if (orte_debug_flag) {
        fprintf(stderr, "exiting with status %d\n", orte_exit_status);
    }
    exit(orte_exit_status);
}