Example #1
0
static void failed_start(orte_job_t *jobdat)
{
    int i;
    orte_proc_t *child;

    /* set the state */
    jobdat->state = ORTE_JOB_STATE_FAILED_TO_START;

    for (i=0; i < orte_local_children->size; i++) {
        if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
            continue;
        }
        /* is this child part of the specified job? */
        if (child->name.jobid == jobdat->jobid) {
            if (ORTE_PROC_STATE_FAILED_TO_START == child->state) {
                /* this proc never launched - flag that the iof
                 * is complete or else we will hang waiting for
                 * pipes to close that were never opened
                 */
                ORTE_FLAG_SET(child, ORTE_PROC_FLAG_IOF_COMPLETE);
                /* ditto for waitpid */
                ORTE_FLAG_SET(child, ORTE_PROC_FLAG_WAITPID);
            }
        }
    }
    OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
                         "%s errmgr:hnp: job %s reported incomplete start",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_JOBID_PRINT(jobdat->jobid)));
    return;
}
Example #2
0
static void orte_job_construct(orte_job_t* job)
{
    job->personality = NULL;
    job->jobid = ORTE_JOBID_INVALID;
    job->offset = 0;
    job->apps = OBJ_NEW(opal_pointer_array_t);
    opal_pointer_array_init(job->apps,
                            1,
                            ORTE_GLOBAL_ARRAY_MAX_SIZE,
                            2);
    job->num_apps = 0;
    job->stdin_target = 0;
    job->total_slots_alloc = 0;
    job->num_procs = 0;
    job->procs = OBJ_NEW(opal_pointer_array_t);
    opal_pointer_array_init(job->procs,
                            ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                            ORTE_GLOBAL_ARRAY_MAX_SIZE,
                            ORTE_GLOBAL_ARRAY_BLOCK_SIZE);
    job->map = NULL;
    job->bookmark = NULL;
    job->bkmark_obj = 0;
    job->state = ORTE_JOB_STATE_UNDEF;

    job->num_mapped = 0;
    job->num_launched = 0;
    job->num_reported = 0;
    job->num_terminated = 0;
    job->num_daemons_reported = 0;

    job->originator.jobid = ORTE_JOBID_INVALID;
    job->originator.vpid = ORTE_VPID_INVALID;
    job->num_local_procs = 0;

    job->flags = 0;
    ORTE_FLAG_SET(job, ORTE_JOB_FLAG_GANG_LAUNCHED);
    ORTE_FLAG_SET(job, ORTE_JOB_FLAG_FORWARD_OUTPUT);

    OBJ_CONSTRUCT(&job->attributes, opal_list_t);
}
Example #3
0
orte_proc_t* orte_rmaps_base_setup_proc(orte_job_t *jdata,
                                        orte_node_t *node,
                                        orte_app_idx_t idx)
{
    orte_proc_t *proc;
    int rc;

    proc = OBJ_NEW(orte_proc_t);
    /* set the jobid */
    proc->name.jobid = jdata->jobid;
    /* flag the proc as ready for launch */
    proc->state = ORTE_PROC_STATE_INIT;
    proc->app_idx = idx;
    /* mark the proc as UPDATED so it will be included in the launch */
    ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_UPDATED);
    if (NULL == node->daemon) {
        proc->parent = ORTE_VPID_INVALID;
    } else {
        proc->parent = node->daemon->name.vpid;
    }

    OBJ_RETAIN(node);  /* maintain accounting on object */
    proc->node = node;
    node->num_procs++;
    if (node->slots_inuse < node->slots) {
        node->slots_inuse += orte_rmaps_base.cpus_per_rank;
    }
    if (0 > (rc = opal_pointer_array_add(node->procs, (void*)proc))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(proc);
        return NULL;
    }
    /* retain the proc struct so that we correctly track its release */
    OBJ_RETAIN(proc);

    return proc;
}
Example #4
0
/*
 * Sequentially map the ranks according to the placement in the
 * specified hostfile
 */
static int orte_rmaps_seq_map(orte_job_t *jdata)
{
    orte_job_map_t *map;
    orte_app_context_t *app;
    int i, n;
    orte_std_cntr_t j;
    opal_list_item_t *item;
    orte_node_t *node, *nd;
    seq_node_t *sq, *save=NULL, *seq;;
    orte_vpid_t vpid;
    orte_std_cntr_t num_nodes;
    int rc;
    opal_list_t default_seq_list;
    opal_list_t node_list, *seq_list, sq_list;
    orte_proc_t *proc;
    mca_base_component_t *c = &mca_rmaps_seq_component.base_version;
    char *hosts = NULL, *sep, *eptr;
    FILE *fp;
    opal_hwloc_resource_type_t rtype;

    OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base_framework.framework_output,
                         "%s rmaps:seq called on job %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_JOBID_PRINT(jdata->jobid)));

    /* this mapper can only handle initial launch
     * when seq mapping is desired - allow
     * restarting of failed apps
     */
    if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:seq: job %s is being restarted - seq cannot map",
                            ORTE_JOBID_PRINT(jdata->jobid));
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }
    if (NULL != jdata->map->req_mapper) {
        if (0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) {
            /* a mapper has been specified, and it isn't me */
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps:seq: job %s not using sequential mapper",
                                ORTE_JOBID_PRINT(jdata->jobid));
            return ORTE_ERR_TAKE_NEXT_OPTION;
        }
        /* we need to process it */
        goto process;
    }
    if (ORTE_MAPPING_SEQ != ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
        /* I don't know how to do these - defer */
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:seq: job %s not using seq mapper",
                            ORTE_JOBID_PRINT(jdata->jobid));
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }

process:
    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps:seq: mapping job %s",
                        ORTE_JOBID_PRINT(jdata->jobid));

    /* flag that I did the mapping */
    if (NULL != jdata->map->last_mapper) {
        free(jdata->map->last_mapper);
    }
    jdata->map->last_mapper = strdup(c->mca_component_name);

    /* convenience def */
    map = jdata->map;

    /* if there is a default hostfile, go and get its ordered list of nodes */
    OBJ_CONSTRUCT(&default_seq_list, opal_list_t);
    if (NULL != orte_default_hostfile) {
        char *hstname = NULL;
        /* open the file */
        fp = fopen(orte_default_hostfile, "r");
        if (NULL == fp) {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            rc = ORTE_ERR_NOT_FOUND;
            goto error;
        }
        while (NULL != (hstname = orte_getline(fp))) {
            if (0 == strlen(hstname)) {
                free(hstname);
                /* blank line - ignore */
                continue;
            }
            if( '#' == hstname[0] ) {
                free(hstname);
                /* Comment line - ignore */
                continue;
            }
            sq = OBJ_NEW(seq_node_t);
            if (NULL != (sep = strchr(hstname, ' '))) {
                *sep = '\0';
                sep++;
                /* remove any trailing space */
                eptr = sep + strlen(sep) - 1;
                while (eptr > sep && isspace(*eptr)) {
                    eptr--;
                }
                *(eptr+1) = 0;
                sq->cpuset = strdup(sep);
            }

            // Strip off the FQDN if present, ignore IP addresses
            if( !orte_keep_fqdn_hostnames && !opal_net_isaddr(hstname) ) {
                char *ptr;
                if (NULL != (ptr = strchr(hstname, '.'))) {
                    *ptr = '\0';
                }
            }

            sq->hostname = hstname;
            opal_list_append(&default_seq_list, &sq->super);
        }
        fclose(fp);
    }

    /* start at the beginning... */
    vpid = 0;
    jdata->num_procs = 0;
    if (0 < opal_list_get_size(&default_seq_list)) {
        save = (seq_node_t*)opal_list_get_first(&default_seq_list);
    }

    /* default to LOGICAL processors */
    if (orte_get_attribute(&jdata->attributes, ORTE_JOB_PHYSICAL_CPUIDS, NULL, OPAL_BOOL)) {
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:seq: using PHYSICAL processors");
        rtype = OPAL_HWLOC_PHYSICAL;
    } else {
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:seq: using LOGICAL processors");
        rtype = OPAL_HWLOC_LOGICAL;
    }

    /* initialize all the nodes as not included in this job map */
    for (j=0; j < orte_node_pool->size; j++) {
        if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, j))) {
            ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
        }
    }

    /* cycle through the app_contexts, mapping them sequentially */
    for(i=0; i < jdata->apps->size; i++) {
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
            continue;
        }

        /* dash-host trumps hostfile */
        if (orte_get_attribute(&app->attributes, ORTE_APP_DASH_HOST, (void**)&hosts, OPAL_STRING)) {
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps:seq: using dash-host nodes on app %s", app->app);
            OBJ_CONSTRUCT(&node_list, opal_list_t);
            /* dash host entries cannot specify cpusets, so used the std function to retrieve the list */
            if (ORTE_SUCCESS != (rc = orte_util_get_ordered_dash_host_list(&node_list, hosts))) {
                ORTE_ERROR_LOG(rc);
                free(hosts);
                goto error;
            }
            free(hosts);
            /* transfer the list to a seq_node_t list */
            OBJ_CONSTRUCT(&sq_list, opal_list_t);
            while (NULL != (nd = (orte_node_t*)opal_list_remove_first(&node_list))) {
                sq = OBJ_NEW(seq_node_t);
                sq->hostname = strdup(nd->name);
                opal_list_append(&sq_list, &sq->super);
                OBJ_RELEASE(nd);
            }
            OBJ_DESTRUCT(&node_list);
            seq_list = &sq_list;
        } else if (orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, (void**)&hosts, OPAL_STRING)) {
            char *hstname;
            if (NULL == hosts) {
                rc = ORTE_ERR_NOT_FOUND;
                goto error;
            }
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps:seq: using hostfile %s nodes on app %s", hosts, app->app);
            OBJ_CONSTRUCT(&sq_list, opal_list_t);
            /* open the file */
            fp = fopen(hosts, "r");
            if (NULL == fp) {
                ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
                rc = ORTE_ERR_NOT_FOUND;
                OBJ_DESTRUCT(&sq_list);
                goto error;
            }
            while (NULL != (hstname = orte_getline(fp))) {
                if (0 == strlen(hstname)) {
                    free(hstname);
                    /* blank line - ignore */
                    continue;
                }
                if( '#' == hstname[0] ) {
                    free(hstname);
                    /* Comment line - ignore */
                    continue;
                }
                sq = OBJ_NEW(seq_node_t);
                if (NULL != (sep = strchr(hstname, ' '))) {
                    *sep = '\0';
                    sep++;
                    /* remove any trailing space */
                    eptr = sep + strlen(sep) - 1;
                    while (eptr > sep && isspace(*eptr)) {
                        eptr--;
                    }
                    *(eptr+1) = 0;
                    sq->cpuset = strdup(sep);
                }

                // Strip off the FQDN if present, ignore IP addresses
                if( !orte_keep_fqdn_hostnames && !opal_net_isaddr(hstname) ) {
                    char *ptr;
                    if (NULL != (ptr = strchr(hstname, '.'))) {
                        (*ptr) = '\0';
                    }
                }

                sq->hostname = hstname;
                opal_list_append(&sq_list, &sq->super);
            }
            fclose(fp);
            free(hosts);
            seq_list = &sq_list;
        } else if (0 < opal_list_get_size(&default_seq_list)) {
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps:seq: using default hostfile nodes on app %s", app->app);
            seq_list = &default_seq_list;
        } else {
            /* can't do anything - no nodes available! */
            orte_show_help("help-orte-rmaps-base.txt",
                           "orte-rmaps-base:no-available-resources",
                           true);
            return ORTE_ERR_SILENT;
        }

        /* check for nolocal and remove the head node, if required */
        if (map->mapping & ORTE_MAPPING_NO_USE_LOCAL) {
            for (item  = opal_list_get_first(seq_list);
                    item != opal_list_get_end(seq_list);
                    item  = opal_list_get_next(item) ) {
                seq = (seq_node_t*)item;
                /* need to check ifislocal because the name in the
                 * hostfile may not have been FQDN, while name returned
                 * by gethostname may have been (or vice versa)
                 */
                if (orte_ifislocal(seq->hostname)) {
                    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                        "mca:rmaps:seq: removing head node %s", seq->hostname);
                    opal_list_remove_item(seq_list, item);
                    OBJ_RELEASE(item);  /* "un-retain" it */
                }
            }
        }

        if (NULL == seq_list || 0 == (num_nodes = (orte_std_cntr_t)opal_list_get_size(seq_list))) {
            orte_show_help("help-orte-rmaps-base.txt",
                           "orte-rmaps-base:no-available-resources",
                           true);
            return ORTE_ERR_SILENT;
        }

        /* if num_procs wasn't specified, set it now */
        if (0 == app->num_procs) {
            app->num_procs = num_nodes;
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps:seq: setting num procs to %s for app %s",
                                ORTE_VPID_PRINT(app->num_procs), app->app);
        } else if (num_nodes < app->num_procs) {
            orte_show_help("help-orte-rmaps-base.txt", "seq:not-enough-resources", true,
                           app->num_procs, num_nodes);
            return ORTE_ERR_SILENT;
        }

        if (seq_list == &default_seq_list) {
            sq = save;
        } else {
            sq = (seq_node_t*)opal_list_get_first(seq_list);
        }
        for (n=0; n < app->num_procs; n++) {
            /* find this node on the global array - this is necessary so
             * that our mapping gets saved on that array as the objects
             * returned by the hostfile function are -not- on the array
             */
            node = NULL;
            for (j=0; j < orte_node_pool->size; j++) {
                if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, j))) {
                    continue;
                }
                if (0 == strcmp(sq->hostname, node->name)) {
                    break;
                }
            }
            if (NULL == node) {
                /* wasn't found - that is an error */
                orte_show_help("help-orte-rmaps-seq.txt",
                               "orte-rmaps-seq:resource-not-found",
                               true, sq->hostname);
                rc = ORTE_ERR_SILENT;
                goto error;
            }
            /* ensure the node is in the map */
            if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
                OBJ_RETAIN(node);
                opal_pointer_array_add(map->nodes, node);
                jdata->map->num_nodes++;
                ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED);
            }
            proc = orte_rmaps_base_setup_proc(jdata, node, i);
            if ((node->slots < (int)node->num_procs) ||
                    (0 < node->slots_max && node->slots_max < (int)node->num_procs)) {
                if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
                    orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
                                   true, node->num_procs, app->app);
                    ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
                    rc = ORTE_ERR_SILENT;
                    goto error;
                }
                /* flag the node as oversubscribed so that sched-yield gets
                 * properly set
                 */
                ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED);
                ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_OVERSUBSCRIBED);
                /* check for permission */
                if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
                    /* if we weren't given a directive either way, then we will error out
                     * as the #slots were specifically given, either by the host RM or
                     * via hostfile/dash-host */
                    if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
                        orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
                                       true, app->num_procs, app->app);
                        ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
                        return ORTE_ERR_SILENT;
                    } else if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
                        /* if we were explicitly told not to oversubscribe, then don't */
                        orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
                                       true, app->num_procs, app->app);
                        ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
                        return ORTE_ERR_SILENT;
                    }
                }
            }
            /* assign the vpid */
            proc->name.vpid = vpid++;
            opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps:seq: assign proc %s to node %s for app %s",
                                ORTE_VPID_PRINT(proc->name.vpid), sq->hostname, app->app);

            /* record the cpuset, if given */
            if (NULL != sq->cpuset) {
                hwloc_cpuset_t bitmap;
                char *cpu_bitmap;
                if (NULL == node->topology) {
                    /* not allowed - for sequential cpusets, we must have
                     * the topology info
                     */
                    orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-topology", true, node->name);
                    rc = ORTE_ERR_SILENT;
                    goto error;
                }
                /* if we are using hwthreads as cpus and binding to hwthreads, then
                 * we can just copy the cpuset across as it already specifies things
                 * at that level */
                if (opal_hwloc_use_hwthreads_as_cpus &&
                        OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) {
                    cpu_bitmap = strdup(sq->cpuset);
                } else {
                    /* setup the bitmap */
                    bitmap = hwloc_bitmap_alloc();
                    /* parse the slot_list to find the socket and core */
                    if (ORTE_SUCCESS != (rc = opal_hwloc_base_slot_list_parse(sq->cpuset, node->topology, rtype, bitmap))) {
                        ORTE_ERROR_LOG(rc);
                        hwloc_bitmap_free(bitmap);
                        goto error;
                    }
                    /* note that we cannot set the proc locale to any specific object
                     * as the slot list may have assigned it to more than one - so
                     * leave that field NULL
                     */
                    /* set the proc to the specified map */
                    hwloc_bitmap_list_asprintf(&cpu_bitmap, bitmap);
                    hwloc_bitmap_free(bitmap);
                }
                orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, cpu_bitmap, OPAL_STRING);
                opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                    "mca:rmaps:seq: binding proc %s to cpuset %s bitmap %s",
                                    ORTE_VPID_PRINT(proc->name.vpid), sq->cpuset, cpu_bitmap);
                /* we are going to bind to cpuset since the user is specifying the cpus */
                OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_CPUSET);
                /* note that the user specified the mapping */
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYUSER);
                ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_GIVEN);
                /* cleanup */
                free(cpu_bitmap);
            } else {
                hwloc_obj_t locale;

                /* assign the locale - okay for the topo to be null as
                 * it just means it wasn't returned
                 */
                if (NULL != node->topology) {
                    locale = hwloc_get_root_obj(node->topology);
                    orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE,
                                       ORTE_ATTR_LOCAL, locale, OPAL_PTR);
                }
            }

            /* add to the jdata proc array */
            if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
                ORTE_ERROR_LOG(rc);
                goto error;
            }
            /* move to next node */
            sq = (seq_node_t*)opal_list_get_next(&sq->super);
        }

        /** track the total number of processes we mapped */
        jdata->num_procs += app->num_procs;

        /* cleanup the node list if it came from this app_context */
        if (seq_list != &default_seq_list) {
            OPAL_LIST_DESTRUCT(seq_list);
        } else {
            save = sq;
        }
    }

    return ORTE_SUCCESS;

error:
    OPAL_LIST_DESTRUCT(&default_seq_list);
    return rc;
}
Example #5
0
static void track_procs(int fd, short argc, void *cbdata)
{
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
    orte_process_name_t *proc = &caddy->name;
    orte_proc_state_t state = caddy->proc_state;
    orte_job_t *jdata;
    orte_proc_t *pdata, *pptr;
    opal_buffer_t *alert;
    int rc, i;
    orte_plm_cmd_flag_t cmd;
    char *rtmod;
    orte_std_cntr_t index;
    orte_job_map_t *map;
    orte_node_t *node;

    OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
                         "%s state:orted:track_procs called for proc %s state %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(proc),
                         orte_proc_state_to_str(state)));

    /* get the job object for this proc */
    if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        goto cleanup;
    }
    pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);

    if (ORTE_PROC_STATE_RUNNING == state) {
        /* update the proc state */
        pdata->state = state;
        jdata->num_launched++;
        /* don't update until we are told that all are done */
    } else if (ORTE_PROC_STATE_REGISTERED == state) {
        /* update the proc state */
        pdata->state = state;
        jdata->num_reported++;
        if (jdata->num_reported == jdata->num_local_procs) {
            /* once everyone registers, notify the HNP */

            OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
                                 "%s state:orted: notifying HNP all local registered",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

            alert = OBJ_NEW(opal_buffer_t);
            /* pack registered command */
            cmd = ORTE_PLM_REGISTERED_CMD;
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            /* pack the jobid */
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            /* pack all the local child vpids */
            for (i=0; i < orte_local_children->size; i++) {
                if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                    continue;
                }
                if (pptr->name.jobid == proc->jobid) {
                    if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &pptr->name.vpid, 1, ORTE_VPID))) {
                        ORTE_ERROR_LOG(rc);
                        goto cleanup;
                    }
                }
            }
            /* send it */
            if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
                                                  ORTE_PROC_MY_HNP, alert,
                                                  ORTE_RML_TAG_PLM,
                                                  orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(rc);
            } else {
                rc = ORTE_SUCCESS;
            }
        }
    } else if (ORTE_PROC_STATE_IOF_COMPLETE == state) {
        /* do NOT update the proc state as this can hit
         * while we are still trying to notify the HNP of
         * successful launch for short-lived procs
         */
        ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_IOF_COMPLETE);
        /* Release the stdin IOF file descriptor for this child, if one
         * was defined. File descriptors for the other IOF channels - stdout,
         * stderr, and stddiag - were released when their associated pipes
         * were cleared and closed due to termination of the process
         * Do this after we handle termination in case the IOF needs
         * to check to see if all procs from the job are actually terminated
         */
        if (NULL != orte_iof.close) {
            orte_iof.close(proc, ORTE_IOF_STDALL);
        }
        if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_WAITPID) &&
            !ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) {
            ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
        }
    } else if (ORTE_PROC_STATE_WAITPID_FIRED == state) {
        /* do NOT update the proc state as this can hit
         * while we are still trying to notify the HNP of
         * successful launch for short-lived procs
         */
        ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_WAITPID);
        if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_IOF_COMPLETE) &&
            !ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) {
            ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
        }
    } else if (ORTE_PROC_STATE_TERMINATED == state) {
        /* if this proc has not already recorded as terminated, then
         * update the accounting here */
        if (!ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) {
            jdata->num_terminated++;
        }
        /* update the proc state */
        ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_RECORDED);
        ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE);
        pdata->state = state;
        /* Clean up the session directory as if we were the process
         * itself.  This covers the case where the process died abnormally
         * and didn't cleanup its own session directory.
         */
        orte_session_dir_finalize(proc);
        /* if we are trying to terminate and our routes are
         * gone, then terminate ourselves IF no local procs
         * remain (might be some from another job)
         */
        rtmod = orte_rml.get_routed(orte_mgmt_conduit);
        if (orte_orteds_term_ordered &&
            0 == orte_routed.num_routes(rtmod)) {
            for (i=0; i < orte_local_children->size; i++) {
                if (NULL != (pdata = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
                    ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_ALIVE)) {
                    /* at least one is still alive */
                    OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
                                         "%s state:orted all routes gone but proc %s still alive",
                                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                         ORTE_NAME_PRINT(&pdata->name)));
                    goto cleanup;
                }
            }
            /* call our appropriate exit procedure */
            OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
                                 "%s state:orted all routes and children gone - exiting",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
            goto cleanup;
        }
        /* track job status */
        if (jdata->num_terminated == jdata->num_local_procs &&
            !orte_get_attribute(&jdata->attributes, ORTE_JOB_TERM_NOTIFIED, NULL, OPAL_BOOL)) {
            /* pack update state command */
            cmd = ORTE_PLM_UPDATE_PROC_STATE;
            alert = OBJ_NEW(opal_buffer_t);
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            /* pack the job info */
            if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) {
                ORTE_ERROR_LOG(rc);
            }
            /* send it */
            OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
                                 "%s state:orted: SENDING JOB LOCAL TERMINATION UPDATE FOR JOB %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_JOBID_PRINT(jdata->jobid)));
            if (0 > (rc = orte_rml.send_buffer_nb(orte_mgmt_conduit,
                                                  ORTE_PROC_MY_HNP, alert,
                                                  ORTE_RML_TAG_PLM,
                                                  orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(rc);
            }
            /* mark that we sent it so we ensure we don't do it again */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_TERM_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
            /* cleanup the procs as these are gone */
            for (i=0; i < orte_local_children->size; i++) {
                if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                    continue;
                }
                /* if this child is part of the job... */
                if (pptr->name.jobid == jdata->jobid) {
                    /* clear the entry in the local children */
                    opal_pointer_array_set_item(orte_local_children, i, NULL);
                    OBJ_RELEASE(pptr);  // maintain accounting
                }
            }
            /* tell the IOF that the job is complete */
            if (NULL != orte_iof.complete) {
                orte_iof.complete(jdata);
            }

            /* tell the PMIx subsystem the job is complete */
            if (NULL != opal_pmix.server_deregister_nspace) {
                opal_pmix.server_deregister_nspace(jdata->jobid, NULL, NULL);
            }

            /* release the resources */
            if (NULL != jdata->map) {
                map = jdata->map;
                for (index = 0; index < map->nodes->size; index++) {
                    if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) {
                        continue;
                    }
                    OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
                                         "%s state:orted releasing procs from node %s",
                                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                         node->name));
                    for (i = 0; i < node->procs->size; i++) {
                        if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
                            continue;
                        }
                        if (pptr->name.jobid != jdata->jobid) {
                            /* skip procs from another job */
                            continue;
                        }
                        node->slots_inuse--;
                        node->num_procs--;
                        OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output,
                                             "%s state:orted releasing proc %s from node %s",
                                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                             ORTE_NAME_PRINT(&pptr->name), node->name));
                        /* set the entry in the node array to NULL */
                        opal_pointer_array_set_item(node->procs, i, NULL);
                        /* release the proc once for the map entry */
                        OBJ_RELEASE(pptr);
                    }
                    /* set the node location to NULL */
                    opal_pointer_array_set_item(map->nodes, index, NULL);
                    /* maintain accounting */
                    OBJ_RELEASE(node);
                    /* flag that the node is no longer in a map */
                    ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
                }
                OBJ_RELEASE(map);
                jdata->map = NULL;
            }

            /* cleanup the job info */
            opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, NULL);
            OBJ_RELEASE(jdata);
        }
    }

  cleanup:
    OBJ_RELEASE(caddy);
}
Example #6
0
static void track_procs(int fd, short args, void *cbdata)
{
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
    orte_process_name_t *proc = &caddy->name;
    orte_proc_state_t state = caddy->proc_state;
    orte_job_t *jdata;
    orte_proc_t *pdata;

    opal_output_verbose(2, orte_state_base_framework.framework_output,
                        "%s state:staged_hnp:track_procs called for proc %s state %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(proc),
                        orte_proc_state_to_str(state));

    /* get the job object for this proc */
    if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        OBJ_RELEASE(caddy);
        return;
    }
    pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);

    if (ORTE_PROC_STATE_RUNNING == state) {
        /* update the proc state */
        pdata->state = state;
        jdata->num_launched++;
        if (jdata->num_launched == jdata->num_procs) {
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_RUNNING);
        }
    }

    /* if this is a registration, check to see if it came from
     * inside MPI_Init - if it did, that is not acceptable
     */
    if (ORTE_PROC_STATE_REGISTERED == state) {
        if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_AS_MPI) &&
            !ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_GANG_LAUNCHED)) {
            /* we can't support this - issue an error and abort */
            orte_show_help("help-state-staged-hnp.txt", "mpi-procs-not-supported", true);
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SILENT_ABORT);
        }
        /* update the proc state */
        pdata->state = state;
        jdata->num_reported++;
        if (jdata->num_reported == jdata->num_procs) {
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REGISTERED);
        }
        OBJ_RELEASE(caddy);
        return;
    }

    if (ORTE_PROC_STATE_IOF_COMPLETE == state) {
        /* update the proc state */
        pdata->state = state;
        /* Release only the stdin IOF file descriptor for this child, if one
         * was defined. File descriptors for the other IOF channels - stdout,
         * stderr, and stddiag - were released when their associated pipes
         * were cleared and closed due to termination of the process
         */
        if (NULL != orte_iof.close) {
            orte_iof.close(proc, ORTE_IOF_STDIN);
        }
        ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_IOF_COMPLETE);
        if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_WAITPID)) {
            goto terminated;
        }
        OBJ_RELEASE(caddy);
        return;
    }

    if (ORTE_PROC_STATE_WAITPID_FIRED == state) {
        /* update the proc state */
        pdata->state = state;
        ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_WAITPID);
        if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_IOF_COMPLETE)) {
            goto terminated;
        }
        OBJ_RELEASE(caddy);
        return;
    }
    
    /* if the proc terminated, see if any other procs are
     * waiting to run. We assume that the app_contexts are
     * in priority order, with the highest priority being
     * at position 0 in the app_context array for this job
     */
    if (ORTE_PROC_STATE_TERMINATED == state) {
    terminated:
        /* update the proc state */
        ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE);
        pdata->state = ORTE_PROC_STATE_TERMINATED;
        if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_LOCAL)) {
            /* Clean up the session directory as if we were the process
             * itself.  This covers the case where the process died abnormally
             * and didn't cleanup its own session directory.
             */
            orte_session_dir_finalize(proc);
        }
        /* return the allocated slot for reuse */
        cleanup_node(pdata);
        /* track job status */
        jdata->num_terminated++;
        if (jdata->num_terminated == jdata->num_procs) {
            /* no other procs are waiting, so end this job */
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
        } else if (jdata->num_mapped < jdata->num_procs) {
            /* schedule the job for re-mapping so that procs
             * waiting for resources can execute
             */
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP);
        }
	/* otherwise, do nothing until more procs terminate */
        OBJ_RELEASE(caddy);
        return;
    }
}
Example #7
0
/*
 * NODE
 */
int orte_dt_unpack_node(opal_buffer_t *buffer, void *dest,
                        int32_t *num_vals, opal_data_type_t type)
{
    int rc;
    int32_t i, n, k, count;
    orte_node_t **nodes;
    uint8_t flag;
    orte_attribute_t *kv;

    /* unpack into array of orte_node_t objects */
    nodes = (orte_node_t**) dest;
    for (i=0; i < *num_vals; i++) {

        /* create the node object */
        nodes[i] = OBJ_NEW(orte_node_t);
        if (NULL == nodes[i]) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            return ORTE_ERR_OUT_OF_RESOURCE;
        }

        /* do not unpack the index - meaningless here */

        /* unpack the node name */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                         &(nodes[i]->name), &n, OPAL_STRING))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }

        /* do not unpack the daemon name or launch id */

        /* unpack the number of procs on the node */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                         (&(nodes[i]->num_procs)), &n, ORTE_VPID))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }

        /* do not unpack the proc info */

        /* unpack whether we are oversubscribed */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                         (&flag), &n, OPAL_UINT8))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        if (flag) {
            ORTE_FLAG_SET(nodes[i], ORTE_NODE_FLAG_OVERSUBSCRIBED);
        }

        /* unpack the state */
        n = 1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer,
                         (&(nodes[i]->state)), &n, ORTE_NODE_STATE))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }

        /* unpack the attributes */
        n=1;
        if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &count,
                                                         &n, ORTE_STD_CNTR))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        for (k=0; k < count; k++) {
            n=1;
            if (ORTE_SUCCESS != (rc = opal_dss_unpack_buffer(buffer, &kv,
                                                             &n, ORTE_ATTRIBUTE))) {
                ORTE_ERROR_LOG(rc);
                return rc;
            }
            kv->local = ORTE_ATTR_GLOBAL;  // obviously not a local value
            opal_list_append(&nodes[i]->attributes, &kv->super);
        }
    }
    return ORTE_SUCCESS;
}
Example #8
0
static int plm_slurm_start_proc(int argc, char **argv, char **env,
                                char *prefix)
{
    int fd;
    int srun_pid;
    char *exec_argv = opal_path_findv(argv[0], 0, env, NULL);
    orte_proc_t *dummy;

    if (NULL == exec_argv) {
        return ORTE_ERR_NOT_FOUND;
    }

    srun_pid = fork();
    if (-1 == srun_pid) {
        ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
        free(exec_argv);
        return ORTE_ERR_SYS_LIMITS_CHILDREN;
    }
    
    /* setup a dummy proc object to track the srun */
    dummy = OBJ_NEW(orte_proc_t);
    dummy->pid = srun_pid;
    /* be sure to mark it as alive so we don't instantly fire */
    ORTE_FLAG_SET(dummy, ORTE_PROC_FLAG_ALIVE);
    /* setup the waitpid so we can find out if srun succeeds! */
    orte_wait_cb(dummy, srun_wait_cb, NULL);

    if (0 == srun_pid) {  /* child */
        char *bin_base = NULL, *lib_base = NULL;

        /* Figure out the basenames for the libdir and bindir.  There
           is a lengthy comment about this in plm_rsh_module.c
           explaining all the rationale for how / why we're doing
           this. */

        lib_base = opal_basename(opal_install_dirs.libdir);
        bin_base = opal_basename(opal_install_dirs.bindir);

        /* If we have a prefix, then modify the PATH and
           LD_LIBRARY_PATH environment variables.  */
        if (NULL != prefix) {
            char *oldenv, *newenv;

            /* Reset PATH */
            oldenv = getenv("PATH");
            if (NULL != oldenv) {
                asprintf(&newenv, "%s/%s:%s", prefix, bin_base, oldenv);
            } else {
                asprintf(&newenv, "%s/%s", prefix, bin_base);
            }
            opal_setenv("PATH", newenv, true, &env);
            OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
                                 "%s plm:slurm: reset PATH: %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 newenv));
            free(newenv);

            /* Reset LD_LIBRARY_PATH */
            oldenv = getenv("LD_LIBRARY_PATH");
            if (NULL != oldenv) {
                asprintf(&newenv, "%s/%s:%s", prefix, lib_base, oldenv);
            } else {
                asprintf(&newenv, "%s/%s", prefix, lib_base);
            }
            opal_setenv("LD_LIBRARY_PATH", newenv, true, &env);
            OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
                                 "%s plm:slurm: reset LD_LIBRARY_PATH: %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 newenv));
            free(newenv);
        }

        fd = open("/dev/null", O_CREAT|O_RDWR|O_TRUNC, 0666);
        if (fd >= 0) {
            dup2(fd, 0);
            /* When not in debug mode and --debug-daemons was not passed,
             * tie stdout/stderr to dev null so we don't see messages from orted
             * EXCEPT if the user has requested that we leave sessions attached
             */
            if (0 > opal_output_get_verbosity(orte_plm_base_framework.framework_output) &&
                !orte_debug_daemons_flag && !orte_leave_session_attached) {
                dup2(fd,1);
                dup2(fd,2);
            }

            /* Don't leave the extra fd to /dev/null open */
            if (fd > 2) {
                close(fd);
            }
        }

        /* get the srun process out of orterun's process group so that
           signals sent from the shell (like those resulting from
           cntl-c) don't get sent to srun */
        setpgid(0, 0);

        execve(exec_argv, argv, env);

        opal_output(0, "plm:slurm:start_proc: exec failed");
        /* don't return - need to exit - returning would be bad -
           we're not in the calling process anymore */
        exit(1);
    } else {  /* parent */
        /* just in case, make sure that the srun process is not in our
           process group any more.  Stevens says always do this on both
           sides of the fork... */
        setpgid(srun_pid, srun_pid);
        
        /* if this is the primary launch - i.e., not a comm_spawn of a
         * child job - then save the pid
         */
        if (!primary_pid_set) {
            primary_srun_pid = srun_pid;
            primary_pid_set = true;
        }
        
        free(exec_argv);
    }

    return ORTE_SUCCESS;
}
Example #9
0
static int setup_child(orte_job_t *jdata,
                       orte_proc_t *child,
                       orte_app_context_t *app)
{
    char *param, *value;
    int rc, i;
    int32_t nrestarts=0, *nrptr;
    bool takeus = false;

    /* see if we are included */
    for (i=0; NULL != jdata->personality[i]; i++) {
        if (0 == strcmp(jdata->personality[i], "ompi")) {
            takeus = true;
            break;
        }
    }
    if (!takeus) {
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }

    /* setup the jobid */
    if (ORTE_SUCCESS != (rc = orte_util_convert_jobid_to_string(&value, child->name.jobid))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    opal_setenv("OMPI_MCA_ess_base_jobid", value, true, &app->env);
    free(value);

    /* setup the vpid */
    if (ORTE_SUCCESS != (rc = orte_util_convert_vpid_to_string(&value, child->name.vpid))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    opal_setenv("OMPI_MCA_ess_base_vpid", value, true, &app->env);

    /* although the vpid IS the process' rank within the job, users
     * would appreciate being given a public environmental variable
     * that also represents this value - something MPI specific - so
     * do that here.
     *
     * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
     * We know - just live with it
     */
    opal_setenv("OMPI_COMM_WORLD_RANK", value, true, &app->env);
    free(value);  /* done with this now */

    /* users would appreciate being given a public environmental variable
     * that also represents the local rank value - something MPI specific - so
     * do that here.
     *
     * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
     * We know - just live with it
     */
    if (ORTE_LOCAL_RANK_INVALID == child->local_rank) {
        ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
        rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
        return rc;
    }
    asprintf(&value, "%lu", (unsigned long) child->local_rank);
    opal_setenv("OMPI_COMM_WORLD_LOCAL_RANK", value, true, &app->env);
    free(value);

    /* users would appreciate being given a public environmental variable
     * that also represents the node rank value - something MPI specific - so
     * do that here.
     *
     * AND YES - THIS BREAKS THE ABSTRACTION BARRIER TO SOME EXTENT.
     * We know - just live with it
     */
    if (ORTE_NODE_RANK_INVALID == child->node_rank) {
        ORTE_ERROR_LOG(ORTE_ERR_VALUE_OUT_OF_BOUNDS);
        rc = ORTE_ERR_VALUE_OUT_OF_BOUNDS;
        return rc;
    }
    asprintf(&value, "%lu", (unsigned long) child->node_rank);
    opal_setenv("OMPI_COMM_WORLD_NODE_RANK", value, true, &app->env);
    /* set an mca param for it too */
    opal_setenv("OMPI_MCA_orte_ess_node_rank", value, true, &app->env);
    free(value);

    /* provide the identifier for the PMIx connection - the
     * PMIx connection is made prior to setting the process
     * name itself. Although in most cases the ID and the
     * process name are the same, it isn't necessarily
     * required */
    orte_util_convert_process_name_to_string(&value, &child->name);
    opal_setenv("PMIX_ID", value, true, &app->env);
    free(value);

    nrptr = &nrestarts;
    if (orte_get_attribute(&child->attributes, ORTE_PROC_NRESTARTS, (void**)&nrptr, OPAL_INT32)) {
        /* pass the number of restarts for this proc - will be zero for
         * an initial start, but procs would like to know if they are being
         * restarted so they can take appropriate action
         */
        asprintf(&value, "%d", nrestarts);
        opal_setenv("OMPI_MCA_orte_num_restarts", value, true, &app->env);
        free(value);
    }

    /* if the proc should not barrier in orte_init, tell it */
    if (orte_get_attribute(&child->attributes, ORTE_PROC_NOBARRIER, NULL, OPAL_BOOL)
        || 0 < nrestarts) {
        opal_setenv("OMPI_MCA_orte_do_not_barrier", "1", true, &app->env);
    }

    /* if we are using staged execution, tell it */
    if (orte_staged_execution) {
        opal_setenv("OMPI_MCA_orte_staged_execution", "1", true, &app->env);
    }

    /* if the proc isn't going to forward IO, then we need to flag that
     * it has "completed" iof termination as otherwise it will never fire
     */
    if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
        ORTE_FLAG_SET(child, ORTE_PROC_FLAG_IOF_COMPLETE);
    }

    /* construct the proc's session dir name */
    if (NULL != orte_process_info.tmpdir_base) {
        value = strdup(orte_process_info.tmpdir_base);
    } else {
        value = NULL;
    }
    param = NULL;
    if (ORTE_SUCCESS != (rc = orte_session_dir_get_name(&param, &value, NULL,
                                                        orte_process_info.nodename,
                                                        NULL, &child->name))) {
        ORTE_ERROR_LOG(rc);
        if (NULL != value) {
            free(value);
        }
        return rc;
    }
    free(value);
    /* pass an envar so the proc can find any files it had prepositioned */
    opal_setenv("OMPI_FILE_LOCATION", param, true, &app->env);

    /* if the user wanted the cwd to be the proc's session dir, then
     * switch to that location now
     */
    if (orte_get_attribute(&app->attributes, ORTE_APP_SSNDIR_CWD, NULL, OPAL_BOOL)) {
        /* create the session dir - may not exist */
        if (OPAL_SUCCESS != (rc = opal_os_dirpath_create(param, S_IRWXU))) {
            ORTE_ERROR_LOG(rc);
            /* doesn't exist with correct permissions, and/or we can't
             * create it - either way, we are done
             */
            free(param);
            return rc;
        }
        /* change to it */
        if (0 != chdir(param)) {
            free(param);
            return ORTE_ERROR;
        }
        /* It seems that chdir doesn't
         * adjust the $PWD enviro variable when it changes the directory. This
         * can cause a user to get a different response when doing getcwd vs
         * looking at the enviro variable. To keep this consistent, we explicitly
         * ensure that the PWD enviro variable matches the CWD we moved to.
         *
         * NOTE: if a user's program does a chdir(), then $PWD will once
         * again not match getcwd! This is beyond our control - we are only
         * ensuring they start out matching.
         */
        opal_setenv("PWD", param, true, &app->env);
        /* update the initial wdir value too */
        opal_setenv("OMPI_MCA_initial_wdir", param, true, &app->env);
    }
    free(param);
    return ORTE_SUCCESS;
}
Example #10
0
static void proc_errors(int fd, short args, void *cbdata)
{
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
    orte_job_t *jdata;
    orte_proc_t *pptr, *proct;
    orte_process_name_t *proc = &caddy->name;
    orte_proc_state_t state = caddy->proc_state;
    int i;
    int32_t i32, *i32ptr;

    OPAL_OUTPUT_VERBOSE((1, orte_errmgr_base_framework.framework_output,
                         "%s errmgr:default_hnp: for proc %s state %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(proc),
                         orte_proc_state_to_str(state)));

    /*
     * if orte is trying to shutdown, just let it
     */
    if (orte_finalizing) {
        goto cleanup;
    }

    /* get the job object */
    if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
        /* could be a race condition */
        goto cleanup;
    }
    pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);

    /* we MUST handle a communication failure before doing anything else
     * as it requires some special care to avoid normal termination issues
     * for local application procs
     */
    if (ORTE_PROC_STATE_COMM_FAILED == state) {
        /* is this to a daemon? */
        if (ORTE_PROC_MY_NAME->jobid != proc->jobid) {
            /* nope - ignore it */
            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                                 "%s Comm failure to non-daemon proc - ignoring it",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            goto cleanup;
        }
        /* if this is my own connection, ignore it */
        if (ORTE_PROC_MY_NAME->vpid == proc->vpid) {
            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                                 "%s Comm failure on my own connection - ignoring it",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            goto cleanup;
        }
        /* mark the daemon as gone */
        ORTE_FLAG_UNSET(pptr, ORTE_PROC_FLAG_ALIVE);
        /* if we have ordered orteds to terminate or abort
         * is in progress, record it */
        if (orte_orteds_term_ordered || orte_abnormal_term_ordered) {
            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                                 "%s Comm failure: daemons terminating - recording daemon %s as gone",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
            /* remove from dependent routes, if it is one */
            orte_routed.route_lost(proc);
            /* if all my routes and local children are gone, then terminate ourselves */
            if (0 == orte_routed.num_routes()) {
                for (i=0; i < orte_local_children->size; i++) {
                    if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
                            ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_ALIVE) && proct->state < ORTE_PROC_STATE_UNTERMINATED) {
                        /* at least one is still alive */
                        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                                             "%s Comm failure: at least one proc (%s) still alive",
                                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                             ORTE_NAME_PRINT(&proct->name)));
                        goto cleanup;
                    }
                }
                /* call our appropriate exit procedure */
                OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                                     "%s errmgr_hnp: all routes and children gone - ordering exit",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
            } else {
                OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                                     "%s Comm failure: %d routes remain alive",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     (int)orte_routed.num_routes()));
            }
            goto cleanup;
        }
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s Comm failure: daemon %s - aborting",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
        /* record the first one to fail */
        if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
            /* output an error message so the user knows what happened */
            orte_show_help("help-errmgr-base.txt", "node-died", true, pptr->node->name);
            /* mark the daemon job as failed */
            jdata->state = ORTE_JOB_STATE_COMM_FAILED;
            /* point to the lowest rank to cause the problem */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
            /* retain the object so it doesn't get free'd */
            OBJ_RETAIN(pptr);
            ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
            /* update our exit code */
            ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
            /* just in case the exit code hadn't been set, do it here - this
             * won't override any reported exit code */
            ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE);
        }
        /* abort the system */
        default_hnp_abort(jdata);
        goto cleanup;
    }

    /* update the proc state - can get multiple reports on a proc
     * depending on circumstances, so ensure we only do this once
     */
    if (pptr->state < ORTE_PROC_STATE_TERMINATED) {
        pptr->state = state;
    }

    /* if we were ordered to terminate, mark this proc as dead and see if
     * any of our routes or local  children remain alive - if not, then
     * terminate ourselves. */
    if (orte_orteds_term_ordered) {
        for (i=0; i < orte_local_children->size; i++) {
            if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                if (ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) {
                    goto keep_going;
                }
            }
        }
        /* if all my routes and children are gone, then terminate
           ourselves nicely (i.e., this is a normal termination) */
        if (0 == orte_routed.num_routes()) {
            OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                                 "%s errmgr:default:hnp all routes gone - exiting",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
        }
    }

keep_going:
    /* ensure we record the failed proc properly so we can report
     * the error once we terminate
     */
    switch (state) {
    case ORTE_PROC_STATE_KILLED_BY_CMD:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:hnp: proc %s killed by cmd",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        /* we ordered this proc to die, so it isn't an abnormal termination
         * and we don't flag it as such
         */
        if (jdata->num_terminated >= jdata->num_procs) {
            /* this job has terminated */
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
        }
        /* don't abort the job as this isn't an abnormal termination */
        break;

    case ORTE_PROC_STATE_ABORTED:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:hnp: proc %s aborted",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
            jdata->state = ORTE_JOB_STATE_ABORTED;
            /* point to the first rank to cause the problem */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
            /* retain the object so it doesn't get free'd */
            OBJ_RETAIN(pptr);
            ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
            ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
            /* abnormal termination - abort, but only do it once
             * to avoid creating a lot of confusion */
            default_hnp_abort(jdata);
        }
        break;

    case ORTE_PROC_STATE_ABORTED_BY_SIG:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:hnp: proc %s aborted by signal",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));

        ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
        /* track the number of non-zero exits */
        i32 = 0;
        i32ptr = &i32;
        orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32);
        ++i32;
        orte_set_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, ORTE_ATTR_LOCAL, i32ptr, OPAL_INT32);
        if (orte_abort_non_zero_exit) {

            if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
                jdata->state = ORTE_JOB_STATE_ABORTED_BY_SIG;
                /* point to the first rank to cause the problem */
                orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
                /* retain the object so it doesn't get free'd */
                OBJ_RETAIN(pptr);
                ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
                ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
                /* abnormal termination - abort, but only do it once
                 * to avoid creating a lot of confusion */
                default_hnp_abort(jdata);
            }
        } else {
            /* user requested we consider this normal termination */
            if (jdata->num_terminated >= jdata->num_procs) {
                /* this job has terminated */
                ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
            }
        }
        break;

    case ORTE_PROC_STATE_TERM_WO_SYNC:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:hnp: proc %s terminated without sync",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
            jdata->state = ORTE_JOB_STATE_ABORTED_WO_SYNC;
            /* point to the first rank to cause the problem */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
            /* retain the object so it doesn't get free'd */
            OBJ_RETAIN(pptr);
            ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
            ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
            /* now treat a special case - if the proc exit'd without a required
             * sync, it may have done so with a zero exit code. We want to ensure
             * that the user realizes there was an error, so in this -one- case,
             * we overwrite the process' exit code with the default error code
             */
            ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
            /* abnormal termination - abort, but only do it once
             * to avoid creating a lot of confusion */
            default_hnp_abort(jdata);
        }
        break;

    case ORTE_PROC_STATE_FAILED_TO_START:
    case ORTE_PROC_STATE_FAILED_TO_LAUNCH:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:hnp: proc %s %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc),
                             orte_proc_state_to_str(state)));
        if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
            if (ORTE_PROC_STATE_FAILED_TO_START) {
                jdata->state = ORTE_JOB_STATE_FAILED_TO_START;
            } else {
                jdata->state = ORTE_JOB_STATE_FAILED_TO_LAUNCH;
            }
            /* point to the first rank to cause the problem */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
            /* retain the object so it doesn't get free'd */
            OBJ_RETAIN(pptr);
            ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
            ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
            /* abnormal termination - abort, but only do it once
             * to avoid creating a lot of confusion */
            default_hnp_abort(jdata);
        }
        /* if this was a daemon, report it */
        if (jdata->jobid == ORTE_PROC_MY_NAME->jobid) {
            /* output a message indicating we failed to launch a daemon */
            orte_show_help("help-errmgr-base.txt", "failed-daemon-launch", true);
        }
        break;

    case ORTE_PROC_STATE_CALLED_ABORT:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:hnp: proc %s called abort with exit code %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc), pptr->exit_code));
        if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
            jdata->state = ORTE_JOB_STATE_CALLED_ABORT;
            /* point to the first proc to cause the problem */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
            /* retain the object so it doesn't get free'd */
            OBJ_RETAIN(pptr);
            ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
            ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
            /* abnormal termination - abort, but only do it once
             * to avoid creating a lot of confusion */
            default_hnp_abort(jdata);
        }
        break;

    case ORTE_PROC_STATE_TERM_NON_ZERO:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:hnp: proc %s exited with non-zero status %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc),
                             pptr->exit_code));
        ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
        /* track the number of non-zero exits */
        i32 = 0;
        i32ptr = &i32;
        orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32);
        ++i32;
        orte_set_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, ORTE_ATTR_LOCAL, i32ptr, OPAL_INT32);
        if (orte_abort_non_zero_exit) {
            if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
                jdata->state = ORTE_JOB_STATE_NON_ZERO_TERM;
                /* point to the first rank to cause the problem */
                orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
                /* retain the object so it doesn't get free'd */
                OBJ_RETAIN(pptr);
                ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
                /* abnormal termination - abort, but only do it once
                 * to avoid creating a lot of confusion */
                default_hnp_abort(jdata);
            }
        } else {
            /* user requested we consider this normal termination */
            if (jdata->num_terminated >= jdata->num_procs) {
                /* this job has terminated */
                ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
            }
        }
        break;

    case ORTE_PROC_STATE_HEARTBEAT_FAILED:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:hnp: proc %s heartbeat failed",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
            jdata->state = ORTE_JOB_STATE_HEARTBEAT_FAILED;
            /* point to the first rank to cause the problem */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
            /* retain the object so it doesn't get free'd */
            OBJ_RETAIN(pptr);
            ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
            ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
            /* abnormal termination - abort, but only do it once
             * to avoid creating a lot of confusion */
            default_hnp_abort(jdata);
        }
        /* remove from dependent routes, if it is one */
        orte_routed.route_lost(proc);
        break;

    case ORTE_PROC_STATE_UNABLE_TO_SEND_MSG:
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:hnp: unable to send message to proc %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        /* if this proc is one of my daemons, then we are truly
         * hosed - so just exit out
         */
        if (ORTE_PROC_MY_NAME->jobid == proc->jobid) {
            ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
            break;
        }
        if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
            /* abnormal termination - abort, but only do it once
             * to avoid creating a lot of confusion */
            default_hnp_abort(jdata);
        }
        break;

    default:
        /* shouldn't get this, but terminate job if required */
        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:hnp: proc %s default error %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc),
                             orte_proc_state_to_str(state)));
        if (jdata->num_terminated == jdata->num_procs) {
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
        }
        break;
    }
    /* if the waitpid fired, be sure to let the state machine know */
    if (ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_WAITPID)) {
        ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_WAITPID_FIRED);
    }

cleanup:
    OBJ_RELEASE(caddy);
}
Example #11
0
static void proc_errors(int fd, short args, void *cbdata)
{
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
    orte_job_t *jdata;
    orte_process_name_t *proc = &caddy->name;
    orte_proc_state_t state = caddy->proc_state;

    orte_proc_t *child, *ptr;
    opal_buffer_t *alert;
    orte_plm_cmd_flag_t cmd;
    int rc=ORTE_SUCCESS;
    int i;

    OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                         "%s errmgr:default_orted:proc_errors process %s error state %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(proc),
                         orte_proc_state_to_str(state)));

    /*
     * if orte is trying to shutdown, just let it
     */
    if (orte_finalizing) {
        OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:default_orted:proc_errors finalizing - ignoring error",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        goto cleanup;
    }

    /* if this is a heartbeat failure, let the HNP handle it */
    if (ORTE_PROC_STATE_HEARTBEAT_FAILED == state) {
        OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:default_orted:proc_errors heartbeat failed - ignoring error",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        goto cleanup;
    }

    /* if this was a failed comm, then see if it was to our
     * lifeline
     */
    if (ORTE_PROC_STATE_LIFELINE_LOST == state ||
        ORTE_PROC_STATE_UNABLE_TO_SEND_MSG == state) {
        OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:orted lifeline lost - exiting",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        /* set our exit status */
        ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
        /* kill our children */
        killprocs(ORTE_JOBID_WILDCARD, ORTE_VPID_WILDCARD);
        /* terminate - our routed children will see
         * us leave and automatically die
         */
        ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        goto cleanup;
    }

    /* get the job object */
    if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
        /* must already be complete */
        OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:default_orted:proc_errors NULL jdata - ignoring error",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        goto cleanup;
    }

    if (ORTE_PROC_STATE_COMM_FAILED == state) {
        /* if it is our own connection, ignore it */
        if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_PROC_MY_NAME, proc)) {
            OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                                 "%s errmgr:default_orted:proc_errors comm_failed to self - ignoring error",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            goto cleanup;
        }
        /* was it a daemon? */
        if (proc->jobid != ORTE_PROC_MY_NAME->jobid) {
            /* nope - we can't seem to trust that we will catch the waitpid
             * in this situation, so push this over to be handled as if
             * it were a waitpid trigger so we don't create a bunch of
             * duplicate code */
            OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                                 "%s errmgr:default_orted:proc_errors comm_failed to non-daemon - handling as waitpid",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            /* get the proc_t */
            if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) {
                ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
                ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
                goto cleanup;
            }
            /* leave the exit code alone - process this as a waitpid */
            odls_base_default_wait_local_proc(child, NULL);
            goto cleanup;
        }
        OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:default:orted daemon %s exited",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(proc)));
        /* if we are using static ports, then it is possible that the HNP
         * will not see this termination. So if the HNP didn't order us
         * to terminate, then we should ensure it knows */
        if (orte_static_ports && !orte_orteds_term_ordered) {
            /* send an alert to the HNP */
            alert = OBJ_NEW(opal_buffer_t);
            /* pack update state command */
            cmd = ORTE_PLM_UPDATE_PROC_STATE;
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            /* get the proc_t */
            if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) {
                ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
                ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
                goto cleanup;
            }
            /* set the exit code to reflect the problem */
            child->exit_code = ORTE_ERR_COMM_FAILURE;
            /* pack only the data for this daemon - have to start with the jobid
             * so the receiver can unpack it correctly
             */
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
                ORTE_ERROR_LOG(rc);
                return;
            }

            /* now pack the daemon's info */
            if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            /* send it */
            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                                 "%s errmgr:default_orted reporting lost connection to daemon %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(proc)));
            if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                                  ORTE_RML_TAG_PLM,
                                                  orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(rc);
                OBJ_RELEASE(alert);
            }
            /* mark that we notified the HNP for this job so we don't do it again */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
            /* continue on */
            goto cleanup;
        }

        if (orte_orteds_term_ordered) {
            /* are any of my children still alive */
            for (i=0; i < orte_local_children->size; i++) {
                if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                    if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
                        OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
                                             "%s errmgr:default:orted[%s(%d)] proc %s is alive",
                                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                             __FILE__, __LINE__,
                                             ORTE_NAME_PRINT(&child->name)));
                        goto cleanup;
                    }
                }
            }
            /* if all my routes and children are gone, then terminate
               ourselves nicely (i.e., this is a normal termination) */
            if (0 == orte_routed.num_routes()) {
                OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                                     "%s errmgr:default:orted all routes gone - exiting",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
            } else {
                OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                                     "%s errmgr:default:orted not exiting, num_routes() == %d",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     (int)orte_routed.num_routes()));
            }
        }
        /* if not, then we can continue */
        goto cleanup;
    }

    if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        goto cleanup;
    }
    /* if this is not a local proc for this job, we can
     * ignore this call
     */
    if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_LOCAL)) {
        OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:default_orted:proc_errors proc is not local - ignoring error",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        goto cleanup;
    }

    OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                         "%s errmgr:default_orted got state %s for proc %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         orte_proc_state_to_str(state),
                         ORTE_NAME_PRINT(proc)));

    if (ORTE_PROC_STATE_TERM_NON_ZERO == state) {
        /* update the state */
        child->state = state;
        /* report this as abnormal termination to the HNP, unless we already have
         * done so for this job */
        if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, NULL, OPAL_BOOL)) {
            alert = OBJ_NEW(opal_buffer_t);
            /* pack update state command */
            cmd = ORTE_PLM_UPDATE_PROC_STATE;
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            /* pack only the data for this proc - have to start with the jobid
             * so the receiver can unpack it correctly
             */
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
                ORTE_ERROR_LOG(rc);
                return;
            }

            /* now pack the child's info */
            if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            /* send it */
            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                                 "%s errmgr:default_orted reporting proc %s abnormally terminated with non-zero status (local procs = %d)",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&child->name),
                                 jdata->num_local_procs));
            if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                                  ORTE_RML_TAG_PLM,
                                                  orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(rc);
                OBJ_RELEASE(alert);
            }
            /* mark that we notified the HNP for this job so we don't do it again */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
        }
        /* if the proc has terminated, notify the state machine */
        if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_IOF_COMPLETE) &&
            ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_WAITPID) &&
            !ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) {
            ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
        }
        goto cleanup;
    }

    if (ORTE_PROC_STATE_FAILED_TO_START == state ||
        ORTE_PROC_STATE_FAILED_TO_LAUNCH == state) {
        /* update the proc state */
        child->state = state;
        /* count the proc as having "terminated" */
        jdata->num_terminated++;
        /* leave the error report in this case to the
         * state machine, which will receive notice
         * when all local procs have attempted to start
         * so that we send a consolidated error report
         * back to the HNP
         */
        goto cleanup;
    }

    if (ORTE_PROC_STATE_TERMINATED < state) {
        /* if we were ordered to terminate, see if
         * any of our routes or local children remain alive - if not, then
         * terminate ourselves. */
        if (orte_orteds_term_ordered) {
            /* mark the child as no longer alive and update the counters, if necessary.
             * we have to do this here as we aren't going to send this to the state
             * machine, and we want to keep the bookkeeping accurate just in case */
            if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
                ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
            }
            if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) {
                ORTE_FLAG_SET(child, ORTE_PROC_FLAG_RECORDED);
                jdata->num_terminated++;
            }
            for (i=0; i < orte_local_children->size; i++) {
                if (NULL != (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                    if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
                        goto keep_going;
                    }
                }
            }
            /* if all my routes and children are gone, then terminate
               ourselves nicely (i.e., this is a normal termination) */
            if (0 == orte_routed.num_routes()) {
                OPAL_OUTPUT_VERBOSE((2, orte_errmgr_base_framework.framework_output,
                                     "%s errmgr:default:orted all routes gone - exiting",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
            }
            /* no need to alert the HNP - we are already on our way out */
            goto cleanup;
        }

    keep_going:
        /* if the job hasn't completed and the state is abnormally
         * terminated, then we need to alert the HNP right away - but
         * only do this once!
         */
        if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, NULL, OPAL_BOOL)) {
            alert = OBJ_NEW(opal_buffer_t);
            /* pack update state command */
            cmd = ORTE_PLM_UPDATE_PROC_STATE;
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            /* pack only the data for this proc - have to start with the jobid
             * so the receiver can unpack it correctly
             */
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            child->state = state;
            /* now pack the child's info */
            if (ORTE_SUCCESS != (rc = pack_state_for_proc(alert, child))) {
                ORTE_ERROR_LOG(rc);
                return;
            }
            OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                                 "%s errmgr:default_orted reporting proc %s aborted to HNP (local procs = %d)",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&child->name),
                                 jdata->num_local_procs));
            /* send it */
            if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                                  ORTE_RML_TAG_PLM,
                                                  orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(rc);
            }
            /* mark that we notified the HNP for this job so we don't do it again */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_FAIL_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
        }
        /* if the proc has terminated, notify the state machine */
        if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_IOF_COMPLETE) &&
            ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_WAITPID) &&
            !ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_RECORDED)) {
            ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
        }
        goto cleanup;
    }

    /* only other state is terminated - see if anyone is left alive */
    if (!any_live_children(proc->jobid)) {
        alert = OBJ_NEW(opal_buffer_t);
        /* pack update state command */
        cmd = ORTE_PLM_UPDATE_PROC_STATE;
        if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
            ORTE_ERROR_LOG(rc);
            return;
        }
        /* pack the data for the job */
        if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) {
            ORTE_ERROR_LOG(rc);
            return;
        }

        OPAL_OUTPUT_VERBOSE((5, orte_errmgr_base_framework.framework_output,
                             "%s errmgr:default_orted reporting all procs in %s terminated",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_JOBID_PRINT(jdata->jobid)));

        /* remove all of this job's children from the global list */
        for (i=0; i < orte_local_children->size; i++) {
            if (NULL == (ptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                continue;
            }
            if (jdata->jobid == ptr->name.jobid) {
                opal_pointer_array_set_item(orte_local_children, i, NULL);
                OBJ_RELEASE(ptr);
            }
        }

        /* ensure the job's local session directory tree is removed */
        orte_session_dir_cleanup(jdata->jobid);

        /* remove this job from our local job data since it is complete */
        opal_pointer_array_set_item(orte_job_data, ORTE_LOCAL_JOBID(jdata->jobid), NULL);
        OBJ_RELEASE(jdata);

        /* send it */
        if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                              ORTE_RML_TAG_PLM,
                                              orte_rml_send_callback, NULL))) {
            ORTE_ERROR_LOG(rc);
        }
        return;
    }

 cleanup:
    OBJ_RELEASE(caddy);
}
Example #12
0
static void track_procs(int fd, short argc, void *cbdata)
{
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
    orte_process_name_t *proc = &caddy->name;
    orte_proc_state_t state = caddy->proc_state;
    orte_job_t *jdata;
    orte_proc_t *pdata, *pptr;
    opal_buffer_t *alert;
    int rc, i;
    orte_plm_cmd_flag_t cmd;
    int8_t flag;

    OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
                         "%s state:orcm:track_procs called for proc %s state %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(proc),
                         orte_proc_state_to_str(state)));

    /* get the job object for this proc */
    if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        goto cleanup;
    }
    pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);
    if (NULL == pdata) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        goto cleanup;
    }

    if (ORTE_PROC_STATE_RUNNING == state) {
        /* update the proc state */
        pdata->state = state;
        jdata->num_launched++;
        /* don't update until we are told that all are done */
    } else if (ORTE_PROC_STATE_REGISTERED == state) {
        /* update the proc state */
        pdata->state = state;
        jdata->num_reported++;
        if (jdata->num_reported == jdata->num_local_procs) {
            /* once everyone registers, send their contact info to
             * the HNP so it is available to debuggers and anyone
             * else that needs it
             */

            OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
                                 "%s state:orcm: notifying HNP all local registered",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

            alert = OBJ_NEW(opal_buffer_t);
            /* pack registered command */
            cmd = ORTE_PLM_REGISTERED_CMD;
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            /* pack the jobid */
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            /* pack all the local child vpids */
            for (i=0; i < orte_local_children->size; i++) {
                if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                    continue;
                }
                if (pptr->name.jobid == proc->jobid) {
                    if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &pptr->name.vpid, 1, ORTE_VPID))) {
                        ORTE_ERROR_LOG(rc);
                        goto cleanup;
                    }
                    if (ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_AS_MPI)) {
                        flag = 1;
                    } else {
                        flag = 0;
                    }
                    if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &flag, 1, OPAL_INT8))) {
                        ORTE_ERROR_LOG(rc);
                        goto cleanup;
                    }
                }
            }
            /* send it */
            if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                                              ORTE_RML_TAG_PLM,
                                                              orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(rc);
            } else {
                rc = ORTE_SUCCESS;
            }
        }
    } else if (ORTE_PROC_STATE_IOF_COMPLETE == state) {
        /* do NOT update the proc state as this can hit
         * while we are still trying to notify the HNP of
         * successful launch for short-lived procs
         */
        ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_IOF_COMPLETE);
        if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_WAITPID) &&
            !ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) {
            ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
        }
    } else if (ORTE_PROC_STATE_WAITPID_FIRED == state) {
        /* do NOT update the proc state as this can hit
         * while we are still trying to notify the HNP of
         * successful launch for short-lived procs
         */
        ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_WAITPID);
        if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_IOF_COMPLETE) &&
            !ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) {
            ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
        }
    } else if (ORTE_PROC_STATE_TERMINATED == state) {
        /* if this proc has not already recorded as terminated, then
         * update the accounting here */
        if (!ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) {
            jdata->num_terminated++;
        }
        /* update the proc state */
        ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_RECORDED);
        ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE);
        pdata->state = state;
        /* Clean up the session directory as if we were the process
         * itself.  This covers the case where the process died abnormally
         * and didn't cleanup its own session directory.
         */
        orte_session_dir_finalize(proc);
        /* track job status */
        if (jdata->num_terminated == jdata->num_local_procs &&
            !orte_get_attribute(&jdata->attributes, ORTE_JOB_TERM_NOTIFIED, NULL, OPAL_BOOL)) {
            /* pack update state command */
            cmd = ORTE_PLM_UPDATE_PROC_STATE;
            alert = OBJ_NEW(opal_buffer_t);
            if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            /* pack the job info */
            if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) {
                ORTE_ERROR_LOG(rc);
            }
            /* send it */
            OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
                                 "%s state:orcm: SENDING JOB LOCAL TERMINATION UPDATE FOR JOB %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_JOBID_PRINT(jdata->jobid)));
            if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert,
                                                  ORTE_RML_TAG_PLM,
                                                  orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(rc);
            }
            /* mark that we sent it so we ensure we don't do it again */
            orte_set_attribute(&jdata->attributes, ORTE_JOB_TERM_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
        }
    }

 cleanup:
    OBJ_RELEASE(caddy);
}
Example #13
0
int orte_rmaps_rr_byslot(orte_job_t *jdata,
                         orte_app_context_t *app,
                         opal_list_t *node_list,
                         orte_std_cntr_t num_slots,
                         orte_vpid_t num_procs)
{
    int rc, i, nprocs_mapped;
    orte_node_t *node;
    orte_proc_t *proc;
    int num_procs_to_assign, extra_procs_to_assign=0, nxtra_nodes=0;
    hwloc_obj_t obj=NULL;
    float balance;
    bool add_one=false;

    opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps:rr: mapping by slot for job %s slots %d num_procs %lu",
                        ORTE_JOBID_PRINT(jdata->jobid), (int)num_slots, (unsigned long)num_procs);

    /* check to see if we can map all the procs */
    if (num_slots < ((int)app->num_procs * orte_rmaps_base.cpus_per_rank)) {
        if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
            orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
                           true, app->num_procs, app->app);
            return ORTE_ERR_SILENT;
        }
    }

    /* first pass: map the number of procs to each node until we
     * map all specified procs or use all allocated slots
     */
    nprocs_mapped = 0;
    OPAL_LIST_FOREACH(node, node_list, orte_node_t) {
        opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:rr:slot working node %s",
                            node->name);
        /* get the root object as we are not assigning
         * locale here except at the node level
         */
        if (NULL != node->topology) {
            obj = hwloc_get_root_obj(node->topology);
        }
        if (node->slots <= node->slots_inuse) {
            opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps:rr:slot node %s is full - skipping",
                                node->name);
            continue;
        }
        /* assign a number of procs equal to the number of available
         * slots divided by the number of cpus/rank the user
         * requested
         */
        num_procs_to_assign = (node->slots - node->slots_inuse) / orte_rmaps_base.cpus_per_rank;
        opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:rr:slot assigning %d procs to node %s",
                            (int)num_procs_to_assign, node->name);

        for (i=0; i < num_procs_to_assign && nprocs_mapped < app->num_procs; i++) {
            /* add this node to the map - do it only once */
            if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
                if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
                    ORTE_ERROR_LOG(rc);
                    return rc;
                }
                ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED);
                OBJ_RETAIN(node);  /* maintain accounting on object */
                ++(jdata->map->num_nodes);
            }
            if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) {
                return ORTE_ERR_OUT_OF_RESOURCE;
            }
            nprocs_mapped++;
            orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
        }
    }
Example #14
0
int orte_ess_base_orted_setup(char **hosts)
{
    int ret = ORTE_ERROR;
    int fd;
    char log_file[PATH_MAX];
    char *jobidstring;
    char *error = NULL;
    orte_job_t *jdata;
    orte_proc_t *proc;
    orte_app_context_t *app;
    orte_node_t *node;
    char *param;
    hwloc_obj_t obj;
    unsigned i, j;
    opal_list_t transports;

    /* my name is set, xfer it to the OPAL layer */
    orte_process_info.super.proc_name = *(opal_process_name_t*)ORTE_PROC_MY_NAME;
    orte_process_info.super.proc_hostname = strdup(orte_process_info.nodename);
    orte_process_info.super.proc_flags = OPAL_PROC_ALL_LOCAL;
    orte_process_info.super.proc_arch = opal_local_arch;
    opal_proc_local_set(&orte_process_info.super);

    plm_in_use = false;
    /* setup callback for SIGPIPE */
    setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback);
    /* Set signal handlers to catch kill signals so we can properly clean up
     * after ourselves.
     */
    setup_sighandler(SIGTERM, &term_handler, shutdown_signal);
    setup_sighandler(SIGINT, &int_handler, shutdown_signal);
    /** setup callbacks for signals we should ignore */
    setup_sighandler(SIGUSR1, &sigusr1_handler, signal_callback);
    setup_sighandler(SIGUSR2, &sigusr2_handler, signal_callback);
    signals_set = true;

    /* get the local topology */
    if (NULL == opal_hwloc_topology) {
        if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) {
            error = "topology discovery";
            goto error;
        }
    }
    /* generate the signature */
    orte_topo_signature = opal_hwloc_base_get_topo_signature(opal_hwloc_topology);
    /* remove the hostname from the topology. Unfortunately, hwloc
     * decided to add the source hostname to the "topology", thus
     * rendering it unusable as a pure topological description. So
     * we remove that information here.
     */
    obj = hwloc_get_root_obj(opal_hwloc_topology);
    for (i=0; i < obj->infos_count; i++) {
        if (NULL == obj->infos[i].name ||
            NULL == obj->infos[i].value) {
            continue;
        }
        if (0 == strncmp(obj->infos[i].name, "HostName", strlen("HostName"))) {
            free(obj->infos[i].name);
            free(obj->infos[i].value);
            /* left justify the array */
            for (j=i; j < obj->infos_count-1; j++) {
                obj->infos[j] = obj->infos[j+1];
            }
            obj->infos[obj->infos_count-1].name = NULL;
            obj->infos[obj->infos_count-1].value = NULL;
            obj->infos_count--;
            break;
        }
    }
    if (15 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) {
        opal_output(0, "%s Topology Info:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
        opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO);
    }

    /* open and setup the opal_pstat framework so we can provide
     * process stats if requested
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_pstat_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "opal_pstat_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "opal_pstat_base_select";
        goto error;
    }

    /* define the HNP name */
    ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid;
    ORTE_PROC_MY_HNP->vpid = 0;

    /* open and setup the state machine */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_state_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_state_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_state_base_select";
        goto error;
    }
    /* open the errmgr */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_errmgr_base_open";
        goto error;
    }
    /* some environments allow remote launches - e.g., ssh - so
     * open and select something -only- if we are given
     * a specific module to use
     */
    (void) mca_base_var_env_name("plm", &param);

    plm_in_use = !!(getenv(param));
    free (param);

    if (plm_in_use)  {

        if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_plm_base_framework, 0))) {
            ORTE_ERROR_LOG(ret);
            error = "orte_plm_base_open";
            goto error;
        }
        if (ORTE_SUCCESS != (ret = orte_plm_base_select())) {
            ORTE_ERROR_LOG(ret);
            error = "orte_plm_base_select";
            goto error;
        }
    }
    /* setup my session directory here as the OOB may need it */
    if (orte_create_session_dirs) {
        OPAL_OUTPUT_VERBOSE((2, orte_ess_base_framework.framework_output,
                             "%s setting up session dir with\n\ttmpdir: %s\n\thost %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base,
                             orte_process_info.nodename));

        /* take a pass thru the session directory code to fillin the
         * tmpdir names - don't create anything yet
         */
        if (ORTE_SUCCESS != (ret = orte_session_dir(false, ORTE_PROC_MY_NAME))) {
            ORTE_ERROR_LOG(ret);
            error = "orte_session_dir define";
            goto error;
        }
        /* clear the session directory just in case there are
         * stale directories laying around
         */
        orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
        /* now actually create the directory tree */
        if (ORTE_SUCCESS != (ret = orte_session_dir(true, ORTE_PROC_MY_NAME))) {
            ORTE_ERROR_LOG(ret);
            error = "orte_session_dir";
            goto error;
        }
        /* set the opal_output env file location to be in the
         * proc-specific session directory. */
        opal_output_set_output_file_info(orte_process_info.proc_session_dir,
                                         "output-", NULL, NULL);
        /* setup stdout/stderr */
        if (orte_debug_daemons_file_flag) {
            /* if we are debugging to a file, then send stdout/stderr to
             * the orted log file
             */
            /* get my jobid */
            if (ORTE_SUCCESS != (ret = orte_util_convert_jobid_to_string(&jobidstring,
                                                                         ORTE_PROC_MY_NAME->jobid))) {
                ORTE_ERROR_LOG(ret);
                error = "convert_jobid";
                goto error;
            }
            /* define a log file name in the session directory */
            snprintf(log_file, PATH_MAX, "output-orted-%s-%s.log",
                     jobidstring, orte_process_info.nodename);
            log_path = opal_os_path(false, orte_process_info.top_session_dir,
                                    log_file, NULL);

            fd = open(log_path, O_RDWR|O_CREAT|O_TRUNC, 0640);
            if (fd < 0) {
                /* couldn't open the file for some reason, so
                 * just connect everything to /dev/null
                 */
                fd = open("/dev/null", O_RDWR|O_CREAT|O_TRUNC, 0666);
            } else {
                dup2(fd, STDOUT_FILENO);
                dup2(fd, STDERR_FILENO);
                if(fd != STDOUT_FILENO && fd != STDERR_FILENO) {
                    close(fd);
                }
            }
        }
    }
    /* setup the global job and node arrays */
    orte_job_data = OBJ_NEW(opal_hash_table_t);
    if (ORTE_SUCCESS != (ret = opal_hash_table_init(orte_job_data, 128))) {
        ORTE_ERROR_LOG(ret);
        error = "setup job array";
        goto error;
    }
    orte_node_pool = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool,
                               ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                               ORTE_GLOBAL_ARRAY_MAX_SIZE,
                               ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
        ORTE_ERROR_LOG(ret);
        error = "setup node array";
        goto error;
    }
    orte_node_topologies = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_topologies,
                               ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                               ORTE_GLOBAL_ARRAY_MAX_SIZE,
                               ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
        ORTE_ERROR_LOG(ret);
        error = "setup node topologies array";
        goto error;
    }
    /* Setup the job data object for the daemons */
    /* create and store the job data object */
    jdata = OBJ_NEW(orte_job_t);
    jdata->jobid = ORTE_PROC_MY_NAME->jobid;
    opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);
    /* every job requires at least one app */
    app = OBJ_NEW(orte_app_context_t);
    opal_pointer_array_set_item(jdata->apps, 0, app);
    jdata->num_apps++;
    /* create and store a node object where we are */
    node = OBJ_NEW(orte_node_t);
    node->name = strdup(orte_process_info.nodename);
    node->index = opal_pointer_array_set_item(orte_node_pool, ORTE_PROC_MY_NAME->vpid, node);
    /* point our topology to the one detected locally */
    node->topology = opal_hwloc_topology;

    /* create and store a proc object for us */
    proc = OBJ_NEW(orte_proc_t);
    proc->name.jobid = ORTE_PROC_MY_NAME->jobid;
    proc->name.vpid = ORTE_PROC_MY_NAME->vpid;
    proc->pid = orte_process_info.pid;
    proc->state = ORTE_PROC_STATE_RUNNING;
    opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc);
    /* record that the daemon (i.e., us) is on this node
     * NOTE: we do not add the proc object to the node's
     * proc array because we are not an application proc.
     * Instead, we record it in the daemon field of the
     * node object
     */
    OBJ_RETAIN(proc);   /* keep accounting straight */
    node->daemon = proc;
    ORTE_FLAG_SET(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED);
    node->state = ORTE_NODE_STATE_UP;
    /* now point our proc node field to the node */
    OBJ_RETAIN(node);   /* keep accounting straight */
    proc->node = node;
    /* record that the daemon job is running */
    jdata->num_procs = 1;
    jdata->state = ORTE_JOB_STATE_RUNNING;
    /* obviously, we have "reported" */
    jdata->num_reported = 1;

    /* setup the PMIx framework - ensure it skips all non-PMIx components,
     * but do not override anything we were given */
    opal_setenv("OMPI_MCA_pmix", "^s1,s2,cray,isolated", false, &environ);
    if (OPAL_SUCCESS != (ret = mca_base_framework_open(&opal_pmix_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_pmix_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = opal_pmix_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "opal_pmix_base_select";
        goto error;
    }
    /* set the event base */
    opal_pmix_base_set_evbase(orte_event_base);
    /* setup the PMIx server */
    if (ORTE_SUCCESS != (ret = pmix_server_init())) {
        /* the server code already barked, so let's be quiet */
        ret = ORTE_ERR_SILENT;
        error = "pmix_server_init";
        goto error;
    }

    /* Setup the communication infrastructure */
    /* Routed system */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_routed_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_routed_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_routed_base_select";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_oob_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_oob_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_oob_base_select";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_rml_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml_base_select";
        goto error;
    }

    if (NULL != orte_process_info.my_hnp_uri) {
        /* extract the HNP's name so we can update the routing table */
        if (ORTE_SUCCESS != (ret = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri,
                                                            ORTE_PROC_MY_HNP, NULL))) {
            ORTE_ERROR_LOG(ret);
            error = "orte_rml_parse_HNP";
            goto error;
        }
        /* Set the contact info in the RML - this won't actually establish
         * the connection, but just tells the RML how to reach the HNP
         * if/when we attempt to send to it
         */
        orte_rml.set_contact_info(orte_process_info.my_hnp_uri);
    }

    /* select the errmgr */
    if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_errmgr_base_select";
        goto error;
    }

    /* get a conduit for our use - we never route IO over fabric */
    OBJ_CONSTRUCT(&transports, opal_list_t);
    orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE,
                       ORTE_ATTR_LOCAL, orte_mgmt_transport, OPAL_STRING);
    orte_mgmt_conduit = orte_rml.open_conduit(&transports);
    OPAL_LIST_DESTRUCT(&transports);

    OBJ_CONSTRUCT(&transports, opal_list_t);
    orte_set_attribute(&transports, ORTE_RML_TRANSPORT_TYPE,
                       ORTE_ATTR_LOCAL, orte_coll_transport, OPAL_STRING);
    orte_coll_conduit = orte_rml.open_conduit(&transports);
    OPAL_LIST_DESTRUCT(&transports);

     /* add our contact info to our proc object */
     proc->rml_uri = orte_rml.get_contact_info();

   /*
     * Group communications
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_select";
        goto error;
    }
    /* Open/select the odls */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_odls_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_odls_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_odls_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_odls_base_select";
        goto error;
    }
    /* Open/select the rtc */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rtc_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rtc_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_rtc_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rtc_base_select";
        goto error;
    }

    /* be sure to update the routing tree so the initial "phone home"
     * to mpirun goes through the tree if static ports were enabled - still
     * need to do it anyway just to initialize things
     */
    orte_routed.update_routing_plan(NULL);

    /* if we are using static ports, then we need to setup
     * the daemon info so the RML can function properly
     * without requiring a wireup stage. This must be done
     * after we enable_comm as that function determines our
     * own port, which we need in order to construct the nidmap
     */
    if (orte_static_ports) {
        /* extract the node info from the environment and
         * build a nidmap from it - this will update the
         * routing plan as well
         */
        if (ORTE_SUCCESS != (ret = orte_util_build_daemon_nidmap(hosts))) {
            ORTE_ERROR_LOG(ret);
            error = "construct daemon map from static ports";
            goto error;
        }
    }

    /* Now provide a chance for the PLM
     * to perform any module-specific init functions. This
     * needs to occur AFTER the communications are setup
     * as it may involve starting a non-blocking recv
     * Do this only if a specific PLM was given to us - the
     * orted has no need of the proxy PLM at all
     */
    if (plm_in_use) {
        if (ORTE_SUCCESS != (ret = orte_plm.init())) {
            ORTE_ERROR_LOG(ret);
            error = "orte_plm_init";
            goto error;
        }
    }

    /* setup I/O forwarding system - must come after we init routes */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_iof_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_iof_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_iof_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_iof_base_select";
        goto error;
    }
    /* setup the FileM */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_filem_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_filem_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_filem_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_filem_base_select";
        goto error;
    }

#if OPAL_ENABLE_FT_CR == 1
    /*
     * Setup the SnapC
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_snapc_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_snapc_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_sstore_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_sstore_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_snapc_base_select(!ORTE_PROC_IS_HNP, ORTE_PROC_IS_DAEMON))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_snapc_base_select";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_sstore_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_sstore_base_select";
        goto error;
    }

    /* For daemons, ORTE doesn't need the OPAL CR stuff */
    opal_cr_set_enabled(false);
#else
    opal_cr_set_enabled(false);
#endif
    /*
     * Initalize the CR setup
     * Note: Always do this, even in non-FT builds.
     * If we don't some user level tools may hang.
     */
    if (ORTE_SUCCESS != (ret = orte_cr_init())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_cr_init";
        goto error;
    }
    /* setup the DFS framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_dfs_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_dfs_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_dfs_select";
        goto error;
    }

    return ORTE_SUCCESS;
 error:
    orte_show_help("help-orte-runtime.txt",
                   "orte_init:startup:internal-failure",
                   true, error, ORTE_ERROR_NAME(ret), ret);
    /* remove our use of the session directory tree */
    orte_session_dir_finalize(ORTE_PROC_MY_NAME);
    /* ensure we scrub the session directory tree */
    orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
    return ORTE_ERR_SILENT;
}
Example #15
0
int orte_rmaps_rr_byslot(orte_job_t *jdata,
                         orte_app_context_t *app,
                         opal_list_t *node_list,
                         orte_std_cntr_t num_slots,
                         orte_vpid_t num_procs)
{
    int rc, i, nprocs_mapped;
    orte_node_t *node;
    orte_proc_t *proc;
    int num_procs_to_assign, extra_procs_to_assign=0, nxtra_nodes=0;
#if OPAL_HAVE_HWLOC
    hwloc_obj_t obj=NULL;
#endif
    float balance;
    bool add_one=false;

    opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps:rr: mapping by slot for job %s slots %d num_procs %lu",
                        ORTE_JOBID_PRINT(jdata->jobid), (int)num_slots, (unsigned long)num_procs);

    /* check to see if we can map all the procs */
    if (num_slots < ((int)app->num_procs * orte_rmaps_base.cpus_per_rank)) {
        if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
            orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
                           true, app->num_procs, app->app);
            return ORTE_ERR_SILENT;
        }
#if OPAL_HAVE_HWLOC
        /* if we will and are allowed to oversubscribe, and binding was given, then
         * we really should warn the user that we cannot bind
         */
        if (OPAL_BINDING_POLICY_IS_SET(jdata->map->binding)) {
            if ((OPAL_BIND_TO_CORE == OPAL_GET_BINDING_POLICY(jdata->map->binding) ||
                 OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(jdata->map->binding)) &&
                !OPAL_BIND_OVERLOAD_ALLOWED(jdata->map->binding)){
                /* RHC: don't emit this warning at this time while we try to 
                 * determine the best path forward. See
                 * https://svn.open-mpi.org/trac/ompi/ticket/4345
                 * for an explanation
                 orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:oversubscribed",
                 true, num_slots, app->num_procs * orte_rmaps_base.cpus_per_rank);
                 OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE);
                */
            }
        } else {
            /* don't default to bound */
            opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps:rr: mapping by slot resetting binding policy to NONE as node is oversubscribed");
            OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_NONE);
        }
#endif
    }

    /* first pass: map the number of procs to each node until we
     * map all specified procs or use all allocated slots
     */
    nprocs_mapped = 0;
    OPAL_LIST_FOREACH(node, node_list, orte_node_t) {
        opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:rr:slot working node %s",
                            node->name);
#if OPAL_HAVE_HWLOC
        /* get the root object as we are not assigning
         * locale here except at the node level
         */
        if (NULL != node->topology) {
            obj = hwloc_get_root_obj(node->topology);
        }
#endif
        if (node->slots <= node->slots_inuse) {
            opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
                                "mca:rmaps:rr:slot node %s is full - skipping",
                                node->name);
            continue;
        }
        /* assign a number of procs equal to the number of available
         * slots divided by the number of cpus/rank the user
         * requested
         */
        num_procs_to_assign = (node->slots - node->slots_inuse) / orte_rmaps_base.cpus_per_rank;
        opal_output_verbose(2, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:rr:slot assigning %d procs to node %s",
                            (int)num_procs_to_assign, node->name);

        for (i=0; i < num_procs_to_assign && nprocs_mapped < app->num_procs; i++) {
            /* add this node to the map - do it only once */
            if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
                if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
                    ORTE_ERROR_LOG(rc);
                    return rc;
                }
                ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED);
                OBJ_RETAIN(node);  /* maintain accounting on object */
                ++(jdata->map->num_nodes);
            }
            if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, app->idx))) {
                return ORTE_ERR_OUT_OF_RESOURCE;
            }
            nprocs_mapped++;
#if OPAL_HAVE_HWLOC
            orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
#endif
        }
    }
Example #16
0
void orte_state_base_track_procs(int fd, short argc, void *cbdata)
{
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
    orte_process_name_t *proc = &caddy->name;
    orte_proc_state_t state = caddy->proc_state;
    orte_job_t *jdata;
    orte_proc_t *pdata;
    int i;
    char *rtmod;
    orte_process_name_t parent, target, *npptr;

    opal_output_verbose(5, orte_state_base_framework.framework_output,
                        "%s state:base:track_procs called for proc %s state %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(proc),
                        orte_proc_state_to_str(state));

    /* get our "lifeline" routed module */
    rtmod = orte_rml.get_routed(orte_mgmt_conduit);

    /* get the job object for this proc */
    if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        goto cleanup;
    }
    pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);

    if (ORTE_PROC_STATE_RUNNING == state) {
        /* update the proc state */
        if (pdata->state < ORTE_PROC_STATE_TERMINATED) {
            pdata->state = state;
        }
        jdata->num_launched++;
        if (jdata->num_launched == jdata->num_procs) {
            if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
                ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS);
            } else {
                ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_RUNNING);
            }
        }
    } else if (ORTE_PROC_STATE_REGISTERED == state) {
        /* update the proc state */
        if (pdata->state < ORTE_PROC_STATE_TERMINATED) {
            pdata->state = state;
        }
        jdata->num_reported++;
        if (jdata->num_reported == jdata->num_procs) {
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REGISTERED);
        }
    } else if (ORTE_PROC_STATE_IOF_COMPLETE == state) {
        /* update the proc state */
        if (pdata->state < ORTE_PROC_STATE_TERMINATED) {
            pdata->state = state;
        }
        /* Release only the stdin IOF file descriptor for this child, if one
         * was defined. File descriptors for the other IOF channels - stdout,
         * stderr, and stddiag - were released when their associated pipes
         * were cleared and closed due to termination of the process
         */
        if (NULL != orte_iof.close) {
            orte_iof.close(proc, ORTE_IOF_STDIN);
        }
        ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_IOF_COMPLETE);
        if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_WAITPID)) {
            ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
        }
    } else if (ORTE_PROC_STATE_WAITPID_FIRED == state) {
        /* update the proc state */
        if (pdata->state < ORTE_PROC_STATE_TERMINATED) {
            pdata->state = state;
        }
        ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_WAITPID);
        if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_IOF_COMPLETE)) {
            ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
        }
    } else if (ORTE_PROC_STATE_TERMINATED == state) {
        /* update the proc state */
        ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE);
        if (pdata->state < ORTE_PROC_STATE_TERMINATED) {
            pdata->state = state;
        }
        if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_LOCAL)) {
            /* tell the PMIx subsystem to cleanup this client */
            opal_pmix.server_deregister_client(proc, NULL, NULL);
            /* Clean up the session directory as if we were the process
             * itself.  This covers the case where the process died abnormally
             * and didn't cleanup its own session directory.
             */
            orte_session_dir_finalize(proc);
        }
        /* if we are trying to terminate and our routes are
         * gone, then terminate ourselves IF no local procs
         * remain (might be some from another job)
         */
        if (orte_orteds_term_ordered &&
                0 == orte_routed.num_routes(rtmod)) {
            for (i=0; i < orte_local_children->size; i++) {
                if (NULL != (pdata = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
                        ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_ALIVE)) {
                    /* at least one is still alive */
                    goto cleanup;
                }
            }
            /* call our appropriate exit procedure */
            OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
                                 "%s state:base all routes and children gone - exiting",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
            goto cleanup;
        }
        /* return the allocated slot for reuse */
        cleanup_node(pdata);
        /* track job status */
        jdata->num_terminated++;
        if (jdata->num_terminated == jdata->num_procs) {
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
            /* if they requested notification upon completion, provide it */
            if (orte_get_attribute(&jdata->attributes, ORTE_JOB_NOTIFY_COMPLETION, NULL, OPAL_BOOL)) {
                /* notify_completion => notify the parent of the termination
                 * of this child job. So get the parent jobid info */
                npptr = &parent;
                if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_LAUNCH_PROXY, (void**)&npptr, OPAL_NAME)) {
                    /* notify everyone who asked for it */
                    target.jobid = jdata->jobid;
                    target.vpid = ORTE_VPID_WILDCARD;
                    _send_notification(OPAL_ERR_JOB_TERMINATED, &target, ORTE_NAME_WILDCARD);
                } else {
                    target.jobid = jdata->jobid;
                    target.vpid = ORTE_VPID_WILDCARD;
                    _send_notification(OPAL_ERR_JOB_TERMINATED, &target, &parent);
                }
            }
        } else if (ORTE_PROC_STATE_TERMINATED < pdata->state &&
                   !orte_job_term_ordered) {
            /* if this was an abnormal term, notify the other procs of the termination */
            parent.jobid = jdata->jobid;
            parent.vpid = ORTE_VPID_WILDCARD;
            _send_notification(OPAL_ERR_PROC_ABORTED, &pdata->name, &parent);
        }
    }

cleanup:
    OBJ_RELEASE(caddy);
}
Example #17
0
int orte_daemon(int argc, char *argv[])
{
    int ret = 0;
    opal_cmd_line_t *cmd_line = NULL;
    char *rml_uri;
    int i;
    opal_buffer_t *buffer;
    char hostname[100];
#if OPAL_ENABLE_FT_CR == 1
    char *tmp_env_var = NULL;
#endif
    
    /* initialize the globals */
    memset(&orted_globals, 0, sizeof(orted_globals));
    /* initialize the singleton died pipe to an illegal value so we can detect it was set */
    orted_globals.singleton_died_pipe = -1;
    /* init the failure orted vpid to an invalid value */
    orted_globals.fail = ORTE_VPID_INVALID;
    
    /* setup to check common command line options that just report and die */
    cmd_line = OBJ_NEW(opal_cmd_line_t);
    if (OPAL_SUCCESS != opal_cmd_line_create(cmd_line, orte_cmd_line_opts)) {
        OBJ_RELEASE(cmd_line);
        exit(1);
    }
    mca_base_cmd_line_setup(cmd_line);
    if (ORTE_SUCCESS != (ret = opal_cmd_line_parse(cmd_line, false,
                                                   argc, argv))) {
        char *args = NULL;
        args = opal_cmd_line_get_usage_msg(cmd_line);
        fprintf(stderr, "Usage: %s [OPTION]...\n%s\n", argv[0], args);
        free(args);
        OBJ_RELEASE(cmd_line);
        return ret;
    }
    
    /*
     * Since this process can now handle MCA/GMCA parameters, make sure to
     * process them.
     */
    mca_base_cmd_line_process_args(cmd_line, &environ, &environ);
    
    /* Ensure that enough of OPAL is setup for us to be able to run */
    /*
     * NOTE: (JJH)
     *  We need to allow 'mca_base_cmd_line_process_args()' to process command
     *  line arguments *before* calling opal_init_util() since the command
     *  line could contain MCA parameters that affect the way opal_init_util()
     *  functions. AMCA parameters are one such option normally received on the
     *  command line that affect the way opal_init_util() behaves.
     *  It is "safe" to call mca_base_cmd_line_process_args() before 
     *  opal_init_util() since mca_base_cmd_line_process_args() does *not*
     *  depend upon opal_init_util() functionality.
     */
    if (OPAL_SUCCESS != opal_init_util(&argc, &argv)) {
        fprintf(stderr, "OPAL failed to initialize -- orted aborting\n");
        exit(1);
    }

    /* save the environment for launch purposes. This MUST be
     * done so that we can pass it to any local procs we
     * spawn - otherwise, those local procs won't see any
     * non-MCA envars that were set in the enviro when the
     * orted was executed - e.g., by .csh
     */
    orte_launch_environ = opal_argv_copy(environ);
    
    /* purge any ess flag set in the environ when we were launched */
    opal_unsetenv(OPAL_MCA_PREFIX"ess", &orte_launch_environ);
    
    /* if orte_daemon_debug is set, let someone know we are alive right
     * away just in case we have a problem along the way
     */
    if (orted_globals.debug) {
        gethostname(hostname, 100);
        fprintf(stderr, "Daemon was launched on %s - beginning to initialize\n", hostname);
    }
    
    /* check for help request */
    if (orted_globals.help) {
        char *args = NULL;
        args = opal_cmd_line_get_usage_msg(cmd_line);
        orte_show_help("help-orted.txt", "orted:usage", false,
                       argv[0], args);
        free(args);
        return 1;
    }
#if defined(HAVE_SETSID)
    /* see if we were directed to separate from current session */
    if (orted_globals.set_sid) {
        setsid();
    }
#endif
    /* see if they want us to spin until they can connect a debugger to us */
    i=0;
    while (orted_spin_flag) {
        i++;
        if (1000 < i) i=0;        
    }

#if OPAL_ENABLE_FT_CR == 1
    /* Mark as a tool program */
    (void) mca_base_var_env_name ("opal_cr_is_tool", &tmp_env_var);
    opal_setenv(tmp_env_var,
                "1",
                true, &environ);
    free(tmp_env_var);
#endif

    /* if mapreduce set, flag it */
    if (orted_globals.mapreduce) {
        orte_map_reduce = true;
    }

    /* detach from controlling terminal
     * otherwise, remain attached so output can get to us
     */
    if(!orte_debug_flag &&
       !orte_debug_daemons_flag &&
       orted_globals.daemonize) {
        opal_daemon_init(NULL);
    }
    
    /* Set the flag telling OpenRTE that I am NOT a
     * singleton, but am "infrastructure" - prevents setting
     * up incorrect infrastructure that only a singleton would
     * require.
     */
    if (orted_globals.hnp) {
        if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_HNP))) {
            ORTE_ERROR_LOG(ret);
            return ret;
        }
    } else {
        if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_DAEMON))) {
            ORTE_ERROR_LOG(ret);
            return ret;
        }
    }
    /* finalize the OPAL utils. As they are opened again from orte_init->opal_init
     * we continue to have a reference count on them. So we have to finalize them twice...
     */
    opal_finalize_util();

#if OPAL_HAVE_HWLOC
    /* bind ourselves if so directed */
    if (NULL != orte_daemon_cores) {
        char **cores=NULL, tmp[128];
        hwloc_obj_t pu;
        hwloc_cpuset_t ours, pucpus, res;
        int core;

        /* could be a collection of comma-delimited ranges, so
         * use our handy utility to parse it
         */
        orte_util_parse_range_options(orte_daemon_cores, &cores);
        if (NULL != cores) {
            ours = hwloc_bitmap_alloc();
            hwloc_bitmap_zero(ours);
            pucpus = hwloc_bitmap_alloc();
            res = hwloc_bitmap_alloc();
            for (i=0; NULL != cores[i]; i++) {
                core = strtoul(cores[i], NULL, 10);
                if (NULL == (pu = opal_hwloc_base_get_pu(opal_hwloc_topology, core, OPAL_HWLOC_LOGICAL))) {
                    /* turn off the show help forwarding as we won't
                     * be able to cycle the event library to send
                     */
                    orte_show_help_finalize();
                    /* the message will now come out locally */
                    orte_show_help("help-orted.txt", "orted:cannot-bind",
                                   true, orte_process_info.nodename,
                                   orte_daemon_cores);
                    ret = ORTE_ERR_NOT_SUPPORTED;
                    goto DONE;
                }
                hwloc_bitmap_and(pucpus, pu->online_cpuset, pu->allowed_cpuset);
                hwloc_bitmap_or(res, ours, pucpus);
                hwloc_bitmap_copy(ours, res);
            }
            /* if the result is all zeros, then don't bind */
            if (!hwloc_bitmap_iszero(ours)) {
                (void)hwloc_set_cpubind(opal_hwloc_topology, ours, 0);
                if (opal_hwloc_report_bindings) {
                    opal_hwloc_base_cset2mapstr(tmp, sizeof(tmp), opal_hwloc_topology, ours);
                    opal_output(0, "Daemon %s is bound to cores %s",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp);
                }
            }
            /* cleanup */
            hwloc_bitmap_free(ours);
            hwloc_bitmap_free(pucpus);
            hwloc_bitmap_free(res);
            opal_argv_free(cores);
        }
    }
#endif

    if ((int)ORTE_VPID_INVALID != orted_globals.fail) {
        orted_globals.abort=false;
        /* some vpid was ordered to fail. The value can be positive
         * or negative, depending upon the desired method for failure,
         * so need to check both here
         */
        if (0 > orted_globals.fail) {
            orted_globals.fail = -1*orted_globals.fail;
            orted_globals.abort = true;
        }
        /* are we the specified vpid? */
        if ((int)ORTE_PROC_MY_NAME->vpid == orted_globals.fail) {
            /* if the user specified we delay, then setup a timer
             * and have it kill us
             */
            if (0 < orted_globals.fail_delay) {
                ORTE_TIMER_EVENT(orted_globals.fail_delay, 0, shutdown_callback, ORTE_SYS_PRI);
                
            } else {
                opal_output(0, "%s is executing clean %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            orted_globals.abort ? "abort" : "abnormal termination");

                /* do -not- call finalize as this will send a message to the HNP
                 * indicating clean termination! Instead, just forcibly cleanup
                 * the local session_dir tree and exit
                 */
                orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
                
                /* if we were ordered to abort, do so */
                if (orted_globals.abort) {
                    abort();
                }
                
                /* otherwise, return with non-zero status */
                ret = ORTE_ERROR_DEFAULT_EXIT_CODE;
                goto DONE;
            }
        }
    }

    /* insert our contact info into our process_info struct so we
     * have it for later use and set the local daemon field to our name
     */
    orte_process_info.my_daemon_uri = orte_rml.get_contact_info();
    ORTE_PROC_MY_DAEMON->jobid = ORTE_PROC_MY_NAME->jobid;
    ORTE_PROC_MY_DAEMON->vpid = ORTE_PROC_MY_NAME->vpid;
    
    /* if I am also the hnp, then update that contact info field too */
    if (ORTE_PROC_IS_HNP) {
        orte_process_info.my_hnp_uri = orte_rml.get_contact_info();
        ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid;
        ORTE_PROC_MY_HNP->vpid = ORTE_PROC_MY_NAME->vpid;
    }
    
    /* setup the primary daemon command receive function */
    orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON,
                            ORTE_RML_PERSISTENT, orte_daemon_recv, NULL);
    
    /* output a message indicating we are alive, our name, and our pid
     * for debugging purposes
     */
    if (orte_debug_daemons_flag) {
        fprintf(stderr, "Daemon %s checking in as pid %ld on host %s\n",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)orte_process_info.pid,
                orte_process_info.nodename);
    }

    /* We actually do *not* want the orted to voluntarily yield() the
       processor more than necessary.  The orted already blocks when
       it is doing nothing, so it doesn't use any more CPU cycles than
       it should; but when it *is* doing something, we do not want it
       to be unnecessarily delayed because it voluntarily yielded the
       processor in the middle of its work.

       For example: when a message arrives at the orted, we want the
       OS to wake up the orted in a timely fashion (which most OS's
       seem good about doing) and then we want the orted to process
       the message as fast as possible.  If the orted yields and lets
       aggressive MPI applications get the processor back, it may be a
       long time before the OS schedules the orted to run again
       (particularly if there is no IO event to wake it up).  Hence,
       routed OOB messages (for example) may be significantly delayed
       before being delivered to MPI processes, which can be
       problematic in some scenarios (e.g., COMM_SPAWN, BTL's that
       require OOB messages for wireup, etc.). */
    opal_progress_set_yield_when_idle(false);

    /* Change the default behavior of libevent such that we want to
       continually block rather than blocking for the default timeout
       and then looping around the progress engine again.  There
       should be nothing in the orted that cannot block in libevent
       until "something" happens (i.e., there's no need to keep
       cycling through progress because the only things that should
       happen will happen in libevent).  This is a minor optimization,
       but what the heck... :-) */
    opal_progress_set_event_flag(OPAL_EVLOOP_ONCE);

    /* if requested, report my uri to the indicated pipe */
    if (orted_globals.uri_pipe > 0) {
        orte_job_t *jdata;
        orte_proc_t *proc;
        orte_node_t *node;
        orte_app_context_t *app;
        char *tmp, *nptr, *sysinfo;
        int32_t ljob;

        /* setup the singleton's job */
        jdata = OBJ_NEW(orte_job_t);
        orte_plm_base_create_jobid(jdata);
        ljob = ORTE_LOCAL_JOBID(jdata->jobid);
        opal_pointer_array_set_item(orte_job_data, ljob, jdata);

        /* must create a map for it (even though it has no
         * info in it) so that the job info will be picked
         * up in subsequent pidmaps or other daemons won't
         * know how to route
         */
        jdata->map = OBJ_NEW(orte_job_map_t);

        /* setup an app_context for the singleton */
        app = OBJ_NEW(orte_app_context_t);
        app->app = strdup("singleton");
        app->num_procs = 1;
        opal_pointer_array_add(jdata->apps, app);
        
        /* setup a proc object for the singleton - since we
         * -must- be the HNP, and therefore we stored our
         * node on the global node pool, and since the singleton
         * -must- be on the same node as us, indicate that
         */
        proc = OBJ_NEW(orte_proc_t);
        proc->name.jobid = jdata->jobid;
        proc->name.vpid = 0;
        ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_ALIVE);
        proc->state = ORTE_PROC_STATE_RUNNING;
        proc->app_idx = 0;
        /* obviously, it is on my node */
        node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
        proc->node = node;
        OBJ_RETAIN(node);  /* keep accounting straight */
        opal_pointer_array_add(jdata->procs, proc);
        jdata->num_procs = 1;
        /* and it obviously is on the node */
        OBJ_RETAIN(proc);
        opal_pointer_array_add(node->procs, proc);
        node->num_procs++;
        /* and obviously it is one of my local procs */
        OBJ_RETAIN(proc);
        opal_pointer_array_add(orte_local_children, proc);
        jdata->num_local_procs = 1;
        /* set the trivial */
        proc->local_rank = 0;
        proc->node_rank = 0;
        proc->app_rank = 0;
        proc->state = ORTE_PROC_STATE_RUNNING;
        proc->app_idx = 0;
        ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_LOCAL);

        /* create a string that contains our uri + sysinfo + PMIx server URI */
        orte_util_convert_sysinfo_to_string(&sysinfo, orte_local_cpu_type, orte_local_cpu_model);
        asprintf(&tmp, "%s[%s]%s", orte_process_info.my_daemon_uri, sysinfo, pmix_server_uri);
	free(sysinfo);

        /* pass that info to the singleton */
        write(orted_globals.uri_pipe, tmp, strlen(tmp)+1); /* need to add 1 to get the NULL */

        /* cleanup */
        free(tmp);

        /* since a singleton spawned us, we need to harvest
         * any MCA params from the local environment so
         * we can pass them along to any subsequent daemons
         * we may start as the result of a comm_spawn
         */
        for (i=0; NULL != environ[i]; i++) {
            if (0 == strncmp(environ[i], OPAL_MCA_PREFIX, 9)) {
                /* make a copy to manipulate */
                tmp = strdup(environ[i]);
                /* find the equal sign */
                nptr = strchr(tmp, '=');
                *nptr = '\0';
                nptr++;
                /* add the mca param to the orted cmd line */
                opal_argv_append_nosize(&orted_cmd_line, "-"OPAL_MCA_CMD_LINE_ID);
                opal_argv_append_nosize(&orted_cmd_line, &tmp[9]);
                opal_argv_append_nosize(&orted_cmd_line, nptr);
                free(tmp);
            }
        }
    }

    /* if we were given a pipe to monitor for singleton termination, set that up */
    if (orted_globals.singleton_died_pipe > 0) {
        /* register shutdown handler */
        pipe_handler = (opal_event_t*)malloc(sizeof(opal_event_t));
        opal_event_set(orte_event_base, pipe_handler,
                       orted_globals.singleton_died_pipe,
                       OPAL_EV_READ,
                       pipe_closed,
                       pipe_handler);
        opal_event_add(pipe_handler, NULL);
    }

    /* If I have a parent, then save his contact info so
     * any messages we send can flow thru him.
     */

    orte_parent_uri = NULL;
    (void) mca_base_var_register ("orte", "orte", NULL, "parent_uri",
                                  "URI for the parent if tree launch is enabled.",
                                  MCA_BASE_VAR_TYPE_STRING, NULL, 0,
                                  MCA_BASE_VAR_FLAG_INTERNAL,
                                  OPAL_INFO_LVL_9,
                                  MCA_BASE_VAR_SCOPE_CONSTANT,
                                  &orte_parent_uri);
    if (NULL != orte_parent_uri) {
        orte_process_name_t parent;

        /* set the contact info into the hash table */
        orte_rml.set_contact_info(orte_parent_uri);
        ret = orte_rml_base_parse_uris(orte_parent_uri, &parent, NULL);
        if (ORTE_SUCCESS != ret) {
            ORTE_ERROR_LOG(ret);
            free (orte_parent_uri);
            orte_parent_uri = NULL;
            goto DONE;
        }

        /* don't need this value anymore */
        free(orte_parent_uri);
        orte_parent_uri = NULL;

        /* tell the routed module that we have a path
         * back to the HNP
         */
        if (ORTE_SUCCESS != (ret = orte_routed.update_route(ORTE_PROC_MY_HNP, &parent))) {
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }
        /* set the lifeline to point to our parent so that we
         * can handle the situation if that lifeline goes away
         */
        if (ORTE_SUCCESS != (ret = orte_routed.set_lifeline(&parent))) {
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }
    }

    /* if we are not the HNP...the only time we will be an HNP
     * is if we are launched by a singleton to provide support
     * for it
     */
    if (!ORTE_PROC_IS_HNP) {
        /* send the information to the orted report-back point - this function
         * will process the data, but also counts the number of
         * orteds that reported back so the launch procedure can continue.
         * We need to do this at the last possible second as the HNP
         * can turn right around and begin issuing orders to us
         */

        buffer = OBJ_NEW(opal_buffer_t);
        /* insert our name for rollup purposes */
        if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
            ORTE_ERROR_LOG(ret);
            OBJ_RELEASE(buffer);
            goto DONE;
        }
        /* for now, always include our contact info, even if we are using
         * static ports. Eventually, this will be removed
         */
        rml_uri = orte_rml.get_contact_info();
        if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &rml_uri, 1, OPAL_STRING))) {
            ORTE_ERROR_LOG(ret);
            OBJ_RELEASE(buffer);
            goto DONE;
        }

        /* include our node name */
        opal_dss.pack(buffer, &orte_process_info.nodename, 1, OPAL_STRING);

        /* if requested, include any non-loopback aliases for this node */
        if (orte_retain_aliases) {
            char **aliases=NULL;
            uint8_t naliases, ni;
            char hostname[ORTE_MAX_HOSTNAME_SIZE];

            /* if we stripped the prefix or removed the fqdn,
             * include full hostname as an alias
             */
            gethostname(hostname, ORTE_MAX_HOSTNAME_SIZE);
            if (strlen(orte_process_info.nodename) < strlen(hostname)) {
                opal_argv_append_nosize(&aliases, hostname);
            }
            opal_ifgetaliases(&aliases);
            naliases = opal_argv_count(aliases);
            if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &naliases, 1, OPAL_UINT8))) {
                ORTE_ERROR_LOG(ret);
                OBJ_RELEASE(buffer);
                goto DONE;
            }
            for (ni=0; ni < naliases; ni++) {
                if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &aliases[ni], 1, OPAL_STRING))) {
                    ORTE_ERROR_LOG(ret);
                    OBJ_RELEASE(buffer);
                    goto DONE;
                }
            }
            opal_argv_free(aliases);
        }

#if OPAL_HAVE_HWLOC
        {
            char *coprocessors;
            /* add the local topology */
            if (NULL != opal_hwloc_topology &&
                (1 == ORTE_PROC_MY_NAME->vpid || orte_hetero_nodes)) {
                if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &opal_hwloc_topology, 1, OPAL_HWLOC_TOPO))) {
                    ORTE_ERROR_LOG(ret);
                }
            }
            /* detect and add any coprocessors */
            coprocessors = opal_hwloc_base_find_coprocessors(opal_hwloc_topology);
            if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &coprocessors, 1, OPAL_STRING))) {
                ORTE_ERROR_LOG(ret);
            }
            /* see if I am on a coprocessor */
            coprocessors = opal_hwloc_base_check_on_coprocessor();
            if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &coprocessors, 1, OPAL_STRING))) {
                ORTE_ERROR_LOG(ret);
            }
        }
#endif

        /* send to the HNP's callback - will be routed if routes are available */
        if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buffer,
                                               ORTE_RML_TAG_ORTED_CALLBACK,
                                               orte_rml_send_callback, NULL))) {
            ORTE_ERROR_LOG(ret);
            OBJ_RELEASE(buffer);
            goto DONE;
        }
    }

    /* if we are tree-spawning, then we need to capture the MCA params
     * from our cmd line so we can pass them along to the daemons we spawn -
     * otherwise, only the first layer of daemons will ever see them
     */
    if (orted_globals.tree_spawn) {
        int j, k;
        bool ignore;
        char *no_keep[] = {
            "orte_hnp_uri",
            "orte_ess_jobid",
            "orte_ess_vpid",
            "orte_ess_num_procs",
            "orte_parent_uri",
            "mca_base_env_list",
            NULL
        };
        for (i=0; i < argc; i++) {
            if (0 == strcmp("-"OPAL_MCA_CMD_LINE_ID,  argv[i]) ||
                0 == strcmp("--"OPAL_MCA_CMD_LINE_ID, argv[i]) ) {
                ignore = false;
                /* see if this is something we cannot pass along */
                for (k=0; NULL != no_keep[k]; k++) {
                    if (0 == strcmp(no_keep[k], argv[i+1])) {
                        ignore = true;
                        break;
                    }
                }
                if (!ignore) {
                    /* see if this is already present so we at least can
                     * avoid growing the cmd line with duplicates
                     */
                    if (NULL != orted_cmd_line) {
                        for (j=0; NULL != orted_cmd_line[j]; j++) {
                            if (0 == strcmp(argv[i+1], orted_cmd_line[j])) {
                                /* already here - ignore it */
                                ignore = true;
                                break;
                            }
                        }
                    }
                    if (!ignore) {
                        opal_argv_append_nosize(&orted_cmd_line, argv[i]);
                        opal_argv_append_nosize(&orted_cmd_line, argv[i+1]);
                        opal_argv_append_nosize(&orted_cmd_line, argv[i+2]);
                    }
                }
                i += 2;
            }
        }
    }
            
    if (orte_debug_daemons_flag) {
        opal_output(0, "%s orted: up and running - waiting for commands!", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
    }
    ret = ORTE_SUCCESS;

    /* loop the event lib until an exit event is detected */
    while (orte_event_base_active) {
        opal_event_loop(orte_event_base, OPAL_EVLOOP_ONCE);
    }

    /* ensure all local procs are dead */
    orte_odls.kill_local_procs(NULL);

 DONE:
    /* update the exit status, in case it wasn't done */
    ORTE_UPDATE_EXIT_STATUS(ret);

    /* cleanup and leave */
    orte_finalize();

    if (orte_debug_flag) {
        fprintf(stderr, "exiting with status %d\n", orte_exit_status);
    }
    exit(orte_exit_status);
}
Example #18
0
int orte_daemon(int argc, char *argv[])
{
    int ret = 0;
    opal_cmd_line_t *cmd_line = NULL;
    int i;
    opal_buffer_t *buffer;
    char hostname[OPAL_MAXHOSTNAMELEN];
#if OPAL_ENABLE_FT_CR == 1
    char *tmp_env_var = NULL;
#endif

    /* initialize the globals */
    memset(&orted_globals, 0, sizeof(orted_globals));
    /* initialize the singleton died pipe to an illegal value so we can detect it was set */
    orted_globals.singleton_died_pipe = -1;
    bucket = OBJ_NEW(opal_buffer_t);

    /* setup to check common command line options that just report and die */
    cmd_line = OBJ_NEW(opal_cmd_line_t);
    if (OPAL_SUCCESS != opal_cmd_line_create(cmd_line, orte_cmd_line_opts)) {
        OBJ_RELEASE(cmd_line);
        exit(1);
    }
    mca_base_cmd_line_setup(cmd_line);
    if (ORTE_SUCCESS != (ret = opal_cmd_line_parse(cmd_line, false, false,
                                                   argc, argv))) {
        char *args = NULL;
        args = opal_cmd_line_get_usage_msg(cmd_line);
        fprintf(stderr, "Usage: %s [OPTION]...\n%s\n", argv[0], args);
        free(args);
        OBJ_RELEASE(cmd_line);
        return ret;
    }

    /*
     * Since this process can now handle MCA/GMCA parameters, make sure to
     * process them.
     */
    mca_base_cmd_line_process_args(cmd_line, &environ, &environ);

    /* Ensure that enough of OPAL is setup for us to be able to run */
    /*
     * NOTE: (JJH)
     *  We need to allow 'mca_base_cmd_line_process_args()' to process command
     *  line arguments *before* calling opal_init_util() since the command
     *  line could contain MCA parameters that affect the way opal_init_util()
     *  functions. AMCA parameters are one such option normally received on the
     *  command line that affect the way opal_init_util() behaves.
     *  It is "safe" to call mca_base_cmd_line_process_args() before
     *  opal_init_util() since mca_base_cmd_line_process_args() does *not*
     *  depend upon opal_init_util() functionality.
     */
    if (OPAL_SUCCESS != opal_init_util(&argc, &argv)) {
        fprintf(stderr, "OPAL failed to initialize -- orted aborting\n");
        exit(1);
    }

    /* save the environment for launch purposes. This MUST be
     * done so that we can pass it to any local procs we
     * spawn - otherwise, those local procs won't see any
     * non-MCA envars that were set in the enviro when the
     * orted was executed - e.g., by .csh
     */
    orte_launch_environ = opal_argv_copy(environ);

    /* purge any ess/pmix flags set in the environ when we were launched */
    opal_unsetenv(OPAL_MCA_PREFIX"ess", &orte_launch_environ);
    opal_unsetenv(OPAL_MCA_PREFIX"pmix", &orte_launch_environ);

    /* if orte_daemon_debug is set, let someone know we are alive right
     * away just in case we have a problem along the way
     */
    if (orted_globals.debug) {
        gethostname(hostname, sizeof(hostname));
        fprintf(stderr, "Daemon was launched on %s - beginning to initialize\n", hostname);
    }

    /* check for help request */
    if (orted_globals.help) {
        char *args = NULL;
        args = opal_cmd_line_get_usage_msg(cmd_line);
        orte_show_help("help-orted.txt", "orted:usage", false,
                       argv[0], args);
        free(args);
        return 1;
    }
#if defined(HAVE_SETSID)
    /* see if we were directed to separate from current session */
    if (orted_globals.set_sid) {
        setsid();
    }
#endif
    /* see if they want us to spin until they can connect a debugger to us */
    i=0;
    while (orted_spin_flag) {
        i++;
        if (1000 < i) i=0;
    }

#if OPAL_ENABLE_FT_CR == 1
    /* Mark as a tool program */
    (void) mca_base_var_env_name ("opal_cr_is_tool", &tmp_env_var);
    opal_setenv(tmp_env_var,
                "1",
                true, &environ);
    free(tmp_env_var);
#endif

    /* detach from controlling terminal
     * otherwise, remain attached so output can get to us
     */
    if(!orte_debug_flag &&
       !orte_debug_daemons_flag &&
       orted_globals.daemonize) {
        opal_daemon_init(NULL);
    }

    /* Set the flag telling OpenRTE that I am NOT a
     * singleton, but am "infrastructure" - prevents setting
     * up incorrect infrastructure that only a singleton would
     * require.
     */
    if (orted_globals.hnp) {
        if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_HNP))) {
            ORTE_ERROR_LOG(ret);
            return ret;
        }
    } else {
        if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_DAEMON))) {
            ORTE_ERROR_LOG(ret);
            return ret;
        }
    }

    /* finalize the OPAL utils. As they are opened again from orte_init->opal_init
     * we continue to have a reference count on them. So we have to finalize them twice...
     */
    opal_finalize_util();

    /* bind ourselves if so directed */
    if (NULL != orte_daemon_cores) {
        char **cores=NULL, tmp[128];
        hwloc_obj_t pu;
        hwloc_cpuset_t ours, res;
        int core;

        /* could be a collection of comma-delimited ranges, so
         * use our handy utility to parse it
         */
        orte_util_parse_range_options(orte_daemon_cores, &cores);
        if (NULL != cores) {
            ours = hwloc_bitmap_alloc();
            hwloc_bitmap_zero(ours);
            res = hwloc_bitmap_alloc();
            for (i=0; NULL != cores[i]; i++) {
                core = strtoul(cores[i], NULL, 10);
                if (NULL == (pu = opal_hwloc_base_get_pu(opal_hwloc_topology, core, OPAL_HWLOC_LOGICAL))) {
                    /* turn off the show help forwarding as we won't
                     * be able to cycle the event library to send
                     */
                    orte_show_help_finalize();
                    /* the message will now come out locally */
                    orte_show_help("help-orted.txt", "orted:cannot-bind",
                                   true, orte_process_info.nodename,
                                   orte_daemon_cores);
                    ret = ORTE_ERR_NOT_SUPPORTED;
                    hwloc_bitmap_free(ours);
                    hwloc_bitmap_free(res);
                    goto DONE;
                }
                hwloc_bitmap_or(res, ours, pu->cpuset);
                hwloc_bitmap_copy(ours, res);
            }
            /* if the result is all zeros, then don't bind */
            if (!hwloc_bitmap_iszero(ours)) {
                (void)hwloc_set_cpubind(opal_hwloc_topology, ours, 0);
                if (opal_hwloc_report_bindings) {
                    opal_hwloc_base_cset2mapstr(tmp, sizeof(tmp), opal_hwloc_topology, ours);
                    opal_output(0, "Daemon %s is bound to cores %s",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp);
                }
            }
            /* cleanup */
            hwloc_bitmap_free(ours);
            hwloc_bitmap_free(res);
            opal_argv_free(cores);
        }
    }

    if ((int)ORTE_VPID_INVALID != orted_debug_failure) {
        orted_globals.abort=false;
        /* some vpid was ordered to fail. The value can be positive
         * or negative, depending upon the desired method for failure,
         * so need to check both here
         */
        if (0 > orted_debug_failure) {
            orted_debug_failure = -1*orted_debug_failure;
            orted_globals.abort = true;
        }
        /* are we the specified vpid? */
        if ((int)ORTE_PROC_MY_NAME->vpid == orted_debug_failure) {
            /* if the user specified we delay, then setup a timer
             * and have it kill us
             */
            if (0 < orted_debug_failure_delay) {
                ORTE_TIMER_EVENT(orted_debug_failure_delay, 0, shutdown_callback, ORTE_SYS_PRI);

            } else {
                opal_output(0, "%s is executing clean %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            orted_globals.abort ? "abort" : "abnormal termination");

                /* do -not- call finalize as this will send a message to the HNP
                 * indicating clean termination! Instead, just forcibly cleanup
                 * the local session_dir tree and exit
                 */
                orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);

                /* if we were ordered to abort, do so */
                if (orted_globals.abort) {
                    abort();
                }

                /* otherwise, return with non-zero status */
                ret = ORTE_ERROR_DEFAULT_EXIT_CODE;
                goto DONE;
            }
        }
    }

    /* insert our contact info into our process_info struct so we
     * have it for later use and set the local daemon field to our name
     */
    orte_oob_base_get_addr(&orte_process_info.my_daemon_uri);
    if (NULL == orte_process_info.my_daemon_uri) {
        /* no way to communicate */
        ret = ORTE_ERROR;
        goto DONE;
    }
    ORTE_PROC_MY_DAEMON->jobid = ORTE_PROC_MY_NAME->jobid;
    ORTE_PROC_MY_DAEMON->vpid = ORTE_PROC_MY_NAME->vpid;

    /* if I am also the hnp, then update that contact info field too */
    if (ORTE_PROC_IS_HNP) {
        orte_process_info.my_hnp_uri = strdup(orte_process_info.my_daemon_uri);
        ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid;
        ORTE_PROC_MY_HNP->vpid = ORTE_PROC_MY_NAME->vpid;
    }

    /* setup the primary daemon command receive function */
    orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON,
                            ORTE_RML_PERSISTENT, orte_daemon_recv, NULL);

    /* output a message indicating we are alive, our name, and our pid
     * for debugging purposes
     */
    if (orte_debug_daemons_flag) {
        fprintf(stderr, "Daemon %s checking in as pid %ld on host %s\n",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)orte_process_info.pid,
                orte_process_info.nodename);
    }

    /* We actually do *not* want the orted to voluntarily yield() the
       processor more than necessary.  The orted already blocks when
       it is doing nothing, so it doesn't use any more CPU cycles than
       it should; but when it *is* doing something, we do not want it
       to be unnecessarily delayed because it voluntarily yielded the
       processor in the middle of its work.

       For example: when a message arrives at the orted, we want the
       OS to wake up the orted in a timely fashion (which most OS's
       seem good about doing) and then we want the orted to process
       the message as fast as possible.  If the orted yields and lets
       aggressive MPI applications get the processor back, it may be a
       long time before the OS schedules the orted to run again
       (particularly if there is no IO event to wake it up).  Hence,
       routed OOB messages (for example) may be significantly delayed
       before being delivered to MPI processes, which can be
       problematic in some scenarios (e.g., COMM_SPAWN, BTL's that
       require OOB messages for wireup, etc.). */
    opal_progress_set_yield_when_idle(false);

    /* Change the default behavior of libevent such that we want to
       continually block rather than blocking for the default timeout
       and then looping around the progress engine again.  There
       should be nothing in the orted that cannot block in libevent
       until "something" happens (i.e., there's no need to keep
       cycling through progress because the only things that should
       happen will happen in libevent).  This is a minor optimization,
       but what the heck... :-) */
    opal_progress_set_event_flag(OPAL_EVLOOP_ONCE);

    /* if requested, report my uri to the indicated pipe */
    if (orted_globals.uri_pipe > 0) {
        orte_job_t *jdata;
        orte_proc_t *proc;
        orte_node_t *node;
        orte_app_context_t *app;
        char *tmp, *nptr, *sysinfo;
        char **singenv=NULL, *string_key, *env_str;

        /* setup the singleton's job */
        jdata = OBJ_NEW(orte_job_t);
        /* default to ompi for now */
        opal_argv_append_nosize(&jdata->personality, "ompi");
        orte_plm_base_create_jobid(jdata);
        opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);

        /* must create a map for it (even though it has no
         * info in it) so that the job info will be picked
         * up in subsequent pidmaps or other daemons won't
         * know how to route
         */
        jdata->map = OBJ_NEW(orte_job_map_t);

        /* setup an app_context for the singleton */
        app = OBJ_NEW(orte_app_context_t);
        app->app = strdup("singleton");
        app->num_procs = 1;
        opal_pointer_array_add(jdata->apps, app);
        jdata->num_apps = 1;

        /* setup a proc object for the singleton - since we
         * -must- be the HNP, and therefore we stored our
         * node on the global node pool, and since the singleton
         * -must- be on the same node as us, indicate that
         */
        proc = OBJ_NEW(orte_proc_t);
        proc->name.jobid = jdata->jobid;
        proc->name.vpid = 0;
        proc->parent = 0;
        ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_ALIVE);
        proc->state = ORTE_PROC_STATE_RUNNING;
        proc->app_idx = 0;
        /* obviously, it is on my node */
        node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
        proc->node = node;
        OBJ_RETAIN(node);  /* keep accounting straight */
        opal_pointer_array_add(jdata->procs, proc);
        jdata->num_procs = 1;
        /* add the node to the job map */
        OBJ_RETAIN(node);
        opal_pointer_array_add(jdata->map->nodes, node);
        jdata->map->num_nodes++;
        /* and it obviously is on the node */
        OBJ_RETAIN(proc);
        opal_pointer_array_add(node->procs, proc);
        node->num_procs++;
        /* and obviously it is one of my local procs */
        OBJ_RETAIN(proc);
        opal_pointer_array_add(orte_local_children, proc);
        jdata->num_local_procs = 1;
        /* set the trivial */
        proc->local_rank = 0;
        proc->node_rank = 0;
        proc->app_rank = 0;
        proc->state = ORTE_PROC_STATE_RUNNING;
        proc->app_idx = 0;
        ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_LOCAL);

        /* set the ORTE_JOB_TRANSPORT_KEY from the environment */
        orte_pre_condition_transports(jdata, NULL);

        /* register the singleton's nspace with our PMIx server */
        if (ORTE_SUCCESS != (ret = orte_pmix_server_register_nspace(jdata, false))) {
          ORTE_ERROR_LOG(ret);
          goto DONE;
        }
        /* use setup fork to create the envars needed by the singleton */
        if (OPAL_SUCCESS != (ret = opal_pmix.server_setup_fork(&proc->name, &singenv))) {
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }

        /* append the transport key to the envars needed by the singleton */
        if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_TRANSPORT_KEY, (void**)&string_key, OPAL_STRING) || NULL == string_key) {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            goto DONE;
        }
        asprintf(&env_str, OPAL_MCA_PREFIX"orte_precondition_transports=%s", string_key);
        opal_argv_append_nosize(&singenv, env_str);
        free(env_str);

        nptr = opal_argv_join(singenv, '*');
        opal_argv_free(singenv);

        /* create a string that contains our uri + sysinfo + PMIx server URI envars */
        orte_util_convert_sysinfo_to_string(&sysinfo, orte_local_cpu_type, orte_local_cpu_model);
        asprintf(&tmp, "%s[%s]%s", orte_process_info.my_daemon_uri, sysinfo, nptr);
        free(sysinfo);
        free(nptr);

        /* pass that info to the singleton */
        if (OPAL_SUCCESS != (ret = opal_fd_write(orted_globals.uri_pipe, strlen(tmp)+1, tmp))) { ; /* need to add 1 to get the NULL */
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }

        /* cleanup */
        free(tmp);
        close(orted_globals.uri_pipe);

        /* since a singleton spawned us, we need to harvest
         * any MCA params from the local environment so
         * we can pass them along to any subsequent daemons
         * we may start as the result of a comm_spawn
         */
        for (i=0; NULL != environ[i]; i++) {
            if (0 == strncmp(environ[i], OPAL_MCA_PREFIX, 9)) {
                /* make a copy to manipulate */
                tmp = strdup(environ[i]);
                /* find the equal sign */
                nptr = strchr(tmp, '=');
                *nptr = '\0';
                nptr++;
                /* add the mca param to the orted cmd line */
                opal_argv_append_nosize(&orted_cmd_line, "-"OPAL_MCA_CMD_LINE_ID);
                opal_argv_append_nosize(&orted_cmd_line, &tmp[9]);
                opal_argv_append_nosize(&orted_cmd_line, nptr);
                free(tmp);
            }
        }
    }

    /* if we were given a pipe to monitor for singleton termination, set that up */
    if (orted_globals.singleton_died_pipe > 0) {
        /* register shutdown handler */
        pipe_handler = (opal_event_t*)malloc(sizeof(opal_event_t));
        opal_event_set(orte_event_base, pipe_handler,
                       orted_globals.singleton_died_pipe,
                       OPAL_EV_READ,
                       pipe_closed,
                       pipe_handler);
        opal_event_add(pipe_handler, NULL);
    }

    /* If I have a parent, then save his contact info so
     * any messages we send can flow thru him.
     */
    orte_parent_uri = NULL;
    (void) mca_base_var_register ("orte", "orte", NULL, "parent_uri",
                                  "URI for the parent if tree launch is enabled.",
                                  MCA_BASE_VAR_TYPE_STRING, NULL, 0,
                                  MCA_BASE_VAR_FLAG_INTERNAL,
                                  OPAL_INFO_LVL_9,
                                  MCA_BASE_VAR_SCOPE_CONSTANT,
                                  &orte_parent_uri);
    if (NULL != orte_parent_uri) {
        orte_process_name_t parent;
        opal_value_t val;

        /* set the contact info into our local database */
        ret = orte_rml_base_parse_uris(orte_parent_uri, &parent, NULL);
        if (ORTE_SUCCESS != ret) {
            ORTE_ERROR_LOG(ret);
            free (orte_parent_uri);
            orte_parent_uri = NULL;
            goto DONE;
        }
        OBJ_CONSTRUCT(&val, opal_value_t);
        val.key = OPAL_PMIX_PROC_URI;
        val.type = OPAL_STRING;
        val.data.string = orte_parent_uri;
        if (OPAL_SUCCESS != (ret = opal_pmix.store_local(&parent, &val))) {
            ORTE_ERROR_LOG(ret);
            OBJ_DESTRUCT(&val);
            goto DONE;
        }
        val.key = NULL;
        val.data.string = NULL;
        OBJ_DESTRUCT(&val);

        /* don't need this value anymore */
        free(orte_parent_uri);
        orte_parent_uri = NULL;

        /* tell the routed module that we have a path
         * back to the HNP
         */
        if (ORTE_SUCCESS != (ret = orte_routed.update_route(NULL, ORTE_PROC_MY_HNP, &parent))) {
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }
        /* set the lifeline to point to our parent so that we
         * can handle the situation if that lifeline goes away
         */
        if (ORTE_SUCCESS != (ret = orte_routed.set_lifeline(NULL, &parent))) {
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }
    }

    /* if we are not the HNP...the only time we will be an HNP
     * is if we are launched by a singleton to provide support
     * for it
     */
    if (!ORTE_PROC_IS_HNP) {
        orte_process_name_t target;
        target.jobid = ORTE_PROC_MY_NAME->jobid;

        if (orte_fwd_mpirun_port || orte_static_ports) {
            /* setup the rollup callback */
            orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ORTED_CALLBACK,
                                    ORTE_RML_PERSISTENT, rollup, NULL);
            target.vpid = ORTE_PROC_MY_NAME->vpid;
            /* since we will be waiting for any children to send us
             * their rollup info before sending to our parent, save
             * a little time in the launch phase by "warming up" the
             * connection to our parent while we wait for our children */
            buffer = OBJ_NEW(opal_buffer_t);  // zero-byte message
            if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
                                                   ORTE_PROC_MY_PARENT, buffer,
                                                   ORTE_RML_TAG_WARMUP_CONNECTION,
                                                   orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(ret);
                OBJ_RELEASE(buffer);
                goto DONE;
            }
        } else {
            target.vpid = 0;
        }

        /* send the information to the orted report-back point - this function
         * will process the data, but also counts the number of
         * orteds that reported back so the launch procedure can continue.
         * We need to do this at the last possible second as the HNP
         * can turn right around and begin issuing orders to us
         */

        buffer = OBJ_NEW(opal_buffer_t);
        /* insert our name for rollup purposes */
        if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
            ORTE_ERROR_LOG(ret);
            OBJ_RELEASE(buffer);
            goto DONE;
        }

        /* get any connection info we may have pushed */
        {
            opal_value_t *val = NULL, *kv;
            opal_list_t *modex;
            int32_t flag;

            if (OPAL_SUCCESS != (ret = opal_pmix.get(ORTE_PROC_MY_NAME, NULL, NULL, &val)) || NULL == val) {
                /* just pack a marker indicating we don't have any to share */
                flag = 0;
                if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT32))) {
                    ORTE_ERROR_LOG(ret);
                    OBJ_RELEASE(buffer);
                    goto DONE;
                }
            } else {
                /* the data is returned as a list of key-value pairs in the opal_value_t */
                if (OPAL_PTR == val->type) {
                    modex = (opal_list_t*)val->data.ptr;
                    flag = (int32_t)opal_list_get_size(modex);
                    if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT32))) {
                        ORTE_ERROR_LOG(ret);
                        OBJ_RELEASE(buffer);
                        goto DONE;
                    }
                    OPAL_LIST_FOREACH(kv, modex, opal_value_t) {
                        if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &kv, 1, OPAL_VALUE))) {
                            ORTE_ERROR_LOG(ret);
                            OBJ_RELEASE(buffer);
                            goto DONE;
                        }
                    }
                    OPAL_LIST_RELEASE(modex);
                } else {
                    opal_output(0, "VAL KEY: %s", (NULL == val->key) ? "NULL" : val->key);
                    /* single value */
                    flag = 1;
                    if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT32))) {
                        ORTE_ERROR_LOG(ret);
                        OBJ_RELEASE(buffer);
                        goto DONE;
                    }
                    if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &val, 1, OPAL_VALUE))) {
                        ORTE_ERROR_LOG(ret);
                        OBJ_RELEASE(buffer);
                        goto DONE;
                    }
                }
                OBJ_RELEASE(val);
            }
Example #19
0
static int do_parent(orte_app_context_t* context,
                     orte_proc_t *child,
                     char **environ_copy,
                     orte_job_t *jobdat, int read_fd,
                     orte_iof_base_io_conf_t opts)
{
    int rc;
    orte_odls_pipe_err_msg_t msg;
    char file[ORTE_ODLS_MAX_FILE_LEN + 1], topic[ORTE_ODLS_MAX_TOPIC_LEN + 1], *str = NULL;

    if (NULL != child && ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
        /* connect endpoints IOF */
        rc = orte_iof_base_setup_parent(&child->name, &opts);
        if (ORTE_SUCCESS != rc) {
            ORTE_ERROR_LOG(rc);
            close(read_fd);

            if (NULL != child) {
                child->state = ORTE_PROC_STATE_UNDEF;
            }
            return rc;
        }
    }

    /* Block reading a message from the pipe */
    while (1) {
        rc = opal_fd_read(read_fd, sizeof(msg), &msg);

        /* If the pipe closed, then the child successfully launched */
        if (OPAL_ERR_TIMEOUT == rc) {
            break;
        }
        
        /* If Something Bad happened in the read, error out */
        if (OPAL_SUCCESS != rc) {
            ORTE_ERROR_LOG(rc);
            close(read_fd);
            
            if (NULL != child) {
                child->state = ORTE_PROC_STATE_UNDEF;
            }
            return rc;
        }

        /* Otherwise, we got a warning or error message from the child */
        if (NULL != child) {
            if (msg.fatal) {
                ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
            } else {
                ORTE_FLAG_SET(child, ORTE_PROC_FLAG_ALIVE);
            }
        }

        /* Read in the strings; ensure to terminate them with \0 */
        if (msg.file_str_len > 0) {
            rc = opal_fd_read(read_fd, msg.file_str_len, file);
            if (OPAL_SUCCESS != rc) {
                orte_show_help("help-orte-odls-default.txt", "syscall fail", 
                               true,
                               orte_process_info.nodename, context->app,
                               "opal_fd_read", __FILE__, __LINE__);
                if (NULL != child) {
                    child->state = ORTE_PROC_STATE_UNDEF;
                }
                return rc;
            }
            file[msg.file_str_len] = '\0';
        }
        if (msg.topic_str_len > 0) {
            rc = opal_fd_read(read_fd, msg.topic_str_len, topic);
            if (OPAL_SUCCESS != rc) {
                orte_show_help("help-orte-odls-default.txt", "syscall fail", 
                               true,
                               orte_process_info.nodename, context->app,
                               "opal_fd_read", __FILE__, __LINE__);
                if (NULL != child) {
                    child->state = ORTE_PROC_STATE_UNDEF;
                }
                return rc;
            }
            topic[msg.topic_str_len] = '\0';
        }
        if (msg.msg_str_len > 0) {
            str = calloc(1, msg.msg_str_len + 1);
            if (NULL == str) {
                orte_show_help("help-orte-odls-default.txt", "syscall fail", 
                               true,
                               orte_process_info.nodename, context->app,
                               "opal_fd_read", __FILE__, __LINE__);
                if (NULL != child) {
                    child->state = ORTE_PROC_STATE_UNDEF;
                }
                return rc;
            }
            rc = opal_fd_read(read_fd, msg.msg_str_len, str);
        }

        /* Print out what we got.  We already have a rendered string,
           so use orte_show_help_norender(). */
        if (msg.msg_str_len > 0) {
            orte_show_help_norender(file, topic, false, str);
            free(str);
            str = NULL;
        }

        /* If msg.fatal is true, then the child exited with an error.
           Otherwise, whatever we just printed was a warning, so loop
           around and see what else is on the pipe (or if the pipe
           closed, indicating that the child launched
           successfully). */
        if (msg.fatal) {
            if (NULL != child) {
                child->state = ORTE_PROC_STATE_FAILED_TO_START;
                ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
            }
            close(read_fd);
            return ORTE_ERR_FAILED_TO_START;
        }
    }

    /* If we got here, it means that the pipe closed without
       indication of a fatal error, meaning that the child process
       launched successfully. */
    if (NULL != child) {
        child->state = ORTE_PROC_STATE_RUNNING;
        ORTE_FLAG_SET(child, ORTE_PROC_FLAG_ALIVE);
    }
    close(read_fd);
    
    return ORTE_SUCCESS;
}
Example #20
0
static int hostfile_parse_line(int token, opal_list_t* updates,
                               opal_list_t* exclude, bool keep_all)
{
    int rc;
    orte_node_t* node;
    bool got_max = false;
    char* value;
    char **argv;
    char* node_name = NULL;
    char* username = NULL;
    int cnt;
    int number_of_slots = 0;
    char buff[64];

    if (ORTE_HOSTFILE_STRING == token ||
        ORTE_HOSTFILE_HOSTNAME == token ||
        ORTE_HOSTFILE_INT == token ||
        ORTE_HOSTFILE_IPV4 == token ||
        ORTE_HOSTFILE_IPV6 == token) {

        if(ORTE_HOSTFILE_INT == token) {
            snprintf(buff, 64, "%d", orte_util_hostfile_value.ival);
            value = buff;
        } else {
            value = orte_util_hostfile_value.sval;
        }
        argv = opal_argv_split (value, '@');

        cnt = opal_argv_count (argv);
        if (1 == cnt) {
            node_name = strdup(argv[0]);
        } else if (2 == cnt) {
            username = argv[0];
            node_name = strdup(argv[1]);
        } else {
            opal_output(0, "WARNING: Unhandled user@host-combination\n"); /* XXX */
        }
        opal_argv_free (argv);

        // Strip off the FQDN if present, ignore IP addresses
        if( !orte_keep_fqdn_hostnames && !opal_net_isaddr(node_name) ) {
            char *ptr;
            if (NULL != (ptr = strchr(node_name, '.'))) {
                *ptr = '\0';
            }
        }

        /* if the first letter of the name is '^', then this is a node
         * to be excluded. Remove the ^ character so the nodename is
         * usable, and put it on the exclude list
         */
        if ('^' == node_name[0]) {
            int i, len;
            len = strlen(node_name);
            for (i=1; i < len; i++) {
                node_name[i-1] = node_name[i];
            }
            node_name[len-1] = '\0';  /* truncate */

            OPAL_OUTPUT_VERBOSE((3, orte_ras_base_framework.framework_output,
                                 "%s hostfile: node %s is being excluded",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node_name));

            /* see if this is another name for us */
            if (orte_ifislocal(node_name)) {
                /* Nodename has been allocated, that is for sure */
                free (node_name);
                node_name = strdup(orte_process_info.nodename);
            }

            /* Do we need to make a new node object?  First check to see
               if it's already in the exclude list */
            if (NULL == (node = hostfile_lookup(exclude, node_name))) {
                node = OBJ_NEW(orte_node_t);
                node->name = node_name;
                if (NULL != username) {
                    orte_set_attribute(&node->attributes, ORTE_NODE_USERNAME, ORTE_ATTR_LOCAL, username, OPAL_STRING);
                }
                opal_list_append(exclude, &node->super);
            } else {
                free(node_name);
            }
            return ORTE_SUCCESS;
        }

        /* this is not a node to be excluded, so we need to process it and
         * add it to the "include" list. See if this host is actually us.
         */
        if (orte_ifislocal(node_name)) {
            /* Nodename has been allocated, that is for sure */
            free (node_name);
            node_name = strdup(orte_process_info.nodename);
        }

        OPAL_OUTPUT_VERBOSE((3, orte_ras_base_framework.framework_output,
                             "%s hostfile: node %s is being included - keep all is %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node_name,
                             keep_all ? "TRUE" : "FALSE"));

        /* Do we need to make a new node object? */
        if (keep_all || NULL == (node = hostfile_lookup(updates, node_name))) {
            node = OBJ_NEW(orte_node_t);
            node->name = node_name;
            node->slots = 1;
            if (NULL != username) {
                orte_set_attribute(&node->attributes, ORTE_NODE_USERNAME, ORTE_ATTR_LOCAL, username, OPAL_STRING);
            }
            opal_list_append(updates, &node->super);
        } else {
            /* this node was already found once - add a slot and mark slots as "given" */
            node->slots++;
            ORTE_FLAG_SET(node, ORTE_NODE_FLAG_SLOTS_GIVEN);
            free(node_name);
        }
    } else if (ORTE_HOSTFILE_RELATIVE == token) {
        /* store this for later processing */
        node = OBJ_NEW(orte_node_t);
        node->name = strdup(orte_util_hostfile_value.sval);
        opal_list_append(updates, &node->super);
    } else if (ORTE_HOSTFILE_RANK == token) {
        /* we can ignore the rank, but we need to extract the node name. we
         * first need to shift over to the other side of the equal sign as
         * this is where the node name will be
         */
        while (!orte_util_hostfile_done &&
               ORTE_HOSTFILE_EQUAL != token) {
            token = orte_util_hostfile_lex();
        }
        if (orte_util_hostfile_done) {
            /* bad syntax somewhere */
            return ORTE_ERROR;
        }
        /* next position should be the node name */
        token = orte_util_hostfile_lex();
        if(ORTE_HOSTFILE_INT == token) {
            snprintf(buff, 64, "%d", orte_util_hostfile_value.ival);
            value = buff;
        } else {
            value = orte_util_hostfile_value.sval;
        }

        argv = opal_argv_split (value, '@');

        cnt = opal_argv_count (argv);
        if (1 == cnt) {
            node_name = strdup(argv[0]);
        } else if (2 == cnt) {
            username = argv[0];
            node_name = strdup(argv[1]);
        } else {
            opal_output(0, "WARNING: Unhandled user@host-combination\n"); /* XXX */
        }
        opal_argv_free (argv);

        // Strip off the FQDN if present, ignore IP addresses
        if( !orte_keep_fqdn_hostnames && !opal_net_isaddr(node_name) ) {
            char *ptr;
            if (NULL != (ptr = strchr(node_name, '.'))) {
                *ptr = '\0';
            }
        }

        /* Do we need to make a new node object? */
        if (NULL == (node = hostfile_lookup(updates, node_name))) {
            node = OBJ_NEW(orte_node_t);
            node->name = node_name;
            node->slots = 1;
            if (NULL != username) {
                orte_set_attribute(&node->attributes, ORTE_NODE_USERNAME, ORTE_ATTR_LOCAL, username, OPAL_STRING);
            }
            opal_list_append(updates, &node->super);
        } else {
            /* add a slot */
            node->slots++;
            free(node_name);
        }
        OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
                             "%s hostfile: node %s slots %d nodes-given %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name, node->slots,
                             ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN) ? "TRUE" : "FALSE"));
        /* mark the slots as "given" since we take them as being the
         * number specified via the rankfile
         */
        ORTE_FLAG_SET(node, ORTE_NODE_FLAG_SLOTS_GIVEN);
        /* skip to end of line */
        while (!orte_util_hostfile_done &&
               ORTE_HOSTFILE_NEWLINE != token) {
            token = orte_util_hostfile_lex();
        }
        return ORTE_SUCCESS;
    } else {
        hostfile_parse_error(token);
        return ORTE_ERROR;
    }
    free(username);

    while (!orte_util_hostfile_done) {
        token = orte_util_hostfile_lex();

        switch (token) {
        case ORTE_HOSTFILE_DONE:
            goto done;

        case ORTE_HOSTFILE_NEWLINE:
            goto done;

        case ORTE_HOSTFILE_USERNAME:
            username = hostfile_parse_string();
            if (NULL != username) {
                orte_set_attribute(&node->attributes, ORTE_NODE_USERNAME, ORTE_ATTR_LOCAL, username, OPAL_STRING);
                free(username);
            }
            break;

        case ORTE_HOSTFILE_PORT:
            rc = hostfile_parse_int();
            if (rc < 0) {
                orte_show_help("help-hostfile.txt", "port",
                               true,
                               cur_hostfile_name, rc);
                return ORTE_ERROR;
            }
            orte_set_attribute(&node->attributes, ORTE_NODE_PORT, ORTE_ATTR_LOCAL, &rc, OPAL_INT);
            break;

        case ORTE_HOSTFILE_COUNT:
        case ORTE_HOSTFILE_CPU:
        case ORTE_HOSTFILE_SLOTS:
            rc = hostfile_parse_int();
            if (rc < 0) {
                orte_show_help("help-hostfile.txt", "slots",
                               true,
                               cur_hostfile_name, rc);
                opal_list_remove_item(updates, &node->super);
                OBJ_RELEASE(node);
                return ORTE_ERROR;
            }
            if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
                /* multiple definitions were given for the
                 * slot count - this is not allowed
                 */
                orte_show_help("help-hostfile.txt", "slots-given",
                               true,
                               cur_hostfile_name, node->name);
                opal_list_remove_item(updates, &node->super);
                OBJ_RELEASE(node);
                return ORTE_ERROR;
            }
            node->slots = rc;
            ORTE_FLAG_SET(node, ORTE_NODE_FLAG_SLOTS_GIVEN);

            /* Ensure that slots_max >= slots */
            if (node->slots_max != 0 && node->slots_max < node->slots) {
                node->slots_max = node->slots;
            }
            break;

        case ORTE_HOSTFILE_SLOTS_MAX:
            rc = hostfile_parse_int();
            if (rc < 0) {
                orte_show_help("help-hostfile.txt", "max_slots",
                               true,
                               cur_hostfile_name, ((size_t) rc));
                opal_list_remove_item(updates, &node->super);
                OBJ_RELEASE(node);
                return ORTE_ERROR;
            }
            /* Only take this update if it puts us >= node_slots */
            if (rc >= node->slots) {
                if (node->slots_max != rc) {
                    node->slots_max = rc;
                    got_max = true;
                }
            } else {
                orte_show_help("help-hostfile.txt", "max_slots_lt",
                               true,
                               cur_hostfile_name, node->slots, rc);
                ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
                opal_list_remove_item(updates, &node->super);
                OBJ_RELEASE(node);
                return ORTE_ERROR;
            }
            break;

        case ORTE_HOSTFILE_STRING:
        case ORTE_HOSTFILE_INT:
            /* just ignore it */
            break;

        default:
            hostfile_parse_error(token);
            opal_list_remove_item(updates, &node->super);
            OBJ_RELEASE(node);
            return ORTE_ERROR;
        }
        if (number_of_slots > node->slots) {
            ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
            opal_list_remove_item(updates, &node->super);
            OBJ_RELEASE(node);
            return ORTE_ERROR;
        }
    }

 done:
    if (got_max && !ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
        node->slots = node->slots_max;
        ORTE_FLAG_SET(node, ORTE_NODE_FLAG_SLOTS_GIVEN);
    }

    return ORTE_SUCCESS;
}
Example #21
0
void orte_state_base_track_procs(int fd, short argc, void *cbdata)
{
    orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata;
    orte_process_name_t *proc = &caddy->name;
    orte_proc_state_t state = caddy->proc_state;
    orte_job_t *jdata;
    orte_proc_t *pdata;
    int i;

    opal_output_verbose(5, orte_state_base_framework.framework_output,
                        "%s state:base:track_procs called for proc %s state %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_NAME_PRINT(proc),
                        orte_proc_state_to_str(state));

    /* get the job object for this proc */
    if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        goto cleanup;
    }
    pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid);

    if (ORTE_PROC_STATE_RUNNING == state) {
        /* update the proc state */
        pdata->state = state;
        jdata->num_launched++;
        if (jdata->num_launched == jdata->num_procs) {
            if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
                ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS);
            } else {
                ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_RUNNING);
            }
        }
    } else if (ORTE_PROC_STATE_REGISTERED == state) {
        /* update the proc state */
        pdata->state = state;
        jdata->num_reported++;
        if (jdata->num_reported == jdata->num_procs) {
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REGISTERED);
        }
    } else if (ORTE_PROC_STATE_IOF_COMPLETE == state) {
        /* update the proc state */
        pdata->state = state;
        /* Release only the stdin IOF file descriptor for this child, if one
         * was defined. File descriptors for the other IOF channels - stdout,
         * stderr, and stddiag - were released when their associated pipes
         * were cleared and closed due to termination of the process
         */
        if (NULL != orte_iof.close) {
            orte_iof.close(proc, ORTE_IOF_STDIN);
        }
        ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_IOF_COMPLETE);
        if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_WAITPID)) {
            ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
        }
    } else if (ORTE_PROC_STATE_WAITPID_FIRED == state) {
        /* update the proc state */
        pdata->state = state;
        ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_WAITPID);
        if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_IOF_COMPLETE)) {
            ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED);
        }
    } else if (ORTE_PROC_STATE_TERMINATED == state) {
        /* update the proc state */
        ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE);
        pdata->state = state;
	if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_LOCAL)) {
            /* Clean up the session directory as if we were the process
             * itself.  This covers the case where the process died abnormally
             * and didn't cleanup its own session directory.
             */
            orte_session_dir_finalize(proc);
	}
        /* if we are trying to terminate and our routes are
         * gone, then terminate ourselves IF no local procs
         * remain (might be some from another job)
         */
        if (orte_orteds_term_ordered &&
            0 == orte_routed.num_routes()) {
            for (i=0; i < orte_local_children->size; i++) {
                if (NULL != (pdata = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) &&
                    ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_ALIVE)) {
                    /* at least one is still alive */
                    goto cleanup;
                }
            }
            /* call our appropriate exit procedure */
            OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output,
                                 "%s state:base all routes and children gone - exiting",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED);
            goto cleanup;
        }
        /* return the allocated slot for reuse */
        cleanup_node(pdata);
	/* track job status */
	jdata->num_terminated++;
	if (jdata->num_terminated == jdata->num_procs) {
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED);
	}
    }

 cleanup:
    OBJ_RELEASE(caddy);
}
Example #22
0
/* Parse the provided hostfile and filter the nodes that are
 * on the input list, removing those that
 * are not found in the hostfile
 */
int orte_util_filter_hostfile_nodes(opal_list_t *nodes,
                                    char *hostfile,
                                    bool remove)
{
    opal_list_t newnodes, exclude;
    opal_list_item_t *item1, *item2, *next, *item3;
    orte_node_t *node_from_list, *node_from_file, *node_from_pool, *node3;
    int rc = ORTE_SUCCESS;
    char *cptr;
    int num_empty, nodeidx;
    bool want_all_empty = false;
    opal_list_t keep;
    bool found;

    OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
                        "%s hostfile: filtering nodes through hostfile %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostfile));

    /* parse the hostfile and create local list of findings */
    OBJ_CONSTRUCT(&newnodes, opal_list_t);
    OBJ_CONSTRUCT(&exclude, opal_list_t);
    if (ORTE_SUCCESS != (rc = hostfile_parse(hostfile, &newnodes, &exclude, false))) {
        OBJ_DESTRUCT(&newnodes);
        OBJ_DESTRUCT(&exclude);
        return rc;
    }

    /* if the hostfile was empty, then treat it as a no-op filter */
    if (0 == opal_list_get_size(&newnodes)) {
        OBJ_DESTRUCT(&newnodes);
        OBJ_DESTRUCT(&exclude);
        /* indicate that the hostfile was empty */
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }

    /* remove from the list of newnodes those that are in the exclude list
     * since we could have added duplicate names above due to the */
    while (NULL != (item1 = opal_list_remove_first(&exclude))) {
        node_from_file = (orte_node_t*)item1;
        /* check for matches on nodes */
        for (item2 = opal_list_get_first(&newnodes);
             item2 != opal_list_get_end(&newnodes);
             item2 = opal_list_get_next(item2)) {
            orte_node_t *node = (orte_node_t*)item2;
            if (0 == strcmp(node_from_file->name, node->name)) {
                /* match - remove it */
                opal_output(0, "HOST %s ON EXCLUDE LIST - REMOVING", node->name);
                opal_list_remove_item(&newnodes, item2);
                OBJ_RELEASE(item2);
                break;
            }
        }
        OBJ_RELEASE(item1);
    }

    /* now check our nodes and keep or mark those that match. We can
     * destruct our hostfile list as we go since this won't be needed
     */
    OBJ_CONSTRUCT(&keep, opal_list_t);
    while (NULL != (item2 = opal_list_remove_first(&newnodes))) {
        node_from_file = (orte_node_t*)item2;

        next = opal_list_get_next(item2);

        /* see if this is a relative node syntax */
        if ('+' == node_from_file->name[0]) {
            /* see if we specified empty nodes */
            if ('e' == node_from_file->name[1] ||
                'E' == node_from_file->name[1]) {
                /* request for empty nodes - do they want
                 * all of them?
                 */
                if (NULL != (cptr = strchr(node_from_file->name, ':'))) {
                    /* the colon indicates a specific # are requested */
                    cptr++; /* step past : */
                    num_empty = strtol(cptr, NULL, 10);
                } else {
                    /* want them all - set num_empty to max */
                    num_empty = INT_MAX;
                    want_all_empty = true;
                }
                /* search the list of nodes provided to us and find those
                 * that are empty
                 */
                item1 = opal_list_get_first(nodes);
                while (0 < num_empty && item1 != opal_list_get_end(nodes)) {
                    node_from_list = (orte_node_t*)item1;
                    next = opal_list_get_next(item1);  /* keep our place */
                    if (0 == node_from_list->slots_inuse) {
                        /* check to see if this node is explicitly called
                         * out later - if so, don't use it here
                         */
                        for (item3 = opal_list_get_first(&newnodes);
                             item3 != opal_list_get_end(&newnodes);
                             item3 = opal_list_get_next(item3)) {
                            node3 = (orte_node_t*)item3;
                            if (0 == strcmp(node3->name, node_from_list->name)) {
                                /* match - don't use it */
                                goto skipnode;
                            }
                        }
                        if (remove) {
                            /* remove item from list */
                            opal_list_remove_item(nodes, item1);
                            /* xfer to keep list */
                            opal_list_append(&keep, item1);
                        } else {
                            /* mark as included */
                            ORTE_FLAG_SET(node_from_list, ORTE_NODE_FLAG_MAPPED);
                        }
                        --num_empty;
                    }
                skipnode:
                    item1 = next;
                }
                /* did they get everything they wanted? */
                if (!want_all_empty && 0 < num_empty) {
                    orte_show_help("help-hostfile.txt", "hostfile:not-enough-empty",
                                   true, num_empty);
                    rc = ORTE_ERR_SILENT;
                    goto cleanup;
                }
            } else if ('n' == node_from_file->name[1] ||
                       'N' == node_from_file->name[1]) {
                /* they want a specific relative node #, so
                 * look it up on global pool
                 */
                nodeidx = strtol(&node_from_file->name[2], NULL, 10);
                if (NULL == (node_from_pool = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, nodeidx))) {
                    /* this is an error */
                    orte_show_help("help-hostfile.txt", "hostfile:relative-node-not-found",
                                   true, nodeidx, node_from_file->name);
                    rc = ORTE_ERR_SILENT;
                    goto cleanup;
                }
                /* search the list of nodes provided to us and find it */
                for (item1 = opal_list_get_first(nodes);
                     item1 != opal_list_get_end(nodes);
                     item1 = opal_list_get_next(nodes)) {
                    node_from_list = (orte_node_t*)item1;
                    if (0 == strcmp(node_from_list->name, node_from_pool->name)) {
                        if (remove) {
                            /* match - remove item from list */
                            opal_list_remove_item(nodes, item1);
                            /* xfer to keep list */
                            opal_list_append(&keep, item1);
                        } else {
                            /* mark as included */
                            ORTE_FLAG_SET(node_from_list, ORTE_NODE_FLAG_MAPPED);
                        }
                        break;
                    }
                }
            } else {
                /* invalid relative node syntax */
                orte_show_help("help-hostfile.txt", "hostfile:invalid-relative-node-syntax",
                               true, node_from_file->name);
                rc = ORTE_ERR_SILENT;
                goto cleanup;
            }
        } else {
            /* we are looking for a specific node on the list
             * search the provided list of nodes to see if this
             * one is found
             */
            found = false;
            for (item1 = opal_list_get_first(nodes);
                 item1 != opal_list_get_end(nodes);
                 item1 = opal_list_get_next(item1)) {
                node_from_list = (orte_node_t*)item1;
                /* we have converted all aliases for ourself
                 * to our own detected nodename, so no need
                 * to check for interfaces again - a simple
                 * strcmp will suffice */
                if (0 == strcmp(node_from_file->name, node_from_list->name)) {
                    /* if the slot count here is less than the
                     * total slots avail on this node, set it
                     * to the specified count - this allows people
                     * to subdivide an allocation
                     */
                    if (node_from_file->slots < node_from_list->slots) {
                        node_from_list->slots = node_from_file->slots;
                    }
                    if (remove) {
                        /* remove the node from the list */
                        opal_list_remove_item(nodes, item1);
                        /* xfer it to keep list */
                        opal_list_append(&keep, item1);
                    } else {
                        /* mark as included */
                        ORTE_FLAG_SET(node_from_list, ORTE_NODE_FLAG_MAPPED);
                    }
                    found = true;
                    break;
                }
            }
            /* if the host in the newnode list wasn't found,
             * then that is an error we need to report to the
             * user and abort
             */
            if (!found) {
                orte_show_help("help-hostfile.txt", "hostfile:extra-node-not-found",
                               true, hostfile, node_from_file->name);
                rc = ORTE_ERR_SILENT;
                goto cleanup;
            }
        }
        /* cleanup the newnode list */
        OBJ_RELEASE(item2);
    }

    /* if we still have entries on our hostfile list, then
     * there were requested hosts that were not in our allocation.
     * This is an error - report it to the user and return an error
     */
    if (0 != opal_list_get_size(&newnodes)) {
        orte_show_help("help-hostfile.txt", "not-all-mapped-alloc",
                       true, hostfile);
        while (NULL != (item1 = opal_list_remove_first(&newnodes))) {
            OBJ_RELEASE(item1);
        }
        OBJ_DESTRUCT(&newnodes);
        return ORTE_ERR_SILENT;
    }

    if (!remove) {
        /* all done */
        OBJ_DESTRUCT(&newnodes);
        return ORTE_SUCCESS;
    }

    /* clear the rest of the nodes list */
    while (NULL != (item1 = opal_list_remove_first(nodes))) {
        OBJ_RELEASE(item1);
    }

    /* the nodes list has been cleared - rebuild it in order */
    while (NULL != (item1 = opal_list_remove_first(&keep))) {
        opal_list_append(nodes, item1);
    }

cleanup:
    OBJ_DESTRUCT(&newnodes);

    return rc;
}
Example #23
0
static int rte_init(void)
{
    int ret;
    char *error = NULL;
    char *contact_path, *jobfam_dir;
    orte_job_t *jdata;
    orte_node_t *node;
    orte_proc_t *proc;
    orte_app_context_t *app;
    char **aliases, *aptr;

    /* run the prolog */
    if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
        error = "orte_ess_base_std_prolog";
        goto error;
    }

    /* setup callback for SIGPIPE */
    setup_sighandler(SIGPIPE, &epipe_handler, epipe_signal_callback);
    /** setup callbacks for abort signals - from this point
     * forward, we need to abort in a manner that allows us
     * to cleanup. However, we cannot directly use libevent
     * to trap these signals as otherwise we cannot respond
     * to them if we are stuck in an event! So instead use
     * the basic POSIX trap functions to handle the signal,
     * and then let that signal handler do some magic to
     * avoid the hang
     *
     * NOTE: posix traps don't allow us to do anything major
     * in them, so use a pipe tied to a libevent event to
     * reach a "safe" place where the termination event can
     * be created
     */
    pipe(term_pipe);
    /* setup an event to attempt normal termination on signal */
    opal_event_set(orte_event_base, &term_handler, term_pipe[0], OPAL_EV_READ, clean_abort, NULL);
    opal_event_set_priority(&term_handler, ORTE_ERROR_PRI);
    opal_event_add(&term_handler, NULL);

    /* Set both ends of this pipe to be close-on-exec so that no
       children inherit it */
    if (opal_fd_set_cloexec(term_pipe[0]) != OPAL_SUCCESS ||
        opal_fd_set_cloexec(term_pipe[1]) != OPAL_SUCCESS) {
        error = "unable to set the pipe to CLOEXEC";
        goto error;
    }

    /* point the signal trap to a function that will activate that event */
    signal(SIGTERM, abort_signal_callback);
    signal(SIGINT, abort_signal_callback);
    signal(SIGHUP, abort_signal_callback);

    /** setup callbacks for signals we should foward */
    setup_sighandler(SIGUSR1, &sigusr1_handler, signal_forward_callback);
    setup_sighandler(SIGUSR2, &sigusr2_handler, signal_forward_callback);
    setup_sighandler(SIGTSTP, &sigtstp_handler, signal_forward_callback);
    setup_sighandler(SIGCONT, &sigcont_handler, signal_forward_callback);
    signals_set = true;

#if OPAL_HAVE_HWLOC
    {
        hwloc_obj_t obj;
        unsigned i, j;

        /* get the local topology */
        if (NULL == opal_hwloc_topology) {
            if (OPAL_SUCCESS != opal_hwloc_base_get_topology()) {
                error = "topology discovery";
                goto error;
            }
        }

        /* remove the hostname from the topology. Unfortunately, hwloc
         * decided to add the source hostname to the "topology", thus
         * rendering it unusable as a pure topological description. So
         * we remove that information here.
         */
        obj = hwloc_get_root_obj(opal_hwloc_topology);
        for (i=0; i < obj->infos_count; i++) {
            if (NULL == obj->infos[i].name ||
                NULL == obj->infos[i].value) {
                continue;
            }
            if (0 == strncmp(obj->infos[i].name, "HostName", strlen("HostName"))) {
                free(obj->infos[i].name);
                free(obj->infos[i].value);
                /* left justify the array */
                for (j=i; j < obj->infos_count-1; j++) {
                    obj->infos[j] = obj->infos[j+1];
                }
                obj->infos[obj->infos_count-1].name = NULL;
                obj->infos[obj->infos_count-1].value = NULL;
                obj->infos_count--;
                break;
            }
        }

        if (4 < opal_output_get_verbosity(orte_ess_base_framework.framework_output)) {
            opal_output(0, "%s Topology Info:", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            opal_dss.dump(0, opal_hwloc_topology, OPAL_HWLOC_TOPO);
        }
    }
#endif

    /* if we are using xml for output, put an mpirun start tag */
    if (orte_xml_output) {
        fprintf(orte_xml_fp, "<mpirun>\n");
        fflush(orte_xml_fp);
    }

    /* open and setup the opal_pstat framework so we can provide
     * process stats if requested
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_pstat_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "opal_pstat_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = opal_pstat_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "opal_pstat_base_select";
        goto error;
    }
  
    /* open and setup the state machine */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_state_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_state_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_state_base_select";
        goto error;
    }

    /* open the errmgr */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_errmgr_base_open";
        goto error;
    }

    /* Since we are the HNP, then responsibility for
     * defining the name falls to the PLM component for our
     * respective environment - hence, we have to open the PLM
     * first and select that component.
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_plm_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_plm_base_open";
        goto error;
    }
    
    if (ORTE_SUCCESS != (ret = orte_plm_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_plm_base_select";
        goto error;
    }
    /* if we were spawned by a singleton, our jobid was given to us */
    if (NULL != orte_ess_base_jobid) {
        if (ORTE_SUCCESS != (ret = orte_util_convert_string_to_jobid(&ORTE_PROC_MY_NAME->jobid, orte_ess_base_jobid))) {
            ORTE_ERROR_LOG(ret);
            error = "convert_string_to_jobid";
            goto error;
        }
        ORTE_PROC_MY_NAME->vpid = 0;
    } else {
        if (ORTE_SUCCESS != (ret = orte_plm.set_hnp_name())) {
            ORTE_ERROR_LOG(ret);
            error = "orte_plm_set_hnp_name";
            goto error;
        }
    }
    /* Setup the communication infrastructure */
    
    /*
     * OOB Layer
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_oob_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_oob_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_oob_base_select";
        goto error;
    }

    /*
     * Runtime Messaging Layer
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_rml_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml_base_select";
        goto error;
    }

    if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_errmgr_base_select";
        goto error;
    }
    
    /* setup the global job and node arrays */
    orte_job_data = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_job_data,
                                                       1,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       1))) {
        ORTE_ERROR_LOG(ret);
        error = "setup job array";
        goto error;
    }
    
    orte_node_pool = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_pool,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
        ORTE_ERROR_LOG(ret);
        error = "setup node array";
        goto error;
    }
    orte_node_topologies = OBJ_NEW(opal_pointer_array_t);
    if (ORTE_SUCCESS != (ret = opal_pointer_array_init(orte_node_topologies,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE,
                                                       ORTE_GLOBAL_ARRAY_MAX_SIZE,
                                                       ORTE_GLOBAL_ARRAY_BLOCK_SIZE))) {
        ORTE_ERROR_LOG(ret);
        error = "setup node topologies array";
        goto error;
    }

    /* init the nidmap - just so we register that verbosity */
    orte_util_nidmap_init(NULL);

    /* Setup the job data object for the daemons */        
    /* create and store the job data object */
    jdata = OBJ_NEW(orte_job_t);
    jdata->jobid = ORTE_PROC_MY_NAME->jobid;
    opal_pointer_array_set_item(orte_job_data, 0, jdata);
    /* mark that the daemons have reported as we are the
     * only ones in the system right now, and we definitely
     * are running!
     */
    jdata->state = ORTE_JOB_STATE_DAEMONS_REPORTED;
   
    /* every job requires at least one app */
    app = OBJ_NEW(orte_app_context_t);
    opal_pointer_array_set_item(jdata->apps, 0, app);
    jdata->num_apps++;

    /* create and store a node object where we are */
    node = OBJ_NEW(orte_node_t);
    node->name = strdup(orte_process_info.nodename);
    node->index = opal_pointer_array_set_item(orte_node_pool, 0, node);
#if OPAL_HAVE_HWLOC
    /* add it to the array of known topologies */
    opal_pointer_array_add(orte_node_topologies, opal_hwloc_topology);
#endif

    /* create and store a proc object for us */
    proc = OBJ_NEW(orte_proc_t);
    proc->name.jobid = ORTE_PROC_MY_NAME->jobid;
    proc->name.vpid = ORTE_PROC_MY_NAME->vpid;
    
    proc->pid = orte_process_info.pid;
    proc->rml_uri = orte_rml.get_contact_info();
    proc->state = ORTE_PROC_STATE_RUNNING;
    OBJ_RETAIN(node);  /* keep accounting straight */
    proc->node = node;
    opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc);

    /* record that the daemon (i.e., us) is on this node 
     * NOTE: we do not add the proc object to the node's
     * proc array because we are not an application proc.
     * Instead, we record it in the daemon field of the
     * node object
     */
    OBJ_RETAIN(proc);   /* keep accounting straight */
    node->daemon = proc;
    ORTE_FLAG_SET(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED);
    node->state = ORTE_NODE_STATE_UP;
    
    /* if we are to retain aliases, get ours */
    if (orte_retain_aliases) {
        aliases = NULL;
        opal_ifgetaliases(&aliases);
        /* add our own local name to it */
        opal_argv_append_nosize(&aliases, orte_process_info.nodename);
        aptr = opal_argv_join(aliases, ',');
        opal_argv_free(aliases);
        orte_set_attribute(&node->attributes, ORTE_NODE_ALIAS, ORTE_ATTR_LOCAL, aptr, OPAL_STRING);
        free(aptr);
    }

    /* record that the daemon job is running */
    jdata->num_procs = 1;
    jdata->state = ORTE_JOB_STATE_RUNNING;
    /* obviously, we have "reported" */
    jdata->num_reported = 1;

    /*
     * Routed system
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_routed_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_routed_base_select";
        goto error;
    }
    
    /* datastore - ensure we don't pickup the pmi component, but
     * don't override anything set by user
     */
    if (NULL == getenv("OMPI_MCA_dstore")) {
        putenv("OMPI_MCA_dstore=^pmi");
    }
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&opal_dstore_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "opal_dstore_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = opal_dstore_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "opal_dstore_base_select";
        goto error;
    }
    /* create the handles */
    if (0 > (opal_dstore_peer = opal_dstore.open("PEER"))) {
        error = "opal dstore global";
        ret = ORTE_ERR_FATAL;
        goto error;
    }
    if (0 > (opal_dstore_internal = opal_dstore.open("INTERNAL"))) {
        error = "opal dstore internal";
        ret = ORTE_ERR_FATAL;
        goto error;
    }
    if (0 > (opal_dstore_nonpeer = opal_dstore.open("NONPEER"))) {
        error = "opal dstore nonpeer";
        ret = ORTE_ERR_FATAL;
        goto error;
    }

    /*
     * Group communications
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_grpcomm_base_select";
        goto error;
    }

    /* Now provide a chance for the PLM
     * to perform any module-specific init functions. This
     * needs to occur AFTER the communications are setup
     * as it may involve starting a non-blocking recv
     */
    if (ORTE_SUCCESS != (ret = orte_plm.init())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_plm_init";
        goto error;
    }

    /*
     * Setup the remaining resource
     * management and errmgr frameworks - application procs
     * and daemons do not open these frameworks as they only use
     * the hnp proxy support in the PLM framework.
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_ras_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_ras_base_open";
        goto error;
    }    
    if (ORTE_SUCCESS != (ret = orte_ras_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_ras_base_find_available";
        goto error;
    }
    
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rmaps_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rmaps_base_open";
        goto error;
    }    
    if (ORTE_SUCCESS != (ret = orte_rmaps_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rmaps_base_find_available";
        goto error;
    }
#if OPAL_HAVE_HWLOC
    {
        char *coprocessors, **sns;
        uint32_t h;
        int idx;

        /* if a topology file was given, then the rmaps framework open
         * will have reset our topology. Ensure we always get the right
         * one by setting our node topology afterwards
         */
        node->topology = opal_hwloc_topology;

        /* init the hash table, if necessary */
        if (NULL == orte_coprocessors) {
            orte_coprocessors = OBJ_NEW(opal_hash_table_t);
            opal_hash_table_init(orte_coprocessors, orte_process_info.num_procs);
        }
        /* detect and add any coprocessors */
        coprocessors = opal_hwloc_base_find_coprocessors(opal_hwloc_topology);
        if (NULL != coprocessors) {
            /* separate the serial numbers of the coprocessors
             * on this host
             */
            sns = opal_argv_split(coprocessors, ',');
            for (idx=0; NULL != sns[idx]; idx++) {
                /* compute the hash */
                OPAL_HASH_STR(sns[idx], h);
                /* mark that this coprocessor is hosted by this node */
                opal_hash_table_set_value_uint32(orte_coprocessors, h, (void*)&(ORTE_PROC_MY_NAME->vpid));
            }
            opal_argv_free(sns);
            free(coprocessors);
            orte_coprocessors_detected = true;
        }
        /* see if I am on a coprocessor */
        coprocessors = opal_hwloc_base_check_on_coprocessor();
        if (NULL != coprocessors) {
            orte_set_attribute(&node->attributes, ORTE_NODE_SERIAL_NUMBER, ORTE_ATTR_LOCAL, coprocessors, OPAL_STRING);
            free(coprocessors);
            orte_coprocessors_detected = true;
        }
    }
#endif

    /* Open/select the odls */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_odls_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_odls_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_odls_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_odls_base_select";
        goto error;
    }
    
    /* Open/select the rtc */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rtc_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rtc_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_rtc_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rtc_base_select";
        goto error;
    }
    
    /* enable communication with the rml */
    if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_rml.enable_comm";
        goto error;
    }

    /* we are an hnp, so update the contact info field for later use */
    orte_process_info.my_hnp_uri = orte_rml.get_contact_info();
    proc->rml_uri = strdup(orte_process_info.my_hnp_uri);

    /* we are also officially a daemon, so better update that field too */
    orte_process_info.my_daemon_uri = strdup(orte_process_info.my_hnp_uri);
    
    /* setup the orte_show_help system to recv remote output */
    orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_SHOW_HELP,
                            ORTE_RML_PERSISTENT, orte_show_help_recv, NULL);

    /* setup my session directory */
    if (orte_create_session_dirs) {
        OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
                             "%s setting up session dir with\n\ttmpdir: %s\n\thost %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base,
                             orte_process_info.nodename));
        
        /* take a pass thru the session directory code to fillin the
         * tmpdir names - don't create anything yet
         */
        if (ORTE_SUCCESS != (ret = orte_session_dir(false,
                                                    orte_process_info.tmpdir_base,
                                                    orte_process_info.nodename, NULL,
                                                    ORTE_PROC_MY_NAME))) {
            ORTE_ERROR_LOG(ret);
            error = "orte_session_dir define";
            goto error;
        }
        /* clear the session directory just in case there are
         * stale directories laying around
         */
        orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);

        /* now actually create the directory tree */
        if (ORTE_SUCCESS != (ret = orte_session_dir(true,
                                                    orte_process_info.tmpdir_base,
                                                    orte_process_info.nodename, NULL,
                                                    ORTE_PROC_MY_NAME))) {
            ORTE_ERROR_LOG(ret);
            error = "orte_session_dir";
            goto error;
        }
        
        /* Once the session directory location has been established, set
           the opal_output hnp file location to be in the
           proc-specific session directory. */
        opal_output_set_output_file_info(orte_process_info.proc_session_dir,
                                         "output-", NULL, NULL);
        
        /* save my contact info in a file for others to find */
        jobfam_dir = opal_dirname(orte_process_info.job_session_dir);
        contact_path = opal_os_path(false, jobfam_dir, "contact.txt", NULL);
        free(jobfam_dir);
        
        OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
                             "%s writing contact file %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             contact_path));
        
        if (ORTE_SUCCESS != (ret = orte_write_hnp_contact_file(contact_path))) {
            OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
                                 "%s writing contact file failed with error %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_ERROR_NAME(ret)));
        } else {
            OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
                                 "%s wrote contact file",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        }
        free(contact_path);
    }

    /* setup the routed info - the selected routed component
     * will know what to do. 
     */
    if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_routed.init_routes";
        goto error;
    }
    
    /* setup I/O forwarding system - must come after we init routes */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_iof_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_iof_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_iof_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_iof_base_select";
        goto error;
    }
    
    /* setup the FileM */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_filem_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_filem_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_filem_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_filem_base_select";
        goto error;
    }

#if OPAL_ENABLE_FT_CR == 1
    /*
     * Setup the SnapC
     */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_snapc_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_snapc_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_sstore_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_sstore_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_snapc_base_select(ORTE_PROC_IS_HNP, ORTE_PROC_IS_APP))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_snapc_base_select";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_sstore_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_sstore_base_select";
        goto error;
    }

    /* For HNP, ORTE doesn't need the OPAL CR stuff */
    opal_cr_set_enabled(false);
#else
    opal_cr_set_enabled(false);
#endif

    /*
     * Initalize the CR setup
     * Note: Always do this, even in non-FT builds.
     * If we don't some user level tools may hang.
     */
    if (ORTE_SUCCESS != (ret = orte_cr_init())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_cr_init";
        goto error;
    }
    
    /* setup the dfs framework */
    if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) {
        ORTE_ERROR_LOG(ret);
        error = "orte_dfs_base_open";
        goto error;
    }
    if (ORTE_SUCCESS != (ret = orte_dfs_base_select())) {
        ORTE_ERROR_LOG(ret);
        error = "orte_dfs_select";
        goto error;
    }

    /* if a tool has launched us and is requesting event reports,
     * then set its contact info into the comm system
     */
    if (orte_report_events) {
        if (ORTE_SUCCESS != (ret = orte_util_comm_connect_tool(orte_report_events_uri))) {
            error = "could not connect to tool";
            goto error;
        }
    }

    /* We actually do *not* want an HNP to voluntarily yield() the
       processor more than necessary.  Orterun already blocks when
       it is doing nothing, so it doesn't use any more CPU cycles than
       it should; but when it *is* doing something, we do not want it
       to be unnecessarily delayed because it voluntarily yielded the
       processor in the middle of its work.
     
       For example: when a message arrives at orterun, we want the
       OS to wake us up in a timely fashion (which most OS's
       seem good about doing) and then we want orterun to process
       the message as fast as possible.  If orterun yields and lets
       aggressive MPI applications get the processor back, it may be a
       long time before the OS schedules orterun to run again
       (particularly if there is no IO event to wake it up).  Hence,
       routed OOB messages (for example) may be significantly delayed
       before being delivered to MPI processes, which can be
       problematic in some scenarios (e.g., COMM_SPAWN, BTL's that
       require OOB messages for wireup, etc.). */
    opal_progress_set_yield_when_idle(false);

    return ORTE_SUCCESS;

 error:
    if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) {
        orte_show_help("help-orte-runtime.txt",
                       "orte_init:startup:internal-failure",
                       true, error, ORTE_ERROR_NAME(ret), ret);
    }
    
    return ORTE_ERR_SILENT;
}
Example #24
0
static int ppr_mapper(orte_job_t *jdata)
{
    int rc = ORTE_SUCCESS, j, n;
    mca_base_component_t *c=&mca_rmaps_ppr_component.base_version;
    orte_node_t *node;
    orte_proc_t *proc;
    orte_app_context_t *app;
    orte_vpid_t total_procs, nprocs_mapped;
    opal_hwloc_level_t start=OPAL_HWLOC_NODE_LEVEL;
#if OPAL_HAVE_HWLOC
    hwloc_obj_t obj;
    hwloc_obj_type_t lowest;
    unsigned cache_level=0;
    unsigned int nobjs, i;
    bool pruning_reqd = false;
    opal_hwloc_level_t level;
#endif
    opal_list_t node_list;
    opal_list_item_t *item;
    orte_std_cntr_t num_slots;
    orte_app_idx_t idx;
    char **ppr_req, **ck;
    size_t len;
    bool initial_map=true;

    /* only handle initial launch of loadbalanced
     * or NPERxxx jobs - allow restarting of failed apps
     */
    if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:ppr: job %s being restarted - ppr cannot map",
                            ORTE_JOBID_PRINT(jdata->jobid));
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }
    if (NULL != jdata->map->req_mapper &&
        0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) {
        /* a mapper has been specified, and it isn't me */
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:ppr: job %s not using ppr mapper",
                            ORTE_JOBID_PRINT(jdata->jobid));
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }
    if (NULL == jdata->map->ppr ||
        ORTE_MAPPING_PPR != ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
        /* not for us */
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:ppr: job %s not using ppr mapper",
                            ORTE_JOBID_PRINT(jdata->jobid));
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps:ppr: mapping job %s with ppr %s",
                        ORTE_JOBID_PRINT(jdata->jobid), jdata->map->ppr);

    /* flag that I did the mapping */
    if (NULL != jdata->map->last_mapper) {
        free(jdata->map->last_mapper);
    }
    jdata->map->last_mapper = strdup(c->mca_component_name);

    /* initialize */
    memset(ppr, 0, OPAL_HWLOC_HWTHREAD_LEVEL * sizeof(opal_hwloc_level_t));

    /* parse option */
    n=0;
    ppr_req = opal_argv_split(jdata->map->ppr, ',');
    for (j=0; NULL != ppr_req[j]; j++) {
        /* split on the colon */
        ck = opal_argv_split(ppr_req[j], ':');
        if (2 != opal_argv_count(ck)) {
            /* must provide a specification */
            orte_show_help("help-orte-rmaps-ppr.txt", "invalid-ppr", true, jdata->map->ppr);
            opal_argv_free(ppr_req);
            opal_argv_free(ck);
            return ORTE_ERR_SILENT;
        }
        len = strlen(ck[1]);
        if (0 == strncasecmp(ck[1], "node", len)) {
            ppr[OPAL_HWLOC_NODE_LEVEL] = strtol(ck[0], NULL, 10);
            ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYNODE);
            start = OPAL_HWLOC_NODE_LEVEL;
            n++;
#if OPAL_HAVE_HWLOC
        } else if (0 == strncasecmp(ck[1], "hwthread", len) ||
                   0 == strncasecmp(ck[1], "thread", len)) {
            ppr[OPAL_HWLOC_HWTHREAD_LEVEL] = strtol(ck[0], NULL, 10);
            start = OPAL_HWLOC_HWTHREAD_LEVEL;
            ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYHWTHREAD);
            n++;
        } else if (0 == strncasecmp(ck[1], "core", len)) {
            ppr[OPAL_HWLOC_CORE_LEVEL] = strtol(ck[0], NULL, 10);
            if (start < OPAL_HWLOC_CORE_LEVEL) {
                start = OPAL_HWLOC_CORE_LEVEL;
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYCORE);
            }
            n++;
        } else if (0 == strncasecmp(ck[1], "socket", len) ||
                   0 == strncasecmp(ck[1], "skt", len)) {
            ppr[OPAL_HWLOC_SOCKET_LEVEL] = strtol(ck[0], NULL, 10);
            if (start < OPAL_HWLOC_SOCKET_LEVEL) {
                start = OPAL_HWLOC_SOCKET_LEVEL;
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSOCKET);
            }
            n++;
        } else if (0 == strncasecmp(ck[1], "l1cache", len)) {
            ppr[OPAL_HWLOC_L1CACHE_LEVEL] = strtol(ck[0], NULL, 10);
            if (start < OPAL_HWLOC_L1CACHE_LEVEL) {
                start = OPAL_HWLOC_L1CACHE_LEVEL;
                cache_level = 1;
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL1CACHE);
            }
            n++;
        } else if (0 == strncasecmp(ck[1], "l2cache", len)) {
            ppr[OPAL_HWLOC_L2CACHE_LEVEL] = strtol(ck[0], NULL, 10);
            if (start < OPAL_HWLOC_L2CACHE_LEVEL) {
                start = OPAL_HWLOC_L2CACHE_LEVEL;
                cache_level = 2;
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL2CACHE);
            }
            n++;
        } else if (0 == strncasecmp(ck[1], "l3cache", len)) {
            ppr[OPAL_HWLOC_L3CACHE_LEVEL] = strtol(ck[0], NULL, 10);
            if (start < OPAL_HWLOC_L3CACHE_LEVEL) {
                start = OPAL_HWLOC_L3CACHE_LEVEL;
                cache_level = 3;
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL3CACHE);
            }
            n++;
        } else if (0 == strncasecmp(ck[1], "numa", len)) {
            ppr[OPAL_HWLOC_NUMA_LEVEL] = strtol(ck[0], NULL, 10);
            if (start < OPAL_HWLOC_NUMA_LEVEL) {
                start = OPAL_HWLOC_NUMA_LEVEL;
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYNUMA);
            }
            n++;
#endif
        } else {
            /* unknown spec */
            orte_show_help("help-orte-rmaps-ppr.txt", "unrecognized-ppr-option", true, ck[1], jdata->map->ppr);
            opal_argv_free(ppr_req);
            opal_argv_free(ck);
            return ORTE_ERR_SILENT;
        }
        opal_argv_free(ck);
    }
    opal_argv_free(ppr_req);
    /* if nothing was given, that's an error */
    if (0 == n) {
        opal_output(0, "NOTHING GIVEN");
        return ORTE_ERR_SILENT;
    }
#if OPAL_HAVE_HWLOC
    /* if more than one level was specified, then pruning will be reqd */
    if (1 < n) {
        pruning_reqd = true;
    }
#endif

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps:ppr: job %s assigned policy %s",
                        ORTE_JOBID_PRINT(jdata->jobid),
                        orte_rmaps_base_print_mapping(jdata->map->mapping));

#if OPAL_HAVE_HWLOC
    /* convenience */
    level = start;
    lowest = opal_hwloc_levels[start];
#endif

    for (idx=0; idx < (orte_app_idx_t)jdata->apps->size; idx++) {
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, idx))) {
            continue;
        }

        /* if the number of total procs was given, set that
         * limit - otherwise, set to max so we simply fill
         * all the nodes with the pattern
         */
        if (0 < app->num_procs) {
            total_procs = app->num_procs;
        } else {
            total_procs = ORTE_VPID_MAX;
        }

        /* get the available nodes */
        OBJ_CONSTRUCT(&node_list, opal_list_t);
        if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
                                                                  jdata->map->mapping, initial_map, false))) {
            ORTE_ERROR_LOG(rc);
            goto error;
        }
        /* flag that all subsequent requests should not reset the node->mapped flag */
        initial_map = false;

        /* if a bookmark exists from some prior mapping, set us to start there */
        jdata->bookmark = orte_rmaps_base_get_starting_point(&node_list, jdata);

        /* cycle across the nodes */
        nprocs_mapped = 0;
        for (item = opal_list_get_first(&node_list);
             item != opal_list_get_end(&node_list);
             item = opal_list_get_next(item)) {
            node = (orte_node_t*)item;
#if OPAL_HAVE_HWLOC
            /* bozo check */
            if (NULL == node->topology) {
                orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing",
                               true, node->name);
                rc = ORTE_ERR_SILENT;
                goto error;
            }
#endif
            /* add the node to the map, if needed */
            if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
                if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
                    ORTE_ERROR_LOG(rc);
                    goto error;
                }
                ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED);
                OBJ_RETAIN(node);  /* maintain accounting on object */
                jdata->map->num_nodes++;
            }
            /* if we are mapping solely at the node level, just put
             * that many procs on this node
             */
            if (OPAL_HWLOC_NODE_LEVEL == start) {
#if OPAL_HAVE_HWLOC
                obj = hwloc_get_root_obj(node->topology);
#endif
                for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) {
                    if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, idx))) {
                        rc = ORTE_ERR_OUT_OF_RESOURCE;
                        goto error;
                    }
                    nprocs_mapped++;
#if OPAL_HAVE_HWLOC
                    orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
#endif
                }
#if OPAL_HAVE_HWLOC
            } else {
                /* get the number of lowest resources on this node */
                nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology,
                                                           lowest, cache_level,
                                                           OPAL_HWLOC_AVAILABLE);

                /* map the specified number of procs to each such resource on this node,
                 * recording the locale of each proc so we know its cpuset
                 */
                for (i=0; i < nobjs; i++) {
                    obj = opal_hwloc_base_get_obj_by_type(node->topology,
                                                          lowest, cache_level,
                                                          i, OPAL_HWLOC_AVAILABLE);
                    for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) {
                        if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, idx))) {
                            rc = ORTE_ERR_OUT_OF_RESOURCE;
                            goto error;
                        }
                        nprocs_mapped++;
                        orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
                    }
                }

                if (pruning_reqd) {
                    /* go up the ladder and prune the procs according to
                     * the specification, adjusting the count of procs on the
                     * node as we go
                     */
                    level--;
                    prune(jdata->jobid, idx, node, &level, &nprocs_mapped);
                }
#endif
            }

            /* set the total slots used */
            if ((int)node->num_procs <= node->slots) {
                node->slots_inuse = (int)node->num_procs;
            } else {
                node->slots_inuse = node->slots;
            }

            /* if no-oversubscribe was specified, check to see if
             * we have violated the total slot specification - regardless,
             * if slots_max was given, we are not allowed to violate it!
             */
            if ((node->slots < (int)node->num_procs) ||
                (0 < node->slots_max && node->slots_max < (int)node->num_procs)) {
                if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
                    orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
                                   true, node->num_procs, app->app);
                    rc = ORTE_ERR_SILENT;
                    goto error;
                }
                /* flag the node as oversubscribed so that sched-yield gets
                 * properly set
                 */
                ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED);
            }

            /* if we haven't mapped all the procs, continue on to the
             * next node
             */
            if (total_procs == nprocs_mapped) {
                break;
            }
        }
        if (0 == app->num_procs) {
            app->num_procs = nprocs_mapped;
        }
        if (ORTE_VPID_MAX != total_procs && nprocs_mapped < total_procs) {
            /* couldn't map them all */
            orte_show_help("help-orte-rmaps-ppr.txt", "ppr-too-many-procs",
                           true, app->app, app->num_procs, jdata->map->ppr);
            rc = ORTE_ERR_SILENT;
            goto error;
        }
        /* compute vpids and add proc objects to the job */
        if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) {
            ORTE_ERROR_LOG(rc);
            goto error;
        }

        /* track the total number of processes we mapped - must update
         * this AFTER we compute vpids so that computation is done
         * correctly
         */
        jdata->num_procs += app->num_procs;

        while (NULL != (item = opal_list_remove_first(&node_list))) {
            OBJ_RELEASE(item);
        }
        OBJ_DESTRUCT(&node_list);
    }
    return ORTE_SUCCESS;

 error:
    while (NULL != (item = opal_list_remove_first(&node_list))) {
        OBJ_RELEASE(item);
    }
    OBJ_DESTRUCT(&node_list);
    return rc;
}
Example #25
0
static int plm_alps_start_proc(int argc, char **argv, char **env,
                                char *prefix)
{
    int fd;
    pid_t alps_pid;
    char *exec_argv = opal_path_findv(argv[0], 0, env, NULL);

    if (NULL == exec_argv) {
        return ORTE_ERR_NOT_FOUND;
    }

    alps_pid = fork();
    if (-1 == alps_pid) {
        ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
        return ORTE_ERR_SYS_LIMITS_CHILDREN;
    }
    
    alpsrun = OBJ_NEW(orte_proc_t);
    alpsrun->pid = alps_pid;
    /* be sure to mark it as alive so we don't instantly fire */
    ORTE_FLAG_SET(alpsrun, ORTE_PROC_FLAG_ALIVE);
    /* setup the waitpid so we can find out if alps succeeds! */
    orte_wait_cb(alpsrun, alps_wait_cb, NULL);

    if (0 == alps_pid) {  /* child */
        char *bin_base = NULL, *lib_base = NULL;

        /* Figure out the basenames for the libdir and bindir.  There
           is a lengthy comment about this in plm_rsh_module.c
           explaining all the rationale for how / why we're doing
           this. */

        lib_base = opal_basename(opal_install_dirs.libdir);
        bin_base = opal_basename(opal_install_dirs.bindir);

        /* If we have a prefix, then modify the PATH and
           LD_LIBRARY_PATH environment variables.  */
        if (NULL != prefix) {
            char *oldenv, *newenv;

            /* Reset PATH */
            oldenv = getenv("PATH");
            if (NULL != oldenv) {
                asprintf(&newenv, "%s/%s:%s", prefix, bin_base, oldenv);
            } else {
                asprintf(&newenv, "%s/%s", prefix, bin_base);
            }
            opal_setenv("PATH", newenv, true, &env);
            if (mca_plm_alps_component.debug) {
                opal_output(0, "plm:alps: reset PATH: %s", newenv);
            }
            free(newenv);

            /* Reset LD_LIBRARY_PATH */
            oldenv = getenv("LD_LIBRARY_PATH");
            if (NULL != oldenv) {
                asprintf(&newenv, "%s/%s:%s", prefix, lib_base, oldenv);
            } else {
                asprintf(&newenv, "%s/%s", prefix, lib_base);
            }
            opal_setenv("LD_LIBRARY_PATH", newenv, true, &env);
            if (mca_plm_alps_component.debug) {
                opal_output(0, "plm:alps: reset LD_LIBRARY_PATH: %s",
                            newenv);
            }
            free(newenv);
        }

        fd = open("/dev/null", O_CREAT|O_WRONLY|O_TRUNC, 0666);
        if(fd > 0) {
            dup2(fd, 0);
        }

        /* When not in debug mode and --debug-daemons was not passed,
         * tie stdout/stderr to dev null so we don't see messages from orted */
        if (0 == mca_plm_alps_component.debug && !orte_debug_daemons_flag) {
            if (fd >= 0) {
                if (fd != 1) {
                    dup2(fd,1);
                }
                if (fd != 2) {
                    dup2(fd,2);
                }
            }
        }

        if (fd > 2) {
            close(fd);
        }

        /* get the alps process out of orterun's process group so that
           signals sent from the shell (like those resulting from
           cntl-c) don't get sent to alps */
        setpgid(0, 0);
         
        
        execve(exec_argv, argv, env);

        opal_output(0, "plm:alps:start_proc: exec failed");
        /* don't return - need to exit - returning would be bad -
           we're not in the calling process anymore */
        exit(1);
    } else {  /* parent */
        /* just in case, make sure that the alps process is not in our
        process group any more.  Stevens says always do this on both
        sides of the fork... */
        setpgid(alps_pid, alps_pid);
        
        free(exec_argv);
    }

    return ORTE_SUCCESS;
}
Example #26
0
static int do_parent(orte_odls_spawn_caddy_t *cd, int read_fd)
{
    int rc;
    orte_odls_pipe_err_msg_t msg;
    char file[ORTE_ODLS_MAX_FILE_LEN + 1], topic[ORTE_ODLS_MAX_TOPIC_LEN + 1], *str = NULL;

    if (cd->opts.connect_stdin) {
        close(cd->opts.p_stdin[0]);
    }
    close(cd->opts.p_stdout[1]);
    if( !orte_iof_base.redirect_app_stderr_to_stdout ) {
        close(cd->opts.p_stderr[1]);
    }
#if OPAL_PMIX_V1
    close(cd->opts.p_internal[1]);
#endif

    /* Block reading a message from the pipe */
    while (1) {
        rc = opal_fd_read(read_fd, sizeof(msg), &msg);

        /* If the pipe closed, then the child successfully launched */
        if (OPAL_ERR_TIMEOUT == rc) {
            break;
        }

        /* If Something Bad happened in the read, error out */
        if (OPAL_SUCCESS != rc) {
            ORTE_ERROR_LOG(rc);
            close(read_fd);

            if (NULL != cd->child) {
                cd->child->state = ORTE_PROC_STATE_UNDEF;
            }
            return rc;
        }

        /* Otherwise, we got a warning or error message from the child */
        if (NULL != cd->child) {
            if (msg.fatal) {
                ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE);
            } else {
                ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_ALIVE);
            }
        }

        /* Read in the strings; ensure to terminate them with \0 */
        if (msg.file_str_len > 0) {
            rc = opal_fd_read(read_fd, msg.file_str_len, file);
            if (OPAL_SUCCESS != rc) {
                orte_show_help("help-orte-odls-alps.txt", "syscall fail",
                               true,
                               orte_process_info.nodename, cd->app,
                               "opal_fd_read", __FILE__, __LINE__);
                if (NULL != cd->child) {
                    cd->child->state = ORTE_PROC_STATE_UNDEF;
                }
                return rc;
            }
            file[msg.file_str_len] = '\0';
        }
        if (msg.topic_str_len > 0) {
            rc = opal_fd_read(read_fd, msg.topic_str_len, topic);
            if (OPAL_SUCCESS != rc) {
                orte_show_help("help-orte-odls-alps.txt", "syscall fail",
                               true,
                               orte_process_info.nodename, cd->app,
                               "opal_fd_read", __FILE__, __LINE__);
                if (NULL != cd->child) {
                    cd->child->state = ORTE_PROC_STATE_UNDEF;
                }
                return rc;
            }
            topic[msg.topic_str_len] = '\0';
        }
        if (msg.msg_str_len > 0) {
            str = calloc(1, msg.msg_str_len + 1);
            if (NULL == str) {
                orte_show_help("help-orte-odls-alps.txt", "syscall fail",
                               true,
                               orte_process_info.nodename, cd->app,
                               "opal_fd_read", __FILE__, __LINE__);
                if (NULL != cd->child) {
                    cd->child->state = ORTE_PROC_STATE_UNDEF;
                }
                return rc;
            }
            rc = opal_fd_read(read_fd, msg.msg_str_len, str);
        }

        /* Print out what we got.  We already have a rendered string,
           so use orte_show_help_norender(). */
        if (msg.msg_str_len > 0) {
            orte_show_help_norender(file, topic, false, str);
            free(str);
            str = NULL;
        }

        /* If msg.fatal is true, then the child exited with an error.
           Otherwise, whatever we just printed was a warning, so loop
           around and see what else is on the pipe (or if the pipe
           closed, indicating that the child launched
           successfully). */
        if (msg.fatal) {
            if (NULL != cd->child) {
                cd->child->state = ORTE_PROC_STATE_FAILED_TO_START;
                ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE);
            }
            close(read_fd);
            return ORTE_ERR_FAILED_TO_START;
        }
    }

    /* If we got here, it means that the pipe closed without
       indication of a fatal error, meaning that the child process
       launched successfully. */
    if (NULL != cd->child) {
        cd->child->state = ORTE_PROC_STATE_RUNNING;
        ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_ALIVE);
    }
    close(read_fd);

    return ORTE_SUCCESS;
}