Ejemplo n.º 1
0
static void unbind_procs(orte_job_t *jdata)
{
    int j;
    orte_proc_t *proc;

    for (j=0; j < jdata->procs->size; j++) {
        if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, j))) {
            continue;
        }
        orte_remove_attribute(&proc->attributes, ORTE_PROC_HWLOC_BOUND);
        orte_remove_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP);
    }
}
Ejemplo n.º 2
0
static void orte_job_destruct(orte_job_t* job)
{
    orte_proc_t *proc;
    orte_app_context_t *app;
    orte_job_t *jdata;
    int n;
    orte_timer_t *evtimer;

    if (NULL == job) {
        /* probably just a race condition - just return */
        return;
    }
    
    if (orte_debug_flag) {
        opal_output(0, "%s Releasing job data for %s",
                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job->jobid));
    }
    
    for (n=0; n < job->apps->size; n++) {
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(job->apps, n))) {
            continue;
        }
        OBJ_RELEASE(app);
    }
    OBJ_RELEASE(job->apps);
    
    /* release any pointers in the attributes */
    evtimer = NULL;
    if (orte_get_attribute(&job->attributes, ORTE_JOB_FAILURE_TIMER_EVENT,
                           (void**)&evtimer, OPAL_PTR)) {
        orte_remove_attribute(&job->attributes, ORTE_JOB_FAILURE_TIMER_EVENT);
        /* the timer is a pointer to orte_timer_t */
        OBJ_RELEASE(evtimer);
    }
    proc = NULL;
    if (orte_get_attribute(&job->attributes, ORTE_JOB_ABORTED_PROC,
                           (void**)&proc, OPAL_PTR)) {
        orte_remove_attribute(&job->attributes, ORTE_JOB_ABORTED_PROC);
        /* points to an orte_proc_t */
        OBJ_RELEASE(proc);
    }

    if (NULL != job->map) {
        OBJ_RELEASE(job->map);
        job->map = NULL;
    }
    
    for (n=0; n < job->procs->size; n++) {
        if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(job->procs, n))) {
            continue;
        }
        OBJ_RELEASE(proc);
    }
    OBJ_RELEASE(job->procs);
    
    /* release the attributes */
    OPAL_LIST_DESTRUCT(&job->attributes);

    /* find the job in the global array */
    if (NULL != orte_job_data && ORTE_JOBID_INVALID != job->jobid) {
        for (n=0; n < orte_job_data->size; n++) {
            if (NULL == (jdata = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, n))) {
                continue;
            }
            if (jdata->jobid == job->jobid) {
                /* set the entry to NULL */
                opal_pointer_array_set_item(orte_job_data, n, NULL);
                break;
            }
        }
    }
}
Ejemplo n.º 3
0
/* stuff proc attributes for sending back to a proc */
int orte_pmix_server_register_nspace(orte_job_t *jdata)
{
    int rc;
    orte_proc_t *pptr;
    int i, k, n;
    opal_list_t *info, *pmap;
    opal_value_t *kv;
    orte_node_t *node, *mynode;
    opal_vpid_t vpid;
    char **list, **procs, **micro, *tmp, *regex, *cpulist, *peerlist;
    orte_job_t *dmns;
    orte_job_map_t *map;
    orte_app_context_t *app;
    uid_t uid;
    gid_t gid;
    opal_list_t *cache;

    opal_output_verbose(2, orte_pmix_server_globals.output,
                        "%s register nspace for %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_JOBID_PRINT(jdata->jobid));

    /* setup the info list */
    info = OBJ_NEW(opal_list_t);
    uid = geteuid();
    gid = getegid();

    /* jobid */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_JOBID);
    kv->data.string = strdup(ORTE_JOBID_PRINT(jdata->jobid));
    kv->type = OPAL_STRING;
    opal_list_append(info, &kv->super);

    /* offset */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_NPROC_OFFSET);
    kv->data.uint32 = jdata->offset;
    kv->type = OPAL_UINT32;
    opal_list_append(info, &kv->super);

    /* check for cached values to add to the job info */
    cache = NULL;
    if (orte_get_attribute(&jdata->attributes, ORTE_JOB_INFO_CACHE, (void**)&cache, OPAL_PTR) &&
        NULL != cache) {
        while (NULL != (kv = (opal_value_t*)opal_list_remove_first(cache))) {
            opal_list_append(info, &kv->super);
        }
        orte_remove_attribute(&jdata->attributes, ORTE_JOB_INFO_CACHE);
        OBJ_RELEASE(cache);
    }

    /* assemble the node and proc map info */
    list = NULL;
    procs = NULL;
    map = jdata->map;
    for (i=0; i < map->nodes->size; i++) {
        micro = NULL;
        if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
            opal_argv_append_nosize(&list, node->name);
            /* assemble all the ranks for this job that are on this node */
            for (k=0; k < node->procs->size; k++) {
                if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, k))) {
                    if (jdata->jobid == pptr->name.jobid) {
                        opal_argv_append_nosize(&micro, ORTE_VPID_PRINT(pptr->name.vpid));
                    }
                }
            }
            /* assemble the rank/node map */
            if (NULL != micro) {
                tmp = opal_argv_join(micro, ',');
                opal_argv_free(micro);
                opal_argv_append_nosize(&procs, tmp);
                free(tmp);
            }
        }
    }
    /* let the PMIx server generate the nodemap regex */
    if (NULL != list) {
        tmp = opal_argv_join(list, ',');
        opal_argv_free(list);
        list = NULL;
        if (OPAL_SUCCESS != (rc = opal_pmix.generate_regex(tmp, &regex))) {
            ORTE_ERROR_LOG(rc);
            free(tmp);
            OPAL_LIST_RELEASE(info);
            return rc;
        }
        free(tmp);
        kv = OBJ_NEW(opal_value_t);
        kv->key = strdup(OPAL_PMIX_NODE_MAP);
        kv->type = OPAL_STRING;
        kv->data.string = regex;
        opal_list_append(info, &kv->super);
    }

    /* let the PMIx server generate the procmap regex */
    if (NULL != procs) {
        tmp = opal_argv_join(procs, ';');
        opal_argv_free(procs);
        procs = NULL;
        if (OPAL_SUCCESS != (rc = opal_pmix.generate_ppn(tmp, &regex))) {
            ORTE_ERROR_LOG(rc);
            free(tmp);
            OPAL_LIST_RELEASE(info);
            return rc;
        }
        free(tmp);
        kv = OBJ_NEW(opal_value_t);
        kv->key = strdup(OPAL_PMIX_PROC_MAP);
        kv->type = OPAL_STRING;
        kv->data.string = regex;
        opal_list_append(info, &kv->super);
    }

    /* get our local node */
    if (NULL == (dmns = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        OPAL_LIST_RELEASE(info);
        return ORTE_ERR_NOT_FOUND;
    }
    if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(dmns->procs, ORTE_PROC_MY_NAME->vpid))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        OPAL_LIST_RELEASE(info);
        return ORTE_ERR_NOT_FOUND;
    }
    mynode = pptr->node;
    if (NULL == mynode) {
        /* cannot happen */
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        OPAL_LIST_RELEASE(info);
        return ORTE_ERR_NOT_FOUND;
    }
    /* pass our node ID */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_NODEID);
    kv->type = OPAL_UINT32;
    kv->data.uint32 = mynode->index;
    opal_list_append(info, &kv->super);

    /* pass our node size */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_NODE_SIZE);
    kv->type = OPAL_UINT32;
    kv->data.uint32 = mynode->num_procs;
    opal_list_append(info, &kv->super);

    /* pass the number of nodes in the job */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_NUM_NODES);
    kv->type = OPAL_UINT32;
    kv->data.uint32 = map->num_nodes;
    opal_list_append(info, &kv->super);

    /* univ size */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_UNIV_SIZE);
    kv->type = OPAL_UINT32;
    kv->data.uint32 = jdata->total_slots_alloc;
    opal_list_append(info, &kv->super);

    /* job size */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_JOB_SIZE);
    kv->type = OPAL_UINT32;
    kv->data.uint32 = jdata->num_procs;
    opal_list_append(info, &kv->super);

    /* number of apps in this job */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_JOB_NUM_APPS);
    kv->type = OPAL_UINT32;
    kv->data.uint32 = jdata->num_apps;
    opal_list_append(info, &kv->super);

    /* local size */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_LOCAL_SIZE);
    kv->type = OPAL_UINT32;
    kv->data.uint32 = jdata->num_local_procs;
    opal_list_append(info, &kv->super);

    /* max procs */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_MAX_PROCS);
    kv->type = OPAL_UINT32;
    kv->data.uint32 = jdata->total_slots_alloc;
    opal_list_append(info, &kv->super);

    /* register any local clients */
    vpid = ORTE_VPID_MAX;
    for (i=0; i < mynode->procs->size; i++) {
        if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(mynode->procs, i))) {
            continue;
        }
        if (pptr->name.jobid == jdata->jobid) {
            if (pptr->name.vpid < vpid) {
                vpid = pptr->name.vpid;
            }
            /* go ahead and register this client */
            if (OPAL_SUCCESS != (rc = opal_pmix.server_register_client(&pptr->name, uid, gid,
                                                                       (void*)pptr, NULL, NULL))) {
                ORTE_ERROR_LOG(rc);
            }
        }
    }
    /* pass the local ldr */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_LOCALLDR);
    kv->type = OPAL_VPID;
    kv->data.name.vpid = vpid;
    opal_list_append(info, &kv->super);

    /* for each proc in this job, create an object that
     * includes the info describing the proc so the recipient has a complete
     * picture. This allows procs to connect to each other without
     * any further info exchange, assuming the underlying transports
     * support it. We also pass all the proc-specific data here so
     * that each proc can lookup info about every other proc in the job */

    for (n=0; n < map->nodes->size; n++) {
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) {
            continue;
        }
        /* construct the list of local peers, while adding
         * each proc's locality info */
        list = NULL;
        procs = NULL;
        cpulist = NULL;
        peerlist = NULL;
        vpid = ORTE_VPID_MAX;
        for (i=0; i < node->procs->size; i++) {
            if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
                continue;
            }
            if (pptr->name.jobid == jdata->jobid) {
                opal_argv_append_nosize(&list, ORTE_VPID_PRINT(pptr->name.vpid));
                if (pptr->name.vpid < vpid) {
                    vpid = pptr->name.vpid;
                }
                /* note that we have to pass the cpuset for each local
                 * peer so locality can be computed */
                tmp = NULL;
                if (orte_get_attribute(&pptr->attributes, ORTE_PROC_CPU_BITMAP, (void**)&tmp, OPAL_STRING)) {
                    if (NULL != tmp) {
                        opal_argv_append_nosize(&procs, tmp);
                        free(tmp);
                    } else {
                        opal_argv_append_nosize(&procs, "UNBOUND");
                    }
                } else {
                    opal_argv_append_nosize(&procs, "UNBOUND");
                }
            }
        }
        /* construct the list of peers for transmission */
        if (NULL != list) {
            peerlist = opal_argv_join(list, ',');
            opal_argv_free(list);
            list = NULL;
        }
        /* construct the list of cpusets for transmission */
        if (NULL != procs) {
            cpulist = opal_argv_join(procs, ':');
            opal_argv_free(procs);
            procs = NULL;
        }

        /* if this is me, then pass the peers and cpusets to myself
         * in order to maintain backward compatibility for the non-pmix
         * components in OPAL/pmix */
        if (node == mynode) {
            /* pass the list of peers */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_LOCAL_PEERS);
            kv->type = OPAL_STRING;
            kv->data.string = strdup(peerlist);
            opal_list_append(info, &kv->super);

            /* pass the list of cpusets */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_LOCAL_CPUSETS);
            kv->type = OPAL_STRING;
            kv->data.string = strdup(cpulist);
            opal_list_append(info, &kv->super);

        }

        /* now cycle across each proc on this node, passing all data that
         * varies by proc */
        for (i=0; i < node->procs->size; i++) {
            if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
                continue;
            }
            /* only consider procs from this job */
            if (pptr->name.jobid != jdata->jobid) {
                continue;
            }
            /* setup the proc map object */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_PROC_DATA);
            kv->type = OPAL_PTR;
            kv->data.ptr = OBJ_NEW(opal_list_t);
            opal_list_append(info, &kv->super);
            pmap = kv->data.ptr;

            /* must start with rank */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_RANK);
            kv->type = OPAL_VPID;
            kv->data.name.vpid = pptr->name.vpid;
            opal_list_append(pmap, &kv->super);

            /* pass the list of peers */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_LOCAL_PEERS);
            kv->type = OPAL_STRING;
            kv->data.string = strdup(peerlist);
            opal_list_append(pmap, &kv->super);

            /* pass the list of cpusets */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_LOCAL_CPUSETS);
            kv->type = OPAL_STRING;
            kv->data.string = strdup(cpulist);
            opal_list_append(pmap, &kv->super);

            /* appnum */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_APPNUM);
            kv->type = OPAL_UINT32;
            kv->data.uint32 = pptr->app_idx;
            opal_list_append(pmap, &kv->super);

            /* app ldr */
            app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx);
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_APPLDR);
            kv->type = OPAL_VPID;
            kv->data.name.vpid = app->first_rank;
            opal_list_append(pmap, &kv->super);

            /* global/univ rank */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_GLOBAL_RANK);
            kv->type = OPAL_VPID;
            kv->data.name.vpid = pptr->name.vpid + jdata->offset;
            opal_list_append(pmap, &kv->super);

            /* app rank */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_APP_RANK);
            kv->type = OPAL_VPID;
            kv->data.name.vpid = pptr->app_rank;
            opal_list_append(pmap, &kv->super);

            /* app size */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_APP_SIZE);
            kv->type = OPAL_UINT32;
            kv->data.uint32 = app->num_procs;
            opal_list_append(info, &kv->super);

            /* local rank */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_LOCAL_RANK);
            kv->type = OPAL_UINT16;
            kv->data.uint16 = pptr->local_rank;
            opal_list_append(pmap, &kv->super);

            /* node rank */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_NODE_RANK);
            kv->type = OPAL_UINT16;
            kv->data.uint32 = pptr->node_rank;
            opal_list_append(pmap, &kv->super);

            /* hostname */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_HOSTNAME);
            kv->type = OPAL_STRING;
            kv->data.string = strdup(pptr->node->name);
            opal_list_append(pmap, &kv->super);

            /* node ID */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_NODEID);
            kv->type = OPAL_UINT32;
            kv->data.uint32 = pptr->node->index;
            opal_list_append(pmap, &kv->super);
        }
        /* cleanup */
        if (NULL != cpulist) {
            free(cpulist);
        }
        if (NULL != peerlist) {
            free(peerlist);
        }
    }

    /* mark the job as registered */
    orte_set_attribute(&jdata->attributes, ORTE_JOB_NSPACE_REGISTERED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);

    /* pass it down */
    /* we are in an event, so no need to callback */
    rc = opal_pmix.server_register_nspace(jdata->jobid,
                                          jdata->num_local_procs,
                                          info, NULL, NULL);
    OPAL_LIST_RELEASE(info);

    return rc;
}
Ejemplo n.º 4
0
static void recv_data(int fd, short args, void *cbdata)
{
    bool found;
    int i, rc;
    orte_node_t *nd, *nd2;
    opal_list_t nds, ndtmp;
    opal_list_item_t *item, *itm;
    char recv_msg[8192];
    int nbytes, idx, sjob;
    char **alloc, *nodelist, *tpn;
    local_jobtracker_t *ptr, *jtrk;
    local_apptracker_t *aptrk;
    orte_app_context_t *app;
    orte_jobid_t jobid;
    orte_job_t *jdata;
    char **dash_host = NULL;

    opal_output_verbose(2, orte_ras_base_framework.framework_output,
                        "%s ras:slurm: dynamic allocation - data recvd",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));

    /* read the data from the socket and put it in the
     * nodes field of op
     */
    memset(recv_msg, 0, sizeof(recv_msg));
    nbytes = read(fd, recv_msg, sizeof(recv_msg) - 1);

    opal_output_verbose(2, orte_ras_base_framework.framework_output,
                        "%s ras:slurm: dynamic allocation msg: %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), recv_msg);

    /* check if we got something */
    if (0 == nbytes || 0 == strlen(recv_msg) || strstr(recv_msg, "failure") != NULL) {
        /* show an error here - basically, a "nothing was available"
         * message
         */
        orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true,
                       (0 == strlen(recv_msg)) ? "NO MSG" : recv_msg);
        ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALLOC_FAILED);
        return;
    }

    /* break the message into its component parts, separated by colons */
    alloc = opal_argv_split(recv_msg, ':');

    /* the first section contains the ORTE jobid for this allocation */
    tpn = strchr(alloc[0], '=');
    orte_util_convert_string_to_jobid(&jobid, tpn+1);
    /* get the corresponding job object */
    jdata = orte_get_job_data_object(jobid);
    jtrk = NULL;
    /* find the associated tracking object */
    for (item = opal_list_get_first(&jobs);
         item != opal_list_get_end(&jobs);
         item = opal_list_get_next(item)) {
        ptr = (local_jobtracker_t*)item;
        if (ptr->jobid == jobid) {
            jtrk = ptr;
            break;
        }
    }
    if (NULL == jtrk) {
        orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, "NO JOB TRACKER");
        ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALLOC_FAILED);
        opal_argv_free(alloc);
        return;
    }

    /* stop the timeout event */
    opal_event_del(&jtrk->timeout_ev);

    /* cycle across all the remaining parts - each is the allocation for
     * an app in this job
     */
    OBJ_CONSTRUCT(&nds, opal_list_t);
    OBJ_CONSTRUCT(&ndtmp, opal_list_t);
    idx = -1;
    sjob = -1;
    nodelist = NULL;
    tpn = NULL;
    for (i=1; NULL != alloc[i]; i++) {
        if (ORTE_SUCCESS != parse_alloc_msg(alloc[i], &idx, &sjob, &nodelist, &tpn)) {
            orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, jtrk->cmd);
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED);
            opal_argv_free(alloc);
            if (NULL != nodelist) {
                free(nodelist);
            }
            if (NULL != tpn) {
                free(tpn);
            }
            return;
        }
        if (idx < 0) {
            orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, jtrk->cmd);
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED);
            opal_argv_free(alloc);
            free(nodelist);
            free(tpn);
            return;
        }
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, idx))) {
            orte_show_help("help-ras-slurm.txt", "slurm-dyn-alloc-failed", true, jtrk->cmd);
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED);
            opal_argv_free(alloc);
            free(nodelist);
            free(tpn);
            return;
        }
        /* release the current dash_host as that contained the *desired* allocation */
        orte_remove_attribute(&app->attributes, ORTE_APP_DASH_HOST);
        /* track the Slurm jobid */
        if (NULL == (aptrk = (local_apptracker_t*)opal_pointer_array_get_item(&jtrk->apps, idx))) {
            aptrk = OBJ_NEW(local_apptracker_t);
            opal_pointer_array_set_item(&jtrk->apps, idx, aptrk);
        }
        aptrk->sjob = sjob;
        /* since the nodelist/tpn may contain regular expressions, parse them */
        if (ORTE_SUCCESS != (rc = orte_ras_slurm_discover(nodelist, tpn, &ndtmp))) {
            ORTE_ERROR_LOG(rc);
            ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOC_FAILED);
            opal_argv_free(alloc);
            free(nodelist);
            free(tpn);
            return;
        }
        /* transfer the discovered nodes to our node list, and construct
         * the new dash_host entry to match what was allocated
         */
        while (NULL != (item = opal_list_remove_first(&ndtmp))) {
            nd = (orte_node_t*)item;
            opal_argv_append_nosize(&dash_host, nd->name);
            /* check for duplicates */
            found = false;
            for (itm = opal_list_get_first(&nds);
                 itm != opal_list_get_end(&nds);
                 itm = opal_list_get_next(itm)) {
                nd2 = (orte_node_t*)itm;
                if (0 == strcmp(nd->name, nd2->name)) {
                    found = true;
                    nd2->slots += nd->slots;
                    OBJ_RELEASE(item);
                    break;
                }
            }
            if (!found) {
                /* append the new node to our list */
                opal_list_append(&nds, item);
            }
        }
        /* cleanup */
        free(nodelist);
        free(tpn);
    }
    /* cleanup */
    opal_argv_free(alloc);
    OBJ_DESTRUCT(&ndtmp);
    if (NULL != dash_host) {
        tpn = opal_argv_join(dash_host, ',');
        orte_set_attribute(&app->attributes, ORTE_APP_DASH_HOST, ORTE_ATTR_LOCAL, (void*)tpn, OPAL_STRING);
        opal_argv_free(dash_host);
        free(tpn);
    }

    if (opal_list_is_empty(&nds)) {
        /* if we get here, then we were able to contact slurm,
         * which means we are in an actively managed cluster.
         * However, slurm indicated that nothing is currently
         * available that meets our requirements. This is a fatal
         * situation - we do NOT have the option of running on
         * user-specified hosts as the cluster is managed.
         */
        OBJ_DESTRUCT(&nds);
        orte_show_help("help-ras-base.txt", "ras-base:no-allocation", true);
        ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
    }

    /* store the found nodes */
    if (ORTE_SUCCESS != (rc = orte_ras_base_node_insert(&nds, jdata))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&nds);
        ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
        return;
    }
    OBJ_DESTRUCT(&nds);

    /* default to no-oversubscribe-allowed for managed systems */
    if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
        ORTE_SET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping, ORTE_MAPPING_NO_OVERSUBSCRIBE);
    }
    /* flag that the allocation is managed */
    orte_managed_allocation = true;
    /* move the job along */
    ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_ALLOCATION_COMPLETE);
    /* all done */
    return;
}
Ejemplo n.º 5
0
/* stuff proc attributes for sending back to a proc */
int orte_pmix_server_register_nspace(orte_job_t *jdata, bool force)
{
    int rc;
    orte_proc_t *pptr;
    int i, k, n;
    opal_list_t *info, *pmap;
    opal_value_t *kv;
    orte_node_t *node, *mynode;
    opal_vpid_t vpid;
    char **list, **procs, **micro, *tmp, *regex;
    orte_job_t *dmns;
    orte_job_map_t *map;
    orte_app_context_t *app;
    uid_t uid;
    gid_t gid;
    opal_list_t *cache;
    hwloc_obj_t machine;

    opal_output_verbose(2, orte_pmix_server_globals.output,
                        "%s register nspace for %s",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                        ORTE_JOBID_PRINT(jdata->jobid));

    /* setup the info list */
    info = OBJ_NEW(opal_list_t);
    uid = geteuid();
    gid = getegid();

    /* pass our nspace/rank */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_SERVER_NSPACE);
    kv->data.string = strdup(ORTE_JOBID_PRINT(ORTE_PROC_MY_NAME->jobid));
    kv->type = OPAL_STRING;
    opal_list_append(info, &kv->super);

    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_SERVER_RANK);
    kv->data.uint32 = ORTE_PROC_MY_NAME->vpid;
    kv->type = OPAL_UINT32;
    opal_list_append(info, &kv->super);

    /* jobid */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_JOBID);
    kv->data.string = strdup(ORTE_JOBID_PRINT(jdata->jobid));
    kv->type = OPAL_STRING;
    opal_list_append(info, &kv->super);

    /* offset */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_NPROC_OFFSET);
    kv->data.uint32 = jdata->offset;
    kv->type = OPAL_UINT32;
    opal_list_append(info, &kv->super);

    /* check for cached values to add to the job info */
    cache = NULL;
    if (orte_get_attribute(&jdata->attributes, ORTE_JOB_INFO_CACHE, (void**)&cache, OPAL_PTR) &&
        NULL != cache) {
        while (NULL != (kv = (opal_value_t*)opal_list_remove_first(cache))) {
            opal_list_append(info, &kv->super);
        }
        orte_remove_attribute(&jdata->attributes, ORTE_JOB_INFO_CACHE);
        OBJ_RELEASE(cache);
    }

    /* assemble the node and proc map info */
    list = NULL;
    procs = NULL;
    map = jdata->map;
    for (i=0; i < map->nodes->size; i++) {
        micro = NULL;
        if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, i))) {
            opal_argv_append_nosize(&list, node->name);
            /* assemble all the ranks for this job that are on this node */
            for (k=0; k < node->procs->size; k++) {
                if (NULL != (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, k))) {
                    if (jdata->jobid == pptr->name.jobid) {
                        opal_argv_append_nosize(&micro, ORTE_VPID_PRINT(pptr->name.vpid));
                    }
                }
            }
            /* assemble the rank/node map */
            if (NULL != micro) {
                tmp = opal_argv_join(micro, ',');
                opal_argv_free(micro);
                opal_argv_append_nosize(&procs, tmp);
                free(tmp);
            }
        }
    }
    /* let the PMIx server generate the nodemap regex */
    if (NULL != list) {
        tmp = opal_argv_join(list, ',');
        opal_argv_free(list);
        list = NULL;
        if (OPAL_SUCCESS != (rc = opal_pmix.generate_regex(tmp, &regex))) {
            ORTE_ERROR_LOG(rc);
            free(tmp);
            OPAL_LIST_RELEASE(info);
            return rc;
        }
        free(tmp);
        kv = OBJ_NEW(opal_value_t);
        kv->key = strdup(OPAL_PMIX_NODE_MAP);
        kv->type = OPAL_STRING;
        kv->data.string = regex;
        opal_list_append(info, &kv->super);
    }

    /* let the PMIx server generate the procmap regex */
    if (NULL != procs) {
        tmp = opal_argv_join(procs, ';');
        opal_argv_free(procs);
        procs = NULL;
        if (OPAL_SUCCESS != (rc = opal_pmix.generate_ppn(tmp, &regex))) {
            ORTE_ERROR_LOG(rc);
            free(tmp);
            OPAL_LIST_RELEASE(info);
            return rc;
        }
        free(tmp);
        kv = OBJ_NEW(opal_value_t);
        kv->key = strdup(OPAL_PMIX_PROC_MAP);
        kv->type = OPAL_STRING;
        kv->data.string = regex;
        opal_list_append(info, &kv->super);
    }

    /* get our local node */
    if (NULL == (dmns = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        OPAL_LIST_RELEASE(info);
        return ORTE_ERR_NOT_FOUND;
    }
    if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(dmns->procs, ORTE_PROC_MY_NAME->vpid))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        OPAL_LIST_RELEASE(info);
        return ORTE_ERR_NOT_FOUND;
    }
    mynode = pptr->node;
    if (NULL == mynode) {
        /* cannot happen */
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        OPAL_LIST_RELEASE(info);
        return ORTE_ERR_NOT_FOUND;
    }
    /* pass our node ID */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_NODEID);
    kv->type = OPAL_UINT32;
    kv->data.uint32 = mynode->index;
    opal_list_append(info, &kv->super);

    /* pass our node size */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_NODE_SIZE);
    kv->type = OPAL_UINT32;
    kv->data.uint32 = mynode->num_procs;
    opal_list_append(info, &kv->super);

    /* pass the number of nodes in the job */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_NUM_NODES);
    kv->type = OPAL_UINT32;
    kv->data.uint32 = map->num_nodes;
    opal_list_append(info, &kv->super);

    /* univ size */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_UNIV_SIZE);
    kv->type = OPAL_UINT32;
    kv->data.uint32 = jdata->total_slots_alloc;
    opal_list_append(info, &kv->super);

    /* job size */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_JOB_SIZE);
    kv->type = OPAL_UINT32;
    kv->data.uint32 = jdata->num_procs;
    opal_list_append(info, &kv->super);

    /* number of apps in this job */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_JOB_NUM_APPS);
    kv->type = OPAL_UINT32;
    kv->data.uint32 = jdata->num_apps;
    opal_list_append(info, &kv->super);

    /* local size */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_LOCAL_SIZE);
    kv->type = OPAL_UINT32;
    kv->data.uint32 = jdata->num_local_procs;
    opal_list_append(info, &kv->super);

    /* max procs */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_MAX_PROCS);
    kv->type = OPAL_UINT32;
    kv->data.uint32 = jdata->total_slots_alloc;
    opal_list_append(info, &kv->super);

    /* topology signature */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_TOPOLOGY_SIGNATURE);
    kv->type = OPAL_STRING;
    kv->data.string = strdup(orte_topo_signature);
    opal_list_append(info, &kv->super);

    /* total available physical memory */
    machine = hwloc_get_next_obj_by_type (opal_hwloc_topology, HWLOC_OBJ_MACHINE, NULL);
    if (NULL != machine) {
        kv = OBJ_NEW(opal_value_t);
        kv->key = strdup(OPAL_PMIX_AVAIL_PHYS_MEMORY);
        kv->type = OPAL_UINT64;
#if HWLOC_API_VERSION < 0x20000
        kv->data.uint64 = machine->memory.total_memory;
#else
        kv->data.uint64 = machine->total_memory;
#endif
        opal_list_append(info, &kv->super);
    }

    /* pass the mapping policy used for this job */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_MAPBY);
    kv->type = OPAL_STRING;
    kv->data.string = strdup(orte_rmaps_base_print_mapping(jdata->map->mapping));
    opal_list_append(info, &kv->super);

    /* pass the ranking policy used for this job */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_RANKBY);
    kv->type = OPAL_STRING;
    kv->data.string = strdup(orte_rmaps_base_print_ranking(jdata->map->ranking));
    opal_list_append(info, &kv->super);

    /* pass the binding policy used for this job */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_BINDTO);
    kv->type = OPAL_STRING;
    kv->data.string = strdup(opal_hwloc_base_print_binding(jdata->map->binding));
    opal_list_append(info, &kv->super);



    /* register any local clients */
    vpid = ORTE_VPID_MAX;
    micro = NULL;
    for (i=0; i < mynode->procs->size; i++) {
        if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(mynode->procs, i))) {
            continue;
        }
        if (pptr->name.jobid == jdata->jobid) {
            opal_argv_append_nosize(&micro, ORTE_VPID_PRINT(pptr->name.vpid));
            if (pptr->name.vpid < vpid) {
                vpid = pptr->name.vpid;
            }
            /* go ahead and register this client */
            if (OPAL_SUCCESS != (rc = opal_pmix.server_register_client(&pptr->name, uid, gid,
                                                                       (void*)pptr, NULL, NULL))) {
                ORTE_ERROR_LOG(rc);
            }
        }
    }
    if (NULL != micro) {
        /* pass the local peers */
        kv = OBJ_NEW(opal_value_t);
        kv->key = strdup(OPAL_PMIX_LOCAL_PEERS);
        kv->type = OPAL_STRING;
        kv->data.string = opal_argv_join(micro, ',');
        opal_argv_free(micro);
        opal_list_append(info, &kv->super);
    }

    /* pass the local ldr */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_LOCALLDR);
    kv->type = OPAL_VPID;
    kv->data.name.vpid = vpid;
    opal_list_append(info, &kv->super);

    /* for each proc in this job, create an object that
     * includes the info describing the proc so the recipient has a complete
     * picture. This allows procs to connect to each other without
     * any further info exchange, assuming the underlying transports
     * support it. We also pass all the proc-specific data here so
     * that each proc can lookup info about every other proc in the job */

    for (n=0; n < map->nodes->size; n++) {
        if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, n))) {
            continue;
        }
        /* cycle across each proc on this node, passing all data that
         * varies by proc */
        for (i=0; i < node->procs->size; i++) {
            if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) {
                continue;
            }
            /* only consider procs from this job */
            if (pptr->name.jobid != jdata->jobid) {
                continue;
            }
            /* setup the proc map object */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_PROC_DATA);
            kv->type = OPAL_PTR;
            kv->data.ptr = OBJ_NEW(opal_list_t);
            opal_list_append(info, &kv->super);
            pmap = kv->data.ptr;

            /* must start with rank */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_RANK);
            kv->type = OPAL_VPID;
            kv->data.name.vpid = pptr->name.vpid;
            opal_list_append(pmap, &kv->super);

            /* location, for local procs */
            if (node == mynode) {
                tmp = NULL;
                if (orte_get_attribute(&pptr->attributes, ORTE_PROC_CPU_BITMAP, (void**)&tmp, OPAL_STRING) &&
                    NULL != tmp) {
                    kv = OBJ_NEW(opal_value_t);
                    kv->key = strdup(OPAL_PMIX_LOCALITY_STRING);
                    kv->type = OPAL_STRING;
                    kv->data.string = opal_hwloc_base_get_locality_string(opal_hwloc_topology, tmp);
                    opal_list_append(pmap, &kv->super);
                    free(tmp);
                } else {
                    /* the proc is not bound */
                    kv = OBJ_NEW(opal_value_t);
                    kv->key = strdup(OPAL_PMIX_LOCALITY_STRING);
                    kv->type = OPAL_STRING;
                    kv->data.string = NULL;
                    opal_list_append(pmap, &kv->super);
                }
            }

            /* global/univ rank */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_GLOBAL_RANK);
            kv->type = OPAL_VPID;
            kv->data.name.vpid = pptr->name.vpid + jdata->offset;
            opal_list_append(pmap, &kv->super);

            if (1 < jdata->num_apps) {
                /* appnum */
                kv = OBJ_NEW(opal_value_t);
                kv->key = strdup(OPAL_PMIX_APPNUM);
                kv->type = OPAL_UINT32;
                kv->data.uint32 = pptr->app_idx;
                opal_list_append(pmap, &kv->super);

                /* app ldr */
                app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx);
                kv = OBJ_NEW(opal_value_t);
                kv->key = strdup(OPAL_PMIX_APPLDR);
                kv->type = OPAL_VPID;
                kv->data.name.vpid = app->first_rank;
                opal_list_append(pmap, &kv->super);

                /* app rank */
                kv = OBJ_NEW(opal_value_t);
                kv->key = strdup(OPAL_PMIX_APP_RANK);
                kv->type = OPAL_VPID;
                kv->data.name.vpid = pptr->app_rank;
                opal_list_append(pmap, &kv->super);

                /* app size */
                kv = OBJ_NEW(opal_value_t);
                kv->key = strdup(OPAL_PMIX_APP_SIZE);
                kv->type = OPAL_UINT32;
                kv->data.uint32 = app->num_procs;
                opal_list_append(info, &kv->super);
            }

            /* local rank */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_LOCAL_RANK);
            kv->type = OPAL_UINT16;
            kv->data.uint16 = pptr->local_rank;
            opal_list_append(pmap, &kv->super);

            /* node rank */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_NODE_RANK);
            kv->type = OPAL_UINT16;
            kv->data.uint32 = pptr->node_rank;
            opal_list_append(pmap, &kv->super);

            /* node ID */
            kv = OBJ_NEW(opal_value_t);
            kv->key = strdup(OPAL_PMIX_NODEID);
            kv->type = OPAL_UINT32;
            kv->data.uint32 = pptr->node->index;
            opal_list_append(pmap, &kv->super);

            if (map->num_nodes < orte_hostname_cutoff) {
                kv = OBJ_NEW(opal_value_t);
                kv->key = strdup(OPAL_PMIX_HOSTNAME);
                kv->type = OPAL_STRING;
                kv->data.string = strdup(pptr->node->name);
                opal_list_append(pmap, &kv->super);
            }
        }
    }

    /* mark the job as registered */
    orte_set_attribute(&jdata->attributes, ORTE_JOB_NSPACE_REGISTERED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);

    /* pass it down */
    /* we are in an event, so no need to callback */
    rc = opal_pmix.server_register_nspace(jdata->jobid,
                                          jdata->num_local_procs,
                                          info, NULL, NULL);
    OPAL_LIST_RELEASE(info);

    /* if the user has connected us to an external server, then we must
     * assume there is going to be some cross-mpirun exchange, and so
     * we protect against that situation by publishing the job info
     * for this job - this allows any subsequent "connect" to retrieve
     * the job info */
    if (NULL != orte_data_server_uri) {
        opal_buffer_t buf;

        OBJ_CONSTRUCT(&buf, opal_buffer_t);
        if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &jdata, 1, ORTE_JOB))) {
            ORTE_ERROR_LOG(rc);
            OBJ_DESTRUCT(&buf);
            return rc;
        }
        info = OBJ_NEW(opal_list_t);
        /* create a key-value with the key being the string jobid
         * and the value being the byte object */
        kv = OBJ_NEW(opal_value_t);
        orte_util_convert_jobid_to_string(&kv->key, jdata->jobid);
        kv->type = OPAL_BYTE_OBJECT;
        opal_dss.unload(&buf, (void**)&kv->data.bo.bytes, &kv->data.bo.size);
        OBJ_DESTRUCT(&buf);
        opal_list_append(info, &kv->super);

        /* set the range to be session */
        kv = OBJ_NEW(opal_value_t);
        kv->key = strdup(OPAL_PMIX_RANGE);
        kv->type = OPAL_UINT;
        kv->data.uint = OPAL_PMIX_RANGE_SESSION;
        opal_list_append(info, &kv->super);

        /* set the persistence to be app */
        kv = OBJ_NEW(opal_value_t);
        kv->key = strdup(OPAL_PMIX_PERSISTENCE);
        kv->type = OPAL_INT;
        kv->data.integer = OPAL_PMIX_PERSIST_APP;
        opal_list_append(info, &kv->super);

        /* add our effective userid to the directives */
        kv = OBJ_NEW(opal_value_t);
        kv->key = strdup(OPAL_PMIX_USERID);
        kv->type = OPAL_UINT32;
        kv->data.uint32 = geteuid();
        opal_list_append(info, &kv->super);

        /* now publish it */
        if (ORTE_SUCCESS != (rc = pmix_server_publish_fn(ORTE_PROC_MY_NAME,
                                                         info, mycbfunc, info))) {
            ORTE_ERROR_LOG(rc);
        }
    }

    return rc;
}
Ejemplo n.º 6
0
int orte_errmgr_base_update_app_context_for_cr_recovery(orte_job_t *jobdata,
                                                        orte_proc_t *proc,
                                                        opal_list_t *local_snapshots)
{
    int exit_status = ORTE_SUCCESS;
    opal_list_item_t *item = NULL;
    orte_std_cntr_t i_app;
    int argc = 0;
    orte_app_context_t *cur_app_context = NULL;
    orte_app_context_t *new_app_context = NULL;
    orte_sstore_base_local_snapshot_info_t *vpid_snapshot = NULL;
    char *reference_fmt_str = NULL;
    char *location_str = NULL;
    char *cache_location_str = NULL;
    char *ref_location_fmt_str = NULL;
    char *tmp_str = NULL;
    char *global_snapshot_ref = NULL;
    char *global_snapshot_seq = NULL;
    char *sload;

    /*
     * Get the snapshot restart command for this process
     * JJH CLEANUP: Pass in the vpid_snapshot, so we don't have to look it up every time?
     */
    for(item  = opal_list_get_first(local_snapshots);
        item != opal_list_get_end(local_snapshots);
        item  = opal_list_get_next(item) ) {
        vpid_snapshot = (orte_sstore_base_local_snapshot_info_t*)item;
        if(OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL,
                                                       &vpid_snapshot->process_name,
                                                       &proc->name) ) {
            break;
        }
        else {
            vpid_snapshot = NULL;
        }
    }

    if( NULL == vpid_snapshot ) {
        ORTE_ERROR_LOG(ORTE_ERROR);
        exit_status = ORTE_ERROR;
        goto cleanup;
    }

    orte_sstore.get_attr(vpid_snapshot->ss_handle,
                         SSTORE_METADATA_LOCAL_SNAP_REF_FMT,
                         &reference_fmt_str);
    orte_sstore.get_attr(vpid_snapshot->ss_handle,
                         SSTORE_METADATA_LOCAL_SNAP_LOC,
                         &location_str);
    orte_sstore.get_attr(vpid_snapshot->ss_handle,
                         SSTORE_METADATA_LOCAL_SNAP_REF_LOC_FMT,
                         &ref_location_fmt_str);
    orte_sstore.get_attr(vpid_snapshot->ss_handle,
                         SSTORE_METADATA_GLOBAL_SNAP_REF,
                         &global_snapshot_ref);
    orte_sstore.get_attr(vpid_snapshot->ss_handle,
                         SSTORE_METADATA_GLOBAL_SNAP_SEQ,
                         &global_snapshot_seq);

    /*
     * Find current app_context
     */
    cur_app_context = NULL;
    for(i_app = 0; i_app < opal_pointer_array_get_size(jobdata->apps); ++i_app) {
        cur_app_context = (orte_app_context_t *)opal_pointer_array_get_item(jobdata->apps,
                                                                            i_app);
        if( NULL == cur_app_context ) {
            continue;
        }
        if(proc->app_idx == cur_app_context->idx) {
            break;
        }
    }

    if( NULL == cur_app_context ) {
        ORTE_ERROR_LOG(ORTE_ERROR);
        exit_status = ORTE_ERROR;
        goto cleanup;
    }

    /*
     * if > 1 processes in this app context
     *   Create a new app_context
     *   Copy over attributes
     *   Add it to the job_t data structure
     *   Associate it with this process in the job
     * else
     *   Reuse this app_context
     */
    if( cur_app_context->num_procs > 1 ) {

        /* Create a new app_context */
        opal_dss.copy((void**)&new_app_context, cur_app_context, ORTE_APP_CONTEXT);

        /* clear unused attributes */
        new_app_context->idx                    = cur_app_context->idx;
        free(new_app_context->app);
        new_app_context->app                    = NULL;
        new_app_context->num_procs              = 1;
        opal_argv_free(new_app_context->argv);
        new_app_context->argv                   = NULL;

        orte_remove_attribute(&new_app_context->attributes, ORTE_APP_PRELOAD_BIN);

        asprintf(&tmp_str, reference_fmt_str, vpid_snapshot->process_name.vpid);
        asprintf(&sload,
                 "%s:%s:%s:%s:%s:%s",
                 location_str,
                 global_snapshot_ref,
                 tmp_str,
                 (vpid_snapshot->compress_comp == NULL ? "" : vpid_snapshot->compress_comp),
                 (vpid_snapshot->compress_postfix == NULL ? "" : vpid_snapshot->compress_postfix),
                 global_snapshot_seq);
        orte_set_attribute(&new_app_context->attributes, ORTE_APP_SSTORE_LOAD, ORTE_ATTR_LOCAL, sload, OPAL_STRING);
        free(sload);

        /* Add it to the job_t data structure */
        /*current_global_jobdata->num_apps++; */
        new_app_context->idx = (jobdata->num_apps);
        proc->app_idx = new_app_context->idx;

        opal_pointer_array_add(jobdata->apps, new_app_context);
        ++(jobdata->num_apps);

        /* Remove association with the old app_context */
        --(cur_app_context->num_procs);
    }
    else {
        new_app_context = cur_app_context;

        /* Cleanout old stuff */
        free(new_app_context->app);
        new_app_context->app = NULL;

        opal_argv_free(new_app_context->argv);
        new_app_context->argv = NULL;

        asprintf(&tmp_str, reference_fmt_str, vpid_snapshot->process_name.vpid);
        asprintf(&sload,
                 "%s:%s:%s:%s:%s:%s",
                 location_str,
                 global_snapshot_ref,
                 tmp_str,
                 (vpid_snapshot->compress_comp == NULL ? "" : vpid_snapshot->compress_comp),
                 (vpid_snapshot->compress_postfix == NULL ? "" : vpid_snapshot->compress_postfix),
                 global_snapshot_seq);
        orte_set_attribute(&new_app_context->attributes, ORTE_APP_SSTORE_LOAD, ORTE_ATTR_LOCAL, sload, OPAL_STRING);
        free(sload);
    }

    /*
     * Update the app_context with the restart informaiton
     */
    new_app_context->app = strdup("opal-restart");
    opal_argv_append(&argc, &(new_app_context->argv), new_app_context->app);
    opal_argv_append(&argc, &(new_app_context->argv), "-l");
    opal_argv_append(&argc, &(new_app_context->argv), location_str);
    opal_argv_append(&argc, &(new_app_context->argv), "-m");
    opal_argv_append(&argc, &(new_app_context->argv), orte_sstore_base_local_metadata_filename);
    opal_argv_append(&argc, &(new_app_context->argv), "-r");
    if( NULL != tmp_str ) {
        free(tmp_str);
        tmp_str = NULL;
    }
    asprintf(&tmp_str, reference_fmt_str, vpid_snapshot->process_name.vpid);
    opal_argv_append(&argc, &(new_app_context->argv), tmp_str);

 cleanup:
    if( NULL != tmp_str) {
        free(tmp_str);
        tmp_str = NULL;
    }
    if( NULL != location_str ) {
        free(location_str);
        location_str = NULL;
    }
    if( NULL != cache_location_str ) {
        free(cache_location_str);
        cache_location_str = NULL;
    }
    if( NULL != reference_fmt_str ) {
        free(reference_fmt_str);
        reference_fmt_str = NULL;
    }
    if( NULL != ref_location_fmt_str ) {
        free(ref_location_fmt_str);
        ref_location_fmt_str = NULL;
    }

    return exit_status;
}