Exemplo n.º 1
0
Arquivo: mpiexec.c Projeto: adk9/hydra
static void signal_cb(int signum)
{
    struct HYD_cmd cmd;
    static int sigint_count = 0;
    int sent, closed;

    HYDU_FUNC_ENTER();

    /* SIGALRM is a special signal that indicates that a checkpoint
     * needs to be initiated */
    if (signum == SIGALRM) {
        if (HYD_server_info.user_global.ckpoint_prefix == NULL) {
            HYDU_dump(stderr, "No checkpoint prefix provided\n");
            return;
        }

#if HAVE_ALARM
        if (HYD_ui_mpich_info.ckpoint_int != -1)
            alarm(HYD_ui_mpich_info.ckpoint_int);
#endif /* HAVE_ALARM */

        cmd.type = HYD_CKPOINT;
        HYDU_sock_write(HYD_server_info.cmd_pipe[1], &cmd, sizeof(cmd), &sent, &closed,
                        HYDU_SOCK_COMM_MSGWAIT);

        goto fn_exit;
    }

    cmd.type = HYD_SIGNAL;
    cmd.signum = signum;

    /* SIGINT is a partially special signal. The first time we see it,
     * we will send it to the processes. The next time, we will treat
     * it as a SIGKILL (user convenience to force kill processes). */
    if (signum == SIGINT && ++sigint_count > 1)
        cmd.type = HYD_CLEANUP;
    else if (signum == SIGINT) {
        /* First Ctrl-C */
        HYDU_dump(stdout, "Sending Ctrl-C to processes as requested\n");
        HYDU_dump(stdout, "Press Ctrl-C again to force abort\n");
    }

    HYDU_sock_write(HYD_server_info.cmd_pipe[1], &cmd, sizeof(cmd), &sent, &closed,
                    HYDU_SOCK_COMM_MSGWAIT);

  fn_exit:
    HYDU_FUNC_EXIT();
    return;
}
Exemplo n.º 2
0
static HYD_status cmd_response(int fd, int pid, const char *cmd)
{
    struct HYD_pmcd_hdr hdr;
    int sent, closed;
    HYD_status status = HYD_SUCCESS;

    HYDU_FUNC_ENTER();

    HYD_pmcd_init_header(&hdr);
    hdr.cmd = PMI_RESPONSE;
    hdr.pid = pid;
    hdr.pmi_version = 1;
    hdr.buflen = strlen(cmd);
    status = HYDU_sock_write(fd, &hdr, sizeof(hdr), &sent, &closed);
    HYDU_ERR_POP(status, "unable to send PMI_RESPONSE header to proxy\n");
    HYDU_ASSERT(!closed, status);

    if (HYD_server_info.user_global.debug) {
        HYDU_dump(stdout, "PMI response to fd %d pid %d: %s", fd, pid, cmd);
    }

    status = HYDU_sock_write(fd, cmd, strlen(cmd), &sent, &closed);
    HYDU_ERR_POP(status, "unable to send response to command\n");
    HYDU_ASSERT(!closed, status);

  fn_exit:
    HYDU_FUNC_EXIT();
    return status;

  fn_fail:
    goto fn_exit;
}
Exemplo n.º 3
0
static HYD_status send_cmd_downstream(int fd, const char *cmd)
{
    char cmdlen[7];
    int sent, closed;
    HYD_status status = HYD_SUCCESS;

    HYDU_FUNC_ENTER();

    MPL_snprintf(cmdlen, 7, "%6u", (unsigned) strlen(cmd));
    status = HYDU_sock_write(fd, cmdlen, 6, &sent, &closed, HYDU_SOCK_COMM_MSGWAIT);
    HYDU_ERR_POP(status, "error writing PMI line\n");
    /* FIXME: We cannot abort when we are not able to send data
     * downstream. The upper layer needs to handle this based on
     * whether we want to abort or not.*/
    HYDU_ASSERT(!closed, status);

    if (HYD_pmcd_pmip.user_global.debug) {
        HYDU_dump(stdout, "PMI response: %s\n", cmd);
    }

    status = HYDU_sock_write(fd, cmd, strlen(cmd), &sent, &closed, HYDU_SOCK_COMM_MSGWAIT);
    HYDU_ERR_POP(status, "error writing PMI line\n");
    HYDU_ASSERT(!closed, status);

  fn_exit:
    HYDU_FUNC_EXIT();
    return status;

  fn_fail:
    goto fn_exit;
}
Exemplo n.º 4
0
static HYD_status handle_pmi_cmd(int fd, int pgid, int pid, char *buf, int pmi_version)
{
    char **args = NULL, *cmd = NULL;
    struct HYD_pmcd_pmi_handle *h;
    int i;
    HYD_status status = HYD_SUCCESS;

    HYDU_FUNC_ENTER();

    if (pmi_version == 1)
        HYD_pmcd_pmi_handle = HYD_pmcd_pmi_v1;
    else
        HYD_pmcd_pmi_handle = HYD_pmcd_pmi_v2;

    if (HYD_server_info.user_global.debug)
        HYDU_dump(stdout, "[pgid: %d] got PMI command: %s\n", pgid, buf);

    HYDU_MALLOC(args, char **, MAX_PMI_ARGS * sizeof(char *), status);
    for (i = 0; i < MAX_PMI_ARGS; i++)
        args[i] = NULL;

    status = HYD_pmcd_pmi_parse_pmi_cmd(buf, pmi_version, &cmd, args);
    HYDU_ERR_POP(status, "unable to parse PMI command\n");

#if defined ENABLE_PROFILING
    if (HYD_server_info.enable_profiling)
        HYD_server_info.num_pmi_calls++;
#endif /* ENABLE_PROFILING */

    h = HYD_pmcd_pmi_handle;
    while (h->handler) {
        if (!strcmp(cmd, h->cmd)) {
            status = h->handler(fd, pid, pgid, args);
            HYDU_ERR_POP(status, "PMI handler returned error\n");
            break;
        }
        h++;
    }
    if (!h->handler) {
        /* We don't understand the command */
        HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR,
                            "Unrecognized PMI command: %s | cleaning up processes\n", cmd);
    }

  fn_exit:
    if (cmd)
        HYDU_FREE(cmd);
    if (args) {
        HYDU_free_strlist(args);
        HYDU_free(args);
    }
    HYDU_FUNC_EXIT();
    return status;

  fn_fail:
    goto fn_exit;
}
Exemplo n.º 5
0
HYD_status HYDTI_bscd_ssh_store_launch_time(char *hostname)
{
    int i, oldest, time_left;
    struct timeval now;
    struct HYDT_bscd_ssh_time *e;
    HYD_status status = HYD_SUCCESS;

    for (e = HYDT_bscd_ssh_time; e; e = e->next)
        if (!strcmp(hostname, e->hostname))
            break;

    if (e == NULL) {    /* Couldn't find an element for this host */
        status = create_element(hostname, &e);
        HYDU_ERR_POP(status, "unable to create ssh time element\n");
    }

    /* Search for an unset element to store the current time */
    for (i = 0; i < HYDT_bscd_ssh_limit; i++) {
        if (e->init_time[i].tv_sec == 0 && e->init_time[i].tv_usec == 0) {
            gettimeofday(&e->init_time[i], NULL);
            goto fn_exit;
        }
    }

    /* No free element found; wait for the oldest element to turn
     * older */
    oldest = 0;
    for (i = 0; i < HYDT_bscd_ssh_limit; i++)
        if (older(e->init_time[i], e->init_time[oldest]))
            oldest = i;

    gettimeofday(&now, NULL);
    time_left = HYDT_bscd_ssh_limit_time - now.tv_sec + e->init_time[oldest].tv_sec;

    /* A better approach will be to make progress here, but that would
     * mean that we need to deal with nested calls to the demux engine
     * and process launches. */
    if (time_left > 0) {
        if (HYDT_bscd_ssh_warnings)
            HYDU_dump(stdout, "WARNING: too many ssh connections to %s; waiting %d seconds\n",
                      hostname, time_left);
        sleep(time_left);
    }

    /* Store the current time in the oldest element */
    gettimeofday(&e->init_time[oldest], NULL);

  fn_exit:
    return status;

  fn_fail:
    goto fn_exit;
}
Exemplo n.º 6
0
static HYD_status send_cmd_upstream(const char *start, int fd, char *args[])
{
    int i, sent, closed;
    struct HYD_string_stash stash;
    char *buf = NULL;
    struct HYD_pmcd_hdr hdr;
    HYD_status status = HYD_SUCCESS;

    HYDU_FUNC_ENTER();

    HYD_STRING_STASH_INIT(stash);
    HYD_STRING_STASH(stash, MPL_strdup(start), status);
    for (i = 0; args[i]; i++) {
        HYD_STRING_STASH(stash, MPL_strdup(args[i]), status);
        if (args[i + 1])
            HYD_STRING_STASH(stash, MPL_strdup(";"), status);
    }

    HYD_STRING_SPIT(stash, buf, status);

    HYD_pmcd_init_header(&hdr);
    hdr.cmd = PMI_CMD;
    hdr.pid = fd;
    hdr.buflen = strlen(buf);
    hdr.pmi_version = 2;
    status =
        HYDU_sock_write(HYD_pmcd_pmip.upstream.control, &hdr, sizeof(hdr), &sent, &closed,
                        HYDU_SOCK_COMM_MSGWAIT);
    HYDU_ERR_POP(status, "unable to send PMI header upstream\n");
    HYDU_ASSERT(!closed, status);

    if (HYD_pmcd_pmip.user_global.debug) {
        HYDU_dump(stdout, "forwarding command (%s) upstream\n", buf);
    }

    status = HYDU_sock_write(HYD_pmcd_pmip.upstream.control, buf, hdr.buflen, &sent, &closed,
                             HYDU_SOCK_COMM_MSGWAIT);
    HYDU_ERR_POP(status, "unable to send PMI command upstream\n");
    HYDU_ASSERT(!closed, status);

  fn_exit:
    if (buf)
        MPL_free(buf);
    HYDU_FUNC_EXIT();
    return status;

  fn_fail:
    goto fn_exit;
}
Exemplo n.º 7
0
static HYD_status cmd_response(int fd, int pid, char *cmd)
{
    char cmdlen[7];
    struct HYD_pmcd_hdr hdr;
    int sent, closed;
    HYD_status status = HYD_SUCCESS;

    HYDU_FUNC_ENTER();

    HYD_pmcd_init_header(&hdr);
    hdr.cmd = PMI_RESPONSE;
    hdr.pid = pid;
    hdr.pmi_version = 2;
    hdr.buflen = 6 + strlen(cmd);
    status = HYDU_sock_write(fd, &hdr, sizeof(hdr), &sent, &closed, HYDU_SOCK_COMM_MSGWAIT);
    HYDU_ERR_POP(status, "unable to send PMI_RESPONSE header to proxy\n");
    HYDU_ASSERT(!closed, status);

    HYDU_snprintf(cmdlen, 7, "%6u", (unsigned) strlen(cmd));
    status = HYDU_sock_write(fd, cmdlen, 6, &sent, &closed, HYDU_SOCK_COMM_MSGWAIT);
    HYDU_ERR_POP(status, "error writing PMI line\n");
    HYDU_ASSERT(!closed, status);

    if (HYD_server_info.user_global.debug) {
        HYDU_dump(stdout, "PMI response to fd %d pid %d: %s\n", fd, pid, cmd);
    }

    status = HYDU_sock_write(fd, cmd, strlen(cmd), &sent, &closed, HYDU_SOCK_COMM_MSGWAIT);
    HYDU_ERR_POP(status, "error writing PMI line\n");
    HYDU_ASSERT(!closed, status);

  fn_exit:
    HYDU_FUNC_EXIT();
    return status;

  fn_fail:
    goto fn_exit;
}
Exemplo n.º 8
0
static HYD_status fn_spawn(int fd, int pid, int pgid, char *args[])
{
    struct HYD_pg *pg;
    struct HYD_pmcd_pmi_pg_scratch *pg_scratch;
    struct HYD_proxy *proxy;
    struct HYD_pmcd_token *tokens;
    struct HYD_exec *exec_list = NULL, *exec;
    struct HYD_env *env;
    struct HYD_node *node;

    char key[PMI_MAXKEYLEN], *val;
    int nprocs, preput_num, info_num, ret;
    char *execname, *path = NULL;

    struct HYD_pmcd_token_segment *segment_list = NULL;

    int token_count, i, j, k, new_pgid, total_spawns;
    int argcnt, num_segments;
    char *control_port, *proxy_args[HYD_NUM_TMP_STRINGS] = { NULL };
    char *tmp[HYD_NUM_TMP_STRINGS];
    HYD_status status = HYD_SUCCESS;

    HYDU_FUNC_ENTER();

    for (i = 0; args[i]; i++)
        mcmd_args[mcmd_num_args++] = HYDU_strdup(args[i]);
    mcmd_args[mcmd_num_args] = NULL;

    status = HYD_pmcd_pmi_args_to_tokens(mcmd_args, &tokens, &token_count);
    HYDU_ERR_POP(status, "unable to convert args to tokens\n");


    /* Here's the order of things we do:
     *
     *   1. Break the token list into multiple segments, each segment
     *      corresponding to a command. Each command represents
     *      information for one executable.
     *
     *   2. Allocate a process group for the new set of spawned
     *      processes
     *
     *   3. Get all the common keys and deal with them
     *
     *   4. Create an executable list based on the segments.
     *
     *   5. Create a proxy list using the created executable list and
     *      spawn it.
     */

    /* Break the token list into multiple segments and create an
     * executable list based on the segments. */
    val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "totspawns");
    HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR,
                        "unable to find token: totspawns\n");
    total_spawns = atoi(val);

    HYDU_MALLOC(segment_list, struct HYD_pmcd_token_segment *,
                total_spawns * sizeof(struct HYD_pmcd_token_segment), status);

    segment_tokens(tokens, token_count, segment_list, &num_segments);

    if (num_segments != total_spawns) {
        /* We didn't read the entire PMI string; wait for the rest to
         * arrive */
        goto fn_exit;
    }
    else {
        /* Got the entire PMI string; free the arguments and reset */
        HYDU_free_strlist(mcmd_args);
        mcmd_num_args = 0;
    }

    /* Allocate a new process group */
    for (pg = &HYD_server_info.pg_list; pg->next; pg = pg->next);
    new_pgid = pg->pgid + 1;

    status = HYDU_alloc_pg(&pg->next, new_pgid);
    HYDU_ERR_POP(status, "unable to allocate process group\n");

    pg = pg->next;

    proxy = HYD_pmcd_pmi_find_proxy(fd);
    HYDU_ASSERT(proxy, status);

    pg->spawner_pg = proxy->pg;

    for (j = 0; j < total_spawns; j++) {
        /* For each segment, we create an exec structure */
        val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx],
                                             segment_list[j].token_count, "nprocs");
        HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR,
                            "unable to find token: nprocs\n");
        nprocs = atoi(val);
        pg->pg_process_count += nprocs;

        val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx],
                                             segment_list[j].token_count, "argcnt");
        HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR,
                            "unable to find token: argcnt\n");
        argcnt = atoi(val);

        val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx],
                                             segment_list[j].token_count, "info_num");
        if (val)
            info_num = atoi(val);
        else
            info_num = 0;

        if (exec_list == NULL) {
            status = HYDU_alloc_exec(&exec_list);
            HYDU_ERR_POP(status, "unable to allocate exec\n");
            exec_list->appnum = 0;
            exec = exec_list;
        }
        else {
            for (exec = exec_list; exec->next; exec = exec->next);
            status = HYDU_alloc_exec(&exec->next);
            HYDU_ERR_POP(status, "unable to allocate exec\n");
            exec->next->appnum = exec->appnum + 1;
            exec = exec->next;
        }

        /* Info keys */
        for (i = 0; i < info_num; i++) {
            char *info_key, *info_val;

            HYDU_snprintf(key, PMI_MAXKEYLEN, "info_key_%d", i);
            val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx],
                                                 segment_list[j].token_count, key);
            HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR,
                                "unable to find token: %s\n", key);
            info_key = val;

            HYDU_snprintf(key, PMI_MAXKEYLEN, "info_val_%d", i);
            val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx],
                                                 segment_list[j].token_count, key);
            HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR,
                                "unable to find token: %s\n", key);
            info_val = val;

            if (!strcmp(info_key, "path")) {
                path = HYDU_strdup(info_val);
            }
            else if (!strcmp(info_key, "wdir")) {
                exec->wdir = HYDU_strdup(info_val);
            }
            else if (!strcmp(info_key, "host")) {
                status = HYDU_process_mfile_token(info_val, 1, &pg->user_node_list);
                HYDU_ERR_POP(status, "error create node list\n");
            }
            else if (!strcmp(info_key, "hostfile")) {
                status =
                    HYDU_parse_hostfile(info_val, &pg->user_node_list, HYDU_process_mfile_token);
                HYDU_ERR_POP(status, "error parsing hostfile\n");
            }
            else {
                /* Unrecognized info key; ignore */
            }
        }

        status = HYDU_correct_wdir(&exec->wdir);
        HYDU_ERR_POP(status, "unable to correct wdir\n");

        val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx],
                                             segment_list[j].token_count, "execname");
        HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR,
                            "unable to find token: execname\n");
        if (path == NULL)
            execname = HYDU_strdup(val);
        else {
            i = 0;
            tmp[i++] = HYDU_strdup(path);
            tmp[i++] = HYDU_strdup("/");
            tmp[i++] = HYDU_strdup(val);
            tmp[i++] = NULL;

            status = HYDU_str_alloc_and_join(tmp, &execname);
            HYDU_ERR_POP(status, "error while joining strings\n");
            HYDU_free_strlist(tmp);
        }

        i = 0;
        exec->exec[i++] = execname;
        for (k = 0; k < argcnt; k++) {
            HYDU_snprintf(key, PMI_MAXKEYLEN, "arg%d", k + 1);
            val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx],
                                                 segment_list[j].token_count, key);
            HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR,
                                "unable to find token: %s\n", key);
            exec->exec[i++] = HYDU_strdup(val);
        }
        exec->exec[i++] = NULL;

        exec->proc_count = nprocs;

        /* It is not clear what kind of environment needs to get
         * passed to the spawned process. Don't set anything here, and
         * let the proxy do whatever it does by default. */
        exec->env_prop = NULL;

        status = HYDU_env_create(&env, "PMI_SPAWNED", "1");
        HYDU_ERR_POP(status, "unable to create PMI_SPAWNED environment\n");
        exec->user_env = env;
    }

    status = HYD_pmcd_pmi_alloc_pg_scratch(pg);
    HYDU_ERR_POP(status, "unable to allocate pg scratch space\n");

    if (pg->user_node_list) {
        pg->pg_core_count = 0;
        for (i = 0, node = pg->user_node_list; node; node = node->next, i++) {
            pg->pg_core_count += node->core_count;
            node->node_id = i;
        }
    }
    else {
        pg->pg_core_count = HYD_server_info.pg_list.pg_core_count;
    }

    pg->pg_process_count = 0;
    for (exec = exec_list; exec; exec = exec->next)
        pg->pg_process_count += exec->proc_count;

    pg_scratch = (struct HYD_pmcd_pmi_pg_scratch *) pg->pg_scratch;

    /* Get the common keys and deal with them */
    val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "preput_num");
    if (val)
        preput_num = atoi(val);
    else
        preput_num = 0;

    for (i = 0; i < preput_num; i++) {
        char *preput_key, *preput_val;

        HYDU_snprintf(key, PMI_MAXKEYLEN, "preput_key_%d", i);
        val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, key);
        HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR,
                            "unable to find token: %s\n", key);
        preput_key = val;

        HYDU_snprintf(key, PMI_MAXKEYLEN, "preput_val_%d", i);
        val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, key);
        HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR,
                            "unable to find token: %s\n", key);
        preput_val = val;

        status = HYD_pmcd_pmi_add_kvs(preput_key, preput_val, pg_scratch->kvs, &ret);
        HYDU_ERR_POP(status, "unable to add keypair to kvs\n");
    }

    /* Create the proxy list */
    if (pg->user_node_list) {
        status = HYDU_create_proxy_list(exec_list, pg->user_node_list, pg);
        HYDU_ERR_POP(status, "error creating proxy list\n");
    }
    else {
        status = HYDU_create_proxy_list(exec_list, HYD_server_info.node_list, pg);
        HYDU_ERR_POP(status, "error creating proxy list\n");
    }
    HYDU_free_exec_list(exec_list);

    status = HYDU_sock_create_and_listen_portstr(HYD_server_info.user_global.iface,
                                                 HYD_server_info.local_hostname,
                                                 HYD_server_info.port_range, &control_port,
                                                 HYD_pmcd_pmiserv_control_listen_cb,
                                                 (void *) (size_t) new_pgid);
    HYDU_ERR_POP(status, "unable to create PMI port\n");
    if (HYD_server_info.user_global.debug)
        HYDU_dump(stdout, "Got a control port string of %s\n", control_port);

    /* Go to the last PG */
    for (pg = &HYD_server_info.pg_list; pg->next; pg = pg->next);

    status = HYD_pmcd_pmi_fill_in_proxy_args(proxy_args, control_port, new_pgid);
    HYDU_ERR_POP(status, "unable to fill in proxy arguments\n");
    HYDU_FREE(control_port);

    status = HYD_pmcd_pmi_fill_in_exec_launch_info(pg);
    HYDU_ERR_POP(status, "unable to fill in executable arguments\n");

    status = HYDT_bsci_launch_procs(proxy_args, pg->proxy_list, NULL);
    HYDU_ERR_POP(status, "launcher cannot launch processes\n");

    {
        char *cmd_str[HYD_NUM_TMP_STRINGS], *cmd;

        i = 0;
        cmd_str[i++] = HYDU_strdup("cmd=spawn_result rc=0");
        cmd_str[i++] = HYDU_strdup("\n");
        cmd_str[i++] = NULL;

        status = HYDU_str_alloc_and_join(cmd_str, &cmd);
        HYDU_ERR_POP(status, "unable to join strings\n");
        HYDU_free_strlist(cmd_str);

        status = cmd_response(fd, pid, cmd);
        HYDU_ERR_POP(status, "error writing PMI line\n");
        HYDU_FREE(cmd);
    }

  fn_exit:
    HYD_pmcd_pmi_free_tokens(tokens, token_count);
    HYDU_free_strlist(proxy_args);
    if (segment_list)
        HYDU_FREE(segment_list);
    HYDU_FUNC_EXIT();
    return status;

  fn_fail:
    goto fn_exit;
}
Exemplo n.º 9
0
HYD_status HYDT_bscd_slurm_launch_procs(char **args, struct HYD_proxy *proxy_list,
                                        int *control_fd)
{
    int num_hosts, idx, i;
    int *pid, *fd_list;
    char *targs[HYD_NUM_TMP_STRINGS], *node_list_str = NULL;
    char *path = NULL, *extra_arg_list = NULL, *extra_arg;
    struct HYD_proxy *proxy;
    struct HYDT_topo_cpuset_t cpuset;
    HYD_status status = HYD_SUCCESS;

    HYDU_FUNC_ENTER();

    /* We use the following priority order for the executable path:
     * (1) user-specified; (2) search in path; (3) Hard-coded
     * location */
    if (HYDT_bsci_info.launcher_exec)
        path = HYDU_strdup(HYDT_bsci_info.launcher_exec);
    if (!path)
        path = HYDU_find_full_path("srun");
    if (!path)
        path = HYDU_strdup("/usr/bin/srun");

    idx = 0;
    targs[idx++] = HYDU_strdup(path);

    if (strcmp(HYDT_bsci_info.rmk, "slurm")) {
        targs[idx++] = HYDU_strdup("--nodelist");

        status = proxy_list_to_node_str(proxy_list, &node_list_str);
        HYDU_ERR_POP(status, "unable to build a node list string\n");

        targs[idx++] = HYDU_strdup(node_list_str);
    }

    num_hosts = 0;
    for (proxy = proxy_list; proxy; proxy = proxy->next)
        num_hosts++;

    targs[idx++] = HYDU_strdup("-N");
    targs[idx++] = HYDU_int_to_str(num_hosts);

    targs[idx++] = HYDU_strdup("-n");
    targs[idx++] = HYDU_int_to_str(num_hosts);

    MPL_env2str("HYDRA_LAUNCHER_EXTRA_ARGS", (const char **) &extra_arg_list);
    if (extra_arg_list) {
        extra_arg = strtok(extra_arg_list, " ");
        while (extra_arg) {
            targs[idx++] = HYDU_strdup(extra_arg);
            extra_arg = strtok(NULL, " ");
        }
    }

    /* Fill in the remaining arguments */
    /* We do not need to create a quoted version of the string for
     * SLURM. It seems to be internally quoting it anyway. */
    for (i = 0; args[i]; i++)
        targs[idx++] = HYDU_strdup(args[i]);

    /* Increase pid list to accommodate the new pid */
    HYDU_MALLOC(pid, int *, (HYD_bscu_pid_count + 1) * sizeof(int), status);
    for (i = 0; i < HYD_bscu_pid_count; i++)
        pid[i] = HYD_bscu_pid_list[i];
    HYDU_FREE(HYD_bscu_pid_list);
    HYD_bscu_pid_list = pid;

    /* Increase fd list to accommodate these new fds */
    HYDU_MALLOC(fd_list, int *, (HYD_bscu_fd_count + 3) * sizeof(int), status);
    for (i = 0; i < HYD_bscu_fd_count; i++)
        fd_list[i] = HYD_bscu_fd_list[i];
    HYDU_FREE(HYD_bscu_fd_list);
    HYD_bscu_fd_list = fd_list;

    /* append proxy ID as -1 */
    targs[idx++] = HYDU_int_to_str(-1);
    targs[idx++] = NULL;

    if (HYDT_bsci_info.debug) {
        HYDU_dump(stdout, "Launch arguments: ");
        HYDU_print_strlist(targs);
    }

    HYDT_topo_cpuset_zero(&cpuset);
    status = HYDU_create_process(targs, NULL, NULL, &fd_stdout, &fd_stderr,
                                 &HYD_bscu_pid_list[HYD_bscu_pid_count++], cpuset);
    HYDU_ERR_POP(status, "create process returned error\n");

    HYD_bscu_fd_list[HYD_bscu_fd_count++] = fd_stdout;
    HYD_bscu_fd_list[HYD_bscu_fd_count++] = fd_stderr;

    status = HYDT_dmx_register_fd(1, &fd_stdout, HYD_POLLIN,
                                  (void *) (size_t) STDOUT_FILENO, HYDT_bscu_stdio_cb);
    HYDU_ERR_POP(status, "demux returned error registering fd\n");

    status = HYDT_dmx_register_fd(1, &fd_stderr, HYD_POLLIN,
                                  (void *) (size_t) STDERR_FILENO, HYDT_bscu_stdio_cb);
    HYDU_ERR_POP(status, "demux returned error registering fd\n");

  fn_exit:
    if (node_list_str)
        HYDU_FREE(node_list_str);
    HYDU_free_strlist(targs);
    if (path)
        HYDU_FREE(path);
    HYDU_FUNC_EXIT();
    return status;

  fn_fail:
    goto fn_exit;
}
Exemplo n.º 10
0
static HYD_status fn_spawn(int fd, int pid, int pgid, char *args[])
{
    struct HYD_pg *pg;
    struct HYD_pmcd_pmi_pg_scratch *pg_scratch;
    struct HYD_proxy *proxy;
    struct HYD_pmcd_token *tokens;
    struct HYD_exec *exec_list = NULL, *exec;
    struct HYD_env *env;
    struct HYD_node *node;

    char *thrid;
    char key[PMI_MAXKEYLEN], *val;
    int maxprocs, preputcount, infokeycount, ret;
    int ncmds;
    char *execname, *path = NULL;

    struct HYD_pmcd_token_segment *segment_list = NULL;

    int token_count, i, j, k, new_pgid;
    int argcnt, num_segments;
    struct HYD_string_stash proxy_stash;
    char *control_port;
    struct HYD_string_stash stash;
    HYD_status status = HYD_SUCCESS;

    HYDU_FUNC_ENTER();

    status = HYD_pmcd_pmi_args_to_tokens(args, &tokens, &token_count);
    HYDU_ERR_POP(status, "unable to convert args to tokens\n");

    thrid = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "thrid");


    /* Here's the order of things we do:
     *
     *   1. Break the token list into multiple segments, each segment
     *      corresponding to a command. Each command represents
     *      information for one executable.
     *
     *   2. Allocate a process group for the new set of spawned
     *      processes
     *
     *   3. Get all the common keys and deal with them
     *
     *   4. Create an executable list based on the segments.
     *
     *   5. Create a proxy list using the created executable list and
     *      spawn it.
     */

    val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "ncmds");
    HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: ncmds\n");
    ncmds = atoi(val);

    HYDU_MALLOC(segment_list, struct HYD_pmcd_token_segment *,
                (ncmds + 1) * sizeof(struct HYD_pmcd_token_segment), status);
    segment_tokens(tokens, token_count, segment_list, &num_segments);
    HYDU_ASSERT((ncmds + 1) == num_segments, status);


    /* Allocate a new process group */
    for (pg = &HYD_server_info.pg_list; pg->next; pg = pg->next);
    new_pgid = pg->pgid + 1;

    status = HYDU_alloc_pg(&pg->next, new_pgid);
    HYDU_ERR_POP(status, "unable to allocate process group\n");

    pg = pg->next;

    proxy = HYD_pmcd_pmi_find_proxy(fd);
    HYDU_ASSERT(proxy, status);

    pg->spawner_pg = proxy->pg;

    for (j = 1; j <= ncmds; j++) {
        /* For each segment, we create an exec structure */
        val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx],
                                             segment_list[j].token_count, "maxprocs");
        HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR,
                            "unable to find token: maxprocs\n");
        maxprocs = atoi(val);
        pg->pg_process_count += maxprocs;

        val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx],
                                             segment_list[j].token_count, "argc");
        HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR,
                            "unable to find token: argc\n");
        argcnt = atoi(val);

        val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx],
                                             segment_list[j].token_count, "infokeycount");
        if (val)
            infokeycount = atoi(val);
        else
            infokeycount = 0;

        if (exec_list == NULL) {
            status = HYDU_alloc_exec(&exec_list);
            HYDU_ERR_POP(status, "unable to allocate exec\n");
            exec_list->appnum = 0;
            exec = exec_list;
        }
        else {
            for (exec = exec_list; exec->next; exec = exec->next);
            status = HYDU_alloc_exec(&exec->next);
            HYDU_ERR_POP(status, "unable to allocate exec\n");
            exec->next->appnum = exec->appnum + 1;
            exec = exec->next;
        }

        /* Info keys */
        for (i = 0; i < infokeycount; i++) {
            char *info_key, *info_val;

            HYDU_snprintf(key, PMI_MAXKEYLEN, "infokey%d", i);
            val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx],
                                                 segment_list[j].token_count, key);
            HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR,
                                "unable to find token: %s\n", key);
            info_key = val;

            HYDU_snprintf(key, PMI_MAXKEYLEN, "infoval%d", i);
            val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx],
                                                 segment_list[j].token_count, key);
            HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR,
                                "unable to find token: %s\n", key);
            info_val = val;

            if (!strcmp(info_key, "path")) {
                path = HYDU_strdup(info_val);
            }
            else if (!strcmp(info_key, "wdir")) {
                exec->wdir = HYDU_strdup(info_val);
            }
            else if (!strcmp(info_key, "host") || !strcmp(info_key, "hosts")) {
                char *host = strtok(info_val, ",");
                while (host) {
                    status = HYDU_process_mfile_token(host, 1, &pg->user_node_list);
                    HYDU_ERR_POP(status, "error creating node list\n");
                    host = strtok(NULL, ",");
                }
            }
            else if (!strcmp(info_key, "hostfile")) {
                status = HYDU_parse_hostfile(info_val, &pg->user_node_list,
                                             HYDU_process_mfile_token);
                HYDU_ERR_POP(status, "error parsing hostfile\n");
            }
            else {
                /* Unrecognized info key; ignore */
            }
        }

        status = HYDU_correct_wdir(&exec->wdir);
        HYDU_ERR_POP(status, "unable to correct wdir\n");

        val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx],
                                             segment_list[j].token_count, "subcmd");
        HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR,
                            "unable to find token: subcmd\n");
        if (path == NULL)
            execname = HYDU_strdup(val);
        else {
            HYD_STRING_STASH_INIT(stash);
            HYD_STRING_STASH(stash, HYDU_strdup(path), status);
            HYD_STRING_STASH(stash, HYDU_strdup("/"), status);
            HYD_STRING_STASH(stash, HYDU_strdup(val), status);

            HYD_STRING_SPIT(stash, execname, status);
        }

        i = 0;
        exec->exec[i++] = execname;
        for (k = 0; k < argcnt; k++) {
            HYDU_snprintf(key, PMI_MAXKEYLEN, "argv%d", k);
            val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx],
                                                 segment_list[j].token_count, key);
            HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR,
                                "unable to find token: %s\n", key);
            exec->exec[i++] = HYDU_strdup(val);
        }
        exec->exec[i++] = NULL;

        exec->proc_count = maxprocs;

        /* It is not clear what kind of environment needs to get
         * passed to the spawned process. Don't set anything here, and
         * let the proxy do whatever it does by default. */
        exec->env_prop = NULL;

        status = HYDU_env_create(&env, "PMI_SPAWNED", "1");
        HYDU_ERR_POP(status, "unable to create PMI_SPAWNED environment\n");
        exec->user_env = env;
    }

    status = HYD_pmcd_pmi_alloc_pg_scratch(pg);
    HYDU_ERR_POP(status, "unable to allocate pg scratch space\n");

    pg->pg_process_count = 0;
    for (exec = exec_list; exec; exec = exec->next)
        pg->pg_process_count += exec->proc_count;

    pg_scratch = (struct HYD_pmcd_pmi_pg_scratch *) pg->pg_scratch;

    /* Get the common keys and deal with them */
    val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "preputcount");
    HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR,
                        "unable to find token: preputcount\n");
    preputcount = atoi(val);

    for (i = 0; i < preputcount; i++) {
        char *preput_key, *preput_val;

        HYDU_snprintf(key, PMI_MAXKEYLEN, "ppkey%d", i);
        val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, key);
        HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR,
                            "unable to find token: %s\n", key);
        preput_key = val;

        HYDU_snprintf(key, PMI_MAXKEYLEN, "ppval%d", i);
        val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, key);
        HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR,
                            "unable to find token: %s\n", key);
        preput_val = val;

        status = HYD_pmcd_pmi_add_kvs(preput_key, preput_val, pg_scratch->kvs, &ret);
        HYDU_ERR_POP(status, "unable to add keypair to kvs\n");
    }

    /* Create the proxy list */
    if (pg->user_node_list) {
        status = HYDU_create_proxy_list(exec_list, pg->user_node_list, pg);
        HYDU_ERR_POP(status, "error creating proxy list\n");
    }
    else {
        status = HYDU_create_proxy_list(exec_list, HYD_server_info.node_list, pg);
        HYDU_ERR_POP(status, "error creating proxy list\n");
    }
    HYDU_free_exec_list(exec_list);

    if (pg->user_node_list) {
        pg->pg_core_count = 0;
        for (i = 0, node = pg->user_node_list; node; node = node->next, i++)
            pg->pg_core_count += node->core_count;
    }
    else {
        pg->pg_core_count = 0;
        for (proxy = pg->proxy_list; proxy; proxy = proxy->next)
            pg->pg_core_count += proxy->node->core_count;
    }

    status = HYDU_sock_create_and_listen_portstr(HYD_server_info.user_global.iface,
                                                 HYD_server_info.localhost,
                                                 HYD_server_info.port_range, &control_port,
                                                 HYD_pmcd_pmiserv_control_listen_cb,
                                                 (void *) (size_t) new_pgid);
    HYDU_ERR_POP(status, "unable to create PMI port\n");
    if (HYD_server_info.user_global.debug)
        HYDU_dump(stdout, "Got a control port string of %s\n", control_port);

    /* Go to the last PG */
    for (pg = &HYD_server_info.pg_list; pg->next; pg = pg->next);

    status = HYD_pmcd_pmi_fill_in_proxy_args(&proxy_stash, control_port, new_pgid);
    HYDU_ERR_POP(status, "unable to fill in proxy arguments\n");
    HYDU_FREE(control_port);

    status = HYD_pmcd_pmi_fill_in_exec_launch_info(pg);
    HYDU_ERR_POP(status, "unable to fill in executable arguments\n");

    status = HYDT_bsci_launch_procs(proxy_stash.strlist, pg->proxy_list, NULL);
    HYDU_ERR_POP(status, "launcher cannot launch processes\n");

    {
        char *cmd;

        HYD_STRING_STASH_INIT(stash);
        HYD_STRING_STASH(stash, HYDU_strdup("cmd=spawn-response;"), status);
        if (thrid) {
            HYD_STRING_STASH(stash, HYDU_strdup("thrid="), status);
            HYD_STRING_STASH(stash, HYDU_strdup(thrid), status);
            HYD_STRING_STASH(stash, HYDU_strdup(";"), status);
        }
        HYD_STRING_STASH(stash, HYDU_strdup("rc=0;"), status);
        HYD_STRING_STASH(stash, HYDU_strdup("jobid="), status);
        HYD_STRING_STASH(stash, HYDU_strdup(pg_scratch->kvs->kvsname), status);
        HYD_STRING_STASH(stash, HYDU_strdup(";"), status);
        HYD_STRING_STASH(stash, HYDU_strdup("nerrs=0;"), status);

        HYD_STRING_SPIT(stash, cmd, status);

        status = cmd_response(fd, pid, cmd);
        HYDU_ERR_POP(status, "send command failed\n");
        HYDU_FREE(cmd);
    }

  fn_exit:
    HYD_pmcd_pmi_free_tokens(tokens, token_count);
    HYD_STRING_STASH_FREE(proxy_stash);
    if (segment_list)
        HYDU_FREE(segment_list);
    HYDU_FUNC_EXIT();
    return status;

  fn_fail:
    goto fn_exit;
}
Exemplo n.º 11
0
HYD_status HYDT_bscd_pbs_wait_for_completion(int timeout)
{
    int time_elapsed;
    int events_count, spawned_count;
    int idx, ierr;
    struct timeval start_tval, curr_tval;
    HYD_status status = HYD_SUCCESS;

    HYDU_FUNC_ENTER();

    /* Allocate memory for taskobits[] */
    HYDU_MALLOC(HYDT_bscd_pbs_sys->taskobits, int *,
                HYDT_bscd_pbs_sys->size * sizeof(int), status);
    spawned_count = HYDT_bscd_pbs_sys->spawned_count;

    /*
     * FIXME: We rely on gettimeofday here. This needs to detect the
     * timer type available and use that. Probably more of an MPL
     * functionality than Hydra's.
     */
    gettimeofday(&start_tval, NULL);

    /* Register with TM to be notified the obituary of the spawning process. */
    for (idx = 0; idx < spawned_count; idx++) {
        /*
         * Get a TM event which will be returned by tm_poll() when
         * the process labelled by taskID dies
         */
        ierr = tm_obit(HYDT_bscd_pbs_sys->taskIDs[idx],
                       HYDT_bscd_pbs_sys->taskobits + idx, HYDT_bscd_pbs_sys->events + idx);
        if (ierr != TM_SUCCESS)
            HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR,
                                "tm_obit() fails with TM err=%d.\n", ierr);
        if (HYDT_bscd_pbs_sys->events[idx] == TM_ERROR_EVENT)
            HYDU_error_printf("tm_obit(Task %d) returns error.\n",
                              HYDT_bscd_pbs_sys->taskIDs[idx]);
        if (HYDT_bscd_pbs_sys->events[idx] == TM_NULL_EVENT)
            HYDU_error_printf("Task %d already exits with status %d\n",
                              HYDT_bscd_pbs_sys->taskIDs[idx],
                              HYDT_bscd_pbs_sys->taskobits[idx]);
    }

    /* Poll if the spawned process has exited */
    events_count = 0;
    /* Substract all the processes that have already exited */
    for (idx = 0; idx < spawned_count; idx++) {
        if (HYDT_bscd_pbs_sys->events[idx] == TM_NULL_EVENT)
            events_count++;
    }
    /* Polling for the remaining alive processes till they all exit */
    while (events_count < spawned_count) {
        tm_event_t event = -1;
        int poll_err;
        ierr = tm_poll(TM_NULL_EVENT, &event, 0, &poll_err);
        if (ierr != TM_SUCCESS)
            HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR,
                                "tm_poll(obit_event) fails with err=%d.\n", ierr);
        if (event != TM_NULL_EVENT) {
            for (idx = 0; idx < spawned_count; idx++) {
                if (HYDT_bscd_pbs_sys->events[idx] == event) {
                    if (HYDT_bsci_info.debug) {
                        HYDU_dump(stdout,
                                  "PBS_DEBUG: Event %d received, task %d exits with status %d.\n",
                                  event, HYDT_bscd_pbs_sys->taskIDs[idx],
                                  HYDT_bscd_pbs_sys->taskobits[idx]);
                        /*
                         * HYDU_error_printf("DEBUG: Event %d received, task %d exits with status %d.\n", event, HYDT_bscd_pbs_sys->taskIDs[idx], HYDT_bscd_pbs_sys->taskobits[idx]);
                         */
                    }
                    events_count++;
                    break;      /* break from for(idx<spawned_count) loop */
                }
            }
        }

        /* Check if time is up */
        if (timeout > 0) {
            gettimeofday(&curr_tval, NULL);
            time_elapsed = curr_tval.tv_sec - start_tval.tv_sec;
            if (time_elapsed > timeout) {
                status = HYD_TIMED_OUT;
                goto fn_exit;
            }
        }
    }

    if (HYDT_bsci_info.debug) {
        HYDU_dump(stdout, "\nPBS_DEBUG: Done with polling obit events!\n");
    }

    /* Loop till all sockets have closed */
fn_exit:
    HYDU_FUNC_EXIT();
    return status;

fn_fail:
    goto fn_exit;
}
Exemplo n.º 12
0
Arquivo: mpiexec.c Projeto: adk9/hydra
int main(int argc, char **argv)
{
    struct HYD_proxy *proxy;
    struct HYD_exec *exec;
    struct HYD_node *node;
    int exit_status = 0, i, timeout, reset_rmk, global_core_count;
    HYD_status status = HYD_SUCCESS;

    HYDU_FUNC_ENTER();

    /* Initialize engines that don't require use to know anything
     * about the user preferences first */
    status = HYDU_dbg_init("mpiexec");
    HYDU_ERR_POP(status, "unable to initialization debugging\n");

    status = HYDU_set_common_signals(signal_cb);
    HYDU_ERR_POP(status, "unable to set signal\n");

    status = HYDT_ftb_init();
    HYDU_ERR_POP(status, "unable to initialize FTB\n");

    /* Get user preferences */
    status = HYD_uii_mpx_get_parameters(argv);
    HYDU_ERR_POP(status, "error parsing parameters\n");

    /* Now we initialize engines that require us to know user
     * preferences */
#if HAVE_ALARM
    if (HYD_ui_mpich_info.ckpoint_int != -1)
        alarm(HYD_ui_mpich_info.ckpoint_int);
#endif /* HAVE_ALARM */

    /* The demux engine should be initialized before any sockets are
     * created, since it checks for STDIN's validity.  If STDIN was
     * closed and we opened a socket that got the same fd as STDIN,
     * this test will not be possible. */
    status = HYDT_dmx_init(&HYD_server_info.user_global.demux);
    HYDU_ERR_POP(status, "unable to initialize the demux engine\n");

    status =
        HYDT_bsci_init(HYD_server_info.user_global.rmk, HYD_server_info.user_global.launcher,
                       HYD_server_info.user_global.launcher_exec,
                       HYD_server_info.user_global.enablex, HYD_server_info.user_global.debug);
    HYDU_ERR_POP(status, "unable to initialize the bootstrap server\n");

    reset_rmk = 0;

    if (HYD_server_info.node_list == NULL) {
        /* Node list is not created yet. The user might not have
         * provided the host file. Query the RMK. */
        status = HYDT_bsci_query_node_list(&HYD_server_info.node_list);
        HYDU_ERR_POP(status, "unable to query the RMK for a node list\n");

        if (HYD_server_info.node_list == NULL) {
            char localhost[MAX_HOSTNAME_LEN] = { 0 };

            /* The RMK didn't give us anything back; use localhost */
            status = HYDU_gethostname(localhost);
            HYDU_ERR_POP(status, "unable to get local hostname\n");

            status = HYDU_add_to_node_list(localhost, 1, &HYD_server_info.node_list);
            HYDU_ERR_POP(status, "unable to add to node list\n");

            reset_rmk = 1;
        }
    }

    /*
     * If this is a checkpoint-restart, if the user specified the
     * number of processes, we already have a dummy executable. If the
     * number of processes came from the RMK, our executable list is
     * still NULL; a dummy executable needs to be created.
     */
    if (HYD_uii_mpx_exec_list == NULL) {
        HYDU_ASSERT(HYD_server_info.user_global.ckpoint_prefix, status);

        /* create a dummy executable */
        status = HYDU_alloc_exec(&HYD_uii_mpx_exec_list);
        HYDU_ERR_POP(status, "unable to allocate exec\n");
        HYD_uii_mpx_exec_list->appnum = 0;
    }

    if (HYD_server_info.user_global.debug)
        for (node = HYD_server_info.node_list; node; node = node->next)
            HYDU_dump_noprefix(stdout, "host: %s\n", node->hostname);

    /* Reset the host list to use only the number of processes per
     * node as specified by the ppn option. */
    if (HYD_ui_mpich_info.ppn != -1) {
        for (node = HYD_server_info.node_list; node; node = node->next)
            node->core_count = HYD_ui_mpich_info.ppn;
        reset_rmk = 1;
    }

    /* The RMK returned a node list. See if the user requested us to
     * manipulate it in some way */
    if (HYD_ui_mpich_info.sort_order != NONE) {
        qsort_node_list();
        reset_rmk = 1;
    }

    if (reset_rmk) {
        /* Reassign node IDs to each node */
        for (node = HYD_server_info.node_list, i = 0; node; node = node->next, i++)
            node->node_id = i;

        /* Reinitialize the bootstrap server with the "user" RMK, so
         * it knows that we are not using the node list provided by
         * the RMK */
        status = HYDT_bsci_finalize();
        HYDU_ERR_POP(status, "unable to finalize bootstrap device\n");

        status = HYDT_bsci_init("user", HYDT_bsci_info.launcher, HYDT_bsci_info.launcher_exec,
                                HYDT_bsci_info.enablex, HYDT_bsci_info.debug);
        HYDU_ERR_POP(status, "unable to reinitialize the bootstrap server\n");
    }

    /* If the number of processes is not given, we allocate all the
     * available nodes to each executable */
    HYD_server_info.pg_list.pg_process_count = 0;
    for (exec = HYD_uii_mpx_exec_list; exec; exec = exec->next) {
        if (exec->proc_count == -1) {
            global_core_count = 0;
            for (node = HYD_server_info.node_list, i = 0; node; node = node->next, i++)
                global_core_count += node->core_count;
            exec->proc_count = global_core_count;
        }
        HYD_server_info.pg_list.pg_process_count += exec->proc_count;
    }

    status = HYDU_list_inherited_env(&HYD_server_info.user_global.global_env.inherited);
    HYDU_ERR_POP(status, "unable to get the inherited env list\n");

    status = HYDU_create_proxy_list(HYD_uii_mpx_exec_list, HYD_server_info.node_list,
                                    &HYD_server_info.pg_list);
    HYDU_ERR_POP(status, "unable to create proxy list\n");

    /* calculate the core count used by the PG */
    HYD_server_info.pg_list.pg_core_count = 0;
    for (proxy = HYD_server_info.pg_list.proxy_list; proxy; proxy = proxy->next)
        HYD_server_info.pg_list.pg_core_count += proxy->node->core_count;

    /* If the user didn't specify a local hostname, try to find one in
     * the list of nodes passed to us */
    if (HYD_server_info.localhost == NULL) {
        /* See if the node list contains a remotely accessible localhost */
        for (node = HYD_server_info.node_list; node; node = node->next) {
            int is_local, remote_access;

            status = HYDU_sock_is_local(node->hostname, &is_local);
            HYDU_ERR_POP(status, "unable to check if %s is local\n", node->hostname);

            if (is_local) {
                status = HYDU_sock_remote_access(node->hostname, &remote_access);
                HYDU_ERR_POP(status, "unable to check if %s is remotely accessible\n",
                             node->hostname);

                if (remote_access)
                    break;
            }
        }

        if (node)
            HYD_server_info.localhost = HYDU_strdup(node->hostname);
        else {
            HYDU_MALLOC(HYD_server_info.localhost, char *, MAX_HOSTNAME_LEN, status);
            status = HYDU_gethostname(HYD_server_info.localhost);
            HYDU_ERR_POP(status, "unable to get local hostname\n");
        }
    }

    if (HYD_server_info.user_global.debug)
        HYD_uiu_print_params();

    if (MPL_env2int("MPIEXEC_TIMEOUT", &timeout) == 0)
        timeout = -1;   /* Infinite timeout */

    if (HYD_server_info.user_global.debug)
        HYDU_dump(stdout, "Timeout set to %d (-1 means infinite)\n", timeout);

    /* Check if the user wants us to use a port within a certain
     * range. */
    if (MPL_env2str("MPIEXEC_PORTRANGE", (const char **) &HYD_server_info.port_range) ||
        MPL_env2str("MPIEXEC_PORT_RANGE", (const char **) &HYD_server_info.port_range) ||
        MPL_env2str("MPICH_PORT_RANGE", (const char **) &HYD_server_info.port_range))
        HYD_server_info.port_range = HYDU_strdup(HYD_server_info.port_range);

    /* Add the stdout/stderr callback handlers */
    HYD_server_info.stdout_cb = HYD_uiu_stdout_cb;
    HYD_server_info.stderr_cb = HYD_uiu_stderr_cb;

    /* Create a pipe connection to wake up the process manager */
    if (pipe(HYD_server_info.cmd_pipe) < 0)
        HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "pipe error\n");

    /* Launch the processes */
    status = HYD_pmci_launch_procs();
    HYDU_ERR_POP(status, "process manager returned error launching processes\n");

    /* Wait for their completion */
    status = HYD_pmci_wait_for_completion(timeout);
    HYDU_ERR_POP(status, "process manager error waiting for completion\n");

    /* Check for the exit status for all the processes */
    if (HYD_ui_mpich_info.print_all_exitcodes)
        HYDU_dump(stdout, "Exit codes: ");
    exit_status = 0;
    for (proxy = HYD_server_info.pg_list.proxy_list; proxy; proxy = proxy->next) {
        if (proxy->exit_status == NULL) {
            /* We didn't receive the exit status for this proxy */
            continue;
        }

        if (HYD_ui_mpich_info.print_all_exitcodes)
            HYDU_dump_noprefix(stdout, "[%s] ", proxy->node->hostname);

        for (i = 0; i < proxy->proxy_process_count; i++) {
            if (HYD_ui_mpich_info.print_all_exitcodes) {
                HYDU_dump_noprefix(stdout, "%d", proxy->exit_status[i]);
                if (i < proxy->proxy_process_count - 1)
                    HYDU_dump_noprefix(stdout, ",");
            }

            exit_status |= proxy->exit_status[i];
        }

        if (HYD_ui_mpich_info.print_all_exitcodes)
            HYDU_dump_noprefix(stdout, "\n");
    }

    /* Call finalize functions for lower layers to cleanup their resources */
    status = HYD_pmci_finalize();
    HYDU_ERR_POP(status, "process manager error on finalize\n");

    status = HYDT_ftb_finalize();
    HYDU_ERR_POP(status, "error finalizing FTB\n");

#if defined ENABLE_PROFILING
    if (HYD_server_info.enable_profiling) {
        HYDU_dump_noprefix(stdout, "\n");
        HYD_DRAW_LINE(80);
        HYDU_dump(stdout, "Number of PMI calls seen by the server: %d\n",
                  HYD_server_info.num_pmi_calls);
        HYD_DRAW_LINE(80);
        HYDU_dump_noprefix(stdout, "\n");
    }
#endif /* ENABLE_PROFILING */

    /* Free the mpiexec params */
    HYD_uiu_free_params();
    HYDU_free_exec_list(HYD_uii_mpx_exec_list);

  fn_exit:
    HYDU_dbg_finalize();
    HYDU_FUNC_EXIT();
    if (status == HYD_GRACEFUL_ABORT)
        return 0;
    else if (status != HYD_SUCCESS)
        return -1;
    else if (WIFSIGNALED(exit_status)) {
        printf("YOUR APPLICATION TERMINATED WITH THE EXIT STRING: %s (signal %d)\n",
               strsignal(WTERMSIG(exit_status)), WTERMSIG(exit_status));
        printf("This typically refers to a problem with your application.\n");
        printf("Please see the FAQ page for debugging suggestions\n");
        return exit_status;
    }
    else if (WIFEXITED(exit_status)) {
        return WEXITSTATUS(exit_status);
    }
    else if (WIFSTOPPED(exit_status)) {
        return WSTOPSIG(exit_status);
    }
    else {
        return exit_status;
    }

  fn_fail:
    goto fn_exit;
}