static void signal_cb(int signum) { struct HYD_cmd cmd; static int sigint_count = 0; int sent, closed; HYDU_FUNC_ENTER(); /* SIGALRM is a special signal that indicates that a checkpoint * needs to be initiated */ if (signum == SIGALRM) { if (HYD_server_info.user_global.ckpoint_prefix == NULL) { HYDU_dump(stderr, "No checkpoint prefix provided\n"); return; } #if HAVE_ALARM if (HYD_ui_mpich_info.ckpoint_int != -1) alarm(HYD_ui_mpich_info.ckpoint_int); #endif /* HAVE_ALARM */ cmd.type = HYD_CKPOINT; HYDU_sock_write(HYD_server_info.cmd_pipe[1], &cmd, sizeof(cmd), &sent, &closed, HYDU_SOCK_COMM_MSGWAIT); goto fn_exit; } cmd.type = HYD_SIGNAL; cmd.signum = signum; /* SIGINT is a partially special signal. The first time we see it, * we will send it to the processes. The next time, we will treat * it as a SIGKILL (user convenience to force kill processes). */ if (signum == SIGINT && ++sigint_count > 1) cmd.type = HYD_CLEANUP; else if (signum == SIGINT) { /* First Ctrl-C */ HYDU_dump(stdout, "Sending Ctrl-C to processes as requested\n"); HYDU_dump(stdout, "Press Ctrl-C again to force abort\n"); } HYDU_sock_write(HYD_server_info.cmd_pipe[1], &cmd, sizeof(cmd), &sent, &closed, HYDU_SOCK_COMM_MSGWAIT); fn_exit: HYDU_FUNC_EXIT(); return; }
static HYD_status cmd_response(int fd, int pid, const char *cmd) { struct HYD_pmcd_hdr hdr; int sent, closed; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); HYD_pmcd_init_header(&hdr); hdr.cmd = PMI_RESPONSE; hdr.pid = pid; hdr.pmi_version = 1; hdr.buflen = strlen(cmd); status = HYDU_sock_write(fd, &hdr, sizeof(hdr), &sent, &closed); HYDU_ERR_POP(status, "unable to send PMI_RESPONSE header to proxy\n"); HYDU_ASSERT(!closed, status); if (HYD_server_info.user_global.debug) { HYDU_dump(stdout, "PMI response to fd %d pid %d: %s", fd, pid, cmd); } status = HYDU_sock_write(fd, cmd, strlen(cmd), &sent, &closed); HYDU_ERR_POP(status, "unable to send response to command\n"); HYDU_ASSERT(!closed, status); fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status send_cmd_downstream(int fd, const char *cmd) { char cmdlen[7]; int sent, closed; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); MPL_snprintf(cmdlen, 7, "%6u", (unsigned) strlen(cmd)); status = HYDU_sock_write(fd, cmdlen, 6, &sent, &closed, HYDU_SOCK_COMM_MSGWAIT); HYDU_ERR_POP(status, "error writing PMI line\n"); /* FIXME: We cannot abort when we are not able to send data * downstream. The upper layer needs to handle this based on * whether we want to abort or not.*/ HYDU_ASSERT(!closed, status); if (HYD_pmcd_pmip.user_global.debug) { HYDU_dump(stdout, "PMI response: %s\n", cmd); } status = HYDU_sock_write(fd, cmd, strlen(cmd), &sent, &closed, HYDU_SOCK_COMM_MSGWAIT); HYDU_ERR_POP(status, "error writing PMI line\n"); HYDU_ASSERT(!closed, status); fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status handle_pmi_cmd(int fd, int pgid, int pid, char *buf, int pmi_version) { char **args = NULL, *cmd = NULL; struct HYD_pmcd_pmi_handle *h; int i; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); if (pmi_version == 1) HYD_pmcd_pmi_handle = HYD_pmcd_pmi_v1; else HYD_pmcd_pmi_handle = HYD_pmcd_pmi_v2; if (HYD_server_info.user_global.debug) HYDU_dump(stdout, "[pgid: %d] got PMI command: %s\n", pgid, buf); HYDU_MALLOC(args, char **, MAX_PMI_ARGS * sizeof(char *), status); for (i = 0; i < MAX_PMI_ARGS; i++) args[i] = NULL; status = HYD_pmcd_pmi_parse_pmi_cmd(buf, pmi_version, &cmd, args); HYDU_ERR_POP(status, "unable to parse PMI command\n"); #if defined ENABLE_PROFILING if (HYD_server_info.enable_profiling) HYD_server_info.num_pmi_calls++; #endif /* ENABLE_PROFILING */ h = HYD_pmcd_pmi_handle; while (h->handler) { if (!strcmp(cmd, h->cmd)) { status = h->handler(fd, pid, pgid, args); HYDU_ERR_POP(status, "PMI handler returned error\n"); break; } h++; } if (!h->handler) { /* We don't understand the command */ HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "Unrecognized PMI command: %s | cleaning up processes\n", cmd); } fn_exit: if (cmd) HYDU_FREE(cmd); if (args) { HYDU_free_strlist(args); HYDU_free(args); } HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
HYD_status HYDTI_bscd_ssh_store_launch_time(char *hostname) { int i, oldest, time_left; struct timeval now; struct HYDT_bscd_ssh_time *e; HYD_status status = HYD_SUCCESS; for (e = HYDT_bscd_ssh_time; e; e = e->next) if (!strcmp(hostname, e->hostname)) break; if (e == NULL) { /* Couldn't find an element for this host */ status = create_element(hostname, &e); HYDU_ERR_POP(status, "unable to create ssh time element\n"); } /* Search for an unset element to store the current time */ for (i = 0; i < HYDT_bscd_ssh_limit; i++) { if (e->init_time[i].tv_sec == 0 && e->init_time[i].tv_usec == 0) { gettimeofday(&e->init_time[i], NULL); goto fn_exit; } } /* No free element found; wait for the oldest element to turn * older */ oldest = 0; for (i = 0; i < HYDT_bscd_ssh_limit; i++) if (older(e->init_time[i], e->init_time[oldest])) oldest = i; gettimeofday(&now, NULL); time_left = HYDT_bscd_ssh_limit_time - now.tv_sec + e->init_time[oldest].tv_sec; /* A better approach will be to make progress here, but that would * mean that we need to deal with nested calls to the demux engine * and process launches. */ if (time_left > 0) { if (HYDT_bscd_ssh_warnings) HYDU_dump(stdout, "WARNING: too many ssh connections to %s; waiting %d seconds\n", hostname, time_left); sleep(time_left); } /* Store the current time in the oldest element */ gettimeofday(&e->init_time[oldest], NULL); fn_exit: return status; fn_fail: goto fn_exit; }
static HYD_status send_cmd_upstream(const char *start, int fd, char *args[]) { int i, sent, closed; struct HYD_string_stash stash; char *buf = NULL; struct HYD_pmcd_hdr hdr; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); HYD_STRING_STASH_INIT(stash); HYD_STRING_STASH(stash, MPL_strdup(start), status); for (i = 0; args[i]; i++) { HYD_STRING_STASH(stash, MPL_strdup(args[i]), status); if (args[i + 1]) HYD_STRING_STASH(stash, MPL_strdup(";"), status); } HYD_STRING_SPIT(stash, buf, status); HYD_pmcd_init_header(&hdr); hdr.cmd = PMI_CMD; hdr.pid = fd; hdr.buflen = strlen(buf); hdr.pmi_version = 2; status = HYDU_sock_write(HYD_pmcd_pmip.upstream.control, &hdr, sizeof(hdr), &sent, &closed, HYDU_SOCK_COMM_MSGWAIT); HYDU_ERR_POP(status, "unable to send PMI header upstream\n"); HYDU_ASSERT(!closed, status); if (HYD_pmcd_pmip.user_global.debug) { HYDU_dump(stdout, "forwarding command (%s) upstream\n", buf); } status = HYDU_sock_write(HYD_pmcd_pmip.upstream.control, buf, hdr.buflen, &sent, &closed, HYDU_SOCK_COMM_MSGWAIT); HYDU_ERR_POP(status, "unable to send PMI command upstream\n"); HYDU_ASSERT(!closed, status); fn_exit: if (buf) MPL_free(buf); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status cmd_response(int fd, int pid, char *cmd) { char cmdlen[7]; struct HYD_pmcd_hdr hdr; int sent, closed; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); HYD_pmcd_init_header(&hdr); hdr.cmd = PMI_RESPONSE; hdr.pid = pid; hdr.pmi_version = 2; hdr.buflen = 6 + strlen(cmd); status = HYDU_sock_write(fd, &hdr, sizeof(hdr), &sent, &closed, HYDU_SOCK_COMM_MSGWAIT); HYDU_ERR_POP(status, "unable to send PMI_RESPONSE header to proxy\n"); HYDU_ASSERT(!closed, status); HYDU_snprintf(cmdlen, 7, "%6u", (unsigned) strlen(cmd)); status = HYDU_sock_write(fd, cmdlen, 6, &sent, &closed, HYDU_SOCK_COMM_MSGWAIT); HYDU_ERR_POP(status, "error writing PMI line\n"); HYDU_ASSERT(!closed, status); if (HYD_server_info.user_global.debug) { HYDU_dump(stdout, "PMI response to fd %d pid %d: %s\n", fd, pid, cmd); } status = HYDU_sock_write(fd, cmd, strlen(cmd), &sent, &closed, HYDU_SOCK_COMM_MSGWAIT); HYDU_ERR_POP(status, "error writing PMI line\n"); HYDU_ASSERT(!closed, status); fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status fn_spawn(int fd, int pid, int pgid, char *args[]) { struct HYD_pg *pg; struct HYD_pmcd_pmi_pg_scratch *pg_scratch; struct HYD_proxy *proxy; struct HYD_pmcd_token *tokens; struct HYD_exec *exec_list = NULL, *exec; struct HYD_env *env; struct HYD_node *node; char key[PMI_MAXKEYLEN], *val; int nprocs, preput_num, info_num, ret; char *execname, *path = NULL; struct HYD_pmcd_token_segment *segment_list = NULL; int token_count, i, j, k, new_pgid, total_spawns; int argcnt, num_segments; char *control_port, *proxy_args[HYD_NUM_TMP_STRINGS] = { NULL }; char *tmp[HYD_NUM_TMP_STRINGS]; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); for (i = 0; args[i]; i++) mcmd_args[mcmd_num_args++] = HYDU_strdup(args[i]); mcmd_args[mcmd_num_args] = NULL; status = HYD_pmcd_pmi_args_to_tokens(mcmd_args, &tokens, &token_count); HYDU_ERR_POP(status, "unable to convert args to tokens\n"); /* Here's the order of things we do: * * 1. Break the token list into multiple segments, each segment * corresponding to a command. Each command represents * information for one executable. * * 2. Allocate a process group for the new set of spawned * processes * * 3. Get all the common keys and deal with them * * 4. Create an executable list based on the segments. * * 5. Create a proxy list using the created executable list and * spawn it. */ /* Break the token list into multiple segments and create an * executable list based on the segments. */ val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "totspawns"); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: totspawns\n"); total_spawns = atoi(val); HYDU_MALLOC(segment_list, struct HYD_pmcd_token_segment *, total_spawns * sizeof(struct HYD_pmcd_token_segment), status); segment_tokens(tokens, token_count, segment_list, &num_segments); if (num_segments != total_spawns) { /* We didn't read the entire PMI string; wait for the rest to * arrive */ goto fn_exit; } else { /* Got the entire PMI string; free the arguments and reset */ HYDU_free_strlist(mcmd_args); mcmd_num_args = 0; } /* Allocate a new process group */ for (pg = &HYD_server_info.pg_list; pg->next; pg = pg->next); new_pgid = pg->pgid + 1; status = HYDU_alloc_pg(&pg->next, new_pgid); HYDU_ERR_POP(status, "unable to allocate process group\n"); pg = pg->next; proxy = HYD_pmcd_pmi_find_proxy(fd); HYDU_ASSERT(proxy, status); pg->spawner_pg = proxy->pg; for (j = 0; j < total_spawns; j++) { /* For each segment, we create an exec structure */ val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, "nprocs"); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: nprocs\n"); nprocs = atoi(val); pg->pg_process_count += nprocs; val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, "argcnt"); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: argcnt\n"); argcnt = atoi(val); val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, "info_num"); if (val) info_num = atoi(val); else info_num = 0; if (exec_list == NULL) { status = HYDU_alloc_exec(&exec_list); HYDU_ERR_POP(status, "unable to allocate exec\n"); exec_list->appnum = 0; exec = exec_list; } else { for (exec = exec_list; exec->next; exec = exec->next); status = HYDU_alloc_exec(&exec->next); HYDU_ERR_POP(status, "unable to allocate exec\n"); exec->next->appnum = exec->appnum + 1; exec = exec->next; } /* Info keys */ for (i = 0; i < info_num; i++) { char *info_key, *info_val; HYDU_snprintf(key, PMI_MAXKEYLEN, "info_key_%d", i); val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, key); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: %s\n", key); info_key = val; HYDU_snprintf(key, PMI_MAXKEYLEN, "info_val_%d", i); val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, key); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: %s\n", key); info_val = val; if (!strcmp(info_key, "path")) { path = HYDU_strdup(info_val); } else if (!strcmp(info_key, "wdir")) { exec->wdir = HYDU_strdup(info_val); } else if (!strcmp(info_key, "host")) { status = HYDU_process_mfile_token(info_val, 1, &pg->user_node_list); HYDU_ERR_POP(status, "error create node list\n"); } else if (!strcmp(info_key, "hostfile")) { status = HYDU_parse_hostfile(info_val, &pg->user_node_list, HYDU_process_mfile_token); HYDU_ERR_POP(status, "error parsing hostfile\n"); } else { /* Unrecognized info key; ignore */ } } status = HYDU_correct_wdir(&exec->wdir); HYDU_ERR_POP(status, "unable to correct wdir\n"); val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, "execname"); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: execname\n"); if (path == NULL) execname = HYDU_strdup(val); else { i = 0; tmp[i++] = HYDU_strdup(path); tmp[i++] = HYDU_strdup("/"); tmp[i++] = HYDU_strdup(val); tmp[i++] = NULL; status = HYDU_str_alloc_and_join(tmp, &execname); HYDU_ERR_POP(status, "error while joining strings\n"); HYDU_free_strlist(tmp); } i = 0; exec->exec[i++] = execname; for (k = 0; k < argcnt; k++) { HYDU_snprintf(key, PMI_MAXKEYLEN, "arg%d", k + 1); val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, key); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: %s\n", key); exec->exec[i++] = HYDU_strdup(val); } exec->exec[i++] = NULL; exec->proc_count = nprocs; /* It is not clear what kind of environment needs to get * passed to the spawned process. Don't set anything here, and * let the proxy do whatever it does by default. */ exec->env_prop = NULL; status = HYDU_env_create(&env, "PMI_SPAWNED", "1"); HYDU_ERR_POP(status, "unable to create PMI_SPAWNED environment\n"); exec->user_env = env; } status = HYD_pmcd_pmi_alloc_pg_scratch(pg); HYDU_ERR_POP(status, "unable to allocate pg scratch space\n"); if (pg->user_node_list) { pg->pg_core_count = 0; for (i = 0, node = pg->user_node_list; node; node = node->next, i++) { pg->pg_core_count += node->core_count; node->node_id = i; } } else { pg->pg_core_count = HYD_server_info.pg_list.pg_core_count; } pg->pg_process_count = 0; for (exec = exec_list; exec; exec = exec->next) pg->pg_process_count += exec->proc_count; pg_scratch = (struct HYD_pmcd_pmi_pg_scratch *) pg->pg_scratch; /* Get the common keys and deal with them */ val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "preput_num"); if (val) preput_num = atoi(val); else preput_num = 0; for (i = 0; i < preput_num; i++) { char *preput_key, *preput_val; HYDU_snprintf(key, PMI_MAXKEYLEN, "preput_key_%d", i); val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, key); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: %s\n", key); preput_key = val; HYDU_snprintf(key, PMI_MAXKEYLEN, "preput_val_%d", i); val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, key); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: %s\n", key); preput_val = val; status = HYD_pmcd_pmi_add_kvs(preput_key, preput_val, pg_scratch->kvs, &ret); HYDU_ERR_POP(status, "unable to add keypair to kvs\n"); } /* Create the proxy list */ if (pg->user_node_list) { status = HYDU_create_proxy_list(exec_list, pg->user_node_list, pg); HYDU_ERR_POP(status, "error creating proxy list\n"); } else { status = HYDU_create_proxy_list(exec_list, HYD_server_info.node_list, pg); HYDU_ERR_POP(status, "error creating proxy list\n"); } HYDU_free_exec_list(exec_list); status = HYDU_sock_create_and_listen_portstr(HYD_server_info.user_global.iface, HYD_server_info.local_hostname, HYD_server_info.port_range, &control_port, HYD_pmcd_pmiserv_control_listen_cb, (void *) (size_t) new_pgid); HYDU_ERR_POP(status, "unable to create PMI port\n"); if (HYD_server_info.user_global.debug) HYDU_dump(stdout, "Got a control port string of %s\n", control_port); /* Go to the last PG */ for (pg = &HYD_server_info.pg_list; pg->next; pg = pg->next); status = HYD_pmcd_pmi_fill_in_proxy_args(proxy_args, control_port, new_pgid); HYDU_ERR_POP(status, "unable to fill in proxy arguments\n"); HYDU_FREE(control_port); status = HYD_pmcd_pmi_fill_in_exec_launch_info(pg); HYDU_ERR_POP(status, "unable to fill in executable arguments\n"); status = HYDT_bsci_launch_procs(proxy_args, pg->proxy_list, NULL); HYDU_ERR_POP(status, "launcher cannot launch processes\n"); { char *cmd_str[HYD_NUM_TMP_STRINGS], *cmd; i = 0; cmd_str[i++] = HYDU_strdup("cmd=spawn_result rc=0"); cmd_str[i++] = HYDU_strdup("\n"); cmd_str[i++] = NULL; status = HYDU_str_alloc_and_join(cmd_str, &cmd); HYDU_ERR_POP(status, "unable to join strings\n"); HYDU_free_strlist(cmd_str); status = cmd_response(fd, pid, cmd); HYDU_ERR_POP(status, "error writing PMI line\n"); HYDU_FREE(cmd); } fn_exit: HYD_pmcd_pmi_free_tokens(tokens, token_count); HYDU_free_strlist(proxy_args); if (segment_list) HYDU_FREE(segment_list); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
HYD_status HYDT_bscd_slurm_launch_procs(char **args, struct HYD_proxy *proxy_list, int *control_fd) { int num_hosts, idx, i; int *pid, *fd_list; char *targs[HYD_NUM_TMP_STRINGS], *node_list_str = NULL; char *path = NULL, *extra_arg_list = NULL, *extra_arg; struct HYD_proxy *proxy; struct HYDT_topo_cpuset_t cpuset; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); /* We use the following priority order for the executable path: * (1) user-specified; (2) search in path; (3) Hard-coded * location */ if (HYDT_bsci_info.launcher_exec) path = HYDU_strdup(HYDT_bsci_info.launcher_exec); if (!path) path = HYDU_find_full_path("srun"); if (!path) path = HYDU_strdup("/usr/bin/srun"); idx = 0; targs[idx++] = HYDU_strdup(path); if (strcmp(HYDT_bsci_info.rmk, "slurm")) { targs[idx++] = HYDU_strdup("--nodelist"); status = proxy_list_to_node_str(proxy_list, &node_list_str); HYDU_ERR_POP(status, "unable to build a node list string\n"); targs[idx++] = HYDU_strdup(node_list_str); } num_hosts = 0; for (proxy = proxy_list; proxy; proxy = proxy->next) num_hosts++; targs[idx++] = HYDU_strdup("-N"); targs[idx++] = HYDU_int_to_str(num_hosts); targs[idx++] = HYDU_strdup("-n"); targs[idx++] = HYDU_int_to_str(num_hosts); MPL_env2str("HYDRA_LAUNCHER_EXTRA_ARGS", (const char **) &extra_arg_list); if (extra_arg_list) { extra_arg = strtok(extra_arg_list, " "); while (extra_arg) { targs[idx++] = HYDU_strdup(extra_arg); extra_arg = strtok(NULL, " "); } } /* Fill in the remaining arguments */ /* We do not need to create a quoted version of the string for * SLURM. It seems to be internally quoting it anyway. */ for (i = 0; args[i]; i++) targs[idx++] = HYDU_strdup(args[i]); /* Increase pid list to accommodate the new pid */ HYDU_MALLOC(pid, int *, (HYD_bscu_pid_count + 1) * sizeof(int), status); for (i = 0; i < HYD_bscu_pid_count; i++) pid[i] = HYD_bscu_pid_list[i]; HYDU_FREE(HYD_bscu_pid_list); HYD_bscu_pid_list = pid; /* Increase fd list to accommodate these new fds */ HYDU_MALLOC(fd_list, int *, (HYD_bscu_fd_count + 3) * sizeof(int), status); for (i = 0; i < HYD_bscu_fd_count; i++) fd_list[i] = HYD_bscu_fd_list[i]; HYDU_FREE(HYD_bscu_fd_list); HYD_bscu_fd_list = fd_list; /* append proxy ID as -1 */ targs[idx++] = HYDU_int_to_str(-1); targs[idx++] = NULL; if (HYDT_bsci_info.debug) { HYDU_dump(stdout, "Launch arguments: "); HYDU_print_strlist(targs); } HYDT_topo_cpuset_zero(&cpuset); status = HYDU_create_process(targs, NULL, NULL, &fd_stdout, &fd_stderr, &HYD_bscu_pid_list[HYD_bscu_pid_count++], cpuset); HYDU_ERR_POP(status, "create process returned error\n"); HYD_bscu_fd_list[HYD_bscu_fd_count++] = fd_stdout; HYD_bscu_fd_list[HYD_bscu_fd_count++] = fd_stderr; status = HYDT_dmx_register_fd(1, &fd_stdout, HYD_POLLIN, (void *) (size_t) STDOUT_FILENO, HYDT_bscu_stdio_cb); HYDU_ERR_POP(status, "demux returned error registering fd\n"); status = HYDT_dmx_register_fd(1, &fd_stderr, HYD_POLLIN, (void *) (size_t) STDERR_FILENO, HYDT_bscu_stdio_cb); HYDU_ERR_POP(status, "demux returned error registering fd\n"); fn_exit: if (node_list_str) HYDU_FREE(node_list_str); HYDU_free_strlist(targs); if (path) HYDU_FREE(path); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status fn_spawn(int fd, int pid, int pgid, char *args[]) { struct HYD_pg *pg; struct HYD_pmcd_pmi_pg_scratch *pg_scratch; struct HYD_proxy *proxy; struct HYD_pmcd_token *tokens; struct HYD_exec *exec_list = NULL, *exec; struct HYD_env *env; struct HYD_node *node; char *thrid; char key[PMI_MAXKEYLEN], *val; int maxprocs, preputcount, infokeycount, ret; int ncmds; char *execname, *path = NULL; struct HYD_pmcd_token_segment *segment_list = NULL; int token_count, i, j, k, new_pgid; int argcnt, num_segments; struct HYD_string_stash proxy_stash; char *control_port; struct HYD_string_stash stash; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); status = HYD_pmcd_pmi_args_to_tokens(args, &tokens, &token_count); HYDU_ERR_POP(status, "unable to convert args to tokens\n"); thrid = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "thrid"); /* Here's the order of things we do: * * 1. Break the token list into multiple segments, each segment * corresponding to a command. Each command represents * information for one executable. * * 2. Allocate a process group for the new set of spawned * processes * * 3. Get all the common keys and deal with them * * 4. Create an executable list based on the segments. * * 5. Create a proxy list using the created executable list and * spawn it. */ val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "ncmds"); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: ncmds\n"); ncmds = atoi(val); HYDU_MALLOC(segment_list, struct HYD_pmcd_token_segment *, (ncmds + 1) * sizeof(struct HYD_pmcd_token_segment), status); segment_tokens(tokens, token_count, segment_list, &num_segments); HYDU_ASSERT((ncmds + 1) == num_segments, status); /* Allocate a new process group */ for (pg = &HYD_server_info.pg_list; pg->next; pg = pg->next); new_pgid = pg->pgid + 1; status = HYDU_alloc_pg(&pg->next, new_pgid); HYDU_ERR_POP(status, "unable to allocate process group\n"); pg = pg->next; proxy = HYD_pmcd_pmi_find_proxy(fd); HYDU_ASSERT(proxy, status); pg->spawner_pg = proxy->pg; for (j = 1; j <= ncmds; j++) { /* For each segment, we create an exec structure */ val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, "maxprocs"); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: maxprocs\n"); maxprocs = atoi(val); pg->pg_process_count += maxprocs; val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, "argc"); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: argc\n"); argcnt = atoi(val); val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, "infokeycount"); if (val) infokeycount = atoi(val); else infokeycount = 0; if (exec_list == NULL) { status = HYDU_alloc_exec(&exec_list); HYDU_ERR_POP(status, "unable to allocate exec\n"); exec_list->appnum = 0; exec = exec_list; } else { for (exec = exec_list; exec->next; exec = exec->next); status = HYDU_alloc_exec(&exec->next); HYDU_ERR_POP(status, "unable to allocate exec\n"); exec->next->appnum = exec->appnum + 1; exec = exec->next; } /* Info keys */ for (i = 0; i < infokeycount; i++) { char *info_key, *info_val; HYDU_snprintf(key, PMI_MAXKEYLEN, "infokey%d", i); val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, key); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: %s\n", key); info_key = val; HYDU_snprintf(key, PMI_MAXKEYLEN, "infoval%d", i); val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, key); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: %s\n", key); info_val = val; if (!strcmp(info_key, "path")) { path = HYDU_strdup(info_val); } else if (!strcmp(info_key, "wdir")) { exec->wdir = HYDU_strdup(info_val); } else if (!strcmp(info_key, "host") || !strcmp(info_key, "hosts")) { char *host = strtok(info_val, ","); while (host) { status = HYDU_process_mfile_token(host, 1, &pg->user_node_list); HYDU_ERR_POP(status, "error creating node list\n"); host = strtok(NULL, ","); } } else if (!strcmp(info_key, "hostfile")) { status = HYDU_parse_hostfile(info_val, &pg->user_node_list, HYDU_process_mfile_token); HYDU_ERR_POP(status, "error parsing hostfile\n"); } else { /* Unrecognized info key; ignore */ } } status = HYDU_correct_wdir(&exec->wdir); HYDU_ERR_POP(status, "unable to correct wdir\n"); val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, "subcmd"); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: subcmd\n"); if (path == NULL) execname = HYDU_strdup(val); else { HYD_STRING_STASH_INIT(stash); HYD_STRING_STASH(stash, HYDU_strdup(path), status); HYD_STRING_STASH(stash, HYDU_strdup("/"), status); HYD_STRING_STASH(stash, HYDU_strdup(val), status); HYD_STRING_SPIT(stash, execname, status); } i = 0; exec->exec[i++] = execname; for (k = 0; k < argcnt; k++) { HYDU_snprintf(key, PMI_MAXKEYLEN, "argv%d", k); val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, key); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: %s\n", key); exec->exec[i++] = HYDU_strdup(val); } exec->exec[i++] = NULL; exec->proc_count = maxprocs; /* It is not clear what kind of environment needs to get * passed to the spawned process. Don't set anything here, and * let the proxy do whatever it does by default. */ exec->env_prop = NULL; status = HYDU_env_create(&env, "PMI_SPAWNED", "1"); HYDU_ERR_POP(status, "unable to create PMI_SPAWNED environment\n"); exec->user_env = env; } status = HYD_pmcd_pmi_alloc_pg_scratch(pg); HYDU_ERR_POP(status, "unable to allocate pg scratch space\n"); pg->pg_process_count = 0; for (exec = exec_list; exec; exec = exec->next) pg->pg_process_count += exec->proc_count; pg_scratch = (struct HYD_pmcd_pmi_pg_scratch *) pg->pg_scratch; /* Get the common keys and deal with them */ val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "preputcount"); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: preputcount\n"); preputcount = atoi(val); for (i = 0; i < preputcount; i++) { char *preput_key, *preput_val; HYDU_snprintf(key, PMI_MAXKEYLEN, "ppkey%d", i); val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, key); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: %s\n", key); preput_key = val; HYDU_snprintf(key, PMI_MAXKEYLEN, "ppval%d", i); val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, key); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: %s\n", key); preput_val = val; status = HYD_pmcd_pmi_add_kvs(preput_key, preput_val, pg_scratch->kvs, &ret); HYDU_ERR_POP(status, "unable to add keypair to kvs\n"); } /* Create the proxy list */ if (pg->user_node_list) { status = HYDU_create_proxy_list(exec_list, pg->user_node_list, pg); HYDU_ERR_POP(status, "error creating proxy list\n"); } else { status = HYDU_create_proxy_list(exec_list, HYD_server_info.node_list, pg); HYDU_ERR_POP(status, "error creating proxy list\n"); } HYDU_free_exec_list(exec_list); if (pg->user_node_list) { pg->pg_core_count = 0; for (i = 0, node = pg->user_node_list; node; node = node->next, i++) pg->pg_core_count += node->core_count; } else { pg->pg_core_count = 0; for (proxy = pg->proxy_list; proxy; proxy = proxy->next) pg->pg_core_count += proxy->node->core_count; } status = HYDU_sock_create_and_listen_portstr(HYD_server_info.user_global.iface, HYD_server_info.localhost, HYD_server_info.port_range, &control_port, HYD_pmcd_pmiserv_control_listen_cb, (void *) (size_t) new_pgid); HYDU_ERR_POP(status, "unable to create PMI port\n"); if (HYD_server_info.user_global.debug) HYDU_dump(stdout, "Got a control port string of %s\n", control_port); /* Go to the last PG */ for (pg = &HYD_server_info.pg_list; pg->next; pg = pg->next); status = HYD_pmcd_pmi_fill_in_proxy_args(&proxy_stash, control_port, new_pgid); HYDU_ERR_POP(status, "unable to fill in proxy arguments\n"); HYDU_FREE(control_port); status = HYD_pmcd_pmi_fill_in_exec_launch_info(pg); HYDU_ERR_POP(status, "unable to fill in executable arguments\n"); status = HYDT_bsci_launch_procs(proxy_stash.strlist, pg->proxy_list, NULL); HYDU_ERR_POP(status, "launcher cannot launch processes\n"); { char *cmd; HYD_STRING_STASH_INIT(stash); HYD_STRING_STASH(stash, HYDU_strdup("cmd=spawn-response;"), status); if (thrid) { HYD_STRING_STASH(stash, HYDU_strdup("thrid="), status); HYD_STRING_STASH(stash, HYDU_strdup(thrid), status); HYD_STRING_STASH(stash, HYDU_strdup(";"), status); } HYD_STRING_STASH(stash, HYDU_strdup("rc=0;"), status); HYD_STRING_STASH(stash, HYDU_strdup("jobid="), status); HYD_STRING_STASH(stash, HYDU_strdup(pg_scratch->kvs->kvsname), status); HYD_STRING_STASH(stash, HYDU_strdup(";"), status); HYD_STRING_STASH(stash, HYDU_strdup("nerrs=0;"), status); HYD_STRING_SPIT(stash, cmd, status); status = cmd_response(fd, pid, cmd); HYDU_ERR_POP(status, "send command failed\n"); HYDU_FREE(cmd); } fn_exit: HYD_pmcd_pmi_free_tokens(tokens, token_count); HYD_STRING_STASH_FREE(proxy_stash); if (segment_list) HYDU_FREE(segment_list); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
HYD_status HYDT_bscd_pbs_wait_for_completion(int timeout) { int time_elapsed; int events_count, spawned_count; int idx, ierr; struct timeval start_tval, curr_tval; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); /* Allocate memory for taskobits[] */ HYDU_MALLOC(HYDT_bscd_pbs_sys->taskobits, int *, HYDT_bscd_pbs_sys->size * sizeof(int), status); spawned_count = HYDT_bscd_pbs_sys->spawned_count; /* * FIXME: We rely on gettimeofday here. This needs to detect the * timer type available and use that. Probably more of an MPL * functionality than Hydra's. */ gettimeofday(&start_tval, NULL); /* Register with TM to be notified the obituary of the spawning process. */ for (idx = 0; idx < spawned_count; idx++) { /* * Get a TM event which will be returned by tm_poll() when * the process labelled by taskID dies */ ierr = tm_obit(HYDT_bscd_pbs_sys->taskIDs[idx], HYDT_bscd_pbs_sys->taskobits + idx, HYDT_bscd_pbs_sys->events + idx); if (ierr != TM_SUCCESS) HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "tm_obit() fails with TM err=%d.\n", ierr); if (HYDT_bscd_pbs_sys->events[idx] == TM_ERROR_EVENT) HYDU_error_printf("tm_obit(Task %d) returns error.\n", HYDT_bscd_pbs_sys->taskIDs[idx]); if (HYDT_bscd_pbs_sys->events[idx] == TM_NULL_EVENT) HYDU_error_printf("Task %d already exits with status %d\n", HYDT_bscd_pbs_sys->taskIDs[idx], HYDT_bscd_pbs_sys->taskobits[idx]); } /* Poll if the spawned process has exited */ events_count = 0; /* Substract all the processes that have already exited */ for (idx = 0; idx < spawned_count; idx++) { if (HYDT_bscd_pbs_sys->events[idx] == TM_NULL_EVENT) events_count++; } /* Polling for the remaining alive processes till they all exit */ while (events_count < spawned_count) { tm_event_t event = -1; int poll_err; ierr = tm_poll(TM_NULL_EVENT, &event, 0, &poll_err); if (ierr != TM_SUCCESS) HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "tm_poll(obit_event) fails with err=%d.\n", ierr); if (event != TM_NULL_EVENT) { for (idx = 0; idx < spawned_count; idx++) { if (HYDT_bscd_pbs_sys->events[idx] == event) { if (HYDT_bsci_info.debug) { HYDU_dump(stdout, "PBS_DEBUG: Event %d received, task %d exits with status %d.\n", event, HYDT_bscd_pbs_sys->taskIDs[idx], HYDT_bscd_pbs_sys->taskobits[idx]); /* * HYDU_error_printf("DEBUG: Event %d received, task %d exits with status %d.\n", event, HYDT_bscd_pbs_sys->taskIDs[idx], HYDT_bscd_pbs_sys->taskobits[idx]); */ } events_count++; break; /* break from for(idx<spawned_count) loop */ } } } /* Check if time is up */ if (timeout > 0) { gettimeofday(&curr_tval, NULL); time_elapsed = curr_tval.tv_sec - start_tval.tv_sec; if (time_elapsed > timeout) { status = HYD_TIMED_OUT; goto fn_exit; } } } if (HYDT_bsci_info.debug) { HYDU_dump(stdout, "\nPBS_DEBUG: Done with polling obit events!\n"); } /* Loop till all sockets have closed */ fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int main(int argc, char **argv) { struct HYD_proxy *proxy; struct HYD_exec *exec; struct HYD_node *node; int exit_status = 0, i, timeout, reset_rmk, global_core_count; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); /* Initialize engines that don't require use to know anything * about the user preferences first */ status = HYDU_dbg_init("mpiexec"); HYDU_ERR_POP(status, "unable to initialization debugging\n"); status = HYDU_set_common_signals(signal_cb); HYDU_ERR_POP(status, "unable to set signal\n"); status = HYDT_ftb_init(); HYDU_ERR_POP(status, "unable to initialize FTB\n"); /* Get user preferences */ status = HYD_uii_mpx_get_parameters(argv); HYDU_ERR_POP(status, "error parsing parameters\n"); /* Now we initialize engines that require us to know user * preferences */ #if HAVE_ALARM if (HYD_ui_mpich_info.ckpoint_int != -1) alarm(HYD_ui_mpich_info.ckpoint_int); #endif /* HAVE_ALARM */ /* The demux engine should be initialized before any sockets are * created, since it checks for STDIN's validity. If STDIN was * closed and we opened a socket that got the same fd as STDIN, * this test will not be possible. */ status = HYDT_dmx_init(&HYD_server_info.user_global.demux); HYDU_ERR_POP(status, "unable to initialize the demux engine\n"); status = HYDT_bsci_init(HYD_server_info.user_global.rmk, HYD_server_info.user_global.launcher, HYD_server_info.user_global.launcher_exec, HYD_server_info.user_global.enablex, HYD_server_info.user_global.debug); HYDU_ERR_POP(status, "unable to initialize the bootstrap server\n"); reset_rmk = 0; if (HYD_server_info.node_list == NULL) { /* Node list is not created yet. The user might not have * provided the host file. Query the RMK. */ status = HYDT_bsci_query_node_list(&HYD_server_info.node_list); HYDU_ERR_POP(status, "unable to query the RMK for a node list\n"); if (HYD_server_info.node_list == NULL) { char localhost[MAX_HOSTNAME_LEN] = { 0 }; /* The RMK didn't give us anything back; use localhost */ status = HYDU_gethostname(localhost); HYDU_ERR_POP(status, "unable to get local hostname\n"); status = HYDU_add_to_node_list(localhost, 1, &HYD_server_info.node_list); HYDU_ERR_POP(status, "unable to add to node list\n"); reset_rmk = 1; } } /* * If this is a checkpoint-restart, if the user specified the * number of processes, we already have a dummy executable. If the * number of processes came from the RMK, our executable list is * still NULL; a dummy executable needs to be created. */ if (HYD_uii_mpx_exec_list == NULL) { HYDU_ASSERT(HYD_server_info.user_global.ckpoint_prefix, status); /* create a dummy executable */ status = HYDU_alloc_exec(&HYD_uii_mpx_exec_list); HYDU_ERR_POP(status, "unable to allocate exec\n"); HYD_uii_mpx_exec_list->appnum = 0; } if (HYD_server_info.user_global.debug) for (node = HYD_server_info.node_list; node; node = node->next) HYDU_dump_noprefix(stdout, "host: %s\n", node->hostname); /* Reset the host list to use only the number of processes per * node as specified by the ppn option. */ if (HYD_ui_mpich_info.ppn != -1) { for (node = HYD_server_info.node_list; node; node = node->next) node->core_count = HYD_ui_mpich_info.ppn; reset_rmk = 1; } /* The RMK returned a node list. See if the user requested us to * manipulate it in some way */ if (HYD_ui_mpich_info.sort_order != NONE) { qsort_node_list(); reset_rmk = 1; } if (reset_rmk) { /* Reassign node IDs to each node */ for (node = HYD_server_info.node_list, i = 0; node; node = node->next, i++) node->node_id = i; /* Reinitialize the bootstrap server with the "user" RMK, so * it knows that we are not using the node list provided by * the RMK */ status = HYDT_bsci_finalize(); HYDU_ERR_POP(status, "unable to finalize bootstrap device\n"); status = HYDT_bsci_init("user", HYDT_bsci_info.launcher, HYDT_bsci_info.launcher_exec, HYDT_bsci_info.enablex, HYDT_bsci_info.debug); HYDU_ERR_POP(status, "unable to reinitialize the bootstrap server\n"); } /* If the number of processes is not given, we allocate all the * available nodes to each executable */ HYD_server_info.pg_list.pg_process_count = 0; for (exec = HYD_uii_mpx_exec_list; exec; exec = exec->next) { if (exec->proc_count == -1) { global_core_count = 0; for (node = HYD_server_info.node_list, i = 0; node; node = node->next, i++) global_core_count += node->core_count; exec->proc_count = global_core_count; } HYD_server_info.pg_list.pg_process_count += exec->proc_count; } status = HYDU_list_inherited_env(&HYD_server_info.user_global.global_env.inherited); HYDU_ERR_POP(status, "unable to get the inherited env list\n"); status = HYDU_create_proxy_list(HYD_uii_mpx_exec_list, HYD_server_info.node_list, &HYD_server_info.pg_list); HYDU_ERR_POP(status, "unable to create proxy list\n"); /* calculate the core count used by the PG */ HYD_server_info.pg_list.pg_core_count = 0; for (proxy = HYD_server_info.pg_list.proxy_list; proxy; proxy = proxy->next) HYD_server_info.pg_list.pg_core_count += proxy->node->core_count; /* If the user didn't specify a local hostname, try to find one in * the list of nodes passed to us */ if (HYD_server_info.localhost == NULL) { /* See if the node list contains a remotely accessible localhost */ for (node = HYD_server_info.node_list; node; node = node->next) { int is_local, remote_access; status = HYDU_sock_is_local(node->hostname, &is_local); HYDU_ERR_POP(status, "unable to check if %s is local\n", node->hostname); if (is_local) { status = HYDU_sock_remote_access(node->hostname, &remote_access); HYDU_ERR_POP(status, "unable to check if %s is remotely accessible\n", node->hostname); if (remote_access) break; } } if (node) HYD_server_info.localhost = HYDU_strdup(node->hostname); else { HYDU_MALLOC(HYD_server_info.localhost, char *, MAX_HOSTNAME_LEN, status); status = HYDU_gethostname(HYD_server_info.localhost); HYDU_ERR_POP(status, "unable to get local hostname\n"); } } if (HYD_server_info.user_global.debug) HYD_uiu_print_params(); if (MPL_env2int("MPIEXEC_TIMEOUT", &timeout) == 0) timeout = -1; /* Infinite timeout */ if (HYD_server_info.user_global.debug) HYDU_dump(stdout, "Timeout set to %d (-1 means infinite)\n", timeout); /* Check if the user wants us to use a port within a certain * range. */ if (MPL_env2str("MPIEXEC_PORTRANGE", (const char **) &HYD_server_info.port_range) || MPL_env2str("MPIEXEC_PORT_RANGE", (const char **) &HYD_server_info.port_range) || MPL_env2str("MPICH_PORT_RANGE", (const char **) &HYD_server_info.port_range)) HYD_server_info.port_range = HYDU_strdup(HYD_server_info.port_range); /* Add the stdout/stderr callback handlers */ HYD_server_info.stdout_cb = HYD_uiu_stdout_cb; HYD_server_info.stderr_cb = HYD_uiu_stderr_cb; /* Create a pipe connection to wake up the process manager */ if (pipe(HYD_server_info.cmd_pipe) < 0) HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "pipe error\n"); /* Launch the processes */ status = HYD_pmci_launch_procs(); HYDU_ERR_POP(status, "process manager returned error launching processes\n"); /* Wait for their completion */ status = HYD_pmci_wait_for_completion(timeout); HYDU_ERR_POP(status, "process manager error waiting for completion\n"); /* Check for the exit status for all the processes */ if (HYD_ui_mpich_info.print_all_exitcodes) HYDU_dump(stdout, "Exit codes: "); exit_status = 0; for (proxy = HYD_server_info.pg_list.proxy_list; proxy; proxy = proxy->next) { if (proxy->exit_status == NULL) { /* We didn't receive the exit status for this proxy */ continue; } if (HYD_ui_mpich_info.print_all_exitcodes) HYDU_dump_noprefix(stdout, "[%s] ", proxy->node->hostname); for (i = 0; i < proxy->proxy_process_count; i++) { if (HYD_ui_mpich_info.print_all_exitcodes) { HYDU_dump_noprefix(stdout, "%d", proxy->exit_status[i]); if (i < proxy->proxy_process_count - 1) HYDU_dump_noprefix(stdout, ","); } exit_status |= proxy->exit_status[i]; } if (HYD_ui_mpich_info.print_all_exitcodes) HYDU_dump_noprefix(stdout, "\n"); } /* Call finalize functions for lower layers to cleanup their resources */ status = HYD_pmci_finalize(); HYDU_ERR_POP(status, "process manager error on finalize\n"); status = HYDT_ftb_finalize(); HYDU_ERR_POP(status, "error finalizing FTB\n"); #if defined ENABLE_PROFILING if (HYD_server_info.enable_profiling) { HYDU_dump_noprefix(stdout, "\n"); HYD_DRAW_LINE(80); HYDU_dump(stdout, "Number of PMI calls seen by the server: %d\n", HYD_server_info.num_pmi_calls); HYD_DRAW_LINE(80); HYDU_dump_noprefix(stdout, "\n"); } #endif /* ENABLE_PROFILING */ /* Free the mpiexec params */ HYD_uiu_free_params(); HYDU_free_exec_list(HYD_uii_mpx_exec_list); fn_exit: HYDU_dbg_finalize(); HYDU_FUNC_EXIT(); if (status == HYD_GRACEFUL_ABORT) return 0; else if (status != HYD_SUCCESS) return -1; else if (WIFSIGNALED(exit_status)) { printf("YOUR APPLICATION TERMINATED WITH THE EXIT STRING: %s (signal %d)\n", strsignal(WTERMSIG(exit_status)), WTERMSIG(exit_status)); printf("This typically refers to a problem with your application.\n"); printf("Please see the FAQ page for debugging suggestions\n"); return exit_status; } else if (WIFEXITED(exit_status)) { return WEXITSTATUS(exit_status); } else if (WIFSTOPPED(exit_status)) { return WSTOPSIG(exit_status); } else { return exit_status; } fn_fail: goto fn_exit; }