int main(int argc, char **argv) { struct HYD_proxy *proxy; struct HYD_exec *exec; struct HYD_node *node; int exit_status = 0, i, timeout, reset_rmk, global_core_count; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); /* Initialize engines that don't require use to know anything * about the user preferences first */ status = HYDU_dbg_init("mpiexec"); HYDU_ERR_POP(status, "unable to initialization debugging\n"); status = HYDU_set_common_signals(signal_cb); HYDU_ERR_POP(status, "unable to set signal\n"); status = HYDT_ftb_init(); HYDU_ERR_POP(status, "unable to initialize FTB\n"); /* Get user preferences */ status = HYD_uii_mpx_get_parameters(argv); HYDU_ERR_POP(status, "error parsing parameters\n"); /* Now we initialize engines that require us to know user * preferences */ #if HAVE_ALARM if (HYD_ui_mpich_info.ckpoint_int != -1) alarm(HYD_ui_mpich_info.ckpoint_int); #endif /* HAVE_ALARM */ /* The demux engine should be initialized before any sockets are * created, since it checks for STDIN's validity. If STDIN was * closed and we opened a socket that got the same fd as STDIN, * this test will not be possible. */ status = HYDT_dmx_init(&HYD_server_info.user_global.demux); HYDU_ERR_POP(status, "unable to initialize the demux engine\n"); status = HYDT_bsci_init(HYD_server_info.user_global.rmk, HYD_server_info.user_global.launcher, HYD_server_info.user_global.launcher_exec, HYD_server_info.user_global.enablex, HYD_server_info.user_global.debug); HYDU_ERR_POP(status, "unable to initialize the bootstrap server\n"); reset_rmk = 0; if (HYD_server_info.node_list == NULL) { /* Node list is not created yet. The user might not have * provided the host file. Query the RMK. */ status = HYDT_bsci_query_node_list(&HYD_server_info.node_list); HYDU_ERR_POP(status, "unable to query the RMK for a node list\n"); if (HYD_server_info.node_list == NULL) { char localhost[MAX_HOSTNAME_LEN] = { 0 }; /* The RMK didn't give us anything back; use localhost */ status = HYDU_gethostname(localhost); HYDU_ERR_POP(status, "unable to get local hostname\n"); status = HYDU_add_to_node_list(localhost, 1, &HYD_server_info.node_list); HYDU_ERR_POP(status, "unable to add to node list\n"); reset_rmk = 1; } } /* * If this is a checkpoint-restart, if the user specified the * number of processes, we already have a dummy executable. If the * number of processes came from the RMK, our executable list is * still NULL; a dummy executable needs to be created. */ if (HYD_uii_mpx_exec_list == NULL) { HYDU_ASSERT(HYD_server_info.user_global.ckpoint_prefix, status); /* create a dummy executable */ status = HYDU_alloc_exec(&HYD_uii_mpx_exec_list); HYDU_ERR_POP(status, "unable to allocate exec\n"); HYD_uii_mpx_exec_list->appnum = 0; } if (HYD_server_info.user_global.debug) for (node = HYD_server_info.node_list; node; node = node->next) HYDU_dump_noprefix(stdout, "host: %s\n", node->hostname); /* Reset the host list to use only the number of processes per * node as specified by the ppn option. */ if (HYD_ui_mpich_info.ppn != -1) { for (node = HYD_server_info.node_list; node; node = node->next) node->core_count = HYD_ui_mpich_info.ppn; reset_rmk = 1; } /* The RMK returned a node list. See if the user requested us to * manipulate it in some way */ if (HYD_ui_mpich_info.sort_order != NONE) { qsort_node_list(); reset_rmk = 1; } if (reset_rmk) { /* Reassign node IDs to each node */ for (node = HYD_server_info.node_list, i = 0; node; node = node->next, i++) node->node_id = i; /* Reinitialize the bootstrap server with the "user" RMK, so * it knows that we are not using the node list provided by * the RMK */ status = HYDT_bsci_finalize(); HYDU_ERR_POP(status, "unable to finalize bootstrap device\n"); status = HYDT_bsci_init("user", HYDT_bsci_info.launcher, HYDT_bsci_info.launcher_exec, HYDT_bsci_info.enablex, HYDT_bsci_info.debug); HYDU_ERR_POP(status, "unable to reinitialize the bootstrap server\n"); } /* If the number of processes is not given, we allocate all the * available nodes to each executable */ HYD_server_info.pg_list.pg_process_count = 0; for (exec = HYD_uii_mpx_exec_list; exec; exec = exec->next) { if (exec->proc_count == -1) { global_core_count = 0; for (node = HYD_server_info.node_list, i = 0; node; node = node->next, i++) global_core_count += node->core_count; exec->proc_count = global_core_count; } HYD_server_info.pg_list.pg_process_count += exec->proc_count; } status = HYDU_list_inherited_env(&HYD_server_info.user_global.global_env.inherited); HYDU_ERR_POP(status, "unable to get the inherited env list\n"); status = HYDU_create_proxy_list(HYD_uii_mpx_exec_list, HYD_server_info.node_list, &HYD_server_info.pg_list); HYDU_ERR_POP(status, "unable to create proxy list\n"); /* calculate the core count used by the PG */ HYD_server_info.pg_list.pg_core_count = 0; for (proxy = HYD_server_info.pg_list.proxy_list; proxy; proxy = proxy->next) HYD_server_info.pg_list.pg_core_count += proxy->node->core_count; /* If the user didn't specify a local hostname, try to find one in * the list of nodes passed to us */ if (HYD_server_info.localhost == NULL) { /* See if the node list contains a remotely accessible localhost */ for (node = HYD_server_info.node_list; node; node = node->next) { int is_local, remote_access; status = HYDU_sock_is_local(node->hostname, &is_local); HYDU_ERR_POP(status, "unable to check if %s is local\n", node->hostname); if (is_local) { status = HYDU_sock_remote_access(node->hostname, &remote_access); HYDU_ERR_POP(status, "unable to check if %s is remotely accessible\n", node->hostname); if (remote_access) break; } } if (node) HYD_server_info.localhost = HYDU_strdup(node->hostname); else { HYDU_MALLOC(HYD_server_info.localhost, char *, MAX_HOSTNAME_LEN, status); status = HYDU_gethostname(HYD_server_info.localhost); HYDU_ERR_POP(status, "unable to get local hostname\n"); } } if (HYD_server_info.user_global.debug) HYD_uiu_print_params(); if (MPL_env2int("MPIEXEC_TIMEOUT", &timeout) == 0) timeout = -1; /* Infinite timeout */ if (HYD_server_info.user_global.debug) HYDU_dump(stdout, "Timeout set to %d (-1 means infinite)\n", timeout); /* Check if the user wants us to use a port within a certain * range. */ if (MPL_env2str("MPIEXEC_PORTRANGE", (const char **) &HYD_server_info.port_range) || MPL_env2str("MPIEXEC_PORT_RANGE", (const char **) &HYD_server_info.port_range) || MPL_env2str("MPICH_PORT_RANGE", (const char **) &HYD_server_info.port_range)) HYD_server_info.port_range = HYDU_strdup(HYD_server_info.port_range); /* Add the stdout/stderr callback handlers */ HYD_server_info.stdout_cb = HYD_uiu_stdout_cb; HYD_server_info.stderr_cb = HYD_uiu_stderr_cb; /* Create a pipe connection to wake up the process manager */ if (pipe(HYD_server_info.cmd_pipe) < 0) HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "pipe error\n"); /* Launch the processes */ status = HYD_pmci_launch_procs(); HYDU_ERR_POP(status, "process manager returned error launching processes\n"); /* Wait for their completion */ status = HYD_pmci_wait_for_completion(timeout); HYDU_ERR_POP(status, "process manager error waiting for completion\n"); /* Check for the exit status for all the processes */ if (HYD_ui_mpich_info.print_all_exitcodes) HYDU_dump(stdout, "Exit codes: "); exit_status = 0; for (proxy = HYD_server_info.pg_list.proxy_list; proxy; proxy = proxy->next) { if (proxy->exit_status == NULL) { /* We didn't receive the exit status for this proxy */ continue; } if (HYD_ui_mpich_info.print_all_exitcodes) HYDU_dump_noprefix(stdout, "[%s] ", proxy->node->hostname); for (i = 0; i < proxy->proxy_process_count; i++) { if (HYD_ui_mpich_info.print_all_exitcodes) { HYDU_dump_noprefix(stdout, "%d", proxy->exit_status[i]); if (i < proxy->proxy_process_count - 1) HYDU_dump_noprefix(stdout, ","); } exit_status |= proxy->exit_status[i]; } if (HYD_ui_mpich_info.print_all_exitcodes) HYDU_dump_noprefix(stdout, "\n"); } /* Call finalize functions for lower layers to cleanup their resources */ status = HYD_pmci_finalize(); HYDU_ERR_POP(status, "process manager error on finalize\n"); status = HYDT_ftb_finalize(); HYDU_ERR_POP(status, "error finalizing FTB\n"); #if defined ENABLE_PROFILING if (HYD_server_info.enable_profiling) { HYDU_dump_noprefix(stdout, "\n"); HYD_DRAW_LINE(80); HYDU_dump(stdout, "Number of PMI calls seen by the server: %d\n", HYD_server_info.num_pmi_calls); HYD_DRAW_LINE(80); HYDU_dump_noprefix(stdout, "\n"); } #endif /* ENABLE_PROFILING */ /* Free the mpiexec params */ HYD_uiu_free_params(); HYDU_free_exec_list(HYD_uii_mpx_exec_list); fn_exit: HYDU_dbg_finalize(); HYDU_FUNC_EXIT(); if (status == HYD_GRACEFUL_ABORT) return 0; else if (status != HYD_SUCCESS) return -1; else if (WIFSIGNALED(exit_status)) { printf("YOUR APPLICATION TERMINATED WITH THE EXIT STRING: %s (signal %d)\n", strsignal(WTERMSIG(exit_status)), WTERMSIG(exit_status)); printf("This typically refers to a problem with your application.\n"); printf("Please see the FAQ page for debugging suggestions\n"); return exit_status; } else if (WIFEXITED(exit_status)) { return WEXITSTATUS(exit_status); } else if (WIFSTOPPED(exit_status)) { return WSTOPSIG(exit_status); } else { return exit_status; } fn_fail: goto fn_exit; }
HYD_status HYD_pmcd_pmip_get_params(char **t_argv) { char **argv = t_argv; static char dbg_prefix[2 * MAX_HOSTNAME_LEN]; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); argv++; do { /* Get the proxy arguments */ status = HYDU_parse_array(&argv, HYD_pmcd_pmip_match_table); HYDU_ERR_POP(status, "error parsing input array\n"); /* No more arguments left */ if (!(*argv)) break; } while (1); /* Verify the arguments we got */ if (HYD_pmcd_pmip.upstream.server_name == NULL) HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "server name not available\n"); if (HYD_pmcd_pmip.upstream.server_port == -1) HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "server port not available\n"); if (HYD_pmcd_pmip.user_global.demux == NULL) HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "demux engine not available\n"); if (HYD_pmcd_pmip.user_global.debug == -1) HYD_pmcd_pmip.user_global.debug = 0; status = HYDT_bsci_init(HYD_pmcd_pmip.user_global.rmk, HYD_pmcd_pmip.user_global.launcher, HYD_pmcd_pmip.user_global.launcher_exec, 0 /* disable x */ , HYD_pmcd_pmip.user_global.debug); HYDU_ERR_POP(status, "proxy unable to initialize bootstrap server\n"); if (HYD_pmcd_pmip.local.id == -1) { /* We didn't get a proxy ID during launch; query the launcher * for it. */ status = HYDT_bsci_query_proxy_id(&HYD_pmcd_pmip.local.id); HYDU_ERR_POP(status, "unable to query launcher for proxy ID\n"); } if (HYD_pmcd_pmip.local.id == -1) HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "proxy ID not available\n"); if (HYD_pmcd_pmip.local.pgid == -1) HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "PG ID not available\n"); if (HYD_pmcd_pmip.local.retries == -1) HYD_pmcd_pmip.local.retries = 0; HYDU_dbg_finalize(); MPL_snprintf(dbg_prefix, 2 * MAX_HOSTNAME_LEN, "proxy:%d:%d", HYD_pmcd_pmip.local.pgid, HYD_pmcd_pmip.local.id); status = HYDU_dbg_init((const char *) dbg_prefix); HYDU_ERR_POP(status, "unable to initialization debugging\n"); fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int main(int argc, char **argv) { int i, count, pid, ret_status, sent, closed, ret, done; struct HYD_pmcd_hdr hdr; HYD_status status = HYD_SUCCESS; status = HYDU_dbg_init("proxy:unset"); HYDU_ERR_POP(status, "unable to initialization debugging\n"); status = HYDU_set_signal(SIGPIPE, signal_cb); HYDU_ERR_POP(status, "unable to set SIGPIPE\n"); status = HYDU_set_signal(SIGTSTP, signal_cb); HYDU_ERR_POP(status, "unable to set SIGTSTP\n"); status = HYDU_set_common_signals(signal_cb); HYDU_ERR_POP(status, "unable to set common signals\n"); status = init_params(); HYDU_ERR_POP(status, "Error initializing proxy params\n"); status = HYD_pmcd_pmip_get_params(argv); HYDU_ERR_POP(status, "bad parameters passed to the proxy\n"); status = HYDT_dmx_init(&HYD_pmcd_pmip.user_global.demux); HYDU_ERR_POP(status, "unable to initialize the demux engine\n"); status = HYDT_ftb_init(); HYDU_ERR_POP(status, "unable to initialize FTB\n"); /* See if HYDI_CONTROL_FD is set before trying to connect upstream */ ret = MPL_env2int("HYDI_CONTROL_FD", &HYD_pmcd_pmip.upstream.control); if (ret < 0) { HYDU_ERR_POP(status, "error reading HYDI_CONTROL_FD environment\n"); } else if (ret == 0) { status = HYDU_sock_connect(HYD_pmcd_pmip.upstream.server_name, HYD_pmcd_pmip.upstream.server_port, &HYD_pmcd_pmip.upstream.control, HYD_pmcd_pmip.local.retries, HYD_CONNECT_DELAY); HYDU_ERR_POP(status, "unable to connect to server %s at port %d (check for firewalls!)\n", HYD_pmcd_pmip.upstream.server_name, HYD_pmcd_pmip.upstream.server_port); } status = HYDU_sock_write(HYD_pmcd_pmip.upstream.control, &HYD_pmcd_pmip.local.id, sizeof(HYD_pmcd_pmip.local.id), &sent, &closed, HYDU_SOCK_COMM_MSGWAIT); HYDU_ERR_POP(status, "unable to send the proxy ID to the server\n"); if (closed) goto fn_fail; status = HYDT_dmx_register_fd(1, &HYD_pmcd_pmip.upstream.control, HYD_POLLIN, NULL, HYD_pmcd_pmip_control_cmd_cb); HYDU_ERR_POP(status, "unable to register fd\n"); while (1) { /* Wait for some event to occur */ status = HYDT_dmx_wait_for_event(-1); HYDU_ERR_POP(status, "demux engine error waiting for event\n"); /* Check to see if there's any open read socket left; if there * are, we will just wait for more events. */ count = 0; for (i = 0; i < HYD_pmcd_pmip.local.proxy_process_count; i++) { if (HYD_pmcd_pmip.downstream.out[i] != HYD_FD_CLOSED) count++; if (HYD_pmcd_pmip.downstream.err[i] != HYD_FD_CLOSED) count++; if (count) break; } if (!count) break; } /* Now wait for the processes to finish */ done = 0; while (1) { pid = waitpid(-1, &ret_status, 0); /* Find the pid and mark it as complete. */ if (pid > 0) for (i = 0; i < HYD_pmcd_pmip.local.proxy_process_count; i++) if (HYD_pmcd_pmip.downstream.pid[i] == pid) { if (HYD_pmcd_pmip.downstream.forced_cleanup) { /* If it is a forced cleanup, the exit status * is either already set or we have to ignore * it */ if (HYD_pmcd_pmip.downstream.exit_status[i] == -1) HYD_pmcd_pmip.downstream.exit_status[i] = 0; else HYD_pmcd_pmip.downstream.exit_status[i] = ret_status; } else { HYD_pmcd_pmip.downstream.exit_status[i] = ret_status; } done++; } /* If no more processes are pending, break out */ if (done == HYD_pmcd_pmip.local.proxy_process_count) break; /* Check if there are any messages from the launcher */ status = HYDT_dmx_wait_for_event(0); HYDU_IGNORE_TIMEOUT(status); HYDU_ERR_POP(status, "demux engine error waiting for event\n"); } /* Send the exit status upstream */ HYD_pmcd_init_header(&hdr); hdr.cmd = EXIT_STATUS; status = HYDU_sock_write(HYD_pmcd_pmip.upstream.control, &hdr, sizeof(hdr), &sent, &closed, HYDU_SOCK_COMM_MSGWAIT); HYDU_ERR_POP(status, "unable to send EXIT_STATUS command upstream\n"); HYDU_ASSERT(!closed, status); status = HYDU_sock_write(HYD_pmcd_pmip.upstream.control, HYD_pmcd_pmip.downstream.exit_status, HYD_pmcd_pmip.local.proxy_process_count * sizeof(int), &sent, &closed, HYDU_SOCK_COMM_MSGWAIT); HYDU_ERR_POP(status, "unable to return exit status upstream\n"); HYDU_ASSERT(!closed, status); status = HYDT_dmx_deregister_fd(HYD_pmcd_pmip.upstream.control); HYDU_ERR_POP(status, "unable to deregister fd\n"); close(HYD_pmcd_pmip.upstream.control); status = HYDT_dmx_finalize(); HYDU_ERR_POP(status, "error returned from demux finalize\n"); status = HYDT_ftb_finalize(); HYDU_ERR_POP(status, "unable to finalize FTB\n"); status = HYDT_bsci_finalize(); HYDU_ERR_POP(status, "unable to finalize the bootstrap device\n"); /* cleanup the params structure */ cleanup_params(); fn_exit: HYDU_dbg_finalize(); return status; fn_fail: /* kill all processes */ HYD_pmcd_pmip_send_signal(SIGKILL); goto fn_exit; }