HYD_status HYDU_find_in_path(const char *execname, char **path) { char *tmp[HYD_NUM_TMP_STRINGS], *path_loc = NULL, *test_loc, *user_path; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); /* The executable is somewhere in the user's path. Find it. */ if (MPL_env2str("PATH", (const char **) &user_path)) user_path = MPL_strdup(user_path); if (user_path) { /* If the PATH environment exists */ status = get_abs_wd(strtok(user_path, ";:"), &test_loc); HYDU_ERR_POP(status, "error getting absolute working dir\n"); do { tmp[0] = MPL_strdup(test_loc); tmp[1] = MPL_strdup("/"); tmp[2] = MPL_strdup(execname); tmp[3] = NULL; status = HYDU_str_alloc_and_join(tmp, &path_loc); HYDU_ERR_POP(status, "unable to join strings\n"); HYDU_free_strlist(tmp); if (exists(path_loc)) { tmp[0] = MPL_strdup(test_loc); tmp[1] = MPL_strdup("/"); tmp[2] = NULL; status = HYDU_str_alloc_and_join(tmp, path); HYDU_ERR_POP(status, "unable to join strings\n"); HYDU_free_strlist(tmp); goto fn_exit; /* We are done */ } MPL_free(path_loc); path_loc = NULL; status = get_abs_wd(strtok(NULL, ";:"), &test_loc); HYDU_ERR_POP(status, "error getting absolute working dir\n"); } while (test_loc); } /* There is either no PATH environment or we could not find the * file in the PATH. Just return an empty path */ *path = MPL_strdup(""); fn_exit: if (user_path) MPL_free(user_path); if (path_loc) MPL_free(path_loc); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
void HYDU_free_proxy_list(struct HYD_proxy *proxy_list) { struct HYD_proxy *proxy, *tproxy; HYDU_FUNC_ENTER(); proxy = proxy_list; while (proxy) { tproxy = proxy->next; proxy->node = NULL; if (proxy->exec_launch_info) { HYDU_free_strlist(proxy->exec_launch_info); HYDU_FREE(proxy->exec_launch_info); } if (proxy->pid) HYDU_FREE(proxy->pid); if (proxy->exit_status) HYDU_FREE(proxy->exit_status); HYDU_free_exec_list(proxy->exec_list); HYDU_FREE(proxy); proxy = tproxy; } HYDU_FUNC_EXIT(); }
void HYDU_free_exec_list(struct HYD_exec *exec_list) { struct HYD_exec *exec, *run; HYDU_FUNC_ENTER(); exec = exec_list; while (exec) { run = exec->next; HYDU_free_strlist(exec->exec); if (exec->wdir) HYDU_FREE(exec->wdir); if (exec->env_prop) HYDU_FREE(exec->env_prop); HYDU_env_free_list(exec->user_env); exec->user_env = NULL; HYDU_FREE(exec); exec = run; } HYDU_FUNC_EXIT(); }
char *HYDU_find_full_path(const char *execname) { char *tmp[HYD_NUM_TMP_STRINGS] = { NULL }, *path = NULL, *test_path = NULL; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); status = HYDU_find_in_path(execname, &test_path); HYDU_ERR_POP(status, "error while searching for executable in user path\n"); if (test_path) { tmp[0] = MPL_strdup(test_path); tmp[1] = MPL_strdup(execname); tmp[2] = NULL; status = HYDU_str_alloc_and_join(tmp, &path); HYDU_ERR_POP(status, "error joining strings\n"); } fn_exit: HYDU_free_strlist(tmp); if (test_path) MPL_free(test_path); HYDU_FUNC_EXIT(); return path; fn_fail: goto fn_exit; }
HYD_status HYDU_correct_wdir(char **wdir) { char *tmp[HYD_NUM_TMP_STRINGS]; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); if (*wdir == NULL) { *wdir = HYDU_getcwd(); } else if (*wdir[0] != '/') { tmp[0] = HYDU_getcwd(); tmp[1] = HYDU_strdup("/"); tmp[2] = HYDU_strdup(*wdir); tmp[3] = NULL; HYDU_FREE(*wdir); status = HYDU_str_alloc_and_join(tmp, wdir); HYDU_ERR_POP(status, "unable to join strings\n"); HYDU_free_strlist(tmp); } fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status handle_pmi_cmd(int fd, int pgid, int pid, char *buf, int pmi_version) { char **args = NULL, *cmd = NULL; struct HYD_pmcd_pmi_handle *h; int i; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); if (pmi_version == 1) HYD_pmcd_pmi_handle = HYD_pmcd_pmi_v1; else HYD_pmcd_pmi_handle = HYD_pmcd_pmi_v2; if (HYD_server_info.user_global.debug) HYDU_dump(stdout, "[pgid: %d] got PMI command: %s\n", pgid, buf); HYDU_MALLOC(args, char **, MAX_PMI_ARGS * sizeof(char *), status); for (i = 0; i < MAX_PMI_ARGS; i++) args[i] = NULL; status = HYD_pmcd_pmi_parse_pmi_cmd(buf, pmi_version, &cmd, args); HYDU_ERR_POP(status, "unable to parse PMI command\n"); #if defined ENABLE_PROFILING if (HYD_server_info.enable_profiling) HYD_server_info.num_pmi_calls++; #endif /* ENABLE_PROFILING */ h = HYD_pmcd_pmi_handle; while (h->handler) { if (!strcmp(cmd, h->cmd)) { status = h->handler(fd, pid, pgid, args); HYDU_ERR_POP(status, "PMI handler returned error\n"); break; } h++; } if (!h->handler) { /* We don't understand the command */ HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "Unrecognized PMI command: %s | cleaning up processes\n", cmd); } fn_exit: if (cmd) HYDU_FREE(cmd); if (args) { HYDU_free_strlist(args); HYDU_free(args); } HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
HYD_status HYDU_parse_hostfile(const char *hostfile, struct HYD_node **node_list, HYD_status(*process_token) (char *token, int newline, struct HYD_node ** node_list)) { char line[HYD_TMP_STRLEN], **tokens; FILE *fp; int i; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); if ((fp = fopen(hostfile, "r")) == NULL) HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "unable to open host file: %s\n", hostfile); if (node_list) *node_list = NULL; while (fgets(line, HYD_TMP_STRLEN, fp)) { char *linep = NULL; linep = line; strtok(linep, "#"); while (isspace(*linep)) linep++; /* Ignore blank lines & comments */ if ((*linep == '#') || (*linep == '\0')) continue; tokens = HYDU_str_to_strlist(linep); if (!tokens) HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "Unable to convert host file entry to strlist\n"); for (i = 0; tokens[i]; i++) { status = process_token(tokens[i], !i, node_list); HYDU_ERR_POP(status, "unable to process token\n"); } HYDU_free_strlist(tokens); MPL_free(tokens); } fclose(fp); fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status poke_progress(char *key) { struct HYD_pmcd_pmi_v2_reqs *req, *list_head = NULL, *list_tail = NULL; int i, count; HYD_status status = HYD_SUCCESS; for (count = 0, req = pending_reqs; req; req = req->next) count++; for (i = 0; i < count; i++) { /* Dequeue a request */ req = pending_reqs; if (pending_reqs) { pending_reqs = pending_reqs->next; req->next = NULL; } if (key && strcmp(key, req->key)) { /* If the key doesn't match the request, just queue it back */ if (list_head == NULL) { list_head = req; list_tail = req; } else { list_tail->next = req; req->prev = list_tail; list_tail = req; } } else { status = fn_kvs_get(req->fd, req->pid, req->pgid, req->args); HYDU_ERR_POP(status, "kvs_get returned error\n"); /* Free the dequeued request */ HYDU_free_strlist(req->args); HYDU_FREE(req); } } if (list_head) { list_tail->next = pending_reqs; pending_reqs = list_head; } fn_exit: return status; fn_fail: goto fn_exit; }
static HYD_status fn_publish_name(int fd, int pid, int pgid, char *args[]) { char *tmp[HYD_NUM_TMP_STRINGS], *cmd, *val; int i, token_count; struct HYD_pmcd_token *tokens; char *name, *port; int success = 0; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); status = HYD_pmcd_pmi_args_to_tokens(args, &tokens, &token_count); HYDU_ERR_POP(status, "unable to convert args to tokens\n"); if ((val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "service")) == NULL) HYDU_ERR_POP(status, "cannot find token: service\n"); name = HYDU_strdup(val); if ((val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "port")) == NULL) HYDU_ERR_POP(status, "cannot find token: port\n"); port = HYDU_strdup(val); status = HYD_pmcd_pmi_publish(name, port, &success); HYDU_ERR_POP(status, "error publishing service\n"); i = 0; if (success) tmp[i++] = HYDU_strdup("cmd=publish_result info=ok rc=0 msg=success\n"); else tmp[i++] = HYDU_strdup("cmd=publish_result info=ok rc=1 msg=key_already_present\n"); tmp[i++] = NULL; status = HYDU_str_alloc_and_join(tmp, &cmd); HYDU_ERR_POP(status, "unable to join strings\n"); HYDU_free_strlist(tmp); status = cmd_response(fd, pid, cmd); HYDU_ERR_POP(status, "send command failed\n"); HYDU_FREE(cmd); fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status proxy_list_to_node_str(struct HYD_proxy *proxy_list, char **node_list_str) { int i; char *tmp[HYD_NUM_TMP_STRINGS], *foo = NULL; struct HYD_proxy *proxy; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); i = 0; for (proxy = proxy_list; proxy; proxy = proxy->next) { tmp[i++] = HYDU_strdup(proxy->node->hostname); if (proxy->node->next) tmp[i++] = HYDU_strdup(","); /* If we used up more than half of the array elements, merge * what we have so far */ if (i > (HYD_NUM_TMP_STRINGS / 2)) { tmp[i++] = NULL; status = HYDU_str_alloc_and_join(tmp, &foo); HYDU_ERR_POP(status, "error joining strings\n"); i = 0; tmp[i++] = HYDU_strdup(foo); HYDU_FREE(foo); } } tmp[i++] = NULL; status = HYDU_str_alloc_and_join(tmp, &foo); HYDU_ERR_POP(status, "error joining strings\n"); *node_list_str = foo; foo = NULL; fn_exit: HYDU_free_strlist(tmp); if (foo) HYDU_FREE(foo); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status fn_lookup_name(int fd, int pid, int pgid, char *args[]) { char *tmp[HYD_NUM_TMP_STRINGS], *cmd, *name, *value; int i, token_count; struct HYD_pmcd_token *tokens; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); status = HYD_pmcd_pmi_args_to_tokens(args, &tokens, &token_count); HYDU_ERR_POP(status, "unable to convert args to tokens\n"); if ((name = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "service")) == NULL) HYDU_ERR_POP(status, "cannot find token: service\n"); status = HYD_pmcd_pmi_lookup(name, &value); HYDU_ERR_POP(status, "error while looking up service\n"); i = 0; tmp[i++] = HYDU_strdup("cmd=lookup_result info=ok"); if (value) { tmp[i++] = HYDU_strdup("value="); tmp[i++] = HYDU_strdup(value); tmp[i++] = HYDU_strdup(" rc=0 msg=success\n"); } else { tmp[i++] = HYDU_strdup(" rc=1 msg=service_not_found\n"); } tmp[i++] = NULL; status = HYDU_str_alloc_and_join(tmp, &cmd); HYDU_ERR_POP(status, "unable to join strings\n"); HYDU_free_strlist(tmp); status = cmd_response(fd, pid, cmd); HYDU_ERR_POP(status, "send command failed\n"); HYDU_FREE(cmd); fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
HYD_status HYDT_bscd_ll_launch_procs(char **args, struct HYD_proxy *proxy_list, int *control_fd) { int idx, i, total_procs, node_count; int *pid, *fd_list, exec_idx; char *targs[HYD_NUM_TMP_STRINGS], *node_list_str = NULL; char *path = NULL, *extra_arg_list = NULL, *extra_arg, quoted_exec_string[HYD_TMP_STRLEN]; struct HYD_proxy *proxy; struct HYDT_topo_cpuset_t cpuset; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); /* We use the following priority order for the executable path: * (1) user-specified; (2) search in path; (3) Hard-coded * location */ if (HYDT_bsci_info.launcher_exec) path = HYDU_strdup(HYDT_bsci_info.launcher_exec); if (!path) path = HYDU_find_full_path("poe"); if (!path) path = HYDU_strdup("/usr/bin/poe"); idx = 0; targs[idx++] = HYDU_strdup(path); if (!strcmp(HYDT_bsci_info.rmk, "ll")) { HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "ll does not support user-defined host lists\n"); } /* Check how many nodes are being passed for the launch */ status = HYDTI_bscd_ll_query_node_count(&total_procs); HYDU_ERR_POP(status, "unable to query for the node count\n"); node_count = 0; for (proxy = proxy_list; proxy; proxy = proxy->next) node_count++; if (total_procs != node_count) HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "processes to be launched have to cover all nodes\n"); MPL_env2str("HYDRA_LAUNCHER_EXTRA_ARGS", (const char **) &extra_arg_list); if (extra_arg_list) { extra_arg = strtok(extra_arg_list, " "); while (extra_arg) { targs[idx++] = HYDU_strdup(extra_arg); extra_arg = strtok(NULL, " "); } } /* Fill in the remaining arguments */ exec_idx = idx; for (i = 0; args[i]; i++) targs[idx++] = HYDU_strdup(args[i]); /* Create a quoted version of the exec string, which is only used * when the executable is not launched directly, but through an * actual launcher */ HYDU_snprintf(quoted_exec_string, HYD_TMP_STRLEN, "\"%s\"", targs[exec_idx]); HYDU_FREE(targs[exec_idx]); targs[exec_idx] = quoted_exec_string; /* Increase pid list to accommodate the new pid */ HYDU_MALLOC(pid, int *, (HYD_bscu_pid_count + 1) * sizeof(int), status); for (i = 0; i < HYD_bscu_pid_count; i++) pid[i] = HYD_bscu_pid_list[i]; HYDU_FREE(HYD_bscu_pid_list); HYD_bscu_pid_list = pid; /* Increase fd list to accommodate these new fds */ HYDU_MALLOC(fd_list, int *, (HYD_bscu_fd_count + 3) * sizeof(int), status); for (i = 0; i < HYD_bscu_fd_count; i++) fd_list[i] = HYD_bscu_fd_list[i]; HYDU_FREE(HYD_bscu_fd_list); HYD_bscu_fd_list = fd_list; /* append proxy ID as -1 */ targs[idx++] = HYDU_int_to_str(-1); targs[idx++] = NULL; HYDT_topo_cpuset_zero(&cpuset); status = HYDU_create_process(targs, NULL, NULL, &fd_stdout, &fd_stderr, &HYD_bscu_pid_list[HYD_bscu_pid_count++], cpuset); HYDU_ERR_POP(status, "create process returned error\n"); HYD_bscu_fd_list[HYD_bscu_fd_count++] = fd_stdout; HYD_bscu_fd_list[HYD_bscu_fd_count++] = fd_stderr; status = HYDT_dmx_register_fd(1, &fd_stdout, HYD_POLLIN, (void *) (size_t) STDOUT_FILENO, HYDT_bscu_stdio_cb); HYDU_ERR_POP(status, "demux returned error registering fd\n"); status = HYDT_dmx_register_fd(1, &fd_stderr, HYD_POLLIN, (void *) (size_t) STDERR_FILENO, HYDT_bscu_stdio_cb); HYDU_ERR_POP(status, "demux returned error registering fd\n"); fn_exit: if (node_list_str) HYDU_FREE(node_list_str); HYDU_free_strlist(targs); if (path) HYDU_FREE(path); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status fn_put(int fd, int pid, int pgid, char *args[]) { int i, ret; struct HYD_proxy *proxy; struct HYD_pmcd_pmi_pg_scratch *pg_scratch; char *kvsname, *key, *val; char *tmp[HYD_NUM_TMP_STRINGS], *cmd; struct HYD_pmcd_token *tokens; int token_count; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); status = HYD_pmcd_pmi_args_to_tokens(args, &tokens, &token_count); HYDU_ERR_POP(status, "unable to convert args to tokens\n"); kvsname = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "kvsname"); HYDU_ERR_CHKANDJUMP(status, kvsname == NULL, HYD_INTERNAL_ERROR, "unable to find token: kvsname\n"); key = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "key"); HYDU_ERR_CHKANDJUMP(status, key == NULL, HYD_INTERNAL_ERROR, "unable to find token: key\n"); val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "value"); if (val == NULL) { /* the user sent an empty string */ val = HYDU_strdup(""); } proxy = HYD_pmcd_pmi_find_proxy(fd); HYDU_ASSERT(proxy, status); pg_scratch = (struct HYD_pmcd_pmi_pg_scratch *) proxy->pg->pg_scratch; if (strcmp(pg_scratch->kvs->kvs_name, kvsname)) HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "kvsname (%s) does not match this group's kvs space (%s)\n", kvsname, pg_scratch->kvs->kvs_name); status = HYD_pmcd_pmi_add_kvs(key, val, pg_scratch->kvs, &ret); HYDU_ERR_POP(status, "unable to add keypair to kvs\n"); i = 0; tmp[i++] = HYDU_strdup("cmd=put_result rc="); tmp[i++] = HYDU_int_to_str(ret); if (ret == 0) { tmp[i++] = HYDU_strdup(" msg=success"); } else { tmp[i++] = HYDU_strdup(" msg=duplicate_key"); tmp[i++] = HYDU_strdup(key); } tmp[i++] = HYDU_strdup("\n"); tmp[i++] = NULL; status = HYDU_str_alloc_and_join(tmp, &cmd); HYDU_ERR_POP(status, "unable to join strings\n"); HYDU_free_strlist(tmp); status = cmd_response(fd, pid, cmd); HYDU_ERR_POP(status, "error writing PMI line\n"); HYDU_FREE(cmd); fn_exit: HYD_pmcd_pmi_free_tokens(tokens, token_count); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status fn_get(int fd, int pid, int pgid, char *args[]) { int i; struct HYD_proxy *proxy; struct HYD_pmcd_pmi_pg_scratch *pg_scratch; struct HYD_pmcd_pmi_kvs_pair *run; char *kvsname, *key, *val; char *tmp[HYD_NUM_TMP_STRINGS], *cmd; struct HYD_pmcd_token *tokens; int token_count; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); status = HYD_pmcd_pmi_args_to_tokens(args, &tokens, &token_count); HYDU_ERR_POP(status, "unable to convert args to tokens\n"); kvsname = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "kvsname"); HYDU_ERR_CHKANDJUMP(status, kvsname == NULL, HYD_INTERNAL_ERROR, "unable to find token: kvsname\n"); key = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "key"); HYDU_ERR_CHKANDJUMP(status, key == NULL, HYD_INTERNAL_ERROR, "unable to find token: key\n"); proxy = HYD_pmcd_pmi_find_proxy(fd); HYDU_ASSERT(proxy, status); pg_scratch = (struct HYD_pmcd_pmi_pg_scratch *) proxy->pg->pg_scratch; val = NULL; if (!strcmp(key, "PMI_dead_processes")) { val = pg_scratch->dead_processes; goto found_val; } if (strcmp(pg_scratch->kvs->kvs_name, kvsname)) HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "kvsname (%s) does not match this group's kvs space (%s)\n", kvsname, pg_scratch->kvs->kvs_name); /* Try to find the key */ for (run = pg_scratch->kvs->key_pair; run; run = run->next) { if (!strcmp(run->key, key)) { val = run->val; break; } } found_val: i = 0; tmp[i++] = HYDU_strdup("cmd=get_result rc="); if (val) { tmp[i++] = HYDU_strdup("0 msg=success value="); tmp[i++] = HYDU_strdup(val); } else { tmp[i++] = HYDU_strdup("-1 msg=key_"); tmp[i++] = HYDU_strdup(key); tmp[i++] = HYDU_strdup("_not_found value=unknown"); } tmp[i++] = HYDU_strdup("\n"); tmp[i++] = NULL; status = HYDU_str_alloc_and_join(tmp, &cmd); HYDU_ERR_POP(status, "unable to join strings\n"); HYDU_free_strlist(tmp); status = cmd_response(fd, pid, cmd); HYDU_ERR_POP(status, "error writing PMI line\n"); HYDU_FREE(cmd); fn_exit: HYD_pmcd_pmi_free_tokens(tokens, token_count); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status fn_spawn(int fd, int pid, int pgid, char *args[]) { struct HYD_pg *pg; struct HYD_pmcd_pmi_pg_scratch *pg_scratch; struct HYD_proxy *proxy; struct HYD_pmcd_token *tokens; struct HYD_exec *exec_list = NULL, *exec; struct HYD_env *env; struct HYD_node *node; char key[PMI_MAXKEYLEN], *val; int nprocs, preput_num, info_num, ret; char *execname, *path = NULL; struct HYD_pmcd_token_segment *segment_list = NULL; int token_count, i, j, k, new_pgid, total_spawns; int argcnt, num_segments; char *control_port, *proxy_args[HYD_NUM_TMP_STRINGS] = { NULL }; char *tmp[HYD_NUM_TMP_STRINGS]; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); for (i = 0; args[i]; i++) mcmd_args[mcmd_num_args++] = HYDU_strdup(args[i]); mcmd_args[mcmd_num_args] = NULL; status = HYD_pmcd_pmi_args_to_tokens(mcmd_args, &tokens, &token_count); HYDU_ERR_POP(status, "unable to convert args to tokens\n"); /* Here's the order of things we do: * * 1. Break the token list into multiple segments, each segment * corresponding to a command. Each command represents * information for one executable. * * 2. Allocate a process group for the new set of spawned * processes * * 3. Get all the common keys and deal with them * * 4. Create an executable list based on the segments. * * 5. Create a proxy list using the created executable list and * spawn it. */ /* Break the token list into multiple segments and create an * executable list based on the segments. */ val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "totspawns"); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: totspawns\n"); total_spawns = atoi(val); HYDU_MALLOC(segment_list, struct HYD_pmcd_token_segment *, total_spawns * sizeof(struct HYD_pmcd_token_segment), status); segment_tokens(tokens, token_count, segment_list, &num_segments); if (num_segments != total_spawns) { /* We didn't read the entire PMI string; wait for the rest to * arrive */ goto fn_exit; } else { /* Got the entire PMI string; free the arguments and reset */ HYDU_free_strlist(mcmd_args); mcmd_num_args = 0; } /* Allocate a new process group */ for (pg = &HYD_server_info.pg_list; pg->next; pg = pg->next); new_pgid = pg->pgid + 1; status = HYDU_alloc_pg(&pg->next, new_pgid); HYDU_ERR_POP(status, "unable to allocate process group\n"); pg = pg->next; proxy = HYD_pmcd_pmi_find_proxy(fd); HYDU_ASSERT(proxy, status); pg->spawner_pg = proxy->pg; for (j = 0; j < total_spawns; j++) { /* For each segment, we create an exec structure */ val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, "nprocs"); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: nprocs\n"); nprocs = atoi(val); pg->pg_process_count += nprocs; val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, "argcnt"); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: argcnt\n"); argcnt = atoi(val); val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, "info_num"); if (val) info_num = atoi(val); else info_num = 0; if (exec_list == NULL) { status = HYDU_alloc_exec(&exec_list); HYDU_ERR_POP(status, "unable to allocate exec\n"); exec_list->appnum = 0; exec = exec_list; } else { for (exec = exec_list; exec->next; exec = exec->next); status = HYDU_alloc_exec(&exec->next); HYDU_ERR_POP(status, "unable to allocate exec\n"); exec->next->appnum = exec->appnum + 1; exec = exec->next; } /* Info keys */ for (i = 0; i < info_num; i++) { char *info_key, *info_val; HYDU_snprintf(key, PMI_MAXKEYLEN, "info_key_%d", i); val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, key); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: %s\n", key); info_key = val; HYDU_snprintf(key, PMI_MAXKEYLEN, "info_val_%d", i); val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, key); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: %s\n", key); info_val = val; if (!strcmp(info_key, "path")) { path = HYDU_strdup(info_val); } else if (!strcmp(info_key, "wdir")) { exec->wdir = HYDU_strdup(info_val); } else if (!strcmp(info_key, "host")) { status = HYDU_process_mfile_token(info_val, 1, &pg->user_node_list); HYDU_ERR_POP(status, "error create node list\n"); } else if (!strcmp(info_key, "hostfile")) { status = HYDU_parse_hostfile(info_val, &pg->user_node_list, HYDU_process_mfile_token); HYDU_ERR_POP(status, "error parsing hostfile\n"); } else { /* Unrecognized info key; ignore */ } } status = HYDU_correct_wdir(&exec->wdir); HYDU_ERR_POP(status, "unable to correct wdir\n"); val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, "execname"); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: execname\n"); if (path == NULL) execname = HYDU_strdup(val); else { i = 0; tmp[i++] = HYDU_strdup(path); tmp[i++] = HYDU_strdup("/"); tmp[i++] = HYDU_strdup(val); tmp[i++] = NULL; status = HYDU_str_alloc_and_join(tmp, &execname); HYDU_ERR_POP(status, "error while joining strings\n"); HYDU_free_strlist(tmp); } i = 0; exec->exec[i++] = execname; for (k = 0; k < argcnt; k++) { HYDU_snprintf(key, PMI_MAXKEYLEN, "arg%d", k + 1); val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, key); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: %s\n", key); exec->exec[i++] = HYDU_strdup(val); } exec->exec[i++] = NULL; exec->proc_count = nprocs; /* It is not clear what kind of environment needs to get * passed to the spawned process. Don't set anything here, and * let the proxy do whatever it does by default. */ exec->env_prop = NULL; status = HYDU_env_create(&env, "PMI_SPAWNED", "1"); HYDU_ERR_POP(status, "unable to create PMI_SPAWNED environment\n"); exec->user_env = env; } status = HYD_pmcd_pmi_alloc_pg_scratch(pg); HYDU_ERR_POP(status, "unable to allocate pg scratch space\n"); if (pg->user_node_list) { pg->pg_core_count = 0; for (i = 0, node = pg->user_node_list; node; node = node->next, i++) { pg->pg_core_count += node->core_count; node->node_id = i; } } else { pg->pg_core_count = HYD_server_info.pg_list.pg_core_count; } pg->pg_process_count = 0; for (exec = exec_list; exec; exec = exec->next) pg->pg_process_count += exec->proc_count; pg_scratch = (struct HYD_pmcd_pmi_pg_scratch *) pg->pg_scratch; /* Get the common keys and deal with them */ val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "preput_num"); if (val) preput_num = atoi(val); else preput_num = 0; for (i = 0; i < preput_num; i++) { char *preput_key, *preput_val; HYDU_snprintf(key, PMI_MAXKEYLEN, "preput_key_%d", i); val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, key); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: %s\n", key); preput_key = val; HYDU_snprintf(key, PMI_MAXKEYLEN, "preput_val_%d", i); val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, key); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: %s\n", key); preput_val = val; status = HYD_pmcd_pmi_add_kvs(preput_key, preput_val, pg_scratch->kvs, &ret); HYDU_ERR_POP(status, "unable to add keypair to kvs\n"); } /* Create the proxy list */ if (pg->user_node_list) { status = HYDU_create_proxy_list(exec_list, pg->user_node_list, pg); HYDU_ERR_POP(status, "error creating proxy list\n"); } else { status = HYDU_create_proxy_list(exec_list, HYD_server_info.node_list, pg); HYDU_ERR_POP(status, "error creating proxy list\n"); } HYDU_free_exec_list(exec_list); status = HYDU_sock_create_and_listen_portstr(HYD_server_info.user_global.iface, HYD_server_info.local_hostname, HYD_server_info.port_range, &control_port, HYD_pmcd_pmiserv_control_listen_cb, (void *) (size_t) new_pgid); HYDU_ERR_POP(status, "unable to create PMI port\n"); if (HYD_server_info.user_global.debug) HYDU_dump(stdout, "Got a control port string of %s\n", control_port); /* Go to the last PG */ for (pg = &HYD_server_info.pg_list; pg->next; pg = pg->next); status = HYD_pmcd_pmi_fill_in_proxy_args(proxy_args, control_port, new_pgid); HYDU_ERR_POP(status, "unable to fill in proxy arguments\n"); HYDU_FREE(control_port); status = HYD_pmcd_pmi_fill_in_exec_launch_info(pg); HYDU_ERR_POP(status, "unable to fill in executable arguments\n"); status = HYDT_bsci_launch_procs(proxy_args, pg->proxy_list, NULL); HYDU_ERR_POP(status, "launcher cannot launch processes\n"); { char *cmd_str[HYD_NUM_TMP_STRINGS], *cmd; i = 0; cmd_str[i++] = HYDU_strdup("cmd=spawn_result rc=0"); cmd_str[i++] = HYDU_strdup("\n"); cmd_str[i++] = NULL; status = HYDU_str_alloc_and_join(cmd_str, &cmd); HYDU_ERR_POP(status, "unable to join strings\n"); HYDU_free_strlist(cmd_str); status = cmd_response(fd, pid, cmd); HYDU_ERR_POP(status, "error writing PMI line\n"); HYDU_FREE(cmd); } fn_exit: HYD_pmcd_pmi_free_tokens(tokens, token_count); HYDU_free_strlist(proxy_args); if (segment_list) HYDU_FREE(segment_list); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
HYD_status HYDT_bscd_slurm_launch_procs(char **args, struct HYD_proxy *proxy_list, int *control_fd) { int num_hosts, idx, i; int *pid, *fd_list; char *targs[HYD_NUM_TMP_STRINGS], *node_list_str = NULL; char *path = NULL, *extra_arg_list = NULL, *extra_arg; struct HYD_proxy *proxy; struct HYDT_topo_cpuset_t cpuset; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); /* We use the following priority order for the executable path: * (1) user-specified; (2) search in path; (3) Hard-coded * location */ if (HYDT_bsci_info.launcher_exec) path = HYDU_strdup(HYDT_bsci_info.launcher_exec); if (!path) path = HYDU_find_full_path("srun"); if (!path) path = HYDU_strdup("/usr/bin/srun"); idx = 0; targs[idx++] = HYDU_strdup(path); if (strcmp(HYDT_bsci_info.rmk, "slurm")) { targs[idx++] = HYDU_strdup("--nodelist"); status = proxy_list_to_node_str(proxy_list, &node_list_str); HYDU_ERR_POP(status, "unable to build a node list string\n"); targs[idx++] = HYDU_strdup(node_list_str); } num_hosts = 0; for (proxy = proxy_list; proxy; proxy = proxy->next) num_hosts++; targs[idx++] = HYDU_strdup("-N"); targs[idx++] = HYDU_int_to_str(num_hosts); targs[idx++] = HYDU_strdup("-n"); targs[idx++] = HYDU_int_to_str(num_hosts); MPL_env2str("HYDRA_LAUNCHER_EXTRA_ARGS", (const char **) &extra_arg_list); if (extra_arg_list) { extra_arg = strtok(extra_arg_list, " "); while (extra_arg) { targs[idx++] = HYDU_strdup(extra_arg); extra_arg = strtok(NULL, " "); } } /* Fill in the remaining arguments */ /* We do not need to create a quoted version of the string for * SLURM. It seems to be internally quoting it anyway. */ for (i = 0; args[i]; i++) targs[idx++] = HYDU_strdup(args[i]); /* Increase pid list to accommodate the new pid */ HYDU_MALLOC(pid, int *, (HYD_bscu_pid_count + 1) * sizeof(int), status); for (i = 0; i < HYD_bscu_pid_count; i++) pid[i] = HYD_bscu_pid_list[i]; HYDU_FREE(HYD_bscu_pid_list); HYD_bscu_pid_list = pid; /* Increase fd list to accommodate these new fds */ HYDU_MALLOC(fd_list, int *, (HYD_bscu_fd_count + 3) * sizeof(int), status); for (i = 0; i < HYD_bscu_fd_count; i++) fd_list[i] = HYD_bscu_fd_list[i]; HYDU_FREE(HYD_bscu_fd_list); HYD_bscu_fd_list = fd_list; /* append proxy ID as -1 */ targs[idx++] = HYDU_int_to_str(-1); targs[idx++] = NULL; if (HYDT_bsci_info.debug) { HYDU_dump(stdout, "Launch arguments: "); HYDU_print_strlist(targs); } HYDT_topo_cpuset_zero(&cpuset); status = HYDU_create_process(targs, NULL, NULL, &fd_stdout, &fd_stderr, &HYD_bscu_pid_list[HYD_bscu_pid_count++], cpuset); HYDU_ERR_POP(status, "create process returned error\n"); HYD_bscu_fd_list[HYD_bscu_fd_count++] = fd_stdout; HYD_bscu_fd_list[HYD_bscu_fd_count++] = fd_stderr; status = HYDT_dmx_register_fd(1, &fd_stdout, HYD_POLLIN, (void *) (size_t) STDOUT_FILENO, HYDT_bscu_stdio_cb); HYDU_ERR_POP(status, "demux returned error registering fd\n"); status = HYDT_dmx_register_fd(1, &fd_stderr, HYD_POLLIN, (void *) (size_t) STDERR_FILENO, HYDT_bscu_stdio_cb); HYDU_ERR_POP(status, "demux returned error registering fd\n"); fn_exit: if (node_list_str) HYDU_FREE(node_list_str); HYDU_free_strlist(targs); if (path) HYDU_FREE(path); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
HYD_status HYD_pmcd_pmi_parse_pmi_cmd(char *obuf, int pmi_version, char **pmi_cmd, char *args[]) { char *tbuf = NULL, *seg, *str1 = NULL, *cmd; char *buf; char *tmp[HYD_NUM_TMP_STRINGS], *targs[HYD_NUM_TMP_STRINGS]; const char *delim; int i, j, k; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); /* Make a copy of the original buffer */ buf = HYDU_strdup(obuf); if (buf[strlen(obuf) - 1] == '\n') buf[strlen(obuf) - 1] = '\0'; if (pmi_version == 1) { if (!strncmp(buf, "cmd=", strlen("cmd="))) delim = " "; else delim = "\n"; /* Here we only get PMI-1 commands or backward compatible * PMI-2 commands, so we always explicitly use the PMI-1 * delimiter. This allows us to get backward-compatible PMI-2 * commands interleaved with regular PMI-2 commands. */ tbuf = HYDU_strdup(buf); cmd = strtok(tbuf, delim); for (i = 0; i < HYD_NUM_TMP_STRINGS; i++) { targs[i] = strtok(NULL, delim); if (targs[i] == NULL) break; } /* Make a pass through targs and merge space separated * arguments which are actually part of the same key */ k = 0; for (i = 0; targs[i]; i++) { if (!strrchr(targs[i], ' ')) { /* no spaces */ args[k++] = HYDU_strdup(targs[i]); } else { /* space in the argument; each segment is either a new * key, or a space-separated part of the previous * key */ j = 0; seg = strtok(targs[i], " "); while (1) { if (!seg || strrchr(seg, '=')) { /* segment has an '='; it's a start of a new key */ if (j) { tmp[j++] = NULL; status = HYDU_str_alloc_and_join(tmp, &args[k++]); HYDU_ERR_POP(status, "error while joining strings\n"); HYDU_free_strlist(tmp); } j = 0; if (!seg) break; } else { /* no '='; part of the previous key */ tmp[j++] = HYDU_strdup(" "); } tmp[j++] = HYDU_strdup(seg); seg = strtok(NULL, " "); } } } args[k++] = NULL; } else { /* PMI-v2 */ delim = ";"; tbuf = HYDU_strdup(buf); cmd = strtok(tbuf, delim); for (i = 0; i < HYD_NUM_TMP_STRINGS; i++) { args[i] = strtok(NULL, delim); if (args[i] == NULL) break; args[i] = HYDU_strdup(args[i]); } } /* Search for the PMI command in our table */ status = HYDU_strsplit(cmd, &str1, pmi_cmd, '='); HYDU_ERR_POP(status, "string split returned error\n"); fn_exit: HYDU_FREE(buf); if (tbuf) HYDU_FREE(tbuf); if (str1) HYDU_FREE(str1); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status bcast_keyvals(int fd, int pid) { int keyval_count, arg_count, i, j; char **tmp = NULL, *cmd; struct HYD_pmcd_pmi_kvs_pair *run; struct HYD_proxy *proxy, *tproxy; struct HYD_pmcd_pmi_pg_scratch *pg_scratch; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); proxy = HYD_pmcd_pmi_find_proxy(fd); HYDU_ASSERT(proxy, status); pg_scratch = (struct HYD_pmcd_pmi_pg_scratch *) proxy->pg->pg_scratch; /* find the number of keyvals */ keyval_count = 0; for (run = pg_scratch->kvs->key_pair; run; run = run->next) keyval_count++; keyval_count -= pg_scratch->keyval_dist_count; /* Each keyval has the following four items: 'key' '=' 'val' * '<space>'. Two additional items for the command at the start * and the NULL at the end. */ HYDU_MALLOC_OR_JUMP(tmp, char **, (4 * keyval_count + 3) * sizeof(char *), status); /* send all available keyvals downstream */ if (keyval_count) { arg_count = 1; i = 0; tmp[i++] = MPL_strdup("cmd=keyval_cache "); for (run = pg_scratch->kvs->key_pair, j = 0; run; run = run->next, j++) { if (j < pg_scratch->keyval_dist_count) continue; tmp[i++] = MPL_strdup(run->key); tmp[i++] = MPL_strdup("="); tmp[i++] = MPL_strdup(run->val); tmp[i++] = MPL_strdup(" "); arg_count++; if (arg_count >= MAX_PMI_INTERNAL_ARGS) { tmp[i++] = MPL_strdup("\n"); tmp[i++] = NULL; status = HYDU_str_alloc_and_join(tmp, &cmd); HYDU_ERR_POP(status, "unable to join strings\n"); HYDU_free_strlist(tmp); pg_scratch->keyval_dist_count += (arg_count - 1); for (tproxy = proxy->pg->proxy_list; tproxy; tproxy = tproxy->next) { status = cmd_response(tproxy->control_fd, pid, cmd); HYDU_ERR_POP(status, "error writing PMI line\n"); } MPL_free(cmd); i = 0; tmp[i++] = MPL_strdup("cmd=keyval_cache "); arg_count = 1; } } tmp[i++] = MPL_strdup("\n"); tmp[i++] = NULL; if (arg_count > 1) { status = HYDU_str_alloc_and_join(tmp, &cmd); HYDU_ERR_POP(status, "unable to join strings\n"); pg_scratch->keyval_dist_count += (arg_count - 1); for (tproxy = proxy->pg->proxy_list; tproxy; tproxy = tproxy->next) { status = cmd_response(tproxy->control_fd, pid, cmd); HYDU_ERR_POP(status, "error writing PMI line\n"); } MPL_free(cmd); } HYDU_free_strlist(tmp); } fn_exit: if (tmp) MPL_free(tmp); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status group_to_nodes(char *str) { char *nodes, *tnodes, *tmp, *start_str, *end_str, **set; int start, end, i, j, k = 0; HYD_status status = HYD_SUCCESS; for (tmp = str; *tmp != '[' && *tmp != 0; tmp++); if (*tmp == 0) { /* only one node in the group */ status = HYDU_add_to_node_list(str, tasks_per_node[k++], &global_node_list); HYDU_ERR_POP(status, "unable to add to node list\n"); goto fn_exit; } /* more than one node in the group */ *tmp = 0; nodes = tmp + 1; for (tmp = nodes; *tmp != ']' && *tmp != 0; tmp++); *tmp = 0; /* remove the closing ']' */ /* Find the number of sets */ tnodes = MPL_strdup(nodes); tmp = strtok(tnodes, ","); for (i = 1; tmp; i++) tmp = strtok(NULL, ","); HYDU_MALLOC_OR_JUMP(set, char **, i * sizeof(char *), status); /* Find the actual node sets */ set[0] = strtok(nodes, ","); for (i = 1; set[i - 1]; i++) set[i] = strtok(NULL, ","); for (i = 0; set[i]; i++) { start_str = strtok(set[i], "-"); if ((end_str = strtok(NULL, "-")) == NULL) end_str = start_str; start = atoi(start_str); end = atoi(end_str); for (j = start; j <= end; j++) { char *node_str[HYD_NUM_TMP_STRINGS]; node_str[0] = MPL_strdup(str); node_str[1] = HYDU_int_to_str_pad(j, strlen(start_str)); node_str[2] = NULL; status = HYDU_str_alloc_and_join(node_str, &tmp); HYDU_ERR_POP(status, "unable to join strings\n"); HYDU_free_strlist(node_str); status = HYDU_add_to_node_list(tmp, tasks_per_node[k++], &global_node_list); HYDU_ERR_POP(status, "unable to add to node list\n"); } } fn_exit: return status; fn_fail: goto fn_exit; }
static HYD_status resolve_pattern_string(const char *pattern, char **str, int pgid, int proxy_id, int rank) { HYD_status status = HYD_SUCCESS; int i, pos, tpos; char *tmp[HYD_NUM_TMP_STRINGS] = { NULL }; struct HYD_pg *pg; struct HYD_proxy *proxy; HYDU_FUNC_ENTER(); *str = NULL; tpos = 0; pos = 0; i = 0; HYDU_MALLOC_OR_JUMP(tmp[i], char *, HYD_TMP_STRLEN, status); tmp[i][0] = '\0'; while (1) { HYDU_ASSERT(tpos < HYD_TMP_STRLEN, status); if (pattern[pos] != '%') { tmp[i][tpos++] = pattern[pos++]; if (pattern[pos - 1] == '\0') break; } else { ++pos; /* consume '%' */ if (pattern[pos] == '%') { tmp[i][tpos++] = pattern[pos++]; continue; } /* all remaining valid specifiers need a new temp string */ tmp[i][tpos] = '\0'; ++i; tpos = 0; HYDU_MALLOC_OR_JUMP(tmp[i], char *, HYD_TMP_STRLEN, status); tmp[i][0] = '\0'; switch (pattern[pos]) { case 'r': MPL_snprintf(tmp[i], HYD_TMP_STRLEN, "%d", rank); break; case 'g': MPL_snprintf(tmp[i], HYD_TMP_STRLEN, "%d", pgid); break; case 'p': MPL_snprintf(tmp[i], HYD_TMP_STRLEN, "%d", proxy_id); break; case 'h': for (pg = &HYD_server_info.pg_list; pg; pg = pg->next) if (pg->pgid == pgid) break; HYDU_ASSERT(pg, status); for (proxy = pg->proxy_list; proxy; proxy = proxy->next) if (proxy->proxy_id == proxy_id) break; HYDU_ASSERT(proxy, status); MPL_snprintf(tmp[i], HYD_TMP_STRLEN, "%s", proxy->node->hostname); break; case '\0': HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "dangling '%%' at end of pattern\n"); break; default: HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "unrecognized pattern specifier ('%c')\n", pattern[pos]); break; } ++pos; /* skip past fmt specifier */ ++i; tpos = 0; HYDU_MALLOC_OR_JUMP(tmp[i], char *, HYD_TMP_STRLEN, status); tmp[i][0] = '\0'; } } tmp[++i] = NULL; status = HYDU_str_alloc_and_join(tmp, str); HYDU_ERR_POP(status, "unable to join strings\n"); fn_exit: HYDU_free_strlist(tmp); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }