void HYDU_free_strlist(char **strlist) { int arg; HYDU_FUNC_ENTER(); for (arg = 0; strlist[arg]; arg++) HYDU_FREE(strlist[arg]); HYDU_FUNC_EXIT(); }
HYD_status HYDU_list_inherited_env(struct HYD_env **env_list) { char *env_str = NULL, *env_name; int i, ret; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); *env_list = NULL; i = 0; while (environ[i]) { env_str = HYDU_strdup(environ[i]); env_name = strtok(env_str, "="); status = HYDT_bsci_query_env_inherit(env_name, &ret); HYDU_ERR_POP(status, "error querying environment propagation\n"); HYDU_FREE(env_str); env_str = NULL; if (!ret) { i++; continue; } status = HYDU_append_env_str_to_list(environ[i], env_list); HYDU_ERR_POP(status, "unable to add env to list\n"); i++; } fn_exit: if (env_str) HYDU_FREE(env_str); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status fn_name_lookup(int fd, int pid, int pgid, char *args[]) { struct HYD_string_stash stash; char *cmd, *thrid, *name, *value; int token_count; struct HYD_pmcd_token *tokens = NULL; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); status = HYD_pmcd_pmi_args_to_tokens(args, &tokens, &token_count); HYDU_ERR_POP(status, "unable to convert args to tokens\n"); thrid = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "thrid"); if ((name = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "name")) == NULL) HYDU_ERR_POP(status, "cannot find token: name\n"); status = HYD_pmcd_pmi_lookup(name, &value); HYDU_ERR_POP(status, "error while looking up service\n"); HYD_STRING_STASH_INIT(stash); HYD_STRING_STASH(stash, HYDU_strdup("cmd=name-lookup-response;"), status); if (thrid) { HYD_STRING_STASH(stash, HYDU_strdup("thrid="), status); HYD_STRING_STASH(stash, HYDU_strdup(thrid), status); HYD_STRING_STASH(stash, HYDU_strdup(";"), status); } if (value) { HYD_STRING_STASH(stash, HYDU_strdup("port="), status); HYD_STRING_STASH(stash, HYDU_strdup(value), status); HYD_STRING_STASH(stash, HYDU_strdup(";found=TRUE;rc=0;"), status); } else { HYD_STRING_STASH(stash, HYDU_strdup("found=FALSE;rc=1;"), status); } HYD_STRING_SPIT(stash, cmd, status); status = cmd_response(fd, pid, cmd); HYDU_ERR_POP(status, "send command failed\n"); HYDU_FREE(cmd); fn_exit: if (tokens) HYD_pmcd_pmi_free_tokens(tokens, token_count); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
void HYDU_free_node_list(struct HYD_node *node_list) { struct HYD_node *node, *tnode; node = node_list; while (node) { tnode = node->next; if (node->hostname) HYDU_FREE(node->hostname); if (node->user) HYDU_FREE(node->user); if (node->local_binding) HYDU_FREE(node->local_binding); HYDU_FREE(node); node = tnode; } }
static HYD_status fn_name_unpublish(int fd, int pid, int pgid, char *args[]) { struct HYD_string_stash stash; char *cmd, *thrid, *name; int token_count, success; struct HYD_pmcd_token *tokens = NULL; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); status = HYD_pmcd_pmi_args_to_tokens(args, &tokens, &token_count); HYDU_ERR_POP(status, "unable to convert args to tokens\n"); thrid = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "thrid"); if ((name = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "name")) == NULL) HYDU_ERR_POP(status, "cannot find token: name\n"); status = HYD_pmcd_pmi_unpublish(name, &success); HYDU_ERR_POP(status, "error unpublishing service\n"); HYD_STRING_STASH_INIT(stash); HYD_STRING_STASH(stash, HYDU_strdup("cmd=name-unpublish-response;"), status); if (thrid) { HYD_STRING_STASH(stash, HYDU_strdup("thrid="), status); HYD_STRING_STASH(stash, HYDU_strdup(thrid), status); HYD_STRING_STASH(stash, HYDU_strdup(";"), status); } if (success) HYD_STRING_STASH(stash, HYDU_strdup("rc=0;"), status); else { HYD_STRING_STASH(stash, HYDU_strdup("rc=1;errmsg=service_"), status); HYD_STRING_STASH(stash, HYDU_strdup(name), status); HYD_STRING_STASH(stash, HYDU_strdup("_not_found;"), status); } HYD_STRING_SPIT(stash, cmd, status); status = cmd_response(fd, pid, cmd); HYDU_ERR_POP(status, "send command failed\n"); HYDU_FREE(cmd); fn_exit: if (tokens) HYD_pmcd_pmi_free_tokens(tokens, token_count); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status poke_progress(char *key) { struct HYD_pmcd_pmi_v2_reqs *req, *list_head = NULL, *list_tail = NULL; int i, count; HYD_status status = HYD_SUCCESS; for (count = 0, req = pending_reqs; req; req = req->next) count++; for (i = 0; i < count; i++) { /* Dequeue a request */ req = pending_reqs; if (pending_reqs) { pending_reqs = pending_reqs->next; req->next = NULL; } if (key && strcmp(key, req->key)) { /* If the key doesn't match the request, just queue it back */ if (list_head == NULL) { list_head = req; list_tail = req; } else { list_tail->next = req; req->prev = list_tail; list_tail = req; } } else { status = fn_kvs_get(req->fd, req->pid, req->pgid, req->args); HYDU_ERR_POP(status, "kvs_get returned error\n"); /* Free the dequeued request */ HYDU_free_strlist(req->args); HYDU_FREE(req); } } if (list_head) { list_tail->next = pending_reqs; pending_reqs = list_head; } fn_exit: return status; fn_fail: goto fn_exit; }
void HYDU_finalize_global_env(struct HYD_env_global *global_env) { if (global_env->system) HYDU_env_free_list(global_env->system); if (global_env->user) HYDU_env_free_list(global_env->user); if (global_env->inherited) HYDU_env_free_list(global_env->inherited); if (global_env->prop) HYDU_FREE(global_env->prop); }
static HYD_status handle_pmi_cmd(int fd, int pgid, int pid, char *buf, int pmi_version) { char *args[MAX_PMI_ARGS], *cmd = NULL; struct HYD_pmcd_pmi_handle *h; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); if (pmi_version == 1) HYD_pmcd_pmi_handle = HYD_pmcd_pmi_v1; else HYD_pmcd_pmi_handle = HYD_pmcd_pmi_v2; if (HYD_server_info.user_global.debug) HYDU_dump(stdout, "[pgid: %d] got PMI command: %s\n", pgid, buf); status = HYD_pmcd_pmi_parse_pmi_cmd(buf, pmi_version, &cmd, args); HYDU_ERR_POP(status, "unable to parse PMI command\n"); #if defined ENABLE_PROFILING if (HYD_server_info.enable_profiling) HYD_server_info.num_pmi_calls++; #endif /* ENABLE_PROFILING */ h = HYD_pmcd_pmi_handle; while (h->handler) { if (!strcmp(cmd, h->cmd)) { status = h->handler(fd, pid, pgid, args); HYDU_ERR_POP(status, "PMI handler returned error\n"); break; } h++; } if (!h->handler) { /* We don't understand the command */ HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "Unrecognized PMI command: %s | cleaning up processes\n", cmd); } fn_exit: if (cmd) HYDU_FREE(cmd); HYDU_free_strlist(args); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
HYD_status HYDT_bscd_lsf_query_node_list(struct HYD_node **node_list) { char *hosts, *hostname, *num_procs_str, *thosts = NULL; int num_procs; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); if (MPL_env2str("LSB_MCPU_HOSTS", (const char **) &hosts) == 0) hosts = NULL; if (hosts == NULL) { *node_list = NULL; HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "No LSF node list found\n"); } else { hosts = HYDU_strdup(hosts); thosts = hosts; hostname = strtok(hosts, " "); while (1) { if (hostname == NULL) break; /* the even fields in the list should be the number of * cores */ num_procs_str = strtok(NULL, " "); HYDU_ASSERT(num_procs_str, status); num_procs = atoi(num_procs_str); status = HYDU_add_to_node_list(hostname, num_procs, node_list); HYDU_ERR_POP(status, "unable to add to node list\n"); hostname = strtok(NULL, " "); } if (thosts) HYDU_FREE(thosts); } fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status fn_publish_name(int fd, int pid, int pgid, char *args[]) { char *tmp[HYD_NUM_TMP_STRINGS], *cmd, *val; int i, token_count; struct HYD_pmcd_token *tokens; char *name, *port; int success = 0; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); status = HYD_pmcd_pmi_args_to_tokens(args, &tokens, &token_count); HYDU_ERR_POP(status, "unable to convert args to tokens\n"); if ((val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "service")) == NULL) HYDU_ERR_POP(status, "cannot find token: service\n"); name = HYDU_strdup(val); if ((val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "port")) == NULL) HYDU_ERR_POP(status, "cannot find token: port\n"); port = HYDU_strdup(val); status = HYD_pmcd_pmi_publish(name, port, &success); HYDU_ERR_POP(status, "error publishing service\n"); i = 0; if (success) tmp[i++] = HYDU_strdup("cmd=publish_result info=ok rc=0 msg=success\n"); else tmp[i++] = HYDU_strdup("cmd=publish_result info=ok rc=1 msg=key_already_present\n"); tmp[i++] = NULL; status = HYDU_str_alloc_and_join(tmp, &cmd); HYDU_ERR_POP(status, "unable to join strings\n"); HYDU_free_strlist(tmp); status = cmd_response(fd, pid, cmd); HYDU_ERR_POP(status, "send command failed\n"); HYDU_FREE(cmd); fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status fn_lookup_name(int fd, int pid, int pgid, char *args[]) { char *tmp[HYD_NUM_TMP_STRINGS], *cmd, *name, *value; int i, token_count; struct HYD_pmcd_token *tokens; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); status = HYD_pmcd_pmi_args_to_tokens(args, &tokens, &token_count); HYDU_ERR_POP(status, "unable to convert args to tokens\n"); if ((name = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "service")) == NULL) HYDU_ERR_POP(status, "cannot find token: service\n"); status = HYD_pmcd_pmi_lookup(name, &value); HYDU_ERR_POP(status, "error while looking up service\n"); i = 0; tmp[i++] = HYDU_strdup("cmd=lookup_result info=ok"); if (value) { tmp[i++] = HYDU_strdup("value="); tmp[i++] = HYDU_strdup(value); tmp[i++] = HYDU_strdup(" rc=0 msg=success\n"); } else { tmp[i++] = HYDU_strdup(" rc=1 msg=service_not_found\n"); } tmp[i++] = NULL; status = HYDU_str_alloc_and_join(tmp, &cmd); HYDU_ERR_POP(status, "unable to join strings\n"); HYDU_free_strlist(tmp); status = cmd_response(fd, pid, cmd); HYDU_ERR_POP(status, "send command failed\n"); HYDU_FREE(cmd); fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
void HYDU_free_pg_list(struct HYD_pg *pg_list) { struct HYD_pg *pg, *tpg; pg = pg_list; while (pg) { tpg = pg->next; if (pg->proxy_list) HYDU_free_proxy_list(pg->proxy_list); if (pg->user_node_list) HYDU_free_node_list(pg->user_node_list); HYDU_FREE(pg); pg = tpg; } }
static HYD_status control_port_fn(char *arg, char ***argv) { char *port = NULL; HYD_status status = HYD_SUCCESS; HYDU_ERR_CHKANDJUMP(status, HYD_pmcd_pmip.upstream.server_name, HYD_INTERNAL_ERROR, "duplicate control port setting\n"); port = HYDU_strdup(**argv); HYD_pmcd_pmip.upstream.server_name = HYDU_strdup(strtok(port, ":")); HYD_pmcd_pmip.upstream.server_port = atoi(strtok(NULL, ":")); (*argv)++; fn_exit: if (port) HYDU_FREE(port); return status; fn_fail: goto fn_exit; }
HYD_status HYD_pmcd_pmi_add_kvs(const char *key, char *val, struct HYD_pmcd_pmi_kvs *kvs, int *ret) { struct HYD_pmcd_pmi_kvs_pair *key_pair, *run, *last; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); HYDU_MALLOC(key_pair, struct HYD_pmcd_pmi_kvs_pair *, sizeof(struct HYD_pmcd_pmi_kvs_pair), status); MPL_snprintf(key_pair->key, PMI_MAXKEYLEN, "%s", key); MPL_snprintf(key_pair->val, PMI_MAXVALLEN, "%s", val); key_pair->next = NULL; *ret = 0; if (kvs->key_pair == NULL) { kvs->key_pair = key_pair; } else { for (run = kvs->key_pair; run; run = run->next) { if (!strcmp(run->key, key_pair->key)) { /* duplicate key found */ *ret = -1; goto fn_fail; } last = run; } /* Add key_pair to end of list. */ last->next = key_pair; } fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: HYDU_FREE(key_pair); goto fn_exit; }
HYD_status HYDT_topo_finalize(void) { HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); /* Finalize the topology library requested by the user */ #if defined HAVE_HWLOC if (!strcmp(HYDT_topo_info.topolib, "hwloc")) { status = HYDT_topo_hwloc_finalize(); HYDU_ERR_POP(status, "unable to finalize hwloc\n"); } #endif /* HAVE_HWLOC */ if (HYDT_topo_info.topolib) HYDU_FREE(HYDT_topo_info.topolib); fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
HYD_status HYDT_bscd_slurm_launch_procs(char **args, struct HYD_proxy *proxy_list, int *control_fd) { int num_hosts, idx, i; int *pid, *fd_list; char *targs[HYD_NUM_TMP_STRINGS], *node_list_str = NULL; char *path = NULL, *extra_arg_list = NULL, *extra_arg; struct HYD_proxy *proxy; struct HYDT_topo_cpuset_t cpuset; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); /* We use the following priority order for the executable path: * (1) user-specified; (2) search in path; (3) Hard-coded * location */ if (HYDT_bsci_info.launcher_exec) path = HYDU_strdup(HYDT_bsci_info.launcher_exec); if (!path) path = HYDU_find_full_path("srun"); if (!path) path = HYDU_strdup("/usr/bin/srun"); idx = 0; targs[idx++] = HYDU_strdup(path); if (strcmp(HYDT_bsci_info.rmk, "slurm")) { targs[idx++] = HYDU_strdup("--nodelist"); status = proxy_list_to_node_str(proxy_list, &node_list_str); HYDU_ERR_POP(status, "unable to build a node list string\n"); targs[idx++] = HYDU_strdup(node_list_str); } num_hosts = 0; for (proxy = proxy_list; proxy; proxy = proxy->next) num_hosts++; targs[idx++] = HYDU_strdup("-N"); targs[idx++] = HYDU_int_to_str(num_hosts); targs[idx++] = HYDU_strdup("-n"); targs[idx++] = HYDU_int_to_str(num_hosts); MPL_env2str("HYDRA_LAUNCHER_EXTRA_ARGS", (const char **) &extra_arg_list); if (extra_arg_list) { extra_arg = strtok(extra_arg_list, " "); while (extra_arg) { targs[idx++] = HYDU_strdup(extra_arg); extra_arg = strtok(NULL, " "); } } /* Fill in the remaining arguments */ /* We do not need to create a quoted version of the string for * SLURM. It seems to be internally quoting it anyway. */ for (i = 0; args[i]; i++) targs[idx++] = HYDU_strdup(args[i]); /* Increase pid list to accommodate the new pid */ HYDU_MALLOC(pid, int *, (HYD_bscu_pid_count + 1) * sizeof(int), status); for (i = 0; i < HYD_bscu_pid_count; i++) pid[i] = HYD_bscu_pid_list[i]; HYDU_FREE(HYD_bscu_pid_list); HYD_bscu_pid_list = pid; /* Increase fd list to accommodate these new fds */ HYDU_MALLOC(fd_list, int *, (HYD_bscu_fd_count + 3) * sizeof(int), status); for (i = 0; i < HYD_bscu_fd_count; i++) fd_list[i] = HYD_bscu_fd_list[i]; HYDU_FREE(HYD_bscu_fd_list); HYD_bscu_fd_list = fd_list; /* append proxy ID as -1 */ targs[idx++] = HYDU_int_to_str(-1); targs[idx++] = NULL; if (HYDT_bsci_info.debug) { HYDU_dump(stdout, "Launch arguments: "); HYDU_print_strlist(targs); } HYDT_topo_cpuset_zero(&cpuset); status = HYDU_create_process(targs, NULL, NULL, &fd_stdout, &fd_stderr, &HYD_bscu_pid_list[HYD_bscu_pid_count++], cpuset); HYDU_ERR_POP(status, "create process returned error\n"); HYD_bscu_fd_list[HYD_bscu_fd_count++] = fd_stdout; HYD_bscu_fd_list[HYD_bscu_fd_count++] = fd_stderr; status = HYDT_dmx_register_fd(1, &fd_stdout, HYD_POLLIN, (void *) (size_t) STDOUT_FILENO, HYDT_bscu_stdio_cb); HYDU_ERR_POP(status, "demux returned error registering fd\n"); status = HYDT_dmx_register_fd(1, &fd_stderr, HYD_POLLIN, (void *) (size_t) STDERR_FILENO, HYDT_bscu_stdio_cb); HYDU_ERR_POP(status, "demux returned error registering fd\n"); fn_exit: if (node_list_str) HYDU_FREE(node_list_str); HYDU_free_strlist(targs); if (path) HYDU_FREE(path); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status fn_info_getjobattr(int fd, int pid, int pgid, char *args[]) { struct HYD_proxy *proxy; struct HYD_pmcd_pmi_pg_scratch *pg_scratch; struct HYD_pmcd_pmi_kvs_pair *run; const char *key; char *thrid, *val, *cmd; struct HYD_string_stash stash; struct HYD_pmcd_token *tokens; int token_count; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); status = HYD_pmcd_pmi_args_to_tokens(args, &tokens, &token_count); HYDU_ERR_POP(status, "unable to convert args to tokens\n"); key = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "key"); HYDU_ERR_CHKANDJUMP(status, key == NULL, HYD_INTERNAL_ERROR, "unable to find key token\n"); thrid = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "thrid"); proxy = HYD_pmcd_pmi_find_proxy(fd); HYDU_ASSERT(proxy, status); pg_scratch = (struct HYD_pmcd_pmi_pg_scratch *) proxy->pg->pg_scratch; val = NULL; if (!strcmp(key, "PMI_dead_processes")) val = pg_scratch->dead_processes; /* Try to find the key */ for (run = pg_scratch->kvs->key_pair; run; run = run->next) { if (!strcmp(run->key, key)) { val = run->val; break; } } HYD_STRING_STASH_INIT(stash); HYD_STRING_STASH(stash, HYDU_strdup("cmd=info-getjobattr-response;"), status); if (thrid) { HYD_STRING_STASH(stash, HYDU_strdup("thrid="), status); HYD_STRING_STASH(stash, HYDU_strdup(thrid), status); HYD_STRING_STASH(stash, HYDU_strdup(";"), status); } HYD_STRING_STASH(stash, HYDU_strdup("found="), status); if (val) { HYD_STRING_STASH(stash, HYDU_strdup("TRUE;value="), status); HYD_STRING_STASH(stash, HYDU_strdup(val), status); HYD_STRING_STASH(stash, HYDU_strdup(";rc=0;"), status); } else { HYD_STRING_STASH(stash, HYDU_strdup("FALSE;rc=0;"), status); } HYD_STRING_SPIT(stash, cmd, status); status = cmd_response(fd, pid, cmd); HYDU_ERR_POP(status, "send command failed\n"); HYDU_FREE(cmd); fn_exit: HYD_pmcd_pmi_free_tokens(tokens, token_count); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status fn_kvs_put(int fd, int pid, int pgid, char *args[]) { struct HYD_string_stash stash; char *key, *val, *thrid, *cmd; int ret; struct HYD_proxy *proxy; struct HYD_pmcd_pmi_pg_scratch *pg_scratch; struct HYD_pmcd_token *tokens; int token_count; struct HYD_pmcd_pmi_v2_reqs *req; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); status = HYD_pmcd_pmi_args_to_tokens(args, &tokens, &token_count); HYDU_ERR_POP(status, "unable to convert args to tokens\n"); key = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "key"); HYDU_ERR_CHKANDJUMP(status, key == NULL, HYD_INTERNAL_ERROR, "unable to find key token\n"); val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "value"); if (val == NULL) { /* the user sent an empty string */ val = HYDU_strdup(""); } thrid = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "thrid"); proxy = HYD_pmcd_pmi_find_proxy(fd); HYDU_ASSERT(proxy, status); pg_scratch = (struct HYD_pmcd_pmi_pg_scratch *) proxy->pg->pg_scratch; status = HYD_pmcd_pmi_add_kvs(key, val, pg_scratch->kvs, &ret); HYDU_ERR_POP(status, "unable to put data into kvs\n"); HYD_STRING_STASH_INIT(stash); HYD_STRING_STASH(stash, HYDU_strdup("cmd=kvs-put-response;"), status); if (thrid) { HYD_STRING_STASH(stash, HYDU_strdup("thrid="), status); HYD_STRING_STASH(stash, HYDU_strdup(thrid), status); HYD_STRING_STASH(stash, HYDU_strdup(";"), status); } HYD_STRING_STASH(stash, HYDU_strdup("rc="), status); HYD_STRING_STASH(stash, HYDU_int_to_str(ret), status); HYD_STRING_STASH(stash, HYDU_strdup(";"), status); HYD_STRING_SPIT(stash, cmd, status); status = cmd_response(fd, pid, cmd); HYDU_ERR_POP(status, "send command failed\n"); HYDU_FREE(cmd); for (req = pending_reqs; req; req = req->next) { if (!strcmp(req->key, key)) { /* Poke the progress engine before exiting */ status = poke_progress(key); HYDU_ERR_POP(status, "poke progress error\n"); break; } } fn_exit: HYD_pmcd_pmi_free_tokens(tokens, token_count); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
HYD_status HYDT_bscd_ll_launch_procs(char **args, struct HYD_proxy *proxy_list, int *control_fd) { int idx, i, total_procs, node_count; int *pid, *fd_list, exec_idx; char *targs[HYD_NUM_TMP_STRINGS], *node_list_str = NULL; char *path = NULL, *extra_arg_list = NULL, *extra_arg, quoted_exec_string[HYD_TMP_STRLEN]; struct HYD_proxy *proxy; struct HYDT_topo_cpuset_t cpuset; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); /* We use the following priority order for the executable path: * (1) user-specified; (2) search in path; (3) Hard-coded * location */ if (HYDT_bsci_info.launcher_exec) path = HYDU_strdup(HYDT_bsci_info.launcher_exec); if (!path) path = HYDU_find_full_path("poe"); if (!path) path = HYDU_strdup("/usr/bin/poe"); idx = 0; targs[idx++] = HYDU_strdup(path); if (!strcmp(HYDT_bsci_info.rmk, "ll")) { HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "ll does not support user-defined host lists\n"); } /* Check how many nodes are being passed for the launch */ status = HYDTI_bscd_ll_query_node_count(&total_procs); HYDU_ERR_POP(status, "unable to query for the node count\n"); node_count = 0; for (proxy = proxy_list; proxy; proxy = proxy->next) node_count++; if (total_procs != node_count) HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "processes to be launched have to cover all nodes\n"); MPL_env2str("HYDRA_LAUNCHER_EXTRA_ARGS", (const char **) &extra_arg_list); if (extra_arg_list) { extra_arg = strtok(extra_arg_list, " "); while (extra_arg) { targs[idx++] = HYDU_strdup(extra_arg); extra_arg = strtok(NULL, " "); } } /* Fill in the remaining arguments */ exec_idx = idx; for (i = 0; args[i]; i++) targs[idx++] = HYDU_strdup(args[i]); /* Create a quoted version of the exec string, which is only used * when the executable is not launched directly, but through an * actual launcher */ HYDU_snprintf(quoted_exec_string, HYD_TMP_STRLEN, "\"%s\"", targs[exec_idx]); HYDU_FREE(targs[exec_idx]); targs[exec_idx] = quoted_exec_string; /* Increase pid list to accommodate the new pid */ HYDU_MALLOC(pid, int *, (HYD_bscu_pid_count + 1) * sizeof(int), status); for (i = 0; i < HYD_bscu_pid_count; i++) pid[i] = HYD_bscu_pid_list[i]; HYDU_FREE(HYD_bscu_pid_list); HYD_bscu_pid_list = pid; /* Increase fd list to accommodate these new fds */ HYDU_MALLOC(fd_list, int *, (HYD_bscu_fd_count + 3) * sizeof(int), status); for (i = 0; i < HYD_bscu_fd_count; i++) fd_list[i] = HYD_bscu_fd_list[i]; HYDU_FREE(HYD_bscu_fd_list); HYD_bscu_fd_list = fd_list; /* append proxy ID as -1 */ targs[idx++] = HYDU_int_to_str(-1); targs[idx++] = NULL; HYDT_topo_cpuset_zero(&cpuset); status = HYDU_create_process(targs, NULL, NULL, &fd_stdout, &fd_stderr, &HYD_bscu_pid_list[HYD_bscu_pid_count++], cpuset); HYDU_ERR_POP(status, "create process returned error\n"); HYD_bscu_fd_list[HYD_bscu_fd_count++] = fd_stdout; HYD_bscu_fd_list[HYD_bscu_fd_count++] = fd_stderr; status = HYDT_dmx_register_fd(1, &fd_stdout, HYD_POLLIN, (void *) (size_t) STDOUT_FILENO, HYDT_bscu_stdio_cb); HYDU_ERR_POP(status, "demux returned error registering fd\n"); status = HYDT_dmx_register_fd(1, &fd_stderr, HYD_POLLIN, (void *) (size_t) STDERR_FILENO, HYDT_bscu_stdio_cb); HYDU_ERR_POP(status, "demux returned error registering fd\n"); fn_exit: if (node_list_str) HYDU_FREE(node_list_str); HYDU_free_strlist(targs); if (path) HYDU_FREE(path); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status fn_put(int fd, int pid, int pgid, char *args[]) { int i, ret; struct HYD_proxy *proxy; struct HYD_pmcd_pmi_pg_scratch *pg_scratch; char *kvsname, *key, *val; char *tmp[HYD_NUM_TMP_STRINGS], *cmd; struct HYD_pmcd_token *tokens; int token_count; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); status = HYD_pmcd_pmi_args_to_tokens(args, &tokens, &token_count); HYDU_ERR_POP(status, "unable to convert args to tokens\n"); kvsname = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "kvsname"); HYDU_ERR_CHKANDJUMP(status, kvsname == NULL, HYD_INTERNAL_ERROR, "unable to find token: kvsname\n"); key = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "key"); HYDU_ERR_CHKANDJUMP(status, key == NULL, HYD_INTERNAL_ERROR, "unable to find token: key\n"); val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "value"); if (val == NULL) { /* the user sent an empty string */ val = HYDU_strdup(""); } proxy = HYD_pmcd_pmi_find_proxy(fd); HYDU_ASSERT(proxy, status); pg_scratch = (struct HYD_pmcd_pmi_pg_scratch *) proxy->pg->pg_scratch; if (strcmp(pg_scratch->kvs->kvs_name, kvsname)) HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "kvsname (%s) does not match this group's kvs space (%s)\n", kvsname, pg_scratch->kvs->kvs_name); status = HYD_pmcd_pmi_add_kvs(key, val, pg_scratch->kvs, &ret); HYDU_ERR_POP(status, "unable to add keypair to kvs\n"); i = 0; tmp[i++] = HYDU_strdup("cmd=put_result rc="); tmp[i++] = HYDU_int_to_str(ret); if (ret == 0) { tmp[i++] = HYDU_strdup(" msg=success"); } else { tmp[i++] = HYDU_strdup(" msg=duplicate_key"); tmp[i++] = HYDU_strdup(key); } tmp[i++] = HYDU_strdup("\n"); tmp[i++] = NULL; status = HYDU_str_alloc_and_join(tmp, &cmd); HYDU_ERR_POP(status, "unable to join strings\n"); HYDU_free_strlist(tmp); status = cmd_response(fd, pid, cmd); HYDU_ERR_POP(status, "error writing PMI line\n"); HYDU_FREE(cmd); fn_exit: HYD_pmcd_pmi_free_tokens(tokens, token_count); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status fn_kvs_get(int fd, int pid, int pgid, char *args[]) { int i, idx, found; struct HYD_pmcd_pmi_pg_scratch *pg_scratch; struct HYD_pg *pg; struct HYD_proxy *proxy; struct HYD_pmcd_pmi_kvs_pair *run; char *key, *thrid, *cmd; struct HYD_string_stash stash; struct HYD_pmcd_token *tokens; int token_count; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); status = HYD_pmcd_pmi_args_to_tokens(args, &tokens, &token_count); HYDU_ERR_POP(status, "unable to convert args to tokens\n"); key = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "key"); HYDU_ERR_CHKANDJUMP(status, key == NULL, HYD_INTERNAL_ERROR, "unable to find key token\n"); thrid = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "thrid"); proxy = HYD_pmcd_pmi_find_proxy(fd); HYDU_ASSERT(proxy, status); pg_scratch = (struct HYD_pmcd_pmi_pg_scratch *) proxy->pg->pg_scratch; found = 0; for (run = pg_scratch->kvs->key_pair; run; run = run->next) { if (!strcmp(run->key, key)) { found = 1; break; } } if (!found) { pg = proxy->pg; pg_scratch = (struct HYD_pmcd_pmi_pg_scratch *) pg->pg_scratch; idx = -1; for (i = 0; i < pg->pg_process_count; i++) if (pg_scratch->ecount[i].fd == fd && pg_scratch->ecount[i].pid == pid) { idx = i; break; } HYDU_ASSERT(idx != -1, status); for (i = 0; i < pg->pg_process_count; i++) { if (pg_scratch->ecount[i].epoch < pg_scratch->ecount[idx].epoch) { /* We haven't reached a barrier yet; queue up request */ status = HYD_pmcd_pmi_v2_queue_req(fd, pid, pgid, args, key, &pending_reqs); HYDU_ERR_POP(status, "unable to queue request\n"); /* We are done */ goto fn_exit; } } } HYD_STRING_STASH_INIT(stash); HYD_STRING_STASH(stash, HYDU_strdup("cmd=kvs-get-response;"), status); if (thrid) { HYD_STRING_STASH(stash, HYDU_strdup("thrid="), status); HYD_STRING_STASH(stash, HYDU_strdup(thrid), status); HYD_STRING_STASH(stash, HYDU_strdup(";"), status); } if (found) { HYD_STRING_STASH(stash, HYDU_strdup("found=TRUE;value="), status); HYD_STRING_STASH(stash, HYDU_strdup(run->val), status); HYD_STRING_STASH(stash, HYDU_strdup(";"), status); } else { HYD_STRING_STASH(stash, HYDU_strdup("found=FALSE;"), status); } HYD_STRING_STASH(stash, HYDU_strdup("rc=0;"), status); HYD_STRING_SPIT(stash, cmd, status); status = cmd_response(fd, pid, cmd); HYDU_ERR_POP(status, "send command failed\n"); HYDU_FREE(cmd); fn_exit: HYD_pmcd_pmi_free_tokens(tokens, token_count); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status fn_kvs_fence(int fd, int pid, int pgid, char *args[]) { struct HYD_proxy *proxy; struct HYD_pmcd_pmi_pg_scratch *pg_scratch; struct HYD_string_stash stash; char *cmd, *thrid; struct HYD_pmcd_token *tokens; int token_count, i; static int fence_count = 0; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); status = HYD_pmcd_pmi_args_to_tokens(args, &tokens, &token_count); HYDU_ERR_POP(status, "unable to convert args to tokens\n"); thrid = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "thrid"); proxy = HYD_pmcd_pmi_find_proxy(fd); HYDU_ASSERT(proxy, status); pg_scratch = (struct HYD_pmcd_pmi_pg_scratch *) proxy->pg->pg_scratch; /* Try to find the epoch point of this process */ for (i = 0; i < proxy->pg->pg_process_count; i++) if (pg_scratch->ecount[i].fd == fd && pg_scratch->ecount[i].pid == pid) pg_scratch->ecount[i].epoch++; if (i == proxy->pg->pg_process_count) { /* couldn't find the current process; find a NULL entry */ for (i = 0; i < proxy->pg->pg_process_count; i++) if (pg_scratch->ecount[i].fd == HYD_FD_UNSET) break; pg_scratch->ecount[i].fd = fd; pg_scratch->ecount[i].pid = pid; pg_scratch->ecount[i].epoch = 1; } HYD_STRING_STASH_INIT(stash); HYD_STRING_STASH(stash, HYDU_strdup("cmd=kvs-fence-response;"), status); if (thrid) { HYD_STRING_STASH(stash, HYDU_strdup("thrid="), status); HYD_STRING_STASH(stash, HYDU_strdup(thrid), status); HYD_STRING_STASH(stash, HYDU_strdup(";"), status); } HYD_STRING_STASH(stash, HYDU_strdup("rc=0;"), status); HYD_STRING_SPIT(stash, cmd, status); status = cmd_response(fd, pid, cmd); HYDU_ERR_POP(status, "send command failed\n"); HYDU_FREE(cmd); fence_count++; if (fence_count % proxy->pg->pg_process_count == 0) { /* Poke the progress engine before exiting */ status = poke_progress(NULL); HYDU_ERR_POP(status, "poke progress error\n"); } fn_exit: HYD_pmcd_pmi_free_tokens(tokens, token_count); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status fn_spawn(int fd, int pid, int pgid, char *args[]) { struct HYD_pg *pg; struct HYD_pmcd_pmi_pg_scratch *pg_scratch; struct HYD_proxy *proxy; struct HYD_pmcd_token *tokens; struct HYD_exec *exec_list = NULL, *exec; struct HYD_env *env; struct HYD_node *node; char key[PMI_MAXKEYLEN], *val; int nprocs, preput_num, info_num, ret; char *execname, *path = NULL; struct HYD_pmcd_token_segment *segment_list = NULL; int token_count, i, j, k, new_pgid, total_spawns; int argcnt, num_segments; char *control_port, *proxy_args[HYD_NUM_TMP_STRINGS] = { NULL }; char *tmp[HYD_NUM_TMP_STRINGS]; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); for (i = 0; args[i]; i++) mcmd_args[mcmd_num_args++] = HYDU_strdup(args[i]); mcmd_args[mcmd_num_args] = NULL; status = HYD_pmcd_pmi_args_to_tokens(mcmd_args, &tokens, &token_count); HYDU_ERR_POP(status, "unable to convert args to tokens\n"); /* Here's the order of things we do: * * 1. Break the token list into multiple segments, each segment * corresponding to a command. Each command represents * information for one executable. * * 2. Allocate a process group for the new set of spawned * processes * * 3. Get all the common keys and deal with them * * 4. Create an executable list based on the segments. * * 5. Create a proxy list using the created executable list and * spawn it. */ /* Break the token list into multiple segments and create an * executable list based on the segments. */ val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "totspawns"); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: totspawns\n"); total_spawns = atoi(val); HYDU_MALLOC(segment_list, struct HYD_pmcd_token_segment *, total_spawns * sizeof(struct HYD_pmcd_token_segment), status); segment_tokens(tokens, token_count, segment_list, &num_segments); if (num_segments != total_spawns) { /* We didn't read the entire PMI string; wait for the rest to * arrive */ goto fn_exit; } else { /* Got the entire PMI string; free the arguments and reset */ HYDU_free_strlist(mcmd_args); mcmd_num_args = 0; } /* Allocate a new process group */ for (pg = &HYD_server_info.pg_list; pg->next; pg = pg->next); new_pgid = pg->pgid + 1; status = HYDU_alloc_pg(&pg->next, new_pgid); HYDU_ERR_POP(status, "unable to allocate process group\n"); pg = pg->next; proxy = HYD_pmcd_pmi_find_proxy(fd); HYDU_ASSERT(proxy, status); pg->spawner_pg = proxy->pg; for (j = 0; j < total_spawns; j++) { /* For each segment, we create an exec structure */ val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, "nprocs"); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: nprocs\n"); nprocs = atoi(val); pg->pg_process_count += nprocs; val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, "argcnt"); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: argcnt\n"); argcnt = atoi(val); val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, "info_num"); if (val) info_num = atoi(val); else info_num = 0; if (exec_list == NULL) { status = HYDU_alloc_exec(&exec_list); HYDU_ERR_POP(status, "unable to allocate exec\n"); exec_list->appnum = 0; exec = exec_list; } else { for (exec = exec_list; exec->next; exec = exec->next); status = HYDU_alloc_exec(&exec->next); HYDU_ERR_POP(status, "unable to allocate exec\n"); exec->next->appnum = exec->appnum + 1; exec = exec->next; } /* Info keys */ for (i = 0; i < info_num; i++) { char *info_key, *info_val; HYDU_snprintf(key, PMI_MAXKEYLEN, "info_key_%d", i); val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, key); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: %s\n", key); info_key = val; HYDU_snprintf(key, PMI_MAXKEYLEN, "info_val_%d", i); val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, key); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: %s\n", key); info_val = val; if (!strcmp(info_key, "path")) { path = HYDU_strdup(info_val); } else if (!strcmp(info_key, "wdir")) { exec->wdir = HYDU_strdup(info_val); } else if (!strcmp(info_key, "host")) { status = HYDU_process_mfile_token(info_val, 1, &pg->user_node_list); HYDU_ERR_POP(status, "error create node list\n"); } else if (!strcmp(info_key, "hostfile")) { status = HYDU_parse_hostfile(info_val, &pg->user_node_list, HYDU_process_mfile_token); HYDU_ERR_POP(status, "error parsing hostfile\n"); } else { /* Unrecognized info key; ignore */ } } status = HYDU_correct_wdir(&exec->wdir); HYDU_ERR_POP(status, "unable to correct wdir\n"); val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, "execname"); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: execname\n"); if (path == NULL) execname = HYDU_strdup(val); else { i = 0; tmp[i++] = HYDU_strdup(path); tmp[i++] = HYDU_strdup("/"); tmp[i++] = HYDU_strdup(val); tmp[i++] = NULL; status = HYDU_str_alloc_and_join(tmp, &execname); HYDU_ERR_POP(status, "error while joining strings\n"); HYDU_free_strlist(tmp); } i = 0; exec->exec[i++] = execname; for (k = 0; k < argcnt; k++) { HYDU_snprintf(key, PMI_MAXKEYLEN, "arg%d", k + 1); val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, key); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: %s\n", key); exec->exec[i++] = HYDU_strdup(val); } exec->exec[i++] = NULL; exec->proc_count = nprocs; /* It is not clear what kind of environment needs to get * passed to the spawned process. Don't set anything here, and * let the proxy do whatever it does by default. */ exec->env_prop = NULL; status = HYDU_env_create(&env, "PMI_SPAWNED", "1"); HYDU_ERR_POP(status, "unable to create PMI_SPAWNED environment\n"); exec->user_env = env; } status = HYD_pmcd_pmi_alloc_pg_scratch(pg); HYDU_ERR_POP(status, "unable to allocate pg scratch space\n"); if (pg->user_node_list) { pg->pg_core_count = 0; for (i = 0, node = pg->user_node_list; node; node = node->next, i++) { pg->pg_core_count += node->core_count; node->node_id = i; } } else { pg->pg_core_count = HYD_server_info.pg_list.pg_core_count; } pg->pg_process_count = 0; for (exec = exec_list; exec; exec = exec->next) pg->pg_process_count += exec->proc_count; pg_scratch = (struct HYD_pmcd_pmi_pg_scratch *) pg->pg_scratch; /* Get the common keys and deal with them */ val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "preput_num"); if (val) preput_num = atoi(val); else preput_num = 0; for (i = 0; i < preput_num; i++) { char *preput_key, *preput_val; HYDU_snprintf(key, PMI_MAXKEYLEN, "preput_key_%d", i); val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, key); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: %s\n", key); preput_key = val; HYDU_snprintf(key, PMI_MAXKEYLEN, "preput_val_%d", i); val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, key); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: %s\n", key); preput_val = val; status = HYD_pmcd_pmi_add_kvs(preput_key, preput_val, pg_scratch->kvs, &ret); HYDU_ERR_POP(status, "unable to add keypair to kvs\n"); } /* Create the proxy list */ if (pg->user_node_list) { status = HYDU_create_proxy_list(exec_list, pg->user_node_list, pg); HYDU_ERR_POP(status, "error creating proxy list\n"); } else { status = HYDU_create_proxy_list(exec_list, HYD_server_info.node_list, pg); HYDU_ERR_POP(status, "error creating proxy list\n"); } HYDU_free_exec_list(exec_list); status = HYDU_sock_create_and_listen_portstr(HYD_server_info.user_global.iface, HYD_server_info.local_hostname, HYD_server_info.port_range, &control_port, HYD_pmcd_pmiserv_control_listen_cb, (void *) (size_t) new_pgid); HYDU_ERR_POP(status, "unable to create PMI port\n"); if (HYD_server_info.user_global.debug) HYDU_dump(stdout, "Got a control port string of %s\n", control_port); /* Go to the last PG */ for (pg = &HYD_server_info.pg_list; pg->next; pg = pg->next); status = HYD_pmcd_pmi_fill_in_proxy_args(proxy_args, control_port, new_pgid); HYDU_ERR_POP(status, "unable to fill in proxy arguments\n"); HYDU_FREE(control_port); status = HYD_pmcd_pmi_fill_in_exec_launch_info(pg); HYDU_ERR_POP(status, "unable to fill in executable arguments\n"); status = HYDT_bsci_launch_procs(proxy_args, pg->proxy_list, NULL); HYDU_ERR_POP(status, "launcher cannot launch processes\n"); { char *cmd_str[HYD_NUM_TMP_STRINGS], *cmd; i = 0; cmd_str[i++] = HYDU_strdup("cmd=spawn_result rc=0"); cmd_str[i++] = HYDU_strdup("\n"); cmd_str[i++] = NULL; status = HYDU_str_alloc_and_join(cmd_str, &cmd); HYDU_ERR_POP(status, "unable to join strings\n"); HYDU_free_strlist(cmd_str); status = cmd_response(fd, pid, cmd); HYDU_ERR_POP(status, "error writing PMI line\n"); HYDU_FREE(cmd); } fn_exit: HYD_pmcd_pmi_free_tokens(tokens, token_count); HYDU_free_strlist(proxy_args); if (segment_list) HYDU_FREE(segment_list); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status fn_get(int fd, int pid, int pgid, char *args[]) { int i; struct HYD_proxy *proxy; struct HYD_pmcd_pmi_pg_scratch *pg_scratch; struct HYD_pmcd_pmi_kvs_pair *run; char *kvsname, *key, *val; char *tmp[HYD_NUM_TMP_STRINGS], *cmd; struct HYD_pmcd_token *tokens; int token_count; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); status = HYD_pmcd_pmi_args_to_tokens(args, &tokens, &token_count); HYDU_ERR_POP(status, "unable to convert args to tokens\n"); kvsname = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "kvsname"); HYDU_ERR_CHKANDJUMP(status, kvsname == NULL, HYD_INTERNAL_ERROR, "unable to find token: kvsname\n"); key = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "key"); HYDU_ERR_CHKANDJUMP(status, key == NULL, HYD_INTERNAL_ERROR, "unable to find token: key\n"); proxy = HYD_pmcd_pmi_find_proxy(fd); HYDU_ASSERT(proxy, status); pg_scratch = (struct HYD_pmcd_pmi_pg_scratch *) proxy->pg->pg_scratch; val = NULL; if (!strcmp(key, "PMI_dead_processes")) { val = pg_scratch->dead_processes; goto found_val; } if (strcmp(pg_scratch->kvs->kvs_name, kvsname)) HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "kvsname (%s) does not match this group's kvs space (%s)\n", kvsname, pg_scratch->kvs->kvs_name); /* Try to find the key */ for (run = pg_scratch->kvs->key_pair; run; run = run->next) { if (!strcmp(run->key, key)) { val = run->val; break; } } found_val: i = 0; tmp[i++] = HYDU_strdup("cmd=get_result rc="); if (val) { tmp[i++] = HYDU_strdup("0 msg=success value="); tmp[i++] = HYDU_strdup(val); } else { tmp[i++] = HYDU_strdup("-1 msg=key_"); tmp[i++] = HYDU_strdup(key); tmp[i++] = HYDU_strdup("_not_found value=unknown"); } tmp[i++] = HYDU_strdup("\n"); tmp[i++] = NULL; status = HYDU_str_alloc_and_join(tmp, &cmd); HYDU_ERR_POP(status, "unable to join strings\n"); HYDU_free_strlist(tmp); status = cmd_response(fd, pid, cmd); HYDU_ERR_POP(status, "error writing PMI line\n"); HYDU_FREE(cmd); fn_exit: HYD_pmcd_pmi_free_tokens(tokens, token_count); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
HYD_status HYD_pmcd_pmi_parse_pmi_cmd(char *obuf, int pmi_version, char **pmi_cmd, char *args[]) { char *tbuf = NULL, *seg, *str1 = NULL, *cmd; char *buf; char *tmp[HYD_NUM_TMP_STRINGS], *targs[HYD_NUM_TMP_STRINGS]; const char *delim; int i, j, k; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); /* Make a copy of the original buffer */ buf = HYDU_strdup(obuf); if (buf[strlen(obuf) - 1] == '\n') buf[strlen(obuf) - 1] = '\0'; if (pmi_version == 1) { if (!strncmp(buf, "cmd=", strlen("cmd="))) delim = " "; else delim = "\n"; /* Here we only get PMI-1 commands or backward compatible * PMI-2 commands, so we always explicitly use the PMI-1 * delimiter. This allows us to get backward-compatible PMI-2 * commands interleaved with regular PMI-2 commands. */ tbuf = HYDU_strdup(buf); cmd = strtok(tbuf, delim); for (i = 0; i < HYD_NUM_TMP_STRINGS; i++) { targs[i] = strtok(NULL, delim); if (targs[i] == NULL) break; } /* Make a pass through targs and merge space separated * arguments which are actually part of the same key */ k = 0; for (i = 0; targs[i]; i++) { if (!strrchr(targs[i], ' ')) { /* no spaces */ args[k++] = HYDU_strdup(targs[i]); } else { /* space in the argument; each segment is either a new * key, or a space-separated part of the previous * key */ j = 0; seg = strtok(targs[i], " "); while (1) { if (!seg || strrchr(seg, '=')) { /* segment has an '='; it's a start of a new key */ if (j) { tmp[j++] = NULL; status = HYDU_str_alloc_and_join(tmp, &args[k++]); HYDU_ERR_POP(status, "error while joining strings\n"); HYDU_free_strlist(tmp); } j = 0; if (!seg) break; } else { /* no '='; part of the previous key */ tmp[j++] = HYDU_strdup(" "); } tmp[j++] = HYDU_strdup(seg); seg = strtok(NULL, " "); } } } args[k++] = NULL; } else { /* PMI-v2 */ delim = ";"; tbuf = HYDU_strdup(buf); cmd = strtok(tbuf, delim); for (i = 0; i < HYD_NUM_TMP_STRINGS; i++) { args[i] = strtok(NULL, delim); if (args[i] == NULL) break; args[i] = HYDU_strdup(args[i]); } } /* Search for the PMI command in our table */ status = HYDU_strsplit(cmd, &str1, pmi_cmd, '='); HYDU_ERR_POP(status, "string split returned error\n"); fn_exit: HYDU_FREE(buf); if (tbuf) HYDU_FREE(tbuf); if (str1) HYDU_FREE(str1); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
HYD_status HYDT_bscu_wait_for_completion(int timeout) { int pid, ret, count, i, time_elapsed, time_left; struct timeval start, now; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); /* FIXME: We rely on gettimeofday here. This needs to detect the * timer type available and use that. Probably more of an MPL * functionality than Hydra's. */ gettimeofday(&start, NULL); /* Loop till all sockets have closed */ restart_wait: while (1) { count = 0; for (i = 0; i < HYD_bscu_fd_count; i++) { if (HYD_bscu_fd_list[i] == HYD_FD_CLOSED) continue; ret = HYDT_dmx_query_fd_registration(HYD_bscu_fd_list[i]); if (ret) { /* still registered */ count++; /* We still need to wait */ gettimeofday(&now, NULL); time_elapsed = (now.tv_sec - start.tv_sec); /* Ignore microsec granularity */ if (timeout > 0) { if (time_elapsed > timeout) { status = HYD_TIMED_OUT; goto fn_exit; } else time_left = timeout - time_elapsed; } else time_left = -1; status = HYDT_dmx_wait_for_event(time_left); HYDU_ERR_POP(status, "error waiting for event\n"); /* Check if any processes terminated badly; if they * did, return an error. */ pid = waitpid(-1, &ret, WNOHANG); if (pid > 0) { /* Find the pid and mark it as complete */ for (i = 0; i < HYD_bscu_pid_count; i++) if (HYD_bscu_pid_list[i] == pid) { HYD_bscu_pid_list[i] = -1; break; } if (ret) { HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "one of the processes terminated badly; aborting\n"); } } goto restart_wait; } else HYD_bscu_fd_list[i] = HYD_FD_CLOSED; } if (count == 0) break; } /* Loop till all processes have completed */ while (1) { count = 0; for (i = 0; i < HYD_bscu_pid_count; i++) if (HYD_bscu_pid_list[i] != -1) count++; /* If there are no processes to wait, we are done */ if (count == 0) break; pid = waitpid(-1, &ret, WNOHANG); if (pid > 0) { /* Find the pid and mark it as complete */ for (i = 0; i < HYD_bscu_pid_count; i++) if (HYD_bscu_pid_list[i] == pid) { HYD_bscu_pid_list[i] = -1; break; } } } if (HYD_bscu_pid_list) { HYDU_FREE(HYD_bscu_pid_list); HYD_bscu_pid_list = NULL; HYD_bscu_pid_count = 0; } if (HYD_bscu_fd_list) { HYDU_FREE(HYD_bscu_fd_list); HYD_bscu_fd_list = NULL; HYD_bscu_fd_count = 0; } fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static void cleanup_params(void) { int i; HYDU_finalize_user_global(&HYD_pmcd_pmip.user_global); /* System global */ if (HYD_pmcd_pmip.system_global.pmi_fd) HYDU_FREE(HYD_pmcd_pmip.system_global.pmi_fd); if (HYD_pmcd_pmip.system_global.pmi_process_mapping) HYDU_FREE(HYD_pmcd_pmip.system_global.pmi_process_mapping); /* Upstream */ if (HYD_pmcd_pmip.upstream.server_name) HYDU_FREE(HYD_pmcd_pmip.upstream.server_name); /* Downstream */ if (HYD_pmcd_pmip.downstream.out) HYDU_FREE(HYD_pmcd_pmip.downstream.out); if (HYD_pmcd_pmip.downstream.err) HYDU_FREE(HYD_pmcd_pmip.downstream.err); if (HYD_pmcd_pmip.downstream.pid) HYDU_FREE(HYD_pmcd_pmip.downstream.pid); if (HYD_pmcd_pmip.downstream.exit_status) HYDU_FREE(HYD_pmcd_pmip.downstream.exit_status); if (HYD_pmcd_pmip.downstream.pmi_rank) HYDU_FREE(HYD_pmcd_pmip.downstream.pmi_rank); if (HYD_pmcd_pmip.downstream.pmi_fd) HYDU_FREE(HYD_pmcd_pmip.downstream.pmi_fd); if (HYD_pmcd_pmip.downstream.pmi_fd_active) HYDU_FREE(HYD_pmcd_pmip.downstream.pmi_fd_active); /* Local */ if (HYD_pmcd_pmip.local.iface_ip_env_name) HYDU_FREE(HYD_pmcd_pmip.local.iface_ip_env_name); if (HYD_pmcd_pmip.local.hostname) HYDU_FREE(HYD_pmcd_pmip.local.hostname); if (HYD_pmcd_pmip.local.spawner_kvs_name) HYDU_FREE(HYD_pmcd_pmip.local.spawner_kvs_name); if (HYD_pmcd_pmip.local.ckpoint_prefix_list) { for (i = 0; HYD_pmcd_pmip.local.ckpoint_prefix_list[i]; i++) HYDU_FREE(HYD_pmcd_pmip.local.ckpoint_prefix_list[i]); HYDU_FREE(HYD_pmcd_pmip.local.ckpoint_prefix_list); } HYD_pmcd_free_pmi_kvs_list(HYD_pmcd_pmip.local.kvs); /* Exec list */ HYDU_free_exec_list(HYD_pmcd_pmip.exec_list); HYDT_topo_finalize(); }
static HYD_status fn_spawn(int fd, int pid, int pgid, char *args[]) { struct HYD_pg *pg; struct HYD_pmcd_pmi_pg_scratch *pg_scratch; struct HYD_proxy *proxy; struct HYD_pmcd_token *tokens; struct HYD_exec *exec_list = NULL, *exec; struct HYD_env *env; struct HYD_node *node; char *thrid; char key[PMI_MAXKEYLEN], *val; int maxprocs, preputcount, infokeycount, ret; int ncmds; char *execname, *path = NULL; struct HYD_pmcd_token_segment *segment_list = NULL; int token_count, i, j, k, new_pgid; int argcnt, num_segments; struct HYD_string_stash proxy_stash; char *control_port; struct HYD_string_stash stash; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); status = HYD_pmcd_pmi_args_to_tokens(args, &tokens, &token_count); HYDU_ERR_POP(status, "unable to convert args to tokens\n"); thrid = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "thrid"); /* Here's the order of things we do: * * 1. Break the token list into multiple segments, each segment * corresponding to a command. Each command represents * information for one executable. * * 2. Allocate a process group for the new set of spawned * processes * * 3. Get all the common keys and deal with them * * 4. Create an executable list based on the segments. * * 5. Create a proxy list using the created executable list and * spawn it. */ val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "ncmds"); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: ncmds\n"); ncmds = atoi(val); HYDU_MALLOC(segment_list, struct HYD_pmcd_token_segment *, (ncmds + 1) * sizeof(struct HYD_pmcd_token_segment), status); segment_tokens(tokens, token_count, segment_list, &num_segments); HYDU_ASSERT((ncmds + 1) == num_segments, status); /* Allocate a new process group */ for (pg = &HYD_server_info.pg_list; pg->next; pg = pg->next); new_pgid = pg->pgid + 1; status = HYDU_alloc_pg(&pg->next, new_pgid); HYDU_ERR_POP(status, "unable to allocate process group\n"); pg = pg->next; proxy = HYD_pmcd_pmi_find_proxy(fd); HYDU_ASSERT(proxy, status); pg->spawner_pg = proxy->pg; for (j = 1; j <= ncmds; j++) { /* For each segment, we create an exec structure */ val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, "maxprocs"); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: maxprocs\n"); maxprocs = atoi(val); pg->pg_process_count += maxprocs; val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, "argc"); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: argc\n"); argcnt = atoi(val); val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, "infokeycount"); if (val) infokeycount = atoi(val); else infokeycount = 0; if (exec_list == NULL) { status = HYDU_alloc_exec(&exec_list); HYDU_ERR_POP(status, "unable to allocate exec\n"); exec_list->appnum = 0; exec = exec_list; } else { for (exec = exec_list; exec->next; exec = exec->next); status = HYDU_alloc_exec(&exec->next); HYDU_ERR_POP(status, "unable to allocate exec\n"); exec->next->appnum = exec->appnum + 1; exec = exec->next; } /* Info keys */ for (i = 0; i < infokeycount; i++) { char *info_key, *info_val; HYDU_snprintf(key, PMI_MAXKEYLEN, "infokey%d", i); val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, key); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: %s\n", key); info_key = val; HYDU_snprintf(key, PMI_MAXKEYLEN, "infoval%d", i); val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, key); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: %s\n", key); info_val = val; if (!strcmp(info_key, "path")) { path = HYDU_strdup(info_val); } else if (!strcmp(info_key, "wdir")) { exec->wdir = HYDU_strdup(info_val); } else if (!strcmp(info_key, "host") || !strcmp(info_key, "hosts")) { char *host = strtok(info_val, ","); while (host) { status = HYDU_process_mfile_token(host, 1, &pg->user_node_list); HYDU_ERR_POP(status, "error creating node list\n"); host = strtok(NULL, ","); } } else if (!strcmp(info_key, "hostfile")) { status = HYDU_parse_hostfile(info_val, &pg->user_node_list, HYDU_process_mfile_token); HYDU_ERR_POP(status, "error parsing hostfile\n"); } else { /* Unrecognized info key; ignore */ } } status = HYDU_correct_wdir(&exec->wdir); HYDU_ERR_POP(status, "unable to correct wdir\n"); val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, "subcmd"); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: subcmd\n"); if (path == NULL) execname = HYDU_strdup(val); else { HYD_STRING_STASH_INIT(stash); HYD_STRING_STASH(stash, HYDU_strdup(path), status); HYD_STRING_STASH(stash, HYDU_strdup("/"), status); HYD_STRING_STASH(stash, HYDU_strdup(val), status); HYD_STRING_SPIT(stash, execname, status); } i = 0; exec->exec[i++] = execname; for (k = 0; k < argcnt; k++) { HYDU_snprintf(key, PMI_MAXKEYLEN, "argv%d", k); val = HYD_pmcd_pmi_find_token_keyval(&tokens[segment_list[j].start_idx], segment_list[j].token_count, key); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: %s\n", key); exec->exec[i++] = HYDU_strdup(val); } exec->exec[i++] = NULL; exec->proc_count = maxprocs; /* It is not clear what kind of environment needs to get * passed to the spawned process. Don't set anything here, and * let the proxy do whatever it does by default. */ exec->env_prop = NULL; status = HYDU_env_create(&env, "PMI_SPAWNED", "1"); HYDU_ERR_POP(status, "unable to create PMI_SPAWNED environment\n"); exec->user_env = env; } status = HYD_pmcd_pmi_alloc_pg_scratch(pg); HYDU_ERR_POP(status, "unable to allocate pg scratch space\n"); pg->pg_process_count = 0; for (exec = exec_list; exec; exec = exec->next) pg->pg_process_count += exec->proc_count; pg_scratch = (struct HYD_pmcd_pmi_pg_scratch *) pg->pg_scratch; /* Get the common keys and deal with them */ val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "preputcount"); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: preputcount\n"); preputcount = atoi(val); for (i = 0; i < preputcount; i++) { char *preput_key, *preput_val; HYDU_snprintf(key, PMI_MAXKEYLEN, "ppkey%d", i); val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, key); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: %s\n", key); preput_key = val; HYDU_snprintf(key, PMI_MAXKEYLEN, "ppval%d", i); val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, key); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find token: %s\n", key); preput_val = val; status = HYD_pmcd_pmi_add_kvs(preput_key, preput_val, pg_scratch->kvs, &ret); HYDU_ERR_POP(status, "unable to add keypair to kvs\n"); } /* Create the proxy list */ if (pg->user_node_list) { status = HYDU_create_proxy_list(exec_list, pg->user_node_list, pg); HYDU_ERR_POP(status, "error creating proxy list\n"); } else { status = HYDU_create_proxy_list(exec_list, HYD_server_info.node_list, pg); HYDU_ERR_POP(status, "error creating proxy list\n"); } HYDU_free_exec_list(exec_list); if (pg->user_node_list) { pg->pg_core_count = 0; for (i = 0, node = pg->user_node_list; node; node = node->next, i++) pg->pg_core_count += node->core_count; } else { pg->pg_core_count = 0; for (proxy = pg->proxy_list; proxy; proxy = proxy->next) pg->pg_core_count += proxy->node->core_count; } status = HYDU_sock_create_and_listen_portstr(HYD_server_info.user_global.iface, HYD_server_info.localhost, HYD_server_info.port_range, &control_port, HYD_pmcd_pmiserv_control_listen_cb, (void *) (size_t) new_pgid); HYDU_ERR_POP(status, "unable to create PMI port\n"); if (HYD_server_info.user_global.debug) HYDU_dump(stdout, "Got a control port string of %s\n", control_port); /* Go to the last PG */ for (pg = &HYD_server_info.pg_list; pg->next; pg = pg->next); status = HYD_pmcd_pmi_fill_in_proxy_args(&proxy_stash, control_port, new_pgid); HYDU_ERR_POP(status, "unable to fill in proxy arguments\n"); HYDU_FREE(control_port); status = HYD_pmcd_pmi_fill_in_exec_launch_info(pg); HYDU_ERR_POP(status, "unable to fill in executable arguments\n"); status = HYDT_bsci_launch_procs(proxy_stash.strlist, pg->proxy_list, NULL); HYDU_ERR_POP(status, "launcher cannot launch processes\n"); { char *cmd; HYD_STRING_STASH_INIT(stash); HYD_STRING_STASH(stash, HYDU_strdup("cmd=spawn-response;"), status); if (thrid) { HYD_STRING_STASH(stash, HYDU_strdup("thrid="), status); HYD_STRING_STASH(stash, HYDU_strdup(thrid), status); HYD_STRING_STASH(stash, HYDU_strdup(";"), status); } HYD_STRING_STASH(stash, HYDU_strdup("rc=0;"), status); HYD_STRING_STASH(stash, HYDU_strdup("jobid="), status); HYD_STRING_STASH(stash, HYDU_strdup(pg_scratch->kvs->kvsname), status); HYD_STRING_STASH(stash, HYDU_strdup(";"), status); HYD_STRING_STASH(stash, HYDU_strdup("nerrs=0;"), status); HYD_STRING_SPIT(stash, cmd, status); status = cmd_response(fd, pid, cmd); HYDU_ERR_POP(status, "send command failed\n"); HYDU_FREE(cmd); } fn_exit: HYD_pmcd_pmi_free_tokens(tokens, token_count); HYD_STRING_STASH_FREE(proxy_stash); if (segment_list) HYDU_FREE(segment_list); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }