static HYD_status fn_info_putnodeattr(int fd, char *args[]) { struct HYD_string_stash stash; char *key, *val, *thrid, *cmd; struct HYD_pmcd_token *tokens = NULL; int token_count, ret; struct HYD_pmcd_pmi_v2_reqs *req; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); status = HYD_pmcd_pmi_args_to_tokens(args, &tokens, &token_count); HYDU_ERR_POP(status, "unable to convert args to tokens\n"); key = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "key"); HYDU_ERR_CHKANDJUMP(status, key == NULL, HYD_INTERNAL_ERROR, "unable to find key token\n"); val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "value"); HYDU_ERR_CHKANDJUMP(status, val == NULL, HYD_INTERNAL_ERROR, "unable to find value token\n"); thrid = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "thrid"); status = HYD_pmcd_pmi_add_kvs(key, val, HYD_pmcd_pmip.local.kvs, &ret); HYDU_ERR_POP(status, "unable to put data into kvs\n"); HYD_STRING_STASH_INIT(stash); HYD_STRING_STASH(stash, MPL_strdup("cmd=info-putnodeattr-response;"), status); if (thrid) { HYD_STRING_STASH(stash, MPL_strdup("thrid="), status); HYD_STRING_STASH(stash, MPL_strdup(thrid), status); HYD_STRING_STASH(stash, MPL_strdup(";"), status); } HYD_STRING_STASH(stash, MPL_strdup("rc="), status); HYD_STRING_STASH(stash, HYDU_int_to_str(ret), status); HYD_STRING_STASH(stash, MPL_strdup(";"), status); HYD_STRING_SPIT(stash, cmd, status); send_cmd_downstream(fd, cmd); MPL_free(cmd); for (req = pending_reqs; req; req = req->next) { if (!strcmp(req->key, key)) { /* Poke the progress engine before exiting */ status = poke_progress(key); HYDU_ERR_POP(status, "poke progress error\n"); break; } } fn_exit: if (tokens) HYD_pmcd_pmi_free_tokens(tokens, token_count); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
HYD_status HYDU_sock_create_and_listen_portstr(char *iface, char *hostname, char *port_range, char **port_str, HYD_status(*callback) (int fd, HYD_event_t events, void *userp), void *userp) { int listenfd; char *sport, *real_port_range, *ip = NULL; uint16_t port; HYD_status status = HYD_SUCCESS; /* Listen on a port in the port range */ port = 0; real_port_range = port_range ? MPL_strdup(port_range) : NULL; status = HYDU_sock_listen(&listenfd, real_port_range, &port); HYDU_ERR_POP(status, "unable to listen on port\n"); /* Register the listening socket with the demux engine */ status = HYDT_dmx_register_fd(1, &listenfd, HYD_POLLIN, userp, callback); HYDU_ERR_POP(status, "unable to register fd\n"); /* Create a port string for MPI processes to use to connect to */ if (iface) { status = HYDU_sock_get_iface_ip(iface, &ip); HYDU_ERR_POP(status, "unable to get network interface IP\n"); } else if (hostname) { ip = MPL_strdup(hostname); } else { char localhost[MAX_HOSTNAME_LEN] = { 0 }; if (gethostname(localhost, MAX_HOSTNAME_LEN) < 0) HYDU_ERR_SETANDJUMP(status, HYD_SOCK_ERROR, "unable to get local hostname\n"); ip = MPL_strdup(localhost); } sport = HYDU_int_to_str(port); HYDU_MALLOC_OR_JUMP(*port_str, char *, strlen(ip) + 1 + strlen(sport) + 1, status); MPL_snprintf(*port_str, strlen(ip) + 1 + strlen(sport) + 1, "%s:%s", ip, sport); MPL_free(sport); fn_exit: if (ip) MPL_free(ip); return status; fn_fail: goto fn_exit; }
HYD_status HYDT_bscd_ll_launch_procs(char **args, struct HYD_proxy *proxy_list, int *control_fd) { int idx, i, total_procs, node_count; int *pid, *fd_list, exec_idx; char *targs[HYD_NUM_TMP_STRINGS], *node_list_str = NULL; char *path = NULL, *extra_arg_list = NULL, *extra_arg, quoted_exec_string[HYD_TMP_STRLEN]; struct HYD_proxy *proxy; struct HYDT_topo_cpuset_t cpuset; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); /* We use the following priority order for the executable path: * (1) user-specified; (2) search in path; (3) Hard-coded * location */ if (HYDT_bsci_info.launcher_exec) path = HYDU_strdup(HYDT_bsci_info.launcher_exec); if (!path) path = HYDU_find_full_path("poe"); if (!path) path = HYDU_strdup("/usr/bin/poe"); idx = 0; targs[idx++] = HYDU_strdup(path); if (!strcmp(HYDT_bsci_info.rmk, "ll")) { HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "ll does not support user-defined host lists\n"); } /* Check how many nodes are being passed for the launch */ status = HYDTI_bscd_ll_query_node_count(&total_procs); HYDU_ERR_POP(status, "unable to query for the node count\n"); node_count = 0; for (proxy = proxy_list; proxy; proxy = proxy->next) node_count++; if (total_procs != node_count) HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "processes to be launched have to cover all nodes\n"); MPL_env2str("HYDRA_LAUNCHER_EXTRA_ARGS", (const char **) &extra_arg_list); if (extra_arg_list) { extra_arg = strtok(extra_arg_list, " "); while (extra_arg) { targs[idx++] = HYDU_strdup(extra_arg); extra_arg = strtok(NULL, " "); } } /* Fill in the remaining arguments */ exec_idx = idx; for (i = 0; args[i]; i++) targs[idx++] = HYDU_strdup(args[i]); /* Create a quoted version of the exec string, which is only used * when the executable is not launched directly, but through an * actual launcher */ HYDU_snprintf(quoted_exec_string, HYD_TMP_STRLEN, "\"%s\"", targs[exec_idx]); HYDU_FREE(targs[exec_idx]); targs[exec_idx] = quoted_exec_string; /* Increase pid list to accommodate the new pid */ HYDU_MALLOC(pid, int *, (HYD_bscu_pid_count + 1) * sizeof(int), status); for (i = 0; i < HYD_bscu_pid_count; i++) pid[i] = HYD_bscu_pid_list[i]; HYDU_FREE(HYD_bscu_pid_list); HYD_bscu_pid_list = pid; /* Increase fd list to accommodate these new fds */ HYDU_MALLOC(fd_list, int *, (HYD_bscu_fd_count + 3) * sizeof(int), status); for (i = 0; i < HYD_bscu_fd_count; i++) fd_list[i] = HYD_bscu_fd_list[i]; HYDU_FREE(HYD_bscu_fd_list); HYD_bscu_fd_list = fd_list; /* append proxy ID as -1 */ targs[idx++] = HYDU_int_to_str(-1); targs[idx++] = NULL; HYDT_topo_cpuset_zero(&cpuset); status = HYDU_create_process(targs, NULL, NULL, &fd_stdout, &fd_stderr, &HYD_bscu_pid_list[HYD_bscu_pid_count++], cpuset); HYDU_ERR_POP(status, "create process returned error\n"); HYD_bscu_fd_list[HYD_bscu_fd_count++] = fd_stdout; HYD_bscu_fd_list[HYD_bscu_fd_count++] = fd_stderr; status = HYDT_dmx_register_fd(1, &fd_stdout, HYD_POLLIN, (void *) (size_t) STDOUT_FILENO, HYDT_bscu_stdio_cb); HYDU_ERR_POP(status, "demux returned error registering fd\n"); status = HYDT_dmx_register_fd(1, &fd_stderr, HYD_POLLIN, (void *) (size_t) STDERR_FILENO, HYDT_bscu_stdio_cb); HYDU_ERR_POP(status, "demux returned error registering fd\n"); fn_exit: if (node_list_str) HYDU_FREE(node_list_str); HYDU_free_strlist(targs); if (path) HYDU_FREE(path); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status fn_put(int fd, int pid, int pgid, char *args[]) { int i, ret; struct HYD_proxy *proxy; struct HYD_pmcd_pmi_pg_scratch *pg_scratch; char *kvsname, *key, *val; char *tmp[HYD_NUM_TMP_STRINGS], *cmd; struct HYD_pmcd_token *tokens; int token_count; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); status = HYD_pmcd_pmi_args_to_tokens(args, &tokens, &token_count); HYDU_ERR_POP(status, "unable to convert args to tokens\n"); kvsname = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "kvsname"); HYDU_ERR_CHKANDJUMP(status, kvsname == NULL, HYD_INTERNAL_ERROR, "unable to find token: kvsname\n"); key = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "key"); HYDU_ERR_CHKANDJUMP(status, key == NULL, HYD_INTERNAL_ERROR, "unable to find token: key\n"); val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "value"); if (val == NULL) { /* the user sent an empty string */ val = HYDU_strdup(""); } proxy = HYD_pmcd_pmi_find_proxy(fd); HYDU_ASSERT(proxy, status); pg_scratch = (struct HYD_pmcd_pmi_pg_scratch *) proxy->pg->pg_scratch; if (strcmp(pg_scratch->kvs->kvs_name, kvsname)) HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "kvsname (%s) does not match this group's kvs space (%s)\n", kvsname, pg_scratch->kvs->kvs_name); status = HYD_pmcd_pmi_add_kvs(key, val, pg_scratch->kvs, &ret); HYDU_ERR_POP(status, "unable to add keypair to kvs\n"); i = 0; tmp[i++] = HYDU_strdup("cmd=put_result rc="); tmp[i++] = HYDU_int_to_str(ret); if (ret == 0) { tmp[i++] = HYDU_strdup(" msg=success"); } else { tmp[i++] = HYDU_strdup(" msg=duplicate_key"); tmp[i++] = HYDU_strdup(key); } tmp[i++] = HYDU_strdup("\n"); tmp[i++] = NULL; status = HYDU_str_alloc_and_join(tmp, &cmd); HYDU_ERR_POP(status, "unable to join strings\n"); HYDU_free_strlist(tmp); status = cmd_response(fd, pid, cmd); HYDU_ERR_POP(status, "error writing PMI line\n"); HYDU_FREE(cmd); fn_exit: HYD_pmcd_pmi_free_tokens(tokens, token_count); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
HYD_status HYDT_bscd_slurm_launch_procs(char **args, struct HYD_proxy *proxy_list, int *control_fd) { int num_hosts, idx, i; int *pid, *fd_list; char *targs[HYD_NUM_TMP_STRINGS], *node_list_str = NULL; char *path = NULL, *extra_arg_list = NULL, *extra_arg; struct HYD_proxy *proxy; struct HYDT_topo_cpuset_t cpuset; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); /* We use the following priority order for the executable path: * (1) user-specified; (2) search in path; (3) Hard-coded * location */ if (HYDT_bsci_info.launcher_exec) path = HYDU_strdup(HYDT_bsci_info.launcher_exec); if (!path) path = HYDU_find_full_path("srun"); if (!path) path = HYDU_strdup("/usr/bin/srun"); idx = 0; targs[idx++] = HYDU_strdup(path); if (strcmp(HYDT_bsci_info.rmk, "slurm")) { targs[idx++] = HYDU_strdup("--nodelist"); status = proxy_list_to_node_str(proxy_list, &node_list_str); HYDU_ERR_POP(status, "unable to build a node list string\n"); targs[idx++] = HYDU_strdup(node_list_str); } num_hosts = 0; for (proxy = proxy_list; proxy; proxy = proxy->next) num_hosts++; targs[idx++] = HYDU_strdup("-N"); targs[idx++] = HYDU_int_to_str(num_hosts); targs[idx++] = HYDU_strdup("-n"); targs[idx++] = HYDU_int_to_str(num_hosts); MPL_env2str("HYDRA_LAUNCHER_EXTRA_ARGS", (const char **) &extra_arg_list); if (extra_arg_list) { extra_arg = strtok(extra_arg_list, " "); while (extra_arg) { targs[idx++] = HYDU_strdup(extra_arg); extra_arg = strtok(NULL, " "); } } /* Fill in the remaining arguments */ /* We do not need to create a quoted version of the string for * SLURM. It seems to be internally quoting it anyway. */ for (i = 0; args[i]; i++) targs[idx++] = HYDU_strdup(args[i]); /* Increase pid list to accommodate the new pid */ HYDU_MALLOC(pid, int *, (HYD_bscu_pid_count + 1) * sizeof(int), status); for (i = 0; i < HYD_bscu_pid_count; i++) pid[i] = HYD_bscu_pid_list[i]; HYDU_FREE(HYD_bscu_pid_list); HYD_bscu_pid_list = pid; /* Increase fd list to accommodate these new fds */ HYDU_MALLOC(fd_list, int *, (HYD_bscu_fd_count + 3) * sizeof(int), status); for (i = 0; i < HYD_bscu_fd_count; i++) fd_list[i] = HYD_bscu_fd_list[i]; HYDU_FREE(HYD_bscu_fd_list); HYD_bscu_fd_list = fd_list; /* append proxy ID as -1 */ targs[idx++] = HYDU_int_to_str(-1); targs[idx++] = NULL; if (HYDT_bsci_info.debug) { HYDU_dump(stdout, "Launch arguments: "); HYDU_print_strlist(targs); } HYDT_topo_cpuset_zero(&cpuset); status = HYDU_create_process(targs, NULL, NULL, &fd_stdout, &fd_stderr, &HYD_bscu_pid_list[HYD_bscu_pid_count++], cpuset); HYDU_ERR_POP(status, "create process returned error\n"); HYD_bscu_fd_list[HYD_bscu_fd_count++] = fd_stdout; HYD_bscu_fd_list[HYD_bscu_fd_count++] = fd_stderr; status = HYDT_dmx_register_fd(1, &fd_stdout, HYD_POLLIN, (void *) (size_t) STDOUT_FILENO, HYDT_bscu_stdio_cb); HYDU_ERR_POP(status, "demux returned error registering fd\n"); status = HYDT_dmx_register_fd(1, &fd_stderr, HYD_POLLIN, (void *) (size_t) STDERR_FILENO, HYDT_bscu_stdio_cb); HYDU_ERR_POP(status, "demux returned error registering fd\n"); fn_exit: if (node_list_str) HYDU_FREE(node_list_str); HYDU_free_strlist(targs); if (path) HYDU_FREE(path); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status fn_kvs_put(int fd, int pid, int pgid, char *args[]) { struct HYD_string_stash stash; char *key, *val, *thrid, *cmd; int ret; struct HYD_proxy *proxy; struct HYD_pmcd_pmi_pg_scratch *pg_scratch; struct HYD_pmcd_token *tokens; int token_count; struct HYD_pmcd_pmi_v2_reqs *req; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); status = HYD_pmcd_pmi_args_to_tokens(args, &tokens, &token_count); HYDU_ERR_POP(status, "unable to convert args to tokens\n"); key = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "key"); HYDU_ERR_CHKANDJUMP(status, key == NULL, HYD_INTERNAL_ERROR, "unable to find key token\n"); val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "value"); if (val == NULL) { /* the user sent an empty string */ val = HYDU_strdup(""); } thrid = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "thrid"); proxy = HYD_pmcd_pmi_find_proxy(fd); HYDU_ASSERT(proxy, status); pg_scratch = (struct HYD_pmcd_pmi_pg_scratch *) proxy->pg->pg_scratch; status = HYD_pmcd_pmi_add_kvs(key, val, pg_scratch->kvs, &ret); HYDU_ERR_POP(status, "unable to put data into kvs\n"); HYD_STRING_STASH_INIT(stash); HYD_STRING_STASH(stash, HYDU_strdup("cmd=kvs-put-response;"), status); if (thrid) { HYD_STRING_STASH(stash, HYDU_strdup("thrid="), status); HYD_STRING_STASH(stash, HYDU_strdup(thrid), status); HYD_STRING_STASH(stash, HYDU_strdup(";"), status); } HYD_STRING_STASH(stash, HYDU_strdup("rc="), status); HYD_STRING_STASH(stash, HYDU_int_to_str(ret), status); HYD_STRING_STASH(stash, HYDU_strdup(";"), status); HYD_STRING_SPIT(stash, cmd, status); status = cmd_response(fd, pid, cmd); HYDU_ERR_POP(status, "send command failed\n"); HYDU_FREE(cmd); for (req = pending_reqs; req; req = req->next) { if (!strcmp(req->key, key)) { /* Poke the progress engine before exiting */ status = poke_progress(key); HYDU_ERR_POP(status, "poke progress error\n"); break; } } fn_exit: HYD_pmcd_pmi_free_tokens(tokens, token_count); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status fn_fullinit(int fd, char *args[]) { int id, i; char *rank_str; struct HYD_string_stash stash; char *cmd; struct HYD_pmcd_token *tokens; int token_count; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); status = HYD_pmcd_pmi_args_to_tokens(args, &tokens, &token_count); HYDU_ERR_POP(status, "unable to convert args to tokens\n"); rank_str = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "pmirank"); HYDU_ERR_CHKANDJUMP(status, rank_str == NULL, HYD_INTERNAL_ERROR, "unable to find pmirank token\n"); id = atoi(rank_str); /* Store the PMI_RANK to fd mapping */ for (i = 0; i < HYD_pmcd_pmip.local.proxy_process_count; i++) { if (HYD_pmcd_pmip.downstream.pmi_rank[i] == id) { HYD_pmcd_pmip.downstream.pmi_fd[i] = fd; HYD_pmcd_pmip.downstream.pmi_fd_active[i] = 1; break; } } HYDU_ASSERT(i < HYD_pmcd_pmip.local.proxy_process_count, status); HYD_STRING_STASH_INIT(stash); HYD_STRING_STASH(stash, MPL_strdup("cmd=fullinit-response;pmi-version=2;pmi-subversion=0;rank="), status); HYD_STRING_STASH(stash, HYDU_int_to_str(id), status); HYD_STRING_STASH(stash, MPL_strdup(";size="), status); HYD_STRING_STASH(stash, HYDU_int_to_str(HYD_pmcd_pmip.system_global.global_process_count), status); HYD_STRING_STASH(stash, MPL_strdup(";appnum=0"), status); if (HYD_pmcd_pmip.local.spawner_kvsname) { HYD_STRING_STASH(stash, MPL_strdup(";spawner-jobid="), status); HYD_STRING_STASH(stash, MPL_strdup(HYD_pmcd_pmip.local.spawner_kvsname), status); } if (HYD_pmcd_pmip.user_global.debug) { HYD_STRING_STASH(stash, MPL_strdup(";debugged=TRUE;pmiverbose=TRUE"), status); } else { HYD_STRING_STASH(stash, MPL_strdup(";debugged=FALSE;pmiverbose=FALSE"), status); } HYD_STRING_STASH(stash, MPL_strdup(";rc=0;"), status); HYD_STRING_SPIT(stash, cmd, status); send_cmd_downstream(fd, cmd); MPL_free(cmd); fn_exit: HYD_pmcd_pmi_free_tokens(tokens, token_count); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }