HYD_status HYDU_strsplit(char *str, char **str1, char **str2, char sep) { int i; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); if (str == NULL) HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, ""); *str1 = HYDU_strdup(str); for (i = 0; (*str1)[i] && ((*str1)[i] != sep); i++); if ((*str1)[i] == 0) /* End of the string */ *str2 = NULL; else { *str2 = HYDU_strdup(&((*str1)[i + 1])); (*str1)[i] = 0; } fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
HYD_status HYDT_topo_hwloc_bind(int idx) { int id; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); /* For processes where the user did not specify a binding unit, no binding is needed. */ if (!HYDT_topo_hwloc_info.user_binding || (idx < HYDT_topo_hwloc_info.num_bitmaps)) { id = idx % HYDT_topo_hwloc_info.num_bitmaps; if (HYDT_topo_info.debug) { /* Print the binding bitmaps for debugging. */ int i; char *binding; HYDU_MALLOC_OR_JUMP(binding, char *, HYDT_topo_hwloc_info.total_num_pus + 1, status); memset(binding, '\0', HYDT_topo_hwloc_info.total_num_pus + 1); for (i = 0; i < HYDT_topo_hwloc_info.total_num_pus; i++) { if (hwloc_bitmap_isset(HYDT_topo_hwloc_info.bitmap[id], i)) *(binding + i) = '1'; else *(binding + i) = '0'; } HYDU_dump_noprefix(stdout, "process %d binding: %s\n", idx, binding); MPL_free(binding); }
char *HYDU_size_t_to_str(size_t x) { int len = 1, i; size_t max = 10; char *str = NULL; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); while (x >= max) { len++; max *= 10; } len++; HYDU_MALLOC(str, char *, len, status); HYDU_ERR_POP(status, "unable to allocate memory\n"); for (i = 0; i < len; i++) str[i] = '0'; HYDU_snprintf(str, len, "%llu", (unsigned long long) x); fn_exit: HYDU_FUNC_EXIT(); return str; fn_fail: goto fn_exit; }
HYD_status HYDU_str_alloc_and_join(char **strlist, char **strjoin) { int len = 0, i, count; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); for (i = 0; strlist[i] != NULL; i++) { len += strlen(strlist[i]); } HYDU_MALLOC(*strjoin, char *, len + 1, status); count = 0; (*strjoin)[0] = 0; for (i = 0; strlist[i] != NULL; i++) { HYDU_snprintf(*strjoin + count, len - count + 1, "%s", strlist[i]); count += strlen(strlist[i]); } fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
HYD_status HYDU_env_to_str(struct HYD_env *env, char **str) { int i; char *tmp[HYD_NUM_TMP_STRINGS]; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); i = 0; tmp[i++] = MPL_strdup("'"); tmp[i++] = MPL_strdup(env->env_name); tmp[i++] = MPL_strdup("="); tmp[i++] = env->env_value ? MPL_strdup(env->env_value) : MPL_strdup(""); tmp[i++] = MPL_strdup("'"); tmp[i++] = NULL; status = HYDU_str_alloc_and_join(tmp, str); HYDU_ERR_POP(status, "unable to join strings\n"); for (i = 0; tmp[i]; i++) MPL_free(tmp[i]); fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status fn_name_publish(int fd, int pid, int pgid, char *args[]) { struct HYD_string_stash stash; char *cmd, *thrid, *val, *name = NULL, *port = NULL; int token_count, success; struct HYD_pmcd_token *tokens = NULL; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); status = HYD_pmcd_pmi_args_to_tokens(args, &tokens, &token_count); HYDU_ERR_POP(status, "unable to convert args to tokens\n"); thrid = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "thrid"); if ((val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "name")) == NULL) HYDU_ERR_POP(status, "cannot find token: name\n"); name = HYDU_strdup(val); if ((val = HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "port")) == NULL) HYDU_ERR_POP(status, "cannot find token: port\n"); port = HYDU_strdup(val); status = HYD_pmcd_pmi_publish(name, port, &success); HYDU_ERR_POP(status, "error publishing service\n"); HYD_STRING_STASH_INIT(stash); HYD_STRING_STASH(stash, HYDU_strdup("cmd=name-publish-response;"), status); if (thrid) { HYD_STRING_STASH(stash, HYDU_strdup("thrid="), status); HYD_STRING_STASH(stash, HYDU_strdup(thrid), status); HYD_STRING_STASH(stash, HYDU_strdup(";"), status); } if (!success) { HYD_STRING_STASH(stash, HYDU_strdup("rc=1;errmsg=duplicate_service_"), status); HYD_STRING_STASH(stash, HYDU_strdup(name), status); HYD_STRING_STASH(stash, HYDU_strdup(";"), status); } else HYD_STRING_STASH(stash, HYDU_strdup("rc=0;"), status); HYD_STRING_SPIT(stash, cmd, status); status = cmd_response(fd, pid, cmd); HYDU_ERR_POP(status, "send command failed\n"); HYDU_FREE(cmd); fn_exit: if (tokens) HYD_pmcd_pmi_free_tokens(tokens, token_count); if (name) HYDU_FREE(name); if (port) HYDU_FREE(port); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
HYD_status HYDU_append_env_str_to_list(const char *str, struct HYD_env **env_list) { char *my_str = NULL; char *env_name, *env_value; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); my_str = env_value = MPL_strdup(str); /* don't use strtok, it will mangle env values that contain '=' */ env_name = MPL_strsep(&env_value, "="); HYDU_ASSERT(env_name != NULL, status); status = HYDU_append_env_to_list(env_name, env_value, env_list); HYDU_ERR_POP(status, "unable to append env to list\n"); fn_exit: if (my_str) MPL_free(my_str); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
HYD_status HYDU_putenv(struct HYD_env *env, HYD_env_overwrite_t overwrite) { char *tmp[HYD_NUM_TMP_STRINGS], *str; int i; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); /* If the overwrite flag is false, just exit */ if (MPL_env2str(env->env_name, (const char **) &str) && overwrite == HYD_ENV_OVERWRITE_FALSE) goto fn_exit; i = 0; tmp[i++] = MPL_strdup(env->env_name); tmp[i++] = MPL_strdup("="); tmp[i++] = env->env_value ? MPL_strdup(env->env_value) : MPL_strdup(""); tmp[i++] = NULL; status = HYDU_str_alloc_and_join(tmp, &str); HYDU_ERR_POP(status, "unable to join strings\n"); MPL_putenv(str); for (i = 0; tmp[i]; i++) MPL_free(tmp[i]); fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
HYD_status HYDU_send_strlist(int fd, char **strlist) { int i, list_len, len; int sent, closed; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); /* Check how many arguments we have */ list_len = HYDU_strlist_lastidx(strlist); status = HYDU_sock_write(fd, &list_len, sizeof(int), &sent, &closed, HYDU_SOCK_COMM_MSGWAIT); HYDU_ERR_POP(status, "unable to write data to proxy\n"); HYDU_ASSERT(!closed, status); /* Convert the string list to parseable data and send */ for (i = 0; strlist[i]; i++) { len = strlen(strlist[i]) + 1; status = HYDU_sock_write(fd, &len, sizeof(int), &sent, &closed, HYDU_SOCK_COMM_MSGWAIT); HYDU_ERR_POP(status, "unable to write data to proxy\n"); HYDU_ASSERT(!closed, status); status = HYDU_sock_write(fd, strlist[i], len, &sent, &closed, HYDU_SOCK_COMM_MSGWAIT); HYDU_ERR_POP(status, "unable to write data to proxy\n"); HYDU_ASSERT(!closed, status); } fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
char *HYDU_find_full_path(const char *execname) { char *tmp[HYD_NUM_TMP_STRINGS] = { NULL }, *path = NULL, *test_path = NULL; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); status = HYDU_find_in_path(execname, &test_path); HYDU_ERR_POP(status, "error while searching for executable in user path\n"); if (test_path) { tmp[0] = MPL_strdup(test_path); tmp[1] = MPL_strdup(execname); tmp[2] = NULL; status = HYDU_str_alloc_and_join(tmp, &path); HYDU_ERR_POP(status, "error joining strings\n"); } fn_exit: HYDU_free_strlist(tmp); if (test_path) MPL_free(test_path); HYDU_FUNC_EXIT(); return path; fn_fail: goto fn_exit; }
static HYD_status send_cmd_downstream(int fd, const char *cmd) { char cmdlen[7]; int sent, closed; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); MPL_snprintf(cmdlen, 7, "%6u", (unsigned) strlen(cmd)); status = HYDU_sock_write(fd, cmdlen, 6, &sent, &closed, HYDU_SOCK_COMM_MSGWAIT); HYDU_ERR_POP(status, "error writing PMI line\n"); /* FIXME: We cannot abort when we are not able to send data * downstream. The upper layer needs to handle this based on * whether we want to abort or not.*/ HYDU_ASSERT(!closed, status); if (HYD_pmcd_pmip.user_global.debug) { HYDU_dump(stdout, "PMI response: %s\n", cmd); } status = HYDU_sock_write(fd, cmd, strlen(cmd), &sent, &closed, HYDU_SOCK_COMM_MSGWAIT); HYDU_ERR_POP(status, "error writing PMI line\n"); HYDU_ASSERT(!closed, status); fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status handle_rr_binding(void) { int i; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); HYDU_ASSERT(hwloc_initialized, status); /* initialize bitmaps */ HYDT_topo_hwloc_info.num_bitmaps = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU); HYDU_MALLOC_OR_JUMP(HYDT_topo_hwloc_info.bitmap, hwloc_bitmap_t *, HYDT_topo_hwloc_info.num_bitmaps * sizeof(hwloc_bitmap_t), status); for (i = 0; i < HYDT_topo_hwloc_info.num_bitmaps; i++) { HYDT_topo_hwloc_info.bitmap[i] = hwloc_bitmap_alloc(); hwloc_bitmap_only(HYDT_topo_hwloc_info.bitmap[i], i); } fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status create_env_file(const struct HYD_env *envlist, int num_ranks, int *ranks) { HYD_status status = HYD_SUCCESS; char filename[256]; FILE *f; const struct HYD_env *e; int ret; int r; HYDU_FUNC_ENTER(); for (r = 0; r < num_ranks; ++r) { MPL_snprintf(filename, sizeof(filename), "/tmp/hydra-env-file-%d:%d", (int) getpid(), ranks[r]); f = fopen(filename, "w"); HYDU_ERR_CHKANDJUMP(status, f == NULL, HYD_INTERNAL_ERROR, "fopen failed: %s\n", strerror(errno)); for (e = envlist; e; e = e->next) { fprintf(f, "%s=%s\n", e->env_name, e->env_value); } ret = fclose(f); HYDU_ERR_CHKANDJUMP(status, ret, HYD_INTERNAL_ERROR, "fclose failed: %s\n", strerror(errno)); } fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
HYD_status HYDT_bscd_pbs_launcher_finalize(void) { int err; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); #if defined(HAVE_TM_H) err = tm_finalize(); HYDU_ERR_CHKANDJUMP(status, err != TM_SUCCESS, HYD_INTERNAL_ERROR, "error calling tm_finalize\n"); #endif /* HAVE_TM_H */ if (HYDT_bscd_pbs_sys) { if (HYDT_bscd_pbs_sys->task_id) HYDU_FREE(HYDT_bscd_pbs_sys->task_id); if (HYDT_bscd_pbs_sys->spawn_events) HYDU_FREE(HYDT_bscd_pbs_sys->spawn_events); HYDU_FREE(HYDT_bscd_pbs_sys); } fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
HYD_status HYDTI_bscd_ll_query_node_count(int *count) { char *hostfile; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); if (MPL_env2str("LOADL_HOSTFILE", (const char **) &hostfile) == 0) hostfile = NULL; if (hostfile == NULL) { HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "No LL nodefile found\n"); } else { total_node_count = 0; status = HYDU_parse_hostfile(hostfile, NULL, process_mfile_count); HYDU_ERR_POP(status, "error parsing hostfile\n"); *count = total_node_count; } fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
void HYDU_free_exec_list(struct HYD_exec *exec_list) { struct HYD_exec *exec, *run; HYDU_FUNC_ENTER(); exec = exec_list; while (exec) { run = exec->next; HYDU_free_strlist(exec->exec); if (exec->wdir) HYDU_FREE(exec->wdir); if (exec->env_prop) HYDU_FREE(exec->env_prop); HYDU_env_free_list(exec->user_env); exec->user_env = NULL; HYDU_FREE(exec); exec = run; } HYDU_FUNC_EXIT(); }
HYD_status HYDU_correct_wdir(char **wdir) { char *tmp[HYD_NUM_TMP_STRINGS]; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); if (*wdir == NULL) { *wdir = HYDU_getcwd(); } else if (*wdir[0] != '/') { tmp[0] = HYDU_getcwd(); tmp[1] = HYDU_strdup("/"); tmp[2] = HYDU_strdup(*wdir); tmp[3] = NULL; HYDU_FREE(*wdir); status = HYDU_str_alloc_and_join(tmp, wdir); HYDU_ERR_POP(status, "unable to join strings\n"); HYDU_free_strlist(tmp); } fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
void HYDU_free_proxy_list(struct HYD_proxy *proxy_list) { struct HYD_proxy *proxy, *tproxy; HYDU_FUNC_ENTER(); proxy = proxy_list; while (proxy) { tproxy = proxy->next; proxy->node = NULL; if (proxy->exec_launch_info) { HYDU_free_strlist(proxy->exec_launch_info); HYDU_FREE(proxy->exec_launch_info); } if (proxy->pid) HYDU_FREE(proxy->pid); if (proxy->exit_status) HYDU_FREE(proxy->exit_status); HYDU_free_exec_list(proxy->exec_list); HYDU_FREE(proxy); proxy = tproxy; } HYDU_FUNC_EXIT(); }
HYD_status HYDU_alloc_exec(struct HYD_exec **exec) { HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); HYDU_MALLOC(*exec, struct HYD_exec *, sizeof(struct HYD_exec), status); (*exec)->exec[0] = NULL; (*exec)->wdir = NULL; (*exec)->proc_count = -1; #if defined(FINEGRAIN_MPI) (*exec)->nfg = 1; (*exec)->start_rank = -1; #endif (*exec)->env_prop = NULL; (*exec)->user_env = NULL; (*exec)->appnum = -1; (*exec)->next = NULL; fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status fn_barrier_in(int fd, int pid, int pgid, char *args[]) { struct HYD_proxy *proxy, *tproxy; const char *cmd; int proxy_count; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); proxy = HYD_pmcd_pmi_find_proxy(fd); HYDU_ASSERT(proxy, status); proxy_count = 0; for (tproxy = proxy->pg->proxy_list; tproxy; tproxy = tproxy->next) proxy_count++; proxy->pg->barrier_count++; if (proxy->pg->barrier_count == proxy_count) { proxy->pg->barrier_count = 0; cmd = "cmd=barrier_out\n"; for (tproxy = proxy->pg->proxy_list; tproxy; tproxy = tproxy->next) { status = cmd_response(tproxy->control_fd, pid, cmd); HYDU_ERR_POP(status, "error writing PMI line\n"); } } fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status alloc_proxy(struct HYD_proxy **proxy, struct HYD_pg *pg, struct HYD_node *node) { HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); HYDU_MALLOC(*proxy, struct HYD_proxy *, sizeof(struct HYD_proxy), status); (*proxy)->node = node; (*proxy)->pg = pg; (*proxy)->proxy_id = -1; (*proxy)->exec_launch_info = NULL; (*proxy)->proxy_process_count = 0; (*proxy)->filler_processes = 0; (*proxy)->pid = NULL; (*proxy)->exit_status = NULL; (*proxy)->control_fd = HYD_FD_UNSET; (*proxy)->exec_list = NULL; (*proxy)->next = NULL; fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
HYD_status HYDT_bscd_sge_query_node_list(struct HYD_node **node_list) { char *hostfile; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); if (MPL_env2str("PE_HOSTFILE", (const char **) &hostfile) == 0) hostfile = NULL; if (hostfile == NULL) { *node_list = NULL; HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "No SGE nodefile found\n"); } else { status = HYDU_parse_hostfile(hostfile, node_list, process_mfile_token); HYDU_ERR_POP(status, "error parsing hostfile\n"); } fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status cmd_response(int fd, int pid, const char *cmd) { struct HYD_pmcd_hdr hdr; int sent, closed; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); HYD_pmcd_init_header(&hdr); hdr.cmd = PMI_RESPONSE; hdr.pid = pid; hdr.pmi_version = 1; hdr.buflen = strlen(cmd); status = HYDU_sock_write(fd, &hdr, sizeof(hdr), &sent, &closed); HYDU_ERR_POP(status, "unable to send PMI_RESPONSE header to proxy\n"); HYDU_ASSERT(!closed, status); if (HYD_server_info.user_global.debug) { HYDU_dump(stdout, "PMI response to fd %d pid %d: %s", fd, pid, cmd); } status = HYDU_sock_write(fd, cmd, strlen(cmd), &sent, &closed); HYDU_ERR_POP(status, "unable to send response to command\n"); HYDU_ASSERT(!closed, status); fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status fn_put(int fd, int pid, int pgid, char *args[]) { struct HYD_proxy *proxy; struct HYD_pmcd_pmi_pg_scratch *pg_scratch; struct HYD_pmcd_token *tokens; int token_count, i, ret; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); status = HYD_pmcd_pmi_args_to_tokens(args, &tokens, &token_count); HYDU_ERR_POP(status, "unable to convert args to tokens\n"); proxy = HYD_pmcd_pmi_find_proxy(fd); HYDU_ASSERT(proxy, status); pg_scratch = (struct HYD_pmcd_pmi_pg_scratch *) proxy->pg->pg_scratch; for (i = 0; i < token_count; i++) { status = HYD_pmcd_pmi_add_kvs(tokens[i].key, tokens[i].val, pg_scratch->kvs, &ret); HYDU_ERR_POP(status, "unable to add keypair to kvs\n"); } fn_exit: HYD_pmcd_pmi_free_tokens(tokens, token_count); HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status fn_abort(int fd, int pid, int pgid, char *args[]) { int token_count; struct HYD_pmcd_token *tokens; /* set a default exit code of 1 */ int exitcode = 1; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); status = HYD_pmcd_pmi_args_to_tokens(args, &tokens, &token_count); HYDU_ERR_POP(status, "unable to convert args to tokens\n"); if (HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "exitcode") == NULL) HYDU_ERR_POP(status, "cannot find token: exitcode\n"); exitcode = atoi(HYD_pmcd_pmi_find_token_keyval(tokens, token_count, "exitcode")); fn_exit: /* clean everything up and exit */ status = HYDT_bsci_wait_for_completion(0); exit(exitcode); /* never get here */ HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
HYD_status HYDT_topo_bind(int idx) { HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); if (idx < 0 || ignore_binding) goto fn_exit; #if defined HAVE_HWLOC if (!strcmp(HYDT_topo_info.topolib, "hwloc")) { status = HYDT_topo_hwloc_bind(idx); HYDU_ERR_POP(status, "HWLOC failure binding process to core\n"); goto fn_exit; } #endif /* HAVE_HWLOC */ HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "no topology library available\n"); fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
static HYD_status alloc_fwd_hash(struct fwd_hash **fwd_hash, int in, int out) { HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); HYDU_MALLOC_OR_JUMP(*fwd_hash, struct fwd_hash *, sizeof(struct fwd_hash), status); (*fwd_hash)->in = in; (*fwd_hash)->out = out; (*fwd_hash)->buf_offset = 0; (*fwd_hash)->buf_count = 0; (*fwd_hash)->next = NULL; status = HYDU_sock_set_nonblock(in); HYDU_ERR_POP(status, "unable to set out-socket to non-blocking\n"); status = HYDU_sock_set_nonblock(out); HYDU_ERR_POP(status, "unable to set out-socket to non-blocking\n"); fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
HYD_status HYDT_bscd_sge_query_env_inherit(const char *env_name, int *ret) { const char *env_list[] = { "DISPLAY", "SGE_ROOT", "SGE_CELL", "SGE_DEBUG_LEVEL", "SGE_QMASTER_PORT", "SGE_O_HOME", "SGE_O_HOST", "SGE_O_LOGNAME", "SGE_O_MAIL", "SGE_O_PATH", "SGE_O_SHELL", "SGE_O_TZ", "SGE_O_WORKDIR", "SGE_ARCH", "SGE_CKPT_ENV", "SGE_CKPT_DIR", "SGE_STDERR_PATH", "SGE_STDOUT_PATH", "SGE_STDIN_PATH", "SGE_JOB_SPOOL_DIR", "SGE_TASK_ID", "SGE_TASK_FIRST", "SGE_TASK_LAST", "SGE_TASK_STEPSIZE", "SGE_BINARY_PATH", "SGE_JSV_TIMEOUT", "SGE_BINDING", "ARC", "ENVIRONMENT", "HOME", "HOSTNAME", "JOB_ID", "JOB_NAME", "JOB_SCRIPT", "LOGNAME", "NHOSTS", "NQUEUES", "NSLOTS", "PATH", "PE", "PE_HOSTFILE", "QUEUE", "REQUEST", "RESTARTED", "SHELL", "TMPDIR", "TMP", "TZ", "USER", NULL }; HYDU_FUNC_ENTER(); *ret = !HYDTI_bscd_in_env_list(env_name, env_list); HYDU_FUNC_EXIT(); return HYD_SUCCESS; }
HYD_status HYDT_bscu_stdio_cb(int fd, HYD_event_t events, void *userp) { int stdfd, closed, i; HYD_status status = HYD_SUCCESS; HYDU_FUNC_ENTER(); stdfd = (int) (size_t) userp; status = HYDU_sock_forward_stdio(fd, stdfd, &closed); HYDU_ERR_POP(status, "stdio forwarding error\n"); if (closed || (events & HYD_POLLHUP)) { /* connection has closed */ status = HYDT_dmx_deregister_fd(fd); HYDU_ERR_SETANDJUMP(status, status, "error deregistering fd %d\n", fd); for (i = 0; i < HYD_bscu_fd_count; i++) { if (HYD_bscu_fd_list[i] == fd) { HYD_bscu_fd_list[i] = HYD_FD_CLOSED; break; } } close(fd); } fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
HYD_status HYDT_ckpoint_blcr_restart(const char *prefix, int pgid, int id, int ckpt_num, struct HYD_env *envlist, int num_ranks, int ranks[], int *in, int *out, int *err, int *pid) { HYD_status status = HYD_SUCCESS; int ret; int context_fd; cr_restart_handle_t cr_handle; cr_restart_args_t args; char filename[256]; char port_str[64]; int port; HYDU_FUNC_ENTER(); /* create listener socket for stdin/out/err */ status = create_stdinouterr_sock(&port); HYDU_ERR_POP(status, "failed to create stdin/out/err socket\n"); MPL_snprintf(port_str, sizeof(port_str), "%d", port); status = HYDU_append_env_to_list(STDINOUTERR_PORT_NAME, port_str, &envlist); HYDU_ERR_POP(status, "failed to add to env list\n"); status = create_env_file(envlist, num_ranks, ranks); if (status) HYDU_ERR_POP(status, "blcr restart\n"); /* open the checkpoint file */ MPL_snprintf(filename, sizeof(filename), "%s/context-num%d-%d-%d", prefix, ckpt_num, pgid, id); context_fd = open(filename, O_RDONLY /* | O_LARGEFILE */); HYDU_ERR_CHKANDJUMP(status, context_fd < 0, HYD_INTERNAL_ERROR, "open failed, %s\n", strerror(errno)); /* ... initialize the request structure */ cr_initialize_restart_args_t(&args); args.cr_fd = context_fd; args.cr_flags = CR_RSTRT_RESTORE_PID; /* ... issue the request */ ret = cr_request_restart(&args, &cr_handle); HYDU_ERR_CHKANDJUMP(status, ret, HYD_INTERNAL_ERROR, "cr_request_restart failed, %s\n", strerror(errno)); ret = close(context_fd); HYDU_ERR_CHKANDJUMP(status, ret, HYD_INTERNAL_ERROR, "close failed, %s\n", strerror(errno)); /* get fds for stdin/out/err sockets, and get pids of restarted processes */ status = wait_for_stdinouterr_sockets(num_ranks, ranks, in, out, err, pid); if (status) HYDU_ERR_POP(status, "blcr restart\n"); fn_exit: HYDU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }