/* * Search the job's environment to determine if the * user requested the MPS to be on or off. * Returns 0 for off, 1 for on, 2 for not requested, * 3 for error. */ static int _get_mps_request(stepd_step_rec_t *job) { char *envval; // Determine what user wants the mps to be set at by the // CRAY_CUDA_MPS and CRAY_CUDA_PROXY variables. If not set, // do nothing. if (!(envval = getenvp(job->env, CRAY_CUDA_MPS_ENV)) && !(envval = getenvp(job->env, CRAY_CUDA_PROXY_ENV))) { debug2("No GPU action requested"); return 2; } if (!strcasecmp(envval, "on") || !strcmp(envval, "1")) { debug2("GPU mps requested on"); return 1; } else if (!strcasecmp(envval, "off") || !strcmp(envval, "0")) { debug2("GPU mps requested off"); return 0; } CRAY_ERR("Couldn't parse %s value %s, expected on,off,0,1", CRAY_CUDA_MPS_ENV, envval); return 3; }
int p_mpi_hook_slurmstepd_task (const mpi_plugin_client_info_t *job, char ***env) { char *nodelist, *task_cnt; nodelist = getenvp(*env, "SLURM_NODELIST"); if (nodelist) { char *host_str = NULL, *tmp; hostlist_t hl = hostlist_create(nodelist); while ((tmp = hostlist_shift(hl))) { if (host_str) xstrcat(host_str, ","); xstrcat(host_str, tmp); free(tmp); } hostlist_destroy(hl); env_array_overwrite_fmt(env, "SLURM_MPICH_NODELIST", "%s", host_str); xfree(host_str); } task_cnt = getenvp(*env, "SLURM_TASKS_PER_NODE"); if (task_cnt) { char *task_str = NULL, tmp_str[32]; int i=0, val, reps; while (task_cnt[i]) { if ((task_cnt[i] >= '0') && (task_cnt[i] <= '9')) val = atoi(&task_cnt[i]); else break; /* bad parse */ i++; while (task_cnt[i] && (task_cnt[i] != 'x') && (task_cnt[i] != ',')) i++; if (task_cnt[i] == 'x') { i++; reps = atoi(&task_cnt[i]); while (task_cnt[i] && (task_cnt[i] != ',')) i++; } else reps = 1; if (task_cnt[i] == ',') i++; while (reps) { if (task_str) xstrcat(task_str, ","); snprintf(tmp_str, sizeof(tmp_str), "%d", val); xstrcat(task_str, tmp_str); reps--; } } env_array_overwrite_fmt(env, "SLURM_MPICH_TASKS", "%s", task_str); xfree(task_str); } return SLURM_SUCCESS; }
/* * Determine the value of the env var 'name' (if it exists) and whether * or not the user wants to use its value as the jobs soft rlimit. */ static int _get_env_val(char **env, const char *name, unsigned long *valp, bool *u_req_propagate) { char *val = NULL; char *p = NULL; xassert(env != NULL); xassert(name != NULL); if (!(val = getenvp(env, name))) return (-1); /* * The letter 'U' would have been prepended to the string value if the * user requested to have this rlimit propagated via 'srun --propagate' */ if (*val == 'U') { *u_req_propagate = TRUE; debug2( "_get_env_val: %s propagated by user option", &name[6]); val++; } else *u_req_propagate = FALSE; *valp = strtoul(val, &p, 10); if (p && (*p != '\0')) { error("Invalid %s env var, value = `%s'", name, val); return (-1); } return (0); }
int p_mpi_hook_slurmstepd_task(const mpi_plugin_task_info_t *job, char ***env) { char addrbuf[1024]; char *p; char *addr = getenvp(*env, "SLURM_LAUNCH_NODE_IPADDR"); debug("Using mpi/mpich-gm"); slurm_print_slurm_addr (job->self, addrbuf, sizeof(addrbuf)); if ((p = strchr (addrbuf, ':')) != NULL) *p = '\0'; env_array_overwrite_fmt(env, "GMPI_MASTER", "%s", addr); env_array_overwrite_fmt(env, "GMPI_SLAVE", "%s", addrbuf); env_array_overwrite_fmt(env, "GMPI_ID", "%u", job->gtaskid); if (!getenv("GMPI_RECV")) { env_array_overwrite_fmt(env, "GMPI_RECV", "%s", "hybrid"); } env_array_overwrite_fmt(env, "MXMPI_MASTER", "%s", addr); env_array_overwrite_fmt(env, "MXMPI_ID", "%u", job->gtaskid); env_array_overwrite_fmt(env, "MXMPI_SLAVE", "%s", addrbuf); if (!getenv("MXMPI_RECV")) { env_array_overwrite_fmt(env, "MXMPI_RECV", "%s", "hybrid"); } debug2("init for mpi rank %u", job->gtaskid); return SLURM_SUCCESS; }
static int _setup_stepd_kvs(const stepd_step_rec_t *job, char ***env) { int rc = SLURM_SUCCESS, i = 0, pp_cnt = 0; char *p, env_key[32], *ppkey, *ppval; kvs_seq = 1; rc = temp_kvs_init(); if (rc != SLURM_SUCCESS) return rc; rc = kvs_init(); if (rc != SLURM_SUCCESS) return rc; /* preput */ p = getenvp(*env, PMI2_PREPUT_CNT_ENV); if (p) { pp_cnt = atoi(p); } for (i = 0; i < pp_cnt; i ++) { snprintf(env_key, 32, PMI2_PPKEY_ENV"%d", i); p = getenvp(*env, env_key); ppkey = p; /* getenvp will not modify p */ snprintf(env_key, 32, PMI2_PPVAL_ENV"%d", i); p = getenvp(*env, env_key); ppval = p; kvs_put(ppkey, ppval); } /* * For PMI11. * A better logic would be to put PMI_process_mapping in KVS only if * the task distribution method is not "arbitrary", because in * "arbitrary" distribution the process mapping varible is not correct. * MPICH2 may deduce the clique info from the hostnames. But that * is rather costly. */ kvs_put("PMI_process_mapping", job_info.proc_mapping); return SLURM_SUCCESS; }
int p_mpi_hook_slurmstepd_task (const mpi_plugin_task_info_t *job, char ***env) { int i; char *processes = NULL; char *addr = getenvp (*env, "SLURM_LAUNCH_NODE_IPADDR"); debug("Using mpi/mvapich"); env_array_overwrite_fmt(env, "MPIRUN_HOST", "%s", addr); env_array_overwrite_fmt(env, "MPIRUN_RANK", "%u", job->gtaskid); env_array_overwrite_fmt(env, "MPIRUN_MPD", "0"); debug2("init for mpi rank %u", job->gtaskid); if (getenvp (*env, "SLURM_NEED_MVAPICH_MPIRUN_PROCESSES")) { /* * Fake MPIRUN_PROCESSES env var -- we don't need this for * SLURM at this time. (what a waste) */ for (i = 0; i < job->ntasks; i++) xstrcat (processes, "x:"); env_array_overwrite_fmt(env, "MPIRUN_PROCESSES", "%s", processes); } /* * Some mvapich versions will ignore MPIRUN_PROCESSES If * the following env var is set. */ env_array_overwrite_fmt(env, "NOT_USE_TOTALVIEW", "1"); /* * Set VIADEV_ENABLE_AFFINITY=0 so that mvapich doesn't * override SLURM's CPU affinity. (Unless this var is * already set in user env) */ if (!getenvp (*env, "VIADEV_ENABLE_AFFINITY")) env_array_overwrite_fmt(env, "VIADEV_ENABLE_AFFINITY", "0"); return SLURM_SUCCESS; }
void pmixp_server_init_pp(char ***env) { char *env_ptr = NULL; int tmp_int; slurm_mutex_init(&_pmixp_pp_lock); /* check if we want to run ping-pong */ if (!(env_ptr = getenvp(*env, PMIXP_PP_ON))) { return; } if (!xstrcmp("1", env_ptr) || !xstrcmp("true", env_ptr)) { _pmixp_pp_on = true; } if ((env_ptr = getenvp(*env, PMIXP_PP_SAMETHR))) { if (!xstrcmp("1", env_ptr) || !xstrcmp("true", env_ptr)) { _pmixp_pp_same_thr = true; } } if ((env_ptr = getenvp(*env, PMIXP_PP_LOW))) { if (_consists_from_digits(env_ptr)) { tmp_int = atoi(env_ptr); _pmixp_pp_low = tmp_int < PMIXP_PP_PWR2_MAX ? tmp_int : PMIXP_PP_PWR2_MAX; } } if ((env_ptr = getenvp(*env, PMIXP_PP_UP))) { if (_consists_from_digits(env_ptr)) { tmp_int = atoi(env_ptr); _pmixp_pp_up = tmp_int < PMIXP_PP_PWR2_MAX ? tmp_int : PMIXP_PP_PWR2_MAX; } } if ((env_ptr = getenvp(*env, PMIXP_PP_SITER))) { if (_consists_from_digits(env_ptr)) { _pmixp_pp_siter = atoi(env_ptr); } } if ((env_ptr = getenvp(*env, PMIXP_PP_LITER))) { if (_consists_from_digits(env_ptr)) { _pmixp_pp_liter = atoi(env_ptr); } } if ((env_ptr = getenvp(*env, PMIXP_PP_BOUND))) { if (_consists_from_digits(env_ptr)) { _pmixp_pp_bound = atoi(env_ptr); } } }
int mpi_hook_slurmstepd_init (char ***env) { char *mpi_type = getenvp (*env, "SLURM_MPI_TYPE"); debug("mpi type = %s", mpi_type); if (_mpi_init(mpi_type) == SLURM_ERROR) return SLURM_ERROR; unsetenvp (*env, "SLURM_MPI_TYPE"); return SLURM_SUCCESS; }
/* Given a program name, translate it to a fully qualified pathname as needed * based upon the PATH environment variable and current working directory * Returns xmalloc()'d string that must be xfree()'d */ extern char *_build_path(char *fname, char **prog_env, char *cwd) { char *path_env = NULL, *dir; char *file_name; struct stat stat_buf; int len = PATH_MAX; if (!fname) return NULL; file_name = (char *) xmalloc(len); /* check if already absolute path */ if (fname[0] == '/') { /* copy and ensure null termination */ strlcpy(file_name, fname, len); return file_name; } if (fname[0] == '.') { if (cwd) { snprintf(file_name, len, "%s/%s", cwd, fname); } else { dir = (char *) xmalloc(len); if (!getcwd(dir, len)) error("getcwd failed: %m"); snprintf(file_name, len, "%s/%s", dir, fname); xfree(dir); } return file_name; } /* search for the file using PATH environment variable */ path_env = xstrdup(getenvp(prog_env, "PATH")); dir = strtok(path_env, ":"); while (dir) { snprintf(file_name, len, "%s/%s", dir, fname); if ((stat(file_name, &stat_buf) == 0) && (! S_ISDIR(stat_buf.st_mode))) break; dir = strtok(NULL, ":"); } if (dir == NULL) /* not found */ strlcpy(file_name, fname, len); xfree(path_env); return file_name; }
static void _set_env(char ***env_ptr, void *gres_ptr, int node_inx, bitstr_t *usable_gres, bool *already_seen, int *local_inx, bool reset, bool is_job) { char *global_list = NULL, *local_list = NULL; char *slurm_env_var = NULL; if (is_job) slurm_env_var = "SLURM_JOB_NICS"; else slurm_env_var = "SLURM_STEP_NICS"; if (*already_seen) { global_list = xstrdup(getenvp(*env_ptr, slurm_env_var)); local_list = xstrdup(getenvp(*env_ptr, "OMPI_MCA_btl_openib_if_include")); } common_gres_set_env(gres_devices, env_ptr, gres_ptr, node_inx, usable_gres, "mlx4_", local_inx, &local_list, &global_list, reset, is_job); if (global_list) { env_array_overwrite(env_ptr, slurm_env_var, global_list); xfree(global_list); } if (local_list) { env_array_overwrite( env_ptr, "OMPI_MCA_btl_openib_if_include", local_list); xfree(local_list); *already_seen = true; } }
/* Set umask using value of env var SLURM_UMASK */ extern int set_umask(stepd_step_rec_t *job) { mode_t mask; char *val; if (!(val = getenvp(job->env, "SLURM_UMASK"))) { if (job->stepid != SLURM_EXTERN_CONT) debug("Couldn't find SLURM_UMASK in environment"); return SLURM_ERROR; } mask = strtol(val, (char **)NULL, 8); umask(mask); return SLURM_SUCCESS; }
static void _make_tmpdir(slurmd_job_t *job) { char *tmpdir; if (!(tmpdir = getenvp(job->env, "TMPDIR"))) setenvf(&job->env, "TMPDIR", "/tmp"); /* task may want it set */ else if (mkdir(tmpdir, 0700) < 0) { struct stat st; int mkdir_errno = errno; if (stat(tmpdir, &st)) { /* does the file exist ? */ /* show why we were not able to create it */ error("Unable to create TMPDIR [%s]: %s", tmpdir, strerror(mkdir_errno)); } else if (!S_ISDIR(st.st_mode)) { /* is it a directory? */ error("TMPDIR [%s] is not a directory", tmpdir); } /* Eaccess wasn't introduced until glibc 2.4 but euidaccess * has been around for a while. So to make sure we * still work with older systems we include this check. */ #if defined(__FreeBSD__) #define __GLIBC__ (1) #define __GLIBC_PREREQ(a,b) (1) #endif #if defined __GLIBC__ && __GLIBC_PREREQ(2, 4) else if (eaccess(tmpdir, X_OK|W_OK)) /* check permissions */ #else else if (euidaccess(tmpdir, X_OK|W_OK)) #endif error("TMPDIR [%s] is not writeable", tmpdir); else return; error("Setting TMPDIR to /tmp"); setenvf(&job->env, "TMPDIR", "/tmp"); } return; }
extern int slurm_ckpt_stepd_prefork(stepd_step_rec_t *job) { char *old_env = NULL, *new_env = NULL, *ptr = NULL, *save_ptr = NULL; /* * I was thinking that a thread can be created here to * communicate with the tasks via sockets/pipes. * Maybe this is not needed - we can modify MVAPICH2 */ /* set LD_PRELOAD for batch script shell */ //if (job->batch) { old_env = getenvp(job->env, "LD_PRELOAD"); if (old_env) { /* search and replace all libcr_run and libcr_omit * the old env value is messed up -- * it will be replaced */ while ((ptr = strtok_r(old_env, " :", &save_ptr))) { old_env = NULL; if (!ptr) break; if (!strncmp(ptr, "libcr_run.so", 12) || !strncmp(ptr, "libcr_omit.so", 13)) continue; xstrcat(new_env, ptr); xstrcat(new_env, ":"); } } ptr = xstrdup("libcr_run.so"); if (new_env) xstrfmtcat(ptr, ":%s", new_env); setenvf(&job->env, "LD_PRELOAD", ptr); xfree(new_env); xfree(ptr); //} return SLURM_SUCCESS; }
static void _set_rlimits(char **env) { slurm_rlimits_info_t *rli; char env_name[25] = "SLURM_RLIMIT_"; char *env_value, *p; struct rlimit r; //unsigned long env_num; rlim_t env_num; for (rli=get_slurm_rlimits_info(); rli->name; rli++) { if (rli->propagate_flag != PROPAGATE_RLIMITS) continue; strcpy(&env_name[sizeof("SLURM_RLIMIT_")-1], rli->name); env_value = getenvp(env, env_name); if (env_value == NULL) continue; unsetenvp(env, env_name); if (getrlimit(rli->resource, &r) < 0) { error("getrlimit(%s): %m", env_name+6); continue; } env_num = strtol(env_value, &p, 10); if (p && (p[0] != '\0')) { error("Invalid environment %s value %s", env_name, env_value); continue; } if (r.rlim_cur == env_num) continue; r.rlim_cur = (rlim_t) env_num; if (setrlimit(rli->resource, &r) < 0) { error("setrlimit(%s): %m", env_name+6); continue; } } }
static int _setup_stepd_job_info(const stepd_step_rec_t *job, char ***env) { char *p; int i; memset(&job_info, 0, sizeof(job_info)); job_info.jobid = job->jobid; job_info.stepid = job->stepid; job_info.nnodes = job->nnodes; job_info.nodeid = job->nodeid; job_info.ntasks = job->ntasks; job_info.ltasks = job->node_tasks; job_info.gtids = xmalloc(job->node_tasks * sizeof(uint32_t)); for (i = 0; i < job->node_tasks; i ++) { job_info.gtids[i] = job->task[i]->gtid; } p = getenvp(*env, PMI2_PMI_DEBUGGED_ENV); if (p) { job_info.pmi_debugged = atoi(p); } else { job_info.pmi_debugged = 0; } p = getenvp(*env, PMI2_SPAWN_SEQ_ENV); if (p) { /* spawned */ job_info.spawn_seq = atoi(p); unsetenvp(*env, PMI2_SPAWN_SEQ_ENV); p = getenvp(*env, PMI2_SPAWNER_JOBID_ENV); job_info.spawner_jobid = xstrdup(p); unsetenvp(*env, PMI2_SPAWNER_JOBID_ENV); } else { job_info.spawn_seq = 0; job_info.spawner_jobid = NULL; } p = getenvp(*env, PMI2_PMI_JOBID_ENV); if (p) { job_info.pmi_jobid = xstrdup(p); unsetenvp(*env, PMI2_PMI_JOBID_ENV); } else { xstrfmtcat(job_info.pmi_jobid, "%u.%u", job->jobid, job->stepid); } p = getenvp(*env, PMI2_STEP_NODES_ENV); if (!p) { error("mpi/pmi2: unable to find nodes in job environment"); return SLURM_ERROR; } else { job_info.step_nodelist = xstrdup(p); unsetenvp(*env, PMI2_STEP_NODES_ENV); } /* * how to get the mapping info from stepd directly? * there is the task distribution info in the launch_tasks_request_msg_t, * but it is not stored in the stepd_step_rec_t. */ p = getenvp(*env, PMI2_PROC_MAPPING_ENV); if (!p) { error("PMI2_PROC_MAPPING_ENV not found"); return SLURM_ERROR; } else { job_info.proc_mapping = xstrdup(p); unsetenvp(*env, PMI2_PROC_MAPPING_ENV); } job_info.job_env = env_array_copy((const char **)*env); job_info.MPIR_proctable = NULL; job_info.srun_opt = NULL; return SLURM_SUCCESS; }
static int _setup_stepd_tree_info(const stepd_step_rec_t *job, char ***env) { hostlist_t hl; char srun_host[64]; uint16_t port; char *p; int tree_width; /* job info available */ memset(&tree_info, 0, sizeof(tree_info)); hl = hostlist_create(job_info.step_nodelist); p = hostlist_nth(hl, job_info.nodeid); /* strdup-ed */ tree_info.this_node = xstrdup(p); free(p); /* this only controls the upward communication tree width */ p = getenvp(*env, PMI2_TREE_WIDTH_ENV); if (p) { tree_width = atoi(p); if (tree_width < 2) { info("invalid PMI2 tree width value (%d) detected. " "fallback to default value.", tree_width); tree_width = slurm_get_tree_width(); } } else { tree_width = slurm_get_tree_width(); } /* TODO: cannot launch 0 tasks on node */ /* * In tree position calculation, root of the tree is srun with id 0. * Stepd's id will be its nodeid plus 1. */ reverse_tree_info(job_info.nodeid + 1, job_info.nnodes + 1, tree_width, &tree_info.parent_id, &tree_info.num_children, &tree_info.depth, &tree_info.max_depth); tree_info.parent_id --; /* restore real nodeid */ if (tree_info.parent_id < 0) { /* parent is srun */ tree_info.parent_node = NULL; } else { p = hostlist_nth(hl, tree_info.parent_id); tree_info.parent_node = xstrdup(p); free(p); } hostlist_destroy(hl); tree_info.pmi_port = 0; /* not used */ p = getenvp(*env, "SLURM_SRUN_COMM_HOST"); if (!p) { error("mpi/pmi2: unable to find srun comm ifhn in env"); return SLURM_ERROR; } else { strncpy(srun_host, p, 64); } p = getenvp(*env, PMI2_SRUN_PORT_ENV); if (!p) { error("mpi/pmi2: unable to find srun pmi2 port in env"); return SLURM_ERROR; } else { port = atoi(p); unsetenvp(*env, PMI2_SRUN_PORT_ENV); } tree_info.srun_addr = xmalloc(sizeof(slurm_addr_t)); slurm_set_addr(tree_info.srun_addr, port, srun_host); /* init kvs seq to 0. TODO: reduce array size */ tree_info.children_kvs_seq = xmalloc(sizeof(uint32_t) * job_info.nnodes); return SLURM_SUCCESS; }
static int _env_set(char ***env) { char *p = NULL; xassert(_pmixp_job_info.hostname); _pmixp_job_info.server_addr_unfmt = slurm_get_slurmd_spooldir(NULL); _pmixp_job_info.lib_tmpdir = slurm_conf_expand_slurmd_path( _pmixp_job_info.server_addr_unfmt, _pmixp_job_info.hostname); xstrfmtcat(_pmixp_job_info.server_addr_unfmt, "/stepd.slurm.pmix.%d.%d", pmixp_info_jobid(), pmixp_info_stepid()); _pmixp_job_info.spool_dir = xstrdup(_pmixp_job_info.lib_tmpdir); /* ----------- Temp directories settings ------------- */ xstrfmtcat(_pmixp_job_info.lib_tmpdir, "/pmix.%d.%d/", pmixp_info_jobid(), pmixp_info_stepid()); /* save client temp directory if requested * TODO: We want to get TmpFS value as well if exists. * Need to sync with SLURM developers. */ p = getenvp(*env, PMIXP_TMPDIR_CLI); if (p) _pmixp_job_info.cli_tmpdir_base = xstrdup(p); else _pmixp_job_info.cli_tmpdir_base = slurm_get_tmp_fs( _pmixp_job_info.hostname); _pmixp_job_info.cli_tmpdir = xstrdup_printf("%s/spmix_appdir_%d.%d", _pmixp_job_info.cli_tmpdir_base, pmixp_info_jobid(), pmixp_info_stepid()); /* ----------- Timeout setting ------------- */ /* TODO: also would be nice to have a cluster-wide setting in SLURM */ _pmixp_job_info.timeout = PMIXP_TIMEOUT_DEFAULT; p = getenvp(*env, PMIXP_TIMEOUT); if (NULL != p) { int tmp; tmp = atoi(p); if (tmp > 0) { _pmixp_job_info.timeout = tmp; } } /* ----------- Forward PMIX settings ------------- */ /* FIXME: this may be intrusive as well as PMIx library will create * lots of output files in /tmp by default. * somebody can use this or annoyance */ p = getenvp(*env, PMIXP_PMIXLIB_DEBUG); if (NULL != p) { setenv(PMIXP_PMIXLIB_DEBUG, p, 1); /* output into the file since we are in slurmstepd * and stdout is muted. * One needs to check TMPDIR for the results */ setenv(PMIXP_PMIXLIB_DEBUG_REDIR, "file", 1); } return SLURM_SUCCESS; }
static int _env_set(char ***env) { char *p = NULL; /* ----------- Temp directories settings ------------- */ /* * FIXME: This is dangerous to set this from the user environment. * I was using this to debug in linux containers * On real hardware each node has it's own separate /tmp directory */ /* set server temp directory - change this process environment */ p = getenvp(*env, PMIXP_TMPDIR_SRV); if (NULL != p) { setenv(PMIXP_OS_TMPDIR_ENV, p, 1); } p = getenv(PMIXP_OS_TMPDIR_ENV); if (NULL == p) { p = PMIXP_TMPDIR_DEFAULT; } _pmixp_job_info.lib_tmpdir = xstrdup_printf("%s/pmix.%d.%d/", p, pmixp_info_jobid(), pmixp_info_stepid()); /* save client temp directory if requested * TODO: We want to get TmpFS value as well if exists. * Need to sync with SLURM developers. */ p = getenvp(*env, PMIXP_TMPDIR_CLI); if (NULL != p) { _pmixp_job_info.cli_tmpdir = xstrdup(p); } else { p = slurm_get_tmp_fs(); if (NULL != p) { _pmixp_job_info.cli_tmpdir = p; } } /* ----------- Timeout setting ------------- */ /* TODO: also would be nice to have a cluster-wide setting in SLURM */ _pmixp_job_info.timeout = PMIXP_TIMEOUT_DEFAULT; p = getenvp(*env, PMIXP_TIMEOUT); if (NULL != p) { int tmp; tmp = atoi(p); if (tmp > 0) { _pmixp_job_info.timeout = tmp; } } /* ----------- Forward PMIX settings ------------- */ /* FIXME: this may be intrusive as well as PMIx library will create * lots of output files in /tmp by default. * somebody can use this or annoyance */ p = getenvp(*env, PMIXP_PMIXLIB_DEBUG); if (NULL != p) { setenv(PMIXP_PMIXLIB_DEBUG, p, 1); /* output into the file since we are in slurmstepd * and stdout is muted. * One needs to check TMPDIR for the results */ setenv(PMIXP_PMIXLIB_DEBUG_REDIR, "file", 1); } return SLURM_SUCCESS; }
static int _resources_set(char ***env) { char *p = NULL; /* Initialize all memory pointers that would be allocated to NULL * So in case of error exit we will know what to xfree */ _pmixp_job_info.job_hl = hostlist_create(""); _pmixp_job_info.step_hl = hostlist_create(""); _pmixp_job_info.hostname = NULL; /* Save step host list */ p = getenvp(*env, PMIXP_STEP_NODES_ENV); if (!p) { PMIXP_ERROR_NO(ENOENT, "Environment variable %s not found", PMIXP_STEP_NODES_ENV); goto err_exit; } hostlist_push(_pmixp_job_info.step_hl, p); /* Extract our node name */ p = hostlist_nth(_pmixp_job_info.step_hl, _pmixp_job_info.node_id); _pmixp_job_info.hostname = xstrdup(p); free(p); /* Determine job-wide node id and job-wide node count */ p = getenvp(*env, PMIXP_JOB_NODES_ENV); if (p == NULL) { p = getenvp(*env, PMIXP_JOB_NODES_ENV_DEP); if (p == NULL) { /* shouldn't happen if we are under SLURM! */ PMIXP_ERROR_NO(ENOENT, "Neither of nodelist environment variables: %s OR %s was found!", PMIXP_JOB_NODES_ENV, PMIXP_JOB_NODES_ENV_DEP); goto err_exit; } } hostlist_push(_pmixp_job_info.job_hl, p); _pmixp_job_info.nnodes_job = hostlist_count(_pmixp_job_info.job_hl); _pmixp_job_info.node_id_job = hostlist_find(_pmixp_job_info.job_hl, _pmixp_job_info.hostname); /* FIXME!! ------------------------------------------------------------- */ /* TODO: _get_task_count not always works well. if (_get_task_count(env, &_pmixp_job_info.ntasks_job, &_pmixp_job_info.ncpus_job) < 0) { _pmixp_job_info.ntasks_job = _pmixp_job_info.ntasks; _pmixp_job_info.ncpus_job = _pmixp_job_info.ntasks; } xassert(_pmixp_job_info.ntasks <= _pmixp_job_info.ntasks_job); */ _pmixp_job_info.ntasks_job = _pmixp_job_info.ntasks; _pmixp_job_info.ncpus_job = _pmixp_job_info.ntasks; /* Save task-to-node mapping */ p = getenvp(*env, PMIXP_SLURM_MAPPING_ENV); if (p == NULL) { /* Direct modex won't work */ PMIXP_ERROR_NO(ENOENT, "No %s environment variable found!", PMIXP_SLURM_MAPPING_ENV); goto err_exit; } _pmixp_job_info.task_map_packed = xstrdup(p); return SLURM_SUCCESS; err_exit: hostlist_destroy(_pmixp_job_info.job_hl); hostlist_destroy(_pmixp_job_info.step_hl); if (NULL != _pmixp_job_info.hostname) { xfree(_pmixp_job_info.hostname); } return SLURM_ERROR; }
static int _handle_signal_container(int fd, stepd_step_rec_t *job, uid_t uid) { int rc = SLURM_SUCCESS; int errnum = 0; int sig; static int msg_sent = 0; char *ptr = NULL; int target_node_id = 0; stepd_step_task_info_t *task; uint32_t i; safe_read(fd, &sig, sizeof(int)); debug("_handle_signal_container for step=%u.%u uid=%d signal=%d", job->jobid, job->stepid, (int) uid, sig); if ((uid != job->uid) && !_slurm_authorized_user(uid)) { error("signal container req from uid %ld for step=%u.%u " "owned by uid %ld", (long)uid, job->jobid, job->stepid, (long)job->uid); rc = -1; errnum = EPERM; goto done; } /* * Sanity checks */ if (job->cont_id == 0) { debug ("step %u.%u invalid container [cont_id:%"PRIu64"]", job->jobid, job->stepid, job->cont_id); rc = -1; errnum = ESLURMD_JOB_NOTRUNNING; goto done; } if ((sig == SIGTERM) || (sig == SIGKILL)) { /* cycle thru the tasks and mark those that have not * called abort and/or terminated as killed_by_cmd */ for (i = 0; i < job->node_tasks; i++) { if (NULL == (task = job->task[i])) { continue; } if (task->aborted || task->exited) { continue; } /* mark that this task is going to be killed by * cmd so we ignore its exit status - otherwise, * we will probably report the final exit status * as SIGKILL */ task->killed_by_cmd = true; } } ptr = getenvp(job->env, "SLURM_STEP_KILLED_MSG_NODE_ID"); if (ptr) target_node_id = atoi(ptr); if ((job->nodeid == target_node_id) && (msg_sent == 0) && (job->state < SLURMSTEPD_STEP_ENDING)) { time_t now = time(NULL); char entity[24], time_str[24]; if (job->stepid == SLURM_BATCH_SCRIPT) { snprintf(entity, sizeof(entity), "JOB %u", job->jobid); } else { snprintf(entity, sizeof(entity), "STEP %u.%u", job->jobid, job->stepid); } slurm_make_time_str(&now, time_str, sizeof(time_str)); /* Not really errors, * but we want messages displayed by default */ if (sig == SIG_TIME_LIMIT) { error("*** %s CANCELLED AT %s DUE TO TIME LIMIT ***", entity, time_str); msg_sent = 1; } else if (sig == SIG_PREEMPTED) { error("*** %s CANCELLED AT %s DUE TO PREEMPTION ***", entity, time_str); msg_sent = 1; } else if (sig == SIG_NODE_FAIL) { error("*** %s CANCELLED AT %s DUE TO NODE FAILURE ***", entity, time_str); msg_sent = 1; } else if (sig == SIG_FAILURE) { error("*** %s FAILED (non-zero exit code or other " "failure mode) ***", entity); msg_sent = 1; } else if ((sig == SIGTERM) || (sig == SIGKILL)) { error("*** %s CANCELLED AT %s ***", entity, time_str); msg_sent = 1; } } if ((sig == SIG_TIME_LIMIT) || (sig == SIG_NODE_FAIL) || (sig == SIG_PREEMPTED) || (sig == SIG_FAILURE)) goto done; if (sig == SIG_ABORT) { sig = SIGKILL; job->aborted = true; } pthread_mutex_lock(&suspend_mutex); if (suspended && (sig != SIGKILL)) { rc = -1; errnum = ESLURMD_STEP_SUSPENDED; pthread_mutex_unlock(&suspend_mutex); goto done; } if (sig == SIG_DEBUG_WAKE) { int i; for (i = 0; i < job->node_tasks; i++) pdebug_wake_process(job, job->task[i]->pid); pthread_mutex_unlock(&suspend_mutex); goto done; } /* * Signal the container */ if (proctrack_g_signal(job->cont_id, sig) < 0) { rc = -1; errnum = errno; verbose("Error sending signal %d to %u.%u: %m", sig, job->jobid, job->stepid); } else { verbose("Sent signal %d to %u.%u", sig, job->jobid, job->stepid); } pthread_mutex_unlock(&suspend_mutex); done: /* Send the return code and errnum */ safe_write(fd, &rc, sizeof(int)); safe_write(fd, &errnum, sizeof(int)); return SLURM_SUCCESS; rwfail: return SLURM_FAILURE; }
/* allocate resources to track PMIX_Ring state */ int pmix_ring_init(const pmi2_job_info_t* job, char*** env) { int i; int rc = SLURM_SUCCESS; /* this is called by each stepd process, and each stepd has * at least one application process, so * pmix_app_children > 0 and pmix_ring_children > 0 */ /* allow user to override default tree width via variable */ char* p = getenvp(*env, PMIX_RING_TREE_WIDTH_ENV); if (p) { int width = atoi(p); if (width >= 2) { pmix_stepd_width = width; } else { info("Invalid %s value detected (%d), using (%d).", PMIX_RING_TREE_WIDTH_ENV, width, pmix_stepd_width); } } /* allocate hostlist so we can map a stepd rank to a hostname */ pmix_stepd_hostlist = hostlist_create(job->step_nodelist); /* record our rank in the stepd tree */ pmix_stepd_rank = job->nodeid; /* record number of ranks in stepd tree */ pmix_stepd_ranks = job->nnodes; /* record number of application children we serve */ pmix_app_children = job->ltasks; /* compute number of stepd children */ int min_child = pmix_stepd_rank * pmix_stepd_width + 1; int max_child = pmix_stepd_rank * pmix_stepd_width + pmix_stepd_width; if (min_child >= pmix_stepd_ranks) { min_child = pmix_stepd_ranks; } if (max_child >= pmix_stepd_ranks) { max_child = pmix_stepd_ranks - 1; } pmix_stepd_children = max_child - min_child + 1; /* record number of children we have (includes app procs and stepds) */ pmix_ring_children = pmix_app_children + pmix_stepd_children; /* allocate a structure to record ring_in message from each child */ pmix_ring_msgs = (pmix_ring_msg*) xmalloc(pmix_ring_children * sizeof(pmix_ring_msg)); /* initialize messages */ for (i = 0; i < pmix_ring_children; i++) { pmix_ring_msgs[i].count = 0; pmix_ring_msgs[i].left = NULL; pmix_ring_msgs[i].right = NULL; } /* initialize count */ pmix_ring_count = 0; return rc; }
/* * Substitute the path option for a batch job. These options should mirror * those used with "srun" (parsed in fname_create found in * src/srun/libsrun/fname.c). */ static void _batch_path_check(char **p, char **q, char **name, unsigned int wid, stepd_step_rec_t *job, int taskid) { switch (**p) { case 'a': /* '%a' => array task id */ xmemcat(*name, *q, *p - 1); xstrfmtcat(*name, "%0*u", wid, job->array_task_id); *q = ++(*p); break; case 'A': /* '%A' => array master job id */ xmemcat(*name, *q, *p - 1); if (job->array_task_id == NO_VAL) xstrfmtcat(*name, "%0*u", wid, job->jobid); else xstrfmtcat(*name, "%0*u",wid, job->array_job_id); *q = ++(*p); break; case 'J': /* '%J' => jobid.stepid */ case 'j': /* '%j' => jobid */ xmemcat(*name, *q, *p - 1); xstrfmtcat(*name, "%0*u", wid, job->jobid); if ((**p == 'J') && (job->stepid != NO_VAL)) xstrfmtcat(*name, ".%u", job->stepid); *q = ++(*p); break; case 'n': /* '%n' => nodeid */ xmemcat(*name, *q, *p - 1); xstrfmtcat(*name, "%0*u", wid, job->nodeid); *q = ++(*p); break; case 'N': /* '%N' => node name */ xmemcat(*name, *q, *p - 1); xstrfmtcat(*name, "%s", conf->hostname); *q = ++(*p); break; case 's': /* '%s' => step id */ xmemcat(*name, *q, *p - 1); xstrfmtcat(*name, "%0*u", wid, job->stepid); *q = ++(*p); break; case 't': /* '%t' => taskid */ xmemcat(*name, *q, *p - 1); xstrfmtcat(*name, "%0*u", wid, taskid); *q = ++(*p); break; case 'u': /* '%u' => user name */ if (!job->user_name) job->user_name = uid_to_string(job->uid); xmemcat(*name, *q, *p - 1); xstrfmtcat(*name, "%s", job->user_name); *q = ++(*p); break; case 'x': /* '%x' => job name */ xmemcat(*name, *q, *p - 1); xstrfmtcat(*name, "%s", getenvp(job->env, "SLURM_JOB_NAME")); *q = ++(*p); break; default: break; } }
static void _set_env(char ***env_ptr, void *gres_ptr, int node_inx, bitstr_t *usable_gres, bool *already_seen, int *local_inx, bool reset, bool is_job) { char *global_list = NULL, *local_list = NULL, *perc_env = NULL; char perc_str[64], *slurm_env_var = NULL; uint64_t count_on_dev, gres_per_node = 0, percentage; int global_id = -1; if (is_job) slurm_env_var = "SLURM_JOB_GRES"; else slurm_env_var = "SLURM_STEP_GRES"; if (*already_seen) { global_list = xstrdup(getenvp(*env_ptr, slurm_env_var)); local_list = xstrdup(getenvp(*env_ptr, "CUDA_VISIBLE_DEVICES")); perc_env = xstrdup(getenvp(*env_ptr, "CUDA_MPS_ACTIVE_THREAD_PERCENTAGE")); } common_gres_set_env(gres_devices, env_ptr, gres_ptr, node_inx, usable_gres, "", local_inx, &gres_per_node, &local_list, &global_list, reset, is_job, &global_id); if (perc_env) { env_array_overwrite(env_ptr, "CUDA_MPS_ACTIVE_THREAD_perc_str", perc_env); xfree(perc_env); } else if (gres_per_node && mps_info) { count_on_dev = _get_dev_count(global_id); if (count_on_dev > 0) { percentage = (gres_per_node * 100) / count_on_dev; percentage = MAX(percentage, 1); } else percentage = 0; snprintf(perc_str, sizeof(perc_str), "%"PRIu64, percentage); env_array_overwrite(env_ptr, "CUDA_MPS_ACTIVE_THREAD_PERCENTAGE", perc_str); } else if (gres_per_node) { error("%s: mps_info list is NULL", __func__); snprintf(perc_str, sizeof(perc_str), "%"PRIu64, gres_per_node); env_array_overwrite(env_ptr, "CUDA_MPS_ACTIVE_THREAD_PERCENTAGE", perc_str); } if (global_list) { env_array_overwrite(env_ptr, slurm_env_var, global_list); xfree(global_list); } if (local_list) { /* * CUDA_VISIBLE_DEVICES is relative to the MPS server. * With only one GPU under the control of MPS, the device * number will always be "0". */ env_array_overwrite(env_ptr, "CUDA_VISIBLE_DEVICES", "0"); env_array_overwrite(env_ptr, "GPU_DEVICE_ORDINAL", "0"); xfree(local_list); *already_seen = true; } }
/* * Set in "dest" the environment variables strings relevant to a SLURM batch * job allocation, overwriting any environment variables of the same name. * If the address pointed to by "dest" is NULL, memory will automatically be * xmalloc'ed. The array is terminated by a NULL pointer, and thus is * suitable for use by execle() and other env_array_* functions. * * Sets the variables: * SLURM_JOB_ID * SLURM_JOB_NUM_NODES * SLURM_JOB_NODELIST * SLURM_JOB_CPUS_PER_NODE * ENVIRONMENT=BATCH * HOSTNAME * LOADLBATCH (AIX only) * * Sets OBSOLETE variables (needed for MPI, do not remove): * SLURM_JOBID * SLURM_NNODES * SLURM_NODELIST * SLURM_NTASKS * SLURM_TASKS_PER_NODE */ extern int env_array_for_batch_job(char ***dest, const batch_job_launch_msg_t *batch, const char *node_name) { char *tmp = NULL; uint32_t num_nodes = 0; uint32_t num_cpus = 0; int i; slurm_step_layout_t *step_layout = NULL; uint32_t num_tasks = batch->ntasks; uint16_t cpus_per_task; uint16_t task_dist; uint32_t cluster_flags = slurmdb_setup_cluster_flags(); _setup_particulars(cluster_flags, dest, batch->select_jobinfo); /* There is no explicit node count in the batch structure, * so we need to calculate the node count. */ for (i = 0; i < batch->num_cpu_groups; i++) { num_nodes += batch->cpu_count_reps[i]; num_cpus += batch->cpu_count_reps[i] * batch->cpus_per_node[i]; } env_array_overwrite_fmt(dest, "SLURM_JOB_ID", "%u", batch->job_id); env_array_overwrite_fmt(dest, "SLURM_JOB_NUM_NODES", "%u", num_nodes); if(cluster_flags & CLUSTER_FLAG_BG) env_array_overwrite_fmt(dest, "SLURM_BG_NUM_NODES", "%u", num_nodes); env_array_overwrite_fmt(dest, "SLURM_JOB_NODELIST", "%s", batch->nodes); tmp = uint32_compressed_to_str(batch->num_cpu_groups, batch->cpus_per_node, batch->cpu_count_reps); env_array_overwrite_fmt(dest, "SLURM_JOB_CPUS_PER_NODE", "%s", tmp); xfree(tmp); env_array_overwrite_fmt(dest, "ENVIRONMENT", "BATCH"); if (node_name) env_array_overwrite_fmt(dest, "HOSTNAME", "%s", node_name); /* OBSOLETE, but needed by MPI, do not remove */ env_array_overwrite_fmt(dest, "SLURM_JOBID", "%u", batch->job_id); env_array_overwrite_fmt(dest, "SLURM_NNODES", "%u", num_nodes); env_array_overwrite_fmt(dest, "SLURM_NODELIST", "%s", batch->nodes); if((batch->cpus_per_task != 0) && (batch->cpus_per_task != (uint16_t) NO_VAL)) cpus_per_task = batch->cpus_per_task; else cpus_per_task = 1; /* default value */ if (cpus_per_task > 1) { env_array_overwrite_fmt(dest, "SLURM_CPUS_PER_TASK", "%u", cpus_per_task); } if(num_tasks) { env_array_overwrite_fmt(dest, "SLURM_NTASKS", "%u", num_tasks); /* keep around for old scripts */ env_array_overwrite_fmt(dest, "SLURM_NPROCS", "%u", num_tasks); } else { num_tasks = num_cpus / cpus_per_task; } if((tmp = getenvp(*dest, "SLURM_ARBITRARY_NODELIST"))) { task_dist = SLURM_DIST_ARBITRARY; } else { tmp = batch->nodes; task_dist = SLURM_DIST_BLOCK; } if(!(step_layout = slurm_step_layout_create(tmp, batch->cpus_per_node, batch->cpu_count_reps, num_nodes, num_tasks, cpus_per_task, task_dist, (uint16_t)NO_VAL))) return SLURM_ERROR; tmp = _uint16_array_to_str(step_layout->node_cnt, step_layout->tasks); slurm_step_layout_destroy(step_layout); env_array_overwrite_fmt(dest, "SLURM_TASKS_PER_NODE", "%s", tmp); xfree(tmp); return SLURM_SUCCESS; }
int Set(int argc,char *argv[]) { char *file, *envp, buf[CMDLINESIZE]; int opt, decimal, setop, i; setop = SET_NOOP; file = (char *)0; envp = (char *)0; decimal = 1; while((opt=getopt(argc,argv,"ab:cdef:iox")) != -1) { switch(opt) { case 'a': /* logical and */ setop = SET_AND; decimal = 0; break; case 'b': ChangeConsoleBaudrate(atoi(optarg)); return(CMD_SUCCESS); case 'c': /* clear environment */ clearenv(); break; case 'd': /* decrement */ setop = SET_DECR; break; case 'e': envp = getenvp(); break; #if INCLUDE_TFS case 'f': /* build script from environment */ envToExec(optarg); return(0); #endif case 'i': /* increment */ setop = SET_INCR; break; case 'o': /* logical or */ setop = SET_OR; decimal = 0; break; case 'x': decimal = 0; break; default: return(CMD_PARAM_ERROR); } } if (!shell_vars) { printf("No memory allocated for environment.\n"); return(CMD_FAILURE); } if (setop != SET_NOOP) { /* Do some operation on a shell variable */ char *varval; unsigned long value, opval; /* For -i & -d, if value is not specified, then assume 1. */ if (argc == optind+1) { if ((setop == SET_INCR) || (setop == SET_DECR)) opval = 1; else return(CMD_PARAM_ERROR); } else if (argc == optind+2) opval = strtoul(argv[optind+1],0,0); else return(CMD_PARAM_ERROR); varval = getenv(argv[optind]); if (!varval) { printf("%s: not found\n", argv[optind]); return(CMD_FAILURE); } value = strtoul(varval,(char **)0,0); switch(setop) { case SET_INCR: value += opval; break; case SET_DECR: value -= opval; break; case SET_AND: value &= opval; break; case SET_OR: value |= opval; break; } if (decimal) sprintf(buf,"%ld",value); else sprintf(buf,"0x%lx",value); setenv(argv[optind],buf); } else if (argc == optind) { /* display all variables */ shell_print(); } else if (argc == (optind+1)) { /* run EE or clear one var or set envp */ #if INCLUDE_EE switch(setEE(argv[optind])) { case 1: return(CMD_SUCCESS); case -1: return(CMD_FAILURE); } #endif if (envp) shell_sprintf(argv[optind],"0x%lx",(ulong)envp); else setenv(argv[optind],0); } else if (argc >= (optind+2)) { /* Set a specific variable */ buf[0] = 0; for(i=optind+1;i<argc;i++) { if ((strlen(buf) + strlen(argv[i]) + 2) >= sizeof(buf)) { printf("String too large\n"); break; } strcat(buf,argv[i]); if (i != (argc-1)) strcat(buf," "); } if (!decimal) shell_sprintf(argv[optind],"0x%lx",atoi(buf)); else setenv(argv[optind],buf); } else return(CMD_PARAM_ERROR); return(CMD_SUCCESS); }