/* * Wait for all processes within a container to exit. * * When slurm_container_wait returns SLURM_SUCCESS, the container is considered * destroyed. There is no need to call slurm_container_destroy after * a successful call to slurm_container_wait, and in fact it will trigger * undefined behavior. * * Return SLURM_SUCCESS or SLURM_ERROR. */ extern int slurm_container_wait(uint64_t cont_id) { if (slurm_proctrack_init() < 0) return SLURM_ERROR; return (*(ops.wait)) (cont_id); }
/* * Get container ID for given process ID * * Returns zero if no container found for the given pid. */ extern uint64_t slurm_container_find(pid_t pid) { if (slurm_proctrack_init() < 0) return SLURM_ERROR; return (*(ops.find_cont)) (pid); }
/* * Return "true" if the container "cont_id" contains the process with * ID "pid". */ extern bool slurm_container_has_pid(uint64_t cont_id, pid_t pid) { if (slurm_proctrack_init() < 0) return SLURM_ERROR; return (*(ops.has_pid)) (cont_id, pid); }
/* * Add a process to the specified container * job IN - slurmd_job_t structure * pid IN - process ID to be added to the container * job->cont_id OUT - Plugin must fill in job->cont_id either here * or in slurm_container_create() * * Returns a SLURM errno. */ extern int slurm_container_add(slurmd_job_t * job, pid_t pid) { if (slurm_proctrack_init() < 0) return SLURM_ERROR; return (*(ops.add)) (job, pid); }
/* * Signal all processes within a container * cont_id IN - container ID as returned by slurm_container_create() * signal IN - signal to send, if zero then perform error checking * but do not send signal * * Returns a SLURM errno. */ extern int slurm_container_signal(uint64_t cont_id, int signal) { if (slurm_proctrack_init() < 0) { return SLURM_ERROR; } return (*(ops.signal)) (cont_id, signal); }
extern int proctrack_g_signal(uint64_t cont_id, int signal) { if (slurm_proctrack_init() < 0) return SLURM_ERROR; return (*(ops.signal)) (cont_id, signal); }
/* * Create a container * job IN - slurmd_job_t structure * job->cont_id OUT - Plugin must fill in job->cont_id either here * or in slurm_container_add() * * Returns a SLURM errno. */ extern int slurm_container_create(slurmd_job_t * job) { if (slurm_proctrack_init() < 0) return 0; return (*(ops.create)) (job); }
/* * Destroy a container, any processes within the container are not effected * cont_id IN - container ID as returned by proctrack_g_create() * * Returns a SLURM errno. */ extern int proctrack_g_destroy(uint64_t cont_id) { if (slurm_proctrack_init() < 0) return SLURM_ERROR; return (*(ops.destroy)) (cont_id); }
/* * Add a process to the specified container * job IN - stepd_step_rec_t structure * pid IN - process ID to be added to the container * job->cont_id OUT - Plugin must fill in job->cont_id either here * or in proctrack_g_create() * * Returns a SLURM errno. */ extern int proctrack_g_add(stepd_step_rec_t * job, pid_t pid) { if (slurm_proctrack_init() < 0) return SLURM_ERROR; return (*(ops.add)) (job, pid); }
/* * Create a container * job IN - stepd_step_rec_t structure * job->cont_id OUT - Plugin must fill in job->cont_id either here * or in proctrack_g_add() * * Returns a SLURM errno. */ extern int proctrack_g_create(stepd_step_rec_t * job) { if (slurm_proctrack_init() < 0) return 0; return (*(ops.create)) (job); }
/* * Destroy a container, any processes within the container are not effected * cont_id IN - container ID as returned by slurm_container_create() * * Returns a SLURM errno. */ extern int slurm_container_destroy(uint64_t cont_id) { if (slurm_proctrack_init() < 0) return SLURM_ERROR; return (*(g_proctrack_context->ops.destroy)) (cont_id); }
/* * Get all process IDs within a container. * * IN cont_id - Container ID. * OUT pids - a pointer to an xmalloc'ed array of process ids, of * length "npids". Caller must free array with xfree(). * OUT npids - number of process IDs in the returned "pids" array. * * Return SLURM_SUCCESS if container exists (npids may be zero, and * pids NULL), return SLURM_ERROR if container does not exist, or * plugin does not implement the call. */ extern int proctrack_g_get_pids(uint64_t cont_id, pid_t **pids, int *npids) { if (slurm_proctrack_init() < 0) return SLURM_ERROR; return (*(ops.get_pids)) (cont_id, pids, npids); }
/* * Get all process IDs within a container. * * IN cont_id - Container ID. * OUT pids - a pointer to an xmalloc'ed array of process ids, of * length "npids". Caller must free array with xfree(). * OUT npids - number of process IDs in the returned "pids" array. * * Return SLURM_SUCCESS if container exists (npids may be zero, and * pids NULL), return SLURM_ERROR if container does not exist, or * plugin does not implement the call. */ extern int slurm_container_get_pids(uint64_t cont_id, pid_t ** pids, int *npids) { if (slurm_proctrack_init() < 0) return SLURM_ERROR; return (*(g_proctrack_context->ops.get_pids)) (cont_id, pids, npids); }
/* * Signal all processes within a container * cont_id IN - container ID as returned by proctrack_g_create() * signal IN - signal to send, if zero then perform error checking * but do not send signal * * Returns a SLURM errno. */ extern int proctrack_g_signal(uint64_t cont_id, int signal) { if (slurm_proctrack_init() < 0) return SLURM_ERROR; if (signal == SIGKILL) { pid_t *pids = NULL; int i, j, npids = 0, hung_pids = 0; char *stat_fname = NULL; if (proctrack_g_get_pids(cont_id, &pids, &npids) == SLURM_SUCCESS) { /* NOTE: proctrack_g_get_pids() is not supported * by the proctrack/pgid plugin */ for (j = 0; j < 2; j++) { if (j) sleep(2); hung_pids = 0; for (i = 0; i < npids; i++) { if (!pids[i]) continue; xstrfmtcat(stat_fname, "/proc/%d/stat", (int) pids[i]); if (_test_core_dumping(stat_fname)) { debug("Process %d continuing " "core dump", (int) pids[i]); hung_pids++; } else { /* Don't test this PID again */ pids[i] = 0; } xfree(stat_fname); } if (hung_pids == 0) break; } xfree(pids); if (hung_pids) { info("Defering sending signal, processes in " "job are currently core dumping"); _spawn_signal_thread(cont_id, signal); return SLURM_SUCCESS; } } } return (*(ops.signal)) (cont_id, signal); }
/* * Add a process to the specified container * job IN - stepd_step_rec_t structure * pid IN - process ID to be added to the container * job->cont_id OUT - Plugin must fill in job->cont_id either here * or in proctrack_g_create() * * Returns a Slurm errno. */ extern int proctrack_g_add(stepd_step_rec_t * job, pid_t pid) { int i = 0, max_retry = 3, rc; if (slurm_proctrack_init() < 0) return SLURM_ERROR; /* Sometimes a plugin is transient in adding a pid, so lets * try a few times before we call it quits. */ while ((rc = (*(ops.add)) (job, pid)) != SLURM_SUCCESS) { if (i++ > max_retry) break; debug("%s: %u.%u couldn't add pid %u, sleeping and trying again", __func__, job->jobid, job->stepid, pid); sleep(1); } return rc; }
static int _slurmd_init(void) { struct rlimit rlim; slurm_ctl_conf_t *cf; struct stat stat_buf; uint32_t cpu_cnt; /* * Process commandline arguments first, since one option may be * an alternate location for the slurm config file. */ _process_cmdline(*conf->argc, *conf->argv); /* * Build nodes table like in slurmctld * This is required by the topology stack * Node tables setup must preceed _read_config() so that the * proper hostname is set. */ slurm_conf_init(conf->conffile); init_node_conf(); /* slurm_select_init() must be called before * build_all_nodeline_info() to be called with proper argument. */ if (slurm_select_init(1) != SLURM_SUCCESS ) return SLURM_FAILURE; build_all_nodeline_info(true); build_all_frontend_info(true); /* * Read global slurm config file, override necessary values from * defaults and command line. */ _read_config(); cpu_cnt = MAX(conf->conf_cpus, conf->block_map_size); if ((gres_plugin_init() != SLURM_SUCCESS) || (gres_plugin_node_config_load(cpu_cnt) != SLURM_SUCCESS)) return SLURM_FAILURE; if (slurm_topo_init() != SLURM_SUCCESS) return SLURM_FAILURE; /* * Get and set slurmd topology information * Build node hash table first to speed up the topo build */ rehash_node(); slurm_topo_build_config(); _set_topo_info(); /* * Check for cpu frequency set capabilities on this node */ cpu_freq_init(conf); _print_conf(); if (slurm_proctrack_init() != SLURM_SUCCESS) return SLURM_FAILURE; if (slurmd_task_init() != SLURM_SUCCESS) return SLURM_FAILURE; if (slurm_auth_init(NULL) != SLURM_SUCCESS) return SLURM_FAILURE; if (spank_slurmd_init() < 0) return SLURM_FAILURE; if (getrlimit(RLIMIT_CPU, &rlim) == 0) { rlim.rlim_cur = rlim.rlim_max; setrlimit(RLIMIT_CPU, &rlim); if (rlim.rlim_max != RLIM_INFINITY) { error("Slurmd process CPU time limit is %d seconds", (int) rlim.rlim_max); } } if (getrlimit(RLIMIT_NOFILE, &rlim) == 0) { rlim.rlim_cur = rlim.rlim_max; setrlimit(RLIMIT_NOFILE, &rlim); } #ifndef NDEBUG if (getrlimit(RLIMIT_CORE, &rlim) == 0) { rlim.rlim_cur = rlim.rlim_max; setrlimit(RLIMIT_CORE, &rlim); } #endif /* !NDEBUG */ /* * Create a context for verifying slurm job credentials */ if (!(conf->vctx = slurm_cred_verifier_ctx_create(conf->pubkey))) return SLURM_FAILURE; if (!strcmp(conf->select_type, "select/serial")) { /* Only cache credential for 5 seconds with select/serial * for shorter cache searches and higher throughput */ slurm_cred_ctx_set(conf->vctx, SLURM_CRED_OPT_EXPIRY_WINDOW, 5); } /* * Create slurmd spool directory if necessary. */ if (_set_slurmd_spooldir() < 0) { error("Unable to initialize slurmd spooldir"); return SLURM_FAILURE; } if (conf->cleanstart) { /* * Need to kill any running slurmd's here */ _kill_old_slurmd(); stepd_cleanup_sockets(conf->spooldir, conf->node_name); _stepd_cleanup_batch_dirs(conf->spooldir, conf->node_name); } if (conf->daemonize) { bool success = false; if (conf->logfile && (conf->logfile[0] == '/')) { char *slash_ptr, *work_dir; work_dir = xstrdup(conf->logfile); slash_ptr = strrchr(work_dir, '/'); if (slash_ptr == work_dir) work_dir[1] = '\0'; else slash_ptr[0] = '\0'; if ((access(work_dir, W_OK) != 0) || (chdir(work_dir) < 0)) { error("Unable to chdir to %s", work_dir); } else success = true; xfree(work_dir); } if (!success) { if ((access(conf->spooldir, W_OK) != 0) || (chdir(conf->spooldir) < 0)) { error("Unable to chdir to %s", conf->spooldir); } else success = true; } if (!success) { if ((access("/var/tmp", W_OK) != 0) || (chdir("/var/tmp") < 0)) { error("chdir(/var/tmp): %m"); return SLURM_FAILURE; } else info("chdir to /var/tmp"); } } /* * Cache the group access list */ cf = slurm_conf_lock(); if (cf->group_info & GROUP_CACHE) init_gids_cache(1); else init_gids_cache(0); slurm_conf_unlock(); if ((devnull = open_cloexec("/dev/null", O_RDWR)) < 0) { error("Unable to open /dev/null: %m"); return SLURM_FAILURE; } /* make sure we have slurmstepd installed */ if (stat(conf->stepd_loc, &stat_buf)) fatal("Unable to find slurmstepd file at %s", conf->stepd_loc); if (!S_ISREG(stat_buf.st_mode)) fatal("slurmstepd not a file at %s", conf->stepd_loc); return SLURM_SUCCESS; }