void step_terminate_monitor_start(uint32_t jobid, uint32_t stepid) { slurm_ctl_conf_t *conf; pthread_attr_t attr; pthread_mutex_lock(&lock); if (running_flag) { pthread_mutex_unlock(&lock); return; } conf = slurm_conf_lock(); if (conf->unkillable_program == NULL) { /* do nothing */ slurm_conf_unlock(); pthread_mutex_unlock(&lock); return; } timeout = conf->unkillable_timeout; program_name = xstrdup(conf->unkillable_program); slurm_conf_unlock(); slurm_attr_init(&attr); pthread_create(&tid, &attr, monitor, NULL); slurm_attr_destroy(&attr); running_flag = 1; recorded_jobid = jobid; recorded_stepid = stepid; pthread_mutex_unlock(&lock); return; }
void step_terminate_monitor_start(stepd_step_rec_t *job) { slurm_ctl_conf_t *conf; pthread_attr_t attr; slurm_mutex_lock(&lock); if (running_flag) { slurm_mutex_unlock(&lock); return; } conf = slurm_conf_lock(); timeout = conf->unkillable_timeout; program_name = xstrdup(conf->unkillable_program); slurm_conf_unlock(); slurm_attr_init(&attr); pthread_create(&tid, &attr, _monitor, job); slurm_attr_destroy(&attr); running_flag = 1; recorded_jobid = job->jobid; recorded_stepid = job->stepid; slurm_mutex_unlock(&lock); return; }
/* Initialize power_save module parameters. * Return 0 on valid configuration to run power saving, * otherwise log the problem and return -1 */ static int _init_power_config(void) { slurm_ctl_conf_t *conf = slurm_conf_lock(); last_config = slurmctld_conf.last_update; idle_time = conf->suspend_time - 1; suspend_rate = conf->suspend_rate; resume_timeout = conf->resume_timeout; resume_rate = conf->resume_rate; slurmd_timeout = conf->slurmd_timeout; suspend_timeout = conf->suspend_timeout; _clear_power_config(); if (conf->suspend_program) suspend_prog = xstrdup(conf->suspend_program); if (conf->resume_program) resume_prog = xstrdup(conf->resume_program); if (conf->suspend_exc_nodes) exc_nodes = xstrdup(conf->suspend_exc_nodes); if (conf->suspend_exc_parts) exc_parts = xstrdup(conf->suspend_exc_parts); slurm_conf_unlock(); if (idle_time < 0) { /* not an error */ debug("power_save module disabled, SuspendTime < 0"); return -1; } if (suspend_rate < 0) { error("power_save module disabled, SuspendRate < 0"); return -1; } if (resume_rate < 0) { error("power_save module disabled, ResumeRate < 0"); return -1; } if (suspend_prog == NULL) { error("power_save module disabled, NULL SuspendProgram"); return -1; } else if (!_valid_prog(suspend_prog)) { error("power_save module disabled, invalid SuspendProgram %s", suspend_prog); return -1; } if (resume_prog == NULL) { error("power_save module disabled, NULL ResumeProgram"); return -1; } else if (!_valid_prog(resume_prog)) { error("power_save module disabled, invalid ResumeProgram %s", resume_prog); return -1; } return 0; }
static bool _slurm_authorized_user() { uid_t uid, slurm_user_id; slurm_ctl_conf_t *conf; conf = slurm_conf_lock(); slurm_user_id = (uid_t)conf->slurm_user_id; slurm_conf_unlock(); uid = getuid(); return ((uid == (uid_t)0) || (uid == slurm_user_id)); }
/* run a suspend or resume program * prog IN - program to run * arg IN - program arguments, the hostlist expression */ static pid_t _run_prog(char *prog, char *arg) { int i; char program[1024], arg0[1024], arg1[1024], *pname; pid_t child; slurm_ctl_conf_t *ctlconf; if (prog == NULL) /* disabled, useful for testing */ return -1; strncpy(program, prog, sizeof(program)); pname = strrchr(program, '/'); if (pname == NULL) pname = program; else pname++; strncpy(arg0, pname, sizeof(arg0)); strncpy(arg1, arg, sizeof(arg1)); child = fork(); if (child == 0) { for (i=0; i<128; i++) close(i); #ifdef SETPGRP_TWO_ARGS setpgrp(0, 0); #else setpgrp(); #endif ctlconf = slurm_conf_lock(); setenv("SLURM_CONF", ctlconf->slurm_conf, 1); slurm_conf_unlock(); execl(program, arg0, arg1, NULL); exit(1); } else if (child < 0) { error("fork: %m"); } else { /* save the pid */ for (i=0; i<PID_CNT; i++) { if (child_pid[i]) continue; child_pid[i] = child; child_time[i] = time(NULL); break; } if (i == PID_CNT) error("power_save: filled child_pid array"); } return child; }
/* run a suspend or resume program * prog IN - program to run * arg1 IN - first program argument, the hostlist expression * arg2 IN - second program argumentor NULL */ static pid_t _run_prog(char *prog, char *arg1, char *arg2) { int i; char *argv[4], *pname; pid_t child; slurm_ctl_conf_t *ctlconf; if (prog == NULL) /* disabled, useful for testing */ return -1; pname = strrchr(prog, '/'); if (pname == NULL) argv[0] = prog; else argv[0] = pname + 1; argv[1] = arg1; argv[2] = arg2; argv[3] = NULL; child = fork(); if (child == 0) { for (i = 0; i < 1024; i++) (void) close(i); #ifdef SETPGRP_TWO_ARGS setpgrp(0, 0); #else setpgrp(); #endif ctlconf = slurm_conf_lock(); setenv("SLURM_CONF", ctlconf->slurm_conf, 1); slurm_conf_unlock(); execv(prog, argv); exit(1); } else if (child < 0) { error("fork: %m"); } else { /* save the pid */ for (i = 0; i < PID_CNT; i++) { if (child_pid[i]) continue; child_pid[i] = child; child_time[i] = time(NULL); break; } if (i == PID_CNT) error("power_save: filled child_pid array"); } return child; }
/*****************************************************************************\ * spawn message hander thread \*****************************************************************************/ extern int spawn_msg_thread(void) { pthread_attr_t thread_attr_msg; slurm_ctl_conf_t *conf; /* Locks: Read configurationn */ slurmctld_lock_t config_read_lock = { READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; lock_slurmctld(config_read_lock); conf = slurm_conf_lock(); sched_port = conf->dynalloc_port; slurm_conf_unlock(); unlock_slurmctld(config_read_lock); if (sched_port == 0) { error("DynAllocPort == 0, not spawning communication thread"); return SLURM_ERROR; } slurm_mutex_lock( &thread_flag_mutex ); if (thread_running) { error("dynalloc thread already running, not starting another"); slurm_mutex_unlock(&thread_flag_mutex); return SLURM_ERROR; } slurm_attr_init(&thread_attr_msg); if (pthread_create(&msg_thread_id, &thread_attr_msg, _msg_thread, NULL)) fatal("pthread_create %m"); else info("dynalloc: msg thread create successful!"); slurm_attr_destroy(&thread_attr_msg); thread_running = true; slurm_mutex_unlock(&thread_flag_mutex); return SLURM_SUCCESS; }
int main(int argc, char **argv) { int status = 1; slurm_ctl_conf_t *slurm_ctl_conf_ptr = slurm_conf_lock(); slurm_ctl_conf_ptr->msg_timeout = 3; slurm_conf_unlock(); #if SLURM_VERSION_NUMBER < SLURM_VERSION_NUM(18,8,0) for (uint32_t i = 1; i < 2; i++) { #else for (uint32_t i = 0; i < slurm_ctl_conf_ptr->control_cnt; i++) { #endif printf("slurm_ping(%i) == %i\n", i, slurm_ping(i)); if (slurm_ping(i) == SLURM_SUCCESS) { status = 0; break; } } slurm_conf_destroy(); return status; }
static void _reconfigure(void) { List steps; ListIterator i; slurm_ctl_conf_t *cf; step_loc_t *stepd; bool did_change; _reconfig = 0; slurm_conf_reinit(conf->conffile); _read_config(); /* * Rebuild topology information and refresh slurmd topo infos */ slurm_topo_build_config(); _set_topo_info(); /* * In case the administrator changed the cpu frequency set capabilities * on this node, rebuild the cpu frequency table information */ cpu_freq_init(conf); _print_conf(); /* * Make best effort at changing to new public key */ slurm_cred_ctx_key_update(conf->vctx, conf->pubkey); /* * Reinitialize the groups cache */ cf = slurm_conf_lock(); if (cf->group_info & GROUP_CACHE) init_gids_cache(1); else init_gids_cache(0); slurm_conf_unlock(); /* send reconfig to each stepd so they can refresh their log * file handle */ steps = stepd_available(conf->spooldir, conf->node_name); i = list_iterator_create(steps); while ((stepd = list_next(i))) { int fd; fd = stepd_connect(stepd->directory, stepd->nodename, stepd->jobid, stepd->stepid); if (fd == -1) continue; if (stepd_reconfig(fd) != SLURM_SUCCESS) debug("Reconfig jobid=%u.%u failed: %m", stepd->jobid, stepd->stepid); close(fd); } list_iterator_destroy(i); list_destroy(steps); gres_plugin_reconfig(&did_change); (void) switch_g_reconfig(); container_g_reconfig(); if (did_change) { uint32_t cpu_cnt = MAX(conf->conf_cpus, conf->block_map_size); (void) gres_plugin_node_config_load(cpu_cnt); send_registration_msg(SLURM_SUCCESS, false); } /* reconfigure energy */ acct_gather_energy_g_set_data(ENERGY_DATA_RECONFIG, NULL); /* * XXX: reopen slurmd port? */ }
/* * Connect to a slurmstepd proccess by way of its unix domain socket. * * Both "directory" and "nodename" may be null, in which case stepd_connect * will attempt to determine them on its own. If you are using multiple * slurmd on one node (unusual outside of development environments), you * will get one of the local NodeNames more-or-less at random. * * Returns a socket descriptor for the opened socket on success, * and -1 on error. */ int stepd_connect(const char *directory, const char *nodename, uint32_t jobid, uint32_t stepid) { int req = REQUEST_CONNECT; int fd = -1; int rc; void *auth_cred; Buf buffer; int len; if (nodename == NULL) { if (!(nodename = _guess_nodename())) return -1; } if (directory == NULL) { slurm_ctl_conf_t *cf; cf = slurm_conf_lock(); directory = slurm_conf_expand_slurmd_path( cf->slurmd_spooldir, nodename); slurm_conf_unlock(); } buffer = init_buf(0); /* Create an auth credential */ auth_cred = g_slurm_auth_create(NULL, 2, NULL); if (auth_cred == NULL) { error("Creating authentication credential: %s", g_slurm_auth_errstr(g_slurm_auth_errno(NULL))); slurm_seterrno(SLURM_PROTOCOL_AUTHENTICATION_ERROR); goto fail1; } /* Pack the auth credential */ rc = g_slurm_auth_pack(auth_cred, buffer); (void) g_slurm_auth_destroy(auth_cred); if (rc) { error("Packing authentication credential: %s", g_slurm_auth_errstr(g_slurm_auth_errno(auth_cred))); slurm_seterrno(SLURM_PROTOCOL_AUTHENTICATION_ERROR); goto fail1; } /* Connect to the step */ fd = _step_connect(directory, nodename, jobid, stepid); if (fd == -1) goto fail1; safe_write(fd, &req, sizeof(int)); len = size_buf(buffer); safe_write(fd, &len, sizeof(int)); safe_write(fd, get_buf_data(buffer), len); safe_read(fd, &rc, sizeof(int)); if (rc < 0) { error("slurmstepd refused authentication: %m"); slurm_seterrno(SLURM_PROTOCOL_AUTHENTICATION_ERROR); goto rwfail; } free_buf(buffer); return fd; rwfail: close(fd); fail1: free_buf(buffer); return -1; }
extern void launch_common_set_stdio_fds(srun_job_t *job, slurm_step_io_fds_t *cio_fds) { bool err_shares_out = false; int file_flags; if (opt.open_mode == OPEN_MODE_APPEND) file_flags = O_CREAT|O_WRONLY|O_APPEND; else if (opt.open_mode == OPEN_MODE_TRUNCATE) file_flags = O_CREAT|O_WRONLY|O_APPEND|O_TRUNC; else { slurm_ctl_conf_t *conf; conf = slurm_conf_lock(); if (conf->job_file_append) file_flags = O_CREAT|O_WRONLY|O_APPEND; else file_flags = O_CREAT|O_WRONLY|O_APPEND|O_TRUNC; slurm_conf_unlock(); } /* * create stdin file descriptor */ if (_is_local_file(job->ifname)) { if ((job->ifname->name == NULL) || (job->ifname->taskid != -1)) { cio_fds->in.fd = STDIN_FILENO; } else { cio_fds->in.fd = open(job->ifname->name, O_RDONLY); if (cio_fds->in.fd == -1) { error("Could not open stdin file: %m"); exit(error_exit); } } if (job->ifname->type == IO_ONE) { cio_fds->in.taskid = job->ifname->taskid; cio_fds->in.nodeid = slurm_step_layout_host_id( launch_common_get_slurm_step_layout(job), job->ifname->taskid); } } /* * create stdout file descriptor */ if (_is_local_file(job->ofname)) { if ((job->ofname->name == NULL) || (job->ofname->taskid != -1)) { cio_fds->out.fd = STDOUT_FILENO; } else { cio_fds->out.fd = open(job->ofname->name, file_flags, 0644); if (cio_fds->out.fd == -1) { error("Could not open stdout file: %m"); exit(error_exit); } } if (job->ofname->name != NULL && job->efname->name != NULL && !strcmp(job->ofname->name, job->efname->name)) { err_shares_out = true; } } /* * create seperate stderr file descriptor only if stderr is not sharing * the stdout file descriptor */ if (err_shares_out) { debug3("stdout and stderr sharing a file"); cio_fds->err.fd = cio_fds->out.fd; cio_fds->err.taskid = cio_fds->out.taskid; } else if (_is_local_file(job->efname)) { if ((job->efname->name == NULL) || (job->efname->taskid != -1)) { cio_fds->err.fd = STDERR_FILENO; } else { cio_fds->err.fd = open(job->efname->name, file_flags, 0644); if (cio_fds->err.fd == -1) { error("Could not open stderr file: %m"); exit(error_exit); } } } }
/* * _set_collectors call the split_hostlist API on the all nodes hostlist * to set the node to be used as a collector for unsolicited node aggregation. * * If this node is a forwarding node (first node in any hostlist), * then its collector and backup are the ControlMachine and it's backup. * * Otherwise, we find the hostlist containing this node. * The forwarding node in that hostlist becomes a collector, the next node * which is not this node becomes the backup. * That list is split, we iterate through it and searching for a list in * which this node is a forwarding node. If found, we set the collector and * backup, else this process is repeated. */ static void _set_collectors(char *this_node_name) { slurm_ctl_conf_t *conf; hostlist_t nodes; hostlist_t* hll = NULL; char *parent = NULL, *backup = NULL; char addrbuf[32]; int i, j, f = -1; int hl_count = 0; uint16_t parent_port; uint16_t backup_port; bool found = false; bool ctldparent = true; #ifdef HAVE_FRONT_END return; /* on a FrontEnd system this would never be useful. */ #endif if (!run_in_daemon("slurmd")) return; /* Only compute nodes have collectors */ /* Set the initial iteration, collector is controller, * full list is split */ xassert(this_node_name); conf = slurm_conf_lock(); nodes = _get_all_nodes(); parent = strdup(conf->control_addr); if (conf->backup_addr) { backup = strdup(conf->backup_addr); } parent_port = conf->slurmctld_port; backup_port = parent_port; slurm_conf_unlock(); while (!found) { if ( route_g_split_hostlist(nodes, &hll, &hl_count) ) { error("unable to split forward hostlist"); goto clean; /* collector addrs remains null */ } /* Find which hostlist contains this node */ for (i=0; i < hl_count; i++) { f = hostlist_find(hll[i], this_node_name); if (f != -1) break; } if (i == hl_count) { fatal("ROUTE -- %s not found in node_record_table", this_node_name); } if (f == 0) { /* we are a forwarded to node, * so our parent is parent */ if (hostlist_count(hll[i]) > 1) this_is_collector = true; xfree(msg_collect_node); msg_collect_node = xmalloc(sizeof(slurm_addr_t)); if (ctldparent) slurm_set_addr(msg_collect_node, parent_port, parent); else { slurm_conf_get_addr(parent, msg_collect_node); msg_collect_node->sin_port = htons(parent_port); } if (debug_flags & DEBUG_FLAG_ROUTE) { slurm_print_slurm_addr(msg_collect_node, addrbuf, 32); info("ROUTE -- message collector address is %s", addrbuf); } xfree(msg_collect_backup); if (backup) { msg_collect_backup = xmalloc(sizeof(slurm_addr_t)); if (ctldparent) { slurm_set_addr(msg_collect_backup, backup_port, backup); } else { slurm_conf_get_addr(backup, msg_collect_backup); msg_collect_backup->sin_port = htons(backup_port); } if (debug_flags & DEBUG_FLAG_ROUTE) { slurm_print_slurm_addr( msg_collect_backup, addrbuf, 32); info("ROUTE -- message collector backup" " address is %s", addrbuf); } } else { if (debug_flags & DEBUG_FLAG_ROUTE) { info("ROUTE -- no message collector " "backup"); } } found = true; goto clean; } /* We are not a forwarding node, the first node in this list * will split the forward_list. * We also know that the forwarding node is not a controller. * * clean up parent context */ ctldparent = false; hostlist_destroy(nodes); if (parent) free(parent); if (backup) free(backup); nodes = hostlist_copy(hll[i]); for (j=0; j < hl_count; j++) { hostlist_destroy(hll[j]); } xfree(hll); /* set our parent, backup, and continue search */ parent = hostlist_shift(nodes); backup = hostlist_nth(nodes, 0); if (strcmp(backup, this_node_name) == 0) { free(backup); backup = NULL; if (hostlist_count(nodes) > 1) backup = hostlist_nth(nodes, 1); } parent_port = slurm_conf_get_port(parent); if (backup) { backup_port = slurm_conf_get_port(backup); } else backup_port = 0; } clean: if (debug_flags & DEBUG_FLAG_ROUTE) { if (this_is_collector) info("ROUTE -- %s is a collector node", this_node_name); else info("ROUTE -- %s is a leaf node", this_node_name); } hostlist_destroy(nodes); if (parent) free(parent); if (backup) free(backup); for (i=0; i < hl_count; i++) { hostlist_destroy(hll[i]); } xfree(hll); }
/* * Set either current frequency (speed) * Or min/max governor base on --cpu-freq parameter */ static void _cpu_freq_setup_data(stepd_step_rec_t *job, int cpx) { uint32_t freq; if ( (job->cpu_freq_min == NO_VAL || job->cpu_freq_min==0) && (job->cpu_freq_max == NO_VAL || job->cpu_freq_max==0) && (job->cpu_freq_gov == NO_VAL || job->cpu_freq_gov==0)) { /* If no --cpu-freq, use default governor from conf file. */ slurm_ctl_conf_t *conf = slurm_conf_lock(); job->cpu_freq_gov = conf->cpu_freq_def; slurm_conf_unlock(); if (job->cpu_freq_gov == NO_VAL) return; } /* Get current state */ if (_cpu_freq_current_state(cpx) == SLURM_FAILURE) return; if (job->cpu_freq_min == NO_VAL && job->cpu_freq_max != NO_VAL && job->cpu_freq_gov == NO_VAL) { /* Pre version 15.08 behavior */ freq = _cpu_freq_freqspec_num(job->cpu_freq_max, cpx); cpufreq[cpx].new_frequency = freq; goto newfreq; } if (job->cpu_freq_gov == CPU_FREQ_USERSPACE) { _cpu_freq_govspec_string(job->cpu_freq_gov, cpx); if (job->cpu_freq_max == NO_VAL) { return; /* pre version 15.08 behavior. */ } /* Power capping */ freq = _cpu_freq_freqspec_num(job->cpu_freq_max, cpx); cpufreq[cpx].new_frequency = freq; freq = _cpu_freq_freqspec_num(job->cpu_freq_min, cpx); cpufreq[cpx].new_min_freq = freq; goto newfreq; } if (job->cpu_freq_min != NO_VAL && job->cpu_freq_max != NO_VAL) { freq = _cpu_freq_freqspec_num(job->cpu_freq_min, cpx); cpufreq[cpx].new_min_freq = freq; freq = _cpu_freq_freqspec_num(job->cpu_freq_max, cpx); cpufreq[cpx].new_max_freq = freq; } if (job->cpu_freq_gov != NO_VAL) { _cpu_freq_govspec_string(job->cpu_freq_gov, cpx); } newfreq: /* Make sure a 'new' frequency is within scaling min/max */ if (cpufreq[cpx].new_frequency != NO_VAL) { if (cpufreq[cpx].new_frequency < cpufreq[cpx].org_min_freq) { cpufreq[cpx].new_min_freq = cpufreq[cpx].new_frequency; } if (cpufreq[cpx].new_frequency > cpufreq[cpx].org_max_freq) { cpufreq[cpx].new_max_freq = cpufreq[cpx].new_frequency; } } }
/* * Scan for available running slurm step daemons by checking * "directory" for unix domain sockets with names beginning in "nodename". * * Both "directory" and "nodename" may be null, in which case stepd_available * will attempt to determine them on its own. If you are using multiple * slurmd on one node (unusual outside of development environments), you * will get one of the local NodeNames more-or-less at random. * * Returns a List of pointers to step_loc_t structures. */ List stepd_available(const char *directory, const char *nodename) { List l; DIR *dp; struct dirent *ent; regex_t re; struct stat stat_buf; if (nodename == NULL) { if (!(nodename = _guess_nodename())) return NULL; } if (directory == NULL) { slurm_ctl_conf_t *cf; cf = slurm_conf_lock(); directory = slurm_conf_expand_slurmd_path( cf->slurmd_spooldir, nodename); slurm_conf_unlock(); } l = list_create((ListDelF) _free_step_loc_t); if (_sockname_regex_init(&re, nodename) == -1) goto done; /* * Make sure that "directory" exists and is a directory. */ if (stat(directory, &stat_buf) < 0) { error("Domain socket directory %s: %m", directory); goto done; } else if (!S_ISDIR(stat_buf.st_mode)) { error("%s is not a directory", directory); goto done; } if ((dp = opendir(directory)) == NULL) { error("Unable to open directory: %m"); goto done; } while ((ent = readdir(dp)) != NULL) { step_loc_t *loc; uint32_t jobid, stepid; if (_sockname_regex(&re, ent->d_name, &jobid, &stepid) == 0) { debug4("found jobid = %u, stepid = %u", jobid, stepid); loc = xmalloc(sizeof(step_loc_t)); loc->directory = xstrdup(directory); loc->nodename = xstrdup(nodename); loc->jobid = jobid; loc->stepid = stepid; list_append(l, (void *)loc); } } closedir(dp); done: regfree(&re); return l; }
/*****************************************************************************\ * message hander thread \*****************************************************************************/ static void *_msg_thread(void *no_data) { slurm_fd_t sock_fd = -1, new_fd; slurm_addr_t cli_addr; char *msg; slurm_ctl_conf_t *conf; int i; /* Locks: Write configuration, job, node, and partition */ slurmctld_lock_t config_write_lock = { WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, WRITE_LOCK }; conf = slurm_conf_lock(); sched_port = conf->schedport; slurm_conf_unlock(); /* Wait until configuration is completely loaded */ lock_slurmctld(config_write_lock); unlock_slurmctld(config_write_lock); /* If SchedulerPort is already taken, keep trying to open it * once per minute. Slurmctld will continue to function * during this interval even if nothing can be scheduled. */ for (i=0; (!thread_shutdown); i++) { if (i > 0) sleep(60); sock_fd = slurm_init_msg_engine_port(sched_port); if (sock_fd != SLURM_SOCKET_ERROR) break; error("wiki: slurm_init_msg_engine_port %u %m", sched_port); error("wiki: Unable to communicate with Moab"); } /* Process incoming RPCs until told to shutdown */ while (!thread_shutdown) { if ((new_fd = slurm_accept_msg_conn(sock_fd, &cli_addr)) == SLURM_SOCKET_ERROR) { if (errno != EINTR) error("wiki: slurm_accept_msg_conn %m"); continue; } if (thread_shutdown) { close(new_fd); break; } /* It would be nice to create a pthread for each new * RPC, but that leaks memory on some systems when * done from a plugin. * FIXME: Maintain a pool of and reuse them. */ err_code = 0; err_msg = ""; msg = _recv_msg(new_fd); if (msg) { _proc_msg(new_fd, msg); xfree(msg); } slurm_close_accepted_conn(new_fd); } if (sock_fd > 0) (void) slurm_shutdown_msg_engine(sock_fd); pthread_exit((void *) 0); return NULL; }
/*****************************************************************************\ * parse_wiki_config - Results go into global variables * RET SLURM_SUCESS or error code * * wiki_conf options * JobPriority=hold|run * AuthKey=number \*****************************************************************************/ extern int parse_wiki_config(void) { s_p_options_t options[] = { {"AuthKey", S_P_STRING}, {"EHost", S_P_STRING}, {"EHostBackup", S_P_STRING}, {"EPort", S_P_UINT16}, {"ExcludePartitions", S_P_STRING}, {"HidePartitionJobs", S_P_STRING}, {"HidePartitionNodes", S_P_STRING}, {"HostFormat", S_P_UINT16}, {"JobAggregationTime", S_P_UINT16}, {"JobPriority", S_P_STRING}, {NULL} }; s_p_hashtbl_t *tbl; char *exclude_partitions, *hide_partitions, *hide_part_nodes; char *key = NULL, *priority_mode = NULL, *wiki_conf; struct stat buf; slurm_ctl_conf_t *conf; int i; /* Set default values */ for (i=0; i<EXC_PART_CNT; i++) exclude_part_ptr[i] = NULL; for (i=0; i<HIDE_PART_CNT; i++) hide_part_ptr[i] = NULL; for (i=0; i<HIDE_PART_CNT; i++) hide_part_nodes_ptr[i] = NULL; conf = slurm_conf_lock(); strncpy(e_host, conf->control_addr, sizeof(e_host)); if (conf->backup_addr) { strncpy(e_host_bu, conf->backup_addr, sizeof(e_host)); } kill_wait = conf->kill_wait; slurm_conf_unlock(); wiki_conf = get_extra_conf_path("wiki.conf"); if ((wiki_conf == NULL) || (stat(wiki_conf, &buf) == -1)) { debug("No wiki.conf file (%s)", wiki_conf); xfree(wiki_conf); return SLURM_SUCCESS; } debug("Reading wiki.conf file (%s)",wiki_conf); tbl = s_p_hashtbl_create(options); if (s_p_parse_file(tbl, NULL, wiki_conf, false) == SLURM_ERROR) fatal("something wrong with opening/reading wiki.conf file"); if (! s_p_get_string(&key, "AuthKey", tbl)) debug("Warning: No wiki_conf AuthKey specified"); else { strncpy(auth_key, key, sizeof(auth_key)); xfree(key); } if ( s_p_get_string(&key, "EHost", tbl)) { strncpy(e_host, key, sizeof(e_host)); xfree(key); } else debug("wiki: Using ControlAddr for EHost value"); if ( s_p_get_string(&key, "EHostBackup", tbl)) { strncpy(e_host_bu, key, sizeof(e_host_bu)); xfree(key); } s_p_get_uint16(&e_port, "EPort", tbl); if (s_p_get_uint16(&job_aggregation_time, "JobAggregationTime", tbl)) error("JobAggregationTime not used by sched/wiki"); if (s_p_get_uint16(&host_format, "HostFormat", tbl)) error("HostFormat not used by sched/wiki"); if (s_p_get_string(&exclude_partitions, "ExcludePartitions", tbl)) { char *tok = NULL, *tok_p = NULL; tok = strtok_r(exclude_partitions, ",", &tok_p); i = 0; while (tok) { if (i >= EXC_PART_CNT) { error("ExcludePartitions has too many entries " "skipping %s and later entries", tok); break; } exclude_part_ptr[i] = find_part_record(tok); if (exclude_part_ptr[i]) i++; else error("ExcludePartitions %s not found", tok); tok = strtok_r(NULL, ",", &tok_p); } } if (s_p_get_string(&hide_partitions, "HidePartitionJobs", tbl)) { char *tok = NULL, *tok_p = NULL; tok = strtok_r(hide_partitions, ",", &tok_p); i = 0; while (tok) { if (i >= HIDE_PART_CNT) { error("HidePartitionJobs has too many entries " "skipping %s and later entries", tok); break; } hide_part_ptr[i] = find_part_record(tok); if (hide_part_ptr[i]) i++; else error("HidePartitionJobs %s not found", tok); tok = strtok_r(NULL, ",", &tok_p); } } if (s_p_get_string(&hide_part_nodes, "HidePartitionNodes", tbl)) { char *tok = NULL, *tok_p = NULL; tok = strtok_r(hide_part_nodes, ",", &tok_p); i = 0; while (tok) { if (i >= HIDE_PART_CNT) { error("HidePartitionNodes has too many entries " "skipping %s and later entries", tok); break; } hide_part_nodes_ptr[i] = find_part_record(tok); if (hide_part_nodes_ptr[i]) i++; else error("HidePartitionNodes %s not found", tok); tok = strtok_r(NULL, ",", &tok_p); } } if (s_p_get_string(&priority_mode, "JobPriority", tbl)) { if (strcasecmp(priority_mode, "hold") == 0) init_prio_mode = PRIO_HOLD; else if (strcasecmp(priority_mode, "run") == 0) init_prio_mode = PRIO_DECREMENT; else error("Invalid value for JobPriority in wiki.conf"); xfree(priority_mode); } s_p_hashtbl_destroy(tbl); xfree(wiki_conf); #if _DEBUG info("AuthKey = %s", auth_key); info("EHost = %s", e_host); info("EHostBackup = %s", e_host_bu); info("EPort = %u", e_port); info("JobAggregationTime = %u sec", job_aggregation_time); info("JobPriority = %s", init_prio_mode ? "run" : "hold"); info("KillWait = %u sec", kill_wait); for (i=0; i<EXC_PART_CNT; i++) { if (!exclude_part_ptr[i]) continue; info("ExcludePartitions = %s", exclude_part_ptr[i]->name); } for (i=0; i<HIDE_PART_CNT; i++) { if (!hide_part_ptr[i]) continue; info("HidePartitionJobs = %s", hide_part_ptr[i]->name); } for (i=0; i<HIDE_PART_CNT; i++) { if (!hide_part_nodes_ptr[i]) continue; info("HidePartitionNodes = %s", hide_part_nodes_ptr[i]->name); } #endif return SLURM_SUCCESS; }
static void _print_conf(void) { slurm_ctl_conf_t *cf; char *str, time_str[32]; int i; cf = slurm_conf_lock(); debug3("NodeName = %s", conf->node_name); debug3("TopoAddr = %s", conf->node_topo_addr); debug3("TopoPattern = %s", conf->node_topo_pattern); if (cf->group_info & GROUP_CACHE) i = 1; else i = 0; debug3("CacheGroups = %d", i); debug3("Confile = `%s'", conf->conffile); debug3("Debug = %d", cf->slurmd_debug); debug3("CPUs = %-2u (CF: %2u, HW: %2u)", conf->cpus, conf->conf_cpus, conf->actual_cpus); debug3("Boards = %-2u (CF: %2u, HW: %2u)", conf->boards, conf->conf_boards, conf->actual_boards); debug3("Sockets = %-2u (CF: %2u, HW: %2u)", conf->sockets, conf->conf_sockets, conf->actual_sockets); debug3("Cores = %-2u (CF: %2u, HW: %2u)", conf->cores, conf->conf_cores, conf->actual_cores); debug3("Threads = %-2u (CF: %2u, HW: %2u)", conf->threads, conf->conf_threads, conf->actual_threads); secs2time_str((time_t)conf->up_time, time_str, sizeof(time_str)); debug3("UpTime = %u = %s", conf->up_time, time_str); str = xmalloc(conf->block_map_size*5); str[0] = '\0'; for (i = 0; i < conf->block_map_size; i++) { char id[10]; sprintf(id, "%u,", conf->block_map[i]); strcat(str, id); } str[strlen(str)-1] = '\0'; /* trim trailing "," */ debug3("Block Map = %s", str); str[0] = '\0'; for (i = 0; i < conf->block_map_size; i++) { char id[10]; sprintf(id, "%u,", conf->block_map_inv[i]); strcat(str, id); } str[strlen(str)-1] = '\0'; /* trim trailing "," */ debug3("Inverse Map = %s", str); xfree(str); debug3("RealMemory = %u", conf->real_memory_size); debug3("TmpDisk = %u", conf->tmp_disk_space); debug3("Epilog = `%s'", conf->epilog); debug3("Logfile = `%s'", cf->slurmd_logfile); debug3("HealthCheck = `%s'", conf->health_check_program); debug3("NodeName = %s", conf->node_name); debug3("NodeAddr = %s", conf->node_addr); debug3("Port = %u", conf->port); debug3("Prolog = `%s'", conf->prolog); debug3("TmpFS = `%s'", conf->tmpfs); debug3("Public Cert = `%s'", conf->pubkey); debug3("Slurmstepd = `%s'", conf->stepd_loc); debug3("Spool Dir = `%s'", conf->spooldir); debug3("Pid File = `%s'", conf->pidfile); debug3("Slurm UID = %u", conf->slurm_user_id); debug3("TaskProlog = `%s'", conf->task_prolog); debug3("TaskEpilog = `%s'", conf->task_epilog); debug3("TaskPluginParam = %u", conf->task_plugin_param); debug3("Use PAM = %u", conf->use_pam); slurm_conf_unlock(); }
/* Initialize power_save module parameters. * Return 0 on valid configuration to run power saving, * otherwise log the problem and return -1 */ static int _init_power_config(void) { slurm_ctl_conf_t *conf = slurm_conf_lock(); last_config = slurmctld_conf.last_update; idle_time = conf->suspend_time - 1; suspend_rate = conf->suspend_rate; resume_timeout = conf->resume_timeout; resume_rate = conf->resume_rate; slurmd_timeout = conf->slurmd_timeout; suspend_timeout = conf->suspend_timeout; _clear_power_config(); if (conf->suspend_program) suspend_prog = xstrdup(conf->suspend_program); if (conf->resume_program) resume_prog = xstrdup(conf->resume_program); if (conf->suspend_exc_nodes) exc_nodes = xstrdup(conf->suspend_exc_nodes); if (conf->suspend_exc_parts) exc_parts = xstrdup(conf->suspend_exc_parts); slurm_conf_unlock(); if (idle_time < 0) { /* not an error */ debug("power_save module disabled, SuspendTime < 0"); return -1; } if (suspend_rate < 0) { error("power_save module disabled, SuspendRate < 0"); return -1; } if (resume_rate < 0) { error("power_save module disabled, ResumeRate < 0"); return -1; } if (suspend_prog == NULL) { error("power_save module disabled, NULL SuspendProgram"); return -1; } else if (!_valid_prog(suspend_prog)) { error("power_save module disabled, invalid SuspendProgram %s", suspend_prog); return -1; } if (resume_prog == NULL) { error("power_save module disabled, NULL ResumeProgram"); return -1; } else if (!_valid_prog(resume_prog)) { error("power_save module disabled, invalid ResumeProgram %s", resume_prog); return -1; } if (exc_nodes && (node_name2bitmap(exc_nodes, false, &exc_node_bitmap))) { error("power_save module disabled, " "invalid SuspendExcNodes %s", exc_nodes); return -1; } if (exc_parts) { char *tmp = NULL, *one_part = NULL, *part_list = NULL; struct part_record *part_ptr = NULL; int rc = 0; part_list = xstrdup(exc_parts); one_part = strtok_r(part_list, ",", &tmp); while (one_part != NULL) { part_ptr = find_part_record(one_part); if (!part_ptr) { error("power_save module disabled, " "invalid SuspendExcPart %s", one_part); rc = -1; break; } if (exc_node_bitmap) bit_or(exc_node_bitmap, part_ptr->node_bitmap); else exc_node_bitmap = bit_copy(part_ptr-> node_bitmap); one_part = strtok_r(NULL, ",", &tmp); } xfree(part_list); if (rc) return rc; } if (exc_node_bitmap) { char *tmp = bitmap2node_name(exc_node_bitmap); debug("power_save module, excluded nodes %s", tmp); xfree(tmp); } return 0; }
static int _slurmd_init(void) { struct rlimit rlim; slurm_ctl_conf_t *cf; struct stat stat_buf; uint32_t cpu_cnt; /* * Process commandline arguments first, since one option may be * an alternate location for the slurm config file. */ _process_cmdline(*conf->argc, *conf->argv); /* * Build nodes table like in slurmctld * This is required by the topology stack * Node tables setup must preceed _read_config() so that the * proper hostname is set. */ slurm_conf_init(conf->conffile); init_node_conf(); /* slurm_select_init() must be called before * build_all_nodeline_info() to be called with proper argument. */ if (slurm_select_init(1) != SLURM_SUCCESS ) return SLURM_FAILURE; build_all_nodeline_info(true); build_all_frontend_info(true); /* * Read global slurm config file, override necessary values from * defaults and command line. */ _read_config(); cpu_cnt = MAX(conf->conf_cpus, conf->block_map_size); if ((gres_plugin_init() != SLURM_SUCCESS) || (gres_plugin_node_config_load(cpu_cnt) != SLURM_SUCCESS)) return SLURM_FAILURE; if (slurm_topo_init() != SLURM_SUCCESS) return SLURM_FAILURE; /* * Get and set slurmd topology information * Build node hash table first to speed up the topo build */ rehash_node(); slurm_topo_build_config(); _set_topo_info(); /* * Check for cpu frequency set capabilities on this node */ cpu_freq_init(conf); _print_conf(); if (slurm_proctrack_init() != SLURM_SUCCESS) return SLURM_FAILURE; if (slurmd_task_init() != SLURM_SUCCESS) return SLURM_FAILURE; if (slurm_auth_init(NULL) != SLURM_SUCCESS) return SLURM_FAILURE; if (spank_slurmd_init() < 0) return SLURM_FAILURE; if (getrlimit(RLIMIT_CPU, &rlim) == 0) { rlim.rlim_cur = rlim.rlim_max; setrlimit(RLIMIT_CPU, &rlim); if (rlim.rlim_max != RLIM_INFINITY) { error("Slurmd process CPU time limit is %d seconds", (int) rlim.rlim_max); } } if (getrlimit(RLIMIT_NOFILE, &rlim) == 0) { rlim.rlim_cur = rlim.rlim_max; setrlimit(RLIMIT_NOFILE, &rlim); } #ifndef NDEBUG if (getrlimit(RLIMIT_CORE, &rlim) == 0) { rlim.rlim_cur = rlim.rlim_max; setrlimit(RLIMIT_CORE, &rlim); } #endif /* !NDEBUG */ /* * Create a context for verifying slurm job credentials */ if (!(conf->vctx = slurm_cred_verifier_ctx_create(conf->pubkey))) return SLURM_FAILURE; if (!strcmp(conf->select_type, "select/serial")) { /* Only cache credential for 5 seconds with select/serial * for shorter cache searches and higher throughput */ slurm_cred_ctx_set(conf->vctx, SLURM_CRED_OPT_EXPIRY_WINDOW, 5); } /* * Create slurmd spool directory if necessary. */ if (_set_slurmd_spooldir() < 0) { error("Unable to initialize slurmd spooldir"); return SLURM_FAILURE; } if (conf->cleanstart) { /* * Need to kill any running slurmd's here */ _kill_old_slurmd(); stepd_cleanup_sockets(conf->spooldir, conf->node_name); _stepd_cleanup_batch_dirs(conf->spooldir, conf->node_name); } if (conf->daemonize) { bool success = false; if (conf->logfile && (conf->logfile[0] == '/')) { char *slash_ptr, *work_dir; work_dir = xstrdup(conf->logfile); slash_ptr = strrchr(work_dir, '/'); if (slash_ptr == work_dir) work_dir[1] = '\0'; else slash_ptr[0] = '\0'; if ((access(work_dir, W_OK) != 0) || (chdir(work_dir) < 0)) { error("Unable to chdir to %s", work_dir); } else success = true; xfree(work_dir); } if (!success) { if ((access(conf->spooldir, W_OK) != 0) || (chdir(conf->spooldir) < 0)) { error("Unable to chdir to %s", conf->spooldir); } else success = true; } if (!success) { if ((access("/var/tmp", W_OK) != 0) || (chdir("/var/tmp") < 0)) { error("chdir(/var/tmp): %m"); return SLURM_FAILURE; } else info("chdir to /var/tmp"); } } /* * Cache the group access list */ cf = slurm_conf_lock(); if (cf->group_info & GROUP_CACHE) init_gids_cache(1); else init_gids_cache(0); slurm_conf_unlock(); if ((devnull = open_cloexec("/dev/null", O_RDWR)) < 0) { error("Unable to open /dev/null: %m"); return SLURM_FAILURE; } /* make sure we have slurmstepd installed */ if (stat(conf->stepd_loc, &stat_buf)) fatal("Unable to find slurmstepd file at %s", conf->stepd_loc); if (!S_ISREG(stat_buf.st_mode)) fatal("slurmstepd not a file at %s", conf->stepd_loc); return SLURM_SUCCESS; }
int main (int argc, char **argv) { log_options_t logopt = LOG_OPTS_STDERR_ONLY; slurm_ctl_conf_t *conf = NULL; shares_response_msg_t resp; log_init(xbasename(argv[0]), logopt, 0, NULL); xfree(slurmctld_conf.priority_type); //logopt.stderr_level += 5; logopt.prefix_level = 1; log_alter(logopt, 0, NULL); print_fields_have_header = 0; print_fields_parsable_print = PRINT_FIELDS_PARSABLE_ENDING; conf = slurm_conf_lock(); /* force priority type to be multifactor */ xfree(conf->priority_type); conf->priority_type = xstrdup("priority/multifactor"); conf->priority_flags = PRIORITY_FLAGS_FAIR_TREE; /* force accounting type to be slurmdbd (It doesn't really talk * to any database, but needs this to work with fairshare * calculation). */ xfree(conf->accounting_storage_type); conf->accounting_storage_type = xstrdup("accounting_storage/slurmdbd"); /* set up a known environment to test against. Since we are * only concerned about the fairshare we won't look at the other * factors here. */ conf->priority_decay_hl = 1; conf->priority_favor_small = 0; conf->priority_max_age = conf->priority_decay_hl; conf->priority_reset_period = 0; conf->priority_weight_age = 0; conf->priority_weight_fs = 10000; conf->priority_weight_js = 0; conf->priority_weight_part = 0; conf->priority_weight_qos = 0; slurm_conf_unlock(); /* we don't want to do any decay here so make the save state * to /dev/null */ xfree(slurmctld_conf.state_save_location); slurmctld_conf.state_save_location = "/dev/null"; /* now set up the association tree */ _setup_assoc_list(); /* now set up the job list */ job_list = list_create(_list_delete_job); /* now init the priorities of the associations */ if (slurm_priority_init() != SLURM_SUCCESS) fatal("failed to initialize priority plugin"); /* on some systems that don't have multiple cores we need to * sleep to make sure the thread gets started. */ sleep(1); memset(&resp, 0, sizeof(shares_response_msg_t)); resp.assoc_shares_list = assoc_mgr_get_shares(NULL, 0, NULL, NULL); process(&resp, 0); /* free memory */ if (slurm_priority_fini() != SLURM_SUCCESS) fatal("failed to finalize priority plugin"); if (job_list) list_destroy(job_list); if (resp.assoc_shares_list) list_destroy(resp.assoc_shares_list); if (assoc_mgr_assoc_list) list_destroy(assoc_mgr_assoc_list); if (assoc_mgr_qos_list) list_destroy(assoc_mgr_qos_list); return 0; }
/* * Read the slurm configuration file (slurm.conf) and substitute some * values into the slurmd configuration in preference of the defaults. */ static void _read_config(void) { char *path_pubkey = NULL; slurm_ctl_conf_t *cf = NULL; uint16_t tmp16 = 0; #ifndef HAVE_FRONT_END bool cr_flag = false, gang_flag = false; #endif cf = slurm_conf_lock(); slurm_mutex_lock(&conf->config_mutex); if (conf->conffile == NULL) conf->conffile = xstrdup(cf->slurm_conf); conf->slurm_user_id = cf->slurm_user_id; conf->cr_type = cf->select_type_param; path_pubkey = xstrdup(cf->job_credential_public_certificate); if (!conf->logfile) conf->logfile = xstrdup(cf->slurmd_logfile); #ifndef HAVE_FRONT_END if (!strcmp(cf->select_type, "select/cons_res")) cr_flag = true; if (cf->preempt_mode & PREEMPT_MODE_GANG) gang_flag = true; #endif slurm_conf_unlock(); /* node_name may already be set from a command line parameter */ if (conf->node_name == NULL) conf->node_name = slurm_conf_get_nodename(conf->hostname); /* if we didn't match the form of the hostname already * stored in conf->hostname, check to see if we match any * valid aliases */ if (conf->node_name == NULL) conf->node_name = slurm_conf_get_aliased_nodename(); if (conf->node_name == NULL) conf->node_name = slurm_conf_get_nodename("localhost"); if (conf->node_name == NULL) fatal("Unable to determine this slurmd's NodeName"); _massage_pathname(&conf->logfile); /* set node_addr if relevant */ if ((conf->node_addr == NULL) && (conf->node_addr = slurm_conf_get_nodeaddr(conf->hostname)) && (strcmp(conf->node_addr, conf->hostname) == 0)) { xfree(conf->node_addr); /* Sets to NULL */ } conf->port = slurm_conf_get_port(conf->node_name); slurm_conf_get_cpus_bsct(conf->node_name, &conf->conf_cpus, &conf->conf_boards, &conf->conf_sockets, &conf->conf_cores, &conf->conf_threads); /* store hardware properties in slurmd_config */ xfree(conf->block_map); xfree(conf->block_map_inv); _update_logging(); _update_nice(); get_cpuinfo(&conf->actual_cpus, &conf->actual_boards, &conf->actual_sockets, &conf->actual_cores, &conf->actual_threads, &conf->block_map_size, &conf->block_map, &conf->block_map_inv); #ifdef HAVE_FRONT_END /* * When running with multiple frontends, the slurmd S:C:T values are not * relevant, hence ignored by both _register_front_ends (sets all to 1) * and validate_nodes_via_front_end (uses slurm.conf values). * Report actual hardware configuration, irrespective of FastSchedule. */ conf->cpus = conf->actual_cpus; conf->boards = conf->actual_boards; conf->sockets = conf->actual_sockets; conf->cores = conf->actual_cores; conf->threads = conf->actual_threads; #else /* If the actual resources on a node differ than what is in * the configuration file and we are using * cons_res or gang scheduling we have to use what is in the * configuration file because the slurmctld creates bitmaps * for scheduling before these nodes check in. */ if (((cf->fast_schedule == 0) && !cr_flag && !gang_flag) || ((cf->fast_schedule == 1) && (conf->actual_cpus < conf->conf_cpus))) { conf->cpus = conf->actual_cpus; conf->boards = conf->actual_boards; conf->sockets = conf->actual_sockets; conf->cores = conf->actual_cores; conf->threads = conf->actual_threads; } else { conf->cpus = conf->conf_cpus; conf->boards = conf->conf_boards; conf->sockets = conf->conf_sockets; conf->cores = conf->conf_cores; conf->threads = conf->conf_threads; } if ((conf->cpus != conf->actual_cpus) || (conf->sockets != conf->actual_sockets) || (conf->cores != conf->actual_cores) || (conf->threads != conf->actual_threads)) { if (cf->fast_schedule) { info("Node configuration differs from hardware: " "CPUs=%u:%u(hw) Boards=%u:%u(hw) " "SocketsPerBoard=%u:%u(hw) CoresPerSocket=%u:%u(hw) " "ThreadsPerCore=%u:%u(hw)", conf->cpus, conf->actual_cpus, conf->boards, conf->actual_boards, conf->sockets, conf->actual_sockets, conf->cores, conf->actual_cores, conf->threads, conf->actual_threads); } else if ((cf->fast_schedule == 0) && (cr_flag || gang_flag)) { error("You are using cons_res or gang scheduling with " "Fastschedule=0 and node configuration differs " "from hardware. The node configuration used " "will be what is in the slurm.conf because of " "the bitmaps the slurmctld must create before " "the slurmd registers.\n" " CPUs=%u:%u(hw) Boards=%u:%u(hw) " "SocketsPerBoard=%u:%u(hw) CoresPerSocket=%u:%u(hw) " "ThreadsPerCore=%u:%u(hw)", conf->cpus, conf->actual_cpus, conf->boards, conf->actual_boards, conf->sockets, conf->actual_sockets, conf->cores, conf->actual_cores, conf->threads, conf->actual_threads); } } #endif get_memory(&conf->real_memory_size); get_up_time(&conf->up_time); cf = slurm_conf_lock(); get_tmp_disk(&conf->tmp_disk_space, cf->tmp_fs); _free_and_set(&conf->epilog, xstrdup(cf->epilog)); _free_and_set(&conf->prolog, xstrdup(cf->prolog)); _free_and_set(&conf->tmpfs, xstrdup(cf->tmp_fs)); _free_and_set(&conf->health_check_program, xstrdup(cf->health_check_program)); _free_and_set(&conf->spooldir, xstrdup(cf->slurmd_spooldir)); _massage_pathname(&conf->spooldir); _free_and_set(&conf->pidfile, xstrdup(cf->slurmd_pidfile)); _massage_pathname(&conf->pidfile); _free_and_set(&conf->select_type, xstrdup(cf->select_type)); _free_and_set(&conf->task_prolog, xstrdup(cf->task_prolog)); _free_and_set(&conf->task_epilog, xstrdup(cf->task_epilog)); _free_and_set(&conf->pubkey, path_pubkey); conf->debug_flags = cf->debug_flags; conf->propagate_prio = cf->propagate_prio_process; _free_and_set(&conf->job_acct_gather_freq, xstrdup(cf->job_acct_gather_freq)); conf->acct_freq_task = (uint16_t)NO_VAL; tmp16 = acct_gather_parse_freq(PROFILE_TASK, conf->job_acct_gather_freq); if (tmp16 != -1) conf->acct_freq_task = tmp16; _free_and_set(&conf->acct_gather_energy_type, xstrdup(cf->acct_gather_energy_type)); _free_and_set(&conf->acct_gather_filesystem_type, xstrdup(cf->acct_gather_filesystem_type)); _free_and_set(&conf->acct_gather_infiniband_type, xstrdup(cf->acct_gather_infiniband_type)); _free_and_set(&conf->acct_gather_profile_type, xstrdup(cf->acct_gather_profile_type)); _free_and_set(&conf->job_acct_gather_type, xstrdup(cf->job_acct_gather_type)); if ( (conf->node_name == NULL) || (conf->node_name[0] == '\0') ) fatal("Node name lookup failure"); if (cf->control_addr == NULL) fatal("Unable to establish controller machine"); if (cf->slurmctld_port == 0) fatal("Unable to establish controller port"); conf->slurmd_timeout = cf->slurmd_timeout; conf->use_pam = cf->use_pam; conf->task_plugin_param = cf->task_plugin_param; slurm_mutex_unlock(&conf->config_mutex); slurm_conf_unlock(); }
extern void create_daemon_popup(GtkAction *action, gpointer user_data) { GtkWidget *popup = gtk_dialog_new_with_buttons( "SLURM Daemons running", GTK_WINDOW(user_data), GTK_DIALOG_DESTROY_WITH_PARENT, GTK_STOCK_CLOSE, GTK_RESPONSE_OK, NULL); int update = 0; slurm_ctl_conf_info_msg_t *conf; char me[MAX_SLURM_NAME], *b, *c, *n; int actld = 0, ctld = 0, d = 0; GtkTreeStore *treestore = _local_create_treestore_2cols(popup, 300, 100); GtkTreeIter iter; g_signal_connect(G_OBJECT(popup), "delete_event", G_CALLBACK(_delete_popup), NULL); g_signal_connect(G_OBJECT(popup), "response", G_CALLBACK(_delete_popup), NULL); slurm_conf_init(NULL); conf = slurm_conf_lock(); gethostname_short(me, MAX_SLURM_NAME); if ((b = conf->backup_controller)) { if ((strcmp(b, me) == 0) || (strcasecmp(b, "localhost") == 0)) ctld = 1; } if ((c = conf->control_machine)) { actld = 1; if ((strcmp(c, me) == 0) || (strcasecmp(c, "localhost") == 0)) ctld = 1; } slurm_conf_unlock(); if ((n = slurm_conf_get_nodename(me))) { d = 1; xfree(n); } else if ((n = slurm_conf_get_aliased_nodename())) { d = 1; xfree(n); } else if ((n = slurm_conf_get_nodename("localhost"))) { d = 1; xfree(n); } if (actld && ctld) add_display_treestore_line(update, treestore, &iter, "Slurmctld", "1"); if (actld && d) add_display_treestore_line(update, treestore, &iter, "Slurmd", "1"); gtk_widget_show_all(popup); return; }
/* * Connect to a slurmstepd proccess by way of its unix domain socket. * * Both "directory" and "nodename" may be null, in which case stepd_connect * will attempt to determine them on its own. If you are using multiple * slurmd on one node (unusual outside of development environments), you * will get one of the local NodeNames more-or-less at random. * * Returns a socket descriptor for the opened socket on success, * and -1 on error. */ int stepd_connect(const char *directory, const char *nodename, uint32_t jobid, uint32_t stepid, uint16_t *protocol_version) { int req = REQUEST_CONNECT; int fd = -1; int rc; void *auth_cred; Buf buffer; int len; *protocol_version = 0; if (nodename == NULL) { if (!(nodename = _guess_nodename())) return -1; } if (directory == NULL) { slurm_ctl_conf_t *cf; cf = slurm_conf_lock(); directory = slurm_conf_expand_slurmd_path( cf->slurmd_spooldir, nodename); slurm_conf_unlock(); } buffer = init_buf(0); /* Create an auth credential */ auth_cred = g_slurm_auth_create(NULL, 2, NULL); if (auth_cred == NULL) { error("Creating authentication credential: %s", g_slurm_auth_errstr(g_slurm_auth_errno(NULL))); slurm_seterrno(SLURM_PROTOCOL_AUTHENTICATION_ERROR); goto fail1; } /* Pack the auth credential */ rc = g_slurm_auth_pack(auth_cred, buffer); (void) g_slurm_auth_destroy(auth_cred); if (rc) { error("Packing authentication credential: %s", g_slurm_auth_errstr(g_slurm_auth_errno(auth_cred))); slurm_seterrno(SLURM_PROTOCOL_AUTHENTICATION_ERROR); goto fail1; } /* Connect to the step */ fd = _step_connect(directory, nodename, jobid, stepid); if (fd == -1) goto fail1; safe_write(fd, &req, sizeof(int)); len = size_buf(buffer); safe_write(fd, &len, sizeof(int)); safe_write(fd, get_buf_data(buffer), len); safe_read(fd, &rc, sizeof(int)); if (rc < 0) { error("slurmstepd refused authentication: %m"); slurm_seterrno(SLURM_PROTOCOL_AUTHENTICATION_ERROR); goto rwfail; } else if (rc) *protocol_version = rc; else { /* 0n older versions of Slurm < 14.11 SLURM_SUCCESS * was returned here instead of the protocol version. * This can be removed when we are 2 versions past * 14.11. */ slurmstepd_info_t *stepd_info = stepd_get_info(fd); *protocol_version = stepd_info->protocol_version; xfree(stepd_info); } free_buf(buffer); return fd; rwfail: close(fd); fail1: free_buf(buffer); return -1; }