static int _slurmd_init(void) { struct rlimit rlim; slurm_ctl_conf_t *cf; struct stat stat_buf; uint32_t cpu_cnt; /* * Process commandline arguments first, since one option may be * an alternate location for the slurm config file. */ _process_cmdline(*conf->argc, *conf->argv); /* * Build nodes table like in slurmctld * This is required by the topology stack * Node tables setup must preceed _read_config() so that the * proper hostname is set. */ slurm_conf_init(conf->conffile); init_node_conf(); /* slurm_select_init() must be called before * build_all_nodeline_info() to be called with proper argument. */ if (slurm_select_init(1) != SLURM_SUCCESS ) return SLURM_FAILURE; build_all_nodeline_info(true); build_all_frontend_info(true); /* * Read global slurm config file, override necessary values from * defaults and command line. */ _read_config(); cpu_cnt = MAX(conf->conf_cpus, conf->block_map_size); if ((gres_plugin_init() != SLURM_SUCCESS) || (gres_plugin_node_config_load(cpu_cnt) != SLURM_SUCCESS)) return SLURM_FAILURE; if (slurm_topo_init() != SLURM_SUCCESS) return SLURM_FAILURE; /* * Get and set slurmd topology information * Build node hash table first to speed up the topo build */ rehash_node(); slurm_topo_build_config(); _set_topo_info(); /* * Check for cpu frequency set capabilities on this node */ cpu_freq_init(conf); _print_conf(); if (slurm_proctrack_init() != SLURM_SUCCESS) return SLURM_FAILURE; if (slurmd_task_init() != SLURM_SUCCESS) return SLURM_FAILURE; if (slurm_auth_init(NULL) != SLURM_SUCCESS) return SLURM_FAILURE; if (spank_slurmd_init() < 0) return SLURM_FAILURE; if (getrlimit(RLIMIT_CPU, &rlim) == 0) { rlim.rlim_cur = rlim.rlim_max; setrlimit(RLIMIT_CPU, &rlim); if (rlim.rlim_max != RLIM_INFINITY) { error("Slurmd process CPU time limit is %d seconds", (int) rlim.rlim_max); } } if (getrlimit(RLIMIT_NOFILE, &rlim) == 0) { rlim.rlim_cur = rlim.rlim_max; setrlimit(RLIMIT_NOFILE, &rlim); } #ifndef NDEBUG if (getrlimit(RLIMIT_CORE, &rlim) == 0) { rlim.rlim_cur = rlim.rlim_max; setrlimit(RLIMIT_CORE, &rlim); } #endif /* !NDEBUG */ /* * Create a context for verifying slurm job credentials */ if (!(conf->vctx = slurm_cred_verifier_ctx_create(conf->pubkey))) return SLURM_FAILURE; if (!strcmp(conf->select_type, "select/serial")) { /* Only cache credential for 5 seconds with select/serial * for shorter cache searches and higher throughput */ slurm_cred_ctx_set(conf->vctx, SLURM_CRED_OPT_EXPIRY_WINDOW, 5); } /* * Create slurmd spool directory if necessary. */ if (_set_slurmd_spooldir() < 0) { error("Unable to initialize slurmd spooldir"); return SLURM_FAILURE; } if (conf->cleanstart) { /* * Need to kill any running slurmd's here */ _kill_old_slurmd(); stepd_cleanup_sockets(conf->spooldir, conf->node_name); _stepd_cleanup_batch_dirs(conf->spooldir, conf->node_name); } if (conf->daemonize) { bool success = false; if (conf->logfile && (conf->logfile[0] == '/')) { char *slash_ptr, *work_dir; work_dir = xstrdup(conf->logfile); slash_ptr = strrchr(work_dir, '/'); if (slash_ptr == work_dir) work_dir[1] = '\0'; else slash_ptr[0] = '\0'; if ((access(work_dir, W_OK) != 0) || (chdir(work_dir) < 0)) { error("Unable to chdir to %s", work_dir); } else success = true; xfree(work_dir); } if (!success) { if ((access(conf->spooldir, W_OK) != 0) || (chdir(conf->spooldir) < 0)) { error("Unable to chdir to %s", conf->spooldir); } else success = true; } if (!success) { if ((access("/var/tmp", W_OK) != 0) || (chdir("/var/tmp") < 0)) { error("chdir(/var/tmp): %m"); return SLURM_FAILURE; } else info("chdir to /var/tmp"); } } /* * Cache the group access list */ cf = slurm_conf_lock(); if (cf->group_info & GROUP_CACHE) init_gids_cache(1); else init_gids_cache(0); slurm_conf_unlock(); if ((devnull = open_cloexec("/dev/null", O_RDWR)) < 0) { error("Unable to open /dev/null: %m"); return SLURM_FAILURE; } /* make sure we have slurmstepd installed */ if (stat(conf->stepd_loc, &stat_buf)) fatal("Unable to find slurmstepd file at %s", conf->stepd_loc); if (!S_ISREG(stat_buf.st_mode)) fatal("slurmstepd not a file at %s", conf->stepd_loc); return SLURM_SUCCESS; }
static void _reconfigure(void) { List steps; ListIterator i; slurm_ctl_conf_t *cf; step_loc_t *stepd; bool did_change; _reconfig = 0; slurm_conf_reinit(conf->conffile); _read_config(); /* * Rebuild topology information and refresh slurmd topo infos */ slurm_topo_build_config(); _set_topo_info(); /* * In case the administrator changed the cpu frequency set capabilities * on this node, rebuild the cpu frequency table information */ cpu_freq_init(conf); _print_conf(); /* * Make best effort at changing to new public key */ slurm_cred_ctx_key_update(conf->vctx, conf->pubkey); /* * Reinitialize the groups cache */ cf = slurm_conf_lock(); if (cf->group_info & GROUP_CACHE) init_gids_cache(1); else init_gids_cache(0); slurm_conf_unlock(); /* send reconfig to each stepd so they can refresh their log * file handle */ steps = stepd_available(conf->spooldir, conf->node_name); i = list_iterator_create(steps); while ((stepd = list_next(i))) { int fd; fd = stepd_connect(stepd->directory, stepd->nodename, stepd->jobid, stepd->stepid); if (fd == -1) continue; if (stepd_reconfig(fd) != SLURM_SUCCESS) debug("Reconfig jobid=%u.%u failed: %m", stepd->jobid, stepd->stepid); close(fd); } list_iterator_destroy(i); list_destroy(steps); gres_plugin_reconfig(&did_change); (void) switch_g_reconfig(); container_g_reconfig(); if (did_change) { uint32_t cpu_cnt = MAX(conf->conf_cpus, conf->block_map_size); (void) gres_plugin_node_config_load(cpu_cnt); send_registration_msg(SLURM_SUCCESS, false); } /* reconfigure energy */ acct_gather_energy_g_set_data(ENERGY_DATA_RECONFIG, NULL); /* * XXX: reopen slurmd port? */ }
/* * This function handles the initialization information from slurmd * sent by _send_slurmstepd_init() in src/slurmd/slurmd/req.c. */ static int _init_from_slurmd(int sock, char **argv, slurm_addr_t **_cli, slurm_addr_t **_self, slurm_msg_t **_msg, int *_ngids, gid_t **_gids) { char *incoming_buffer = NULL; Buf buffer; int step_type; int len, proto; slurm_addr_t *cli = NULL; slurm_addr_t *self = NULL; slurm_msg_t *msg = NULL; int ngids = 0; gid_t *gids = NULL; uint16_t port; char buf[16]; log_options_t lopts = LOG_OPTS_INITIALIZER; log_init(argv[0], lopts, LOG_DAEMON, NULL); /* receive job type from slurmd */ safe_read(sock, &step_type, sizeof(int)); debug3("step_type = %d", step_type); /* receive reverse-tree info from slurmd */ slurm_mutex_lock(&step_complete.lock); safe_read(sock, &step_complete.rank, sizeof(int)); safe_read(sock, &step_complete.parent_rank, sizeof(int)); safe_read(sock, &step_complete.children, sizeof(int)); safe_read(sock, &step_complete.depth, sizeof(int)); safe_read(sock, &step_complete.max_depth, sizeof(int)); safe_read(sock, &step_complete.parent_addr, sizeof(slurm_addr_t)); step_complete.bits = bit_alloc(step_complete.children); step_complete.jobacct = jobacctinfo_create(NULL); slurm_mutex_unlock(&step_complete.lock); /* receive conf from slurmd */ if ((conf = read_slurmd_conf_lite (sock)) == NULL) fatal("Failed to read conf from slurmd"); log_alter(conf->log_opts, 0, conf->logfile); log_set_timefmt(conf->log_fmt); debug2("debug level is %d.", conf->debug_level); switch_g_slurmd_step_init(); slurm_get_ip_str(&step_complete.parent_addr, &port, buf, 16); debug3("slurmstepd rank %d, parent address = %s, port = %u", step_complete.rank, buf, port); /* receive cli from slurmd */ safe_read(sock, &len, sizeof(int)); incoming_buffer = xmalloc(sizeof(char) * len); safe_read(sock, incoming_buffer, len); buffer = create_buf(incoming_buffer,len); cli = xmalloc(sizeof(slurm_addr_t)); if (slurm_unpack_slurm_addr_no_alloc(cli, buffer) == SLURM_ERROR) fatal("slurmstepd: problem with unpack of slurmd_conf"); free_buf(buffer); /* receive self from slurmd */ safe_read(sock, &len, sizeof(int)); if (len > 0) { /* receive packed self from main slurmd */ incoming_buffer = xmalloc(sizeof(char) * len); safe_read(sock, incoming_buffer, len); buffer = create_buf(incoming_buffer,len); self = xmalloc(sizeof(slurm_addr_t)); if (slurm_unpack_slurm_addr_no_alloc(self, buffer) == SLURM_ERROR) { fatal("slurmstepd: problem with unpack of " "slurmd_conf"); } free_buf(buffer); } /* Receive GRES information from slurmd */ gres_plugin_recv_stepd(sock); /* Grab the slurmd's spooldir. Has %n expanded. */ cpu_freq_init(conf); /* Receive cpu_frequency info from slurmd */ cpu_freq_recv_info(sock); /* get the protocol version of the srun */ safe_read(sock, &proto, sizeof(int)); /* receive req from slurmd */ safe_read(sock, &len, sizeof(int)); incoming_buffer = xmalloc(sizeof(char) * len); safe_read(sock, incoming_buffer, len); buffer = create_buf(incoming_buffer,len); msg = xmalloc(sizeof(slurm_msg_t)); slurm_msg_t_init(msg); msg->protocol_version = (uint16_t)proto; switch (step_type) { case LAUNCH_BATCH_JOB: msg->msg_type = REQUEST_BATCH_JOB_LAUNCH; break; case LAUNCH_TASKS: msg->msg_type = REQUEST_LAUNCH_TASKS; break; default: fatal("%s: Unrecognized launch RPC (%d)", __func__, step_type); break; } if (unpack_msg(msg, buffer) == SLURM_ERROR) fatal("slurmstepd: we didn't unpack the request correctly"); free_buf(buffer); /* receive cached group ids array for the relevant uid */ safe_read(sock, &ngids, sizeof(int)); if (ngids > 0) { int i; uint32_t tmp32; gids = (gid_t *)xmalloc(sizeof(gid_t) * ngids); for (i = 0; i < ngids; i++) { safe_read(sock, &tmp32, sizeof(uint32_t)); gids[i] = (gid_t)tmp32; debug2("got gid %d", gids[i]); } } *_cli = cli; *_self = self; *_msg = msg; *_ngids = ngids; *_gids = gids; return 1; rwfail: fatal("Error reading initialization data from slurmd"); exit(1); }