extern int task_cgroup_devices_init(slurm_cgroup_conf_t *slurm_cgroup_conf) { uint16_t cpunum; /* initialize cpuinfo internal data */ if ( xcpuinfo_init() != XCPUINFO_SUCCESS ) return SLURM_ERROR; /* initialize user/job/jobstep cgroup relative paths */ user_cgroup_path[0] = '\0'; job_cgroup_path[0] = '\0'; jobstep_cgroup_path[0] = '\0'; /* initialize allowed_devices_filename */ cgroup_allowed_devices_file[0] = '\0'; if ( get_procs(&cpunum) != 0 ) { error("task/cgroup: unable to get a number of CPU"); goto error; } (void) gres_plugin_node_config_load(cpunum, conf->node_name, NULL); strcpy(cgroup_allowed_devices_file, slurm_cgroup_conf->allowed_devices_file); if (xcgroup_ns_create(slurm_cgroup_conf, &devices_ns, "", "devices") != XCGROUP_SUCCESS ) { error("task/cgroup: unable to create devices namespace"); goto error; } return SLURM_SUCCESS; error: xcgroup_ns_destroy(&devices_ns); xcpuinfo_fini(); return SLURM_ERROR; }
static int _slurmd_init(void) { struct rlimit rlim; slurm_ctl_conf_t *cf; struct stat stat_buf; uint32_t cpu_cnt; /* * Process commandline arguments first, since one option may be * an alternate location for the slurm config file. */ _process_cmdline(*conf->argc, *conf->argv); /* * Build nodes table like in slurmctld * This is required by the topology stack * Node tables setup must preceed _read_config() so that the * proper hostname is set. */ slurm_conf_init(conf->conffile); init_node_conf(); /* slurm_select_init() must be called before * build_all_nodeline_info() to be called with proper argument. */ if (slurm_select_init(1) != SLURM_SUCCESS ) return SLURM_FAILURE; build_all_nodeline_info(true); build_all_frontend_info(true); /* * Read global slurm config file, override necessary values from * defaults and command line. */ _read_config(); cpu_cnt = MAX(conf->conf_cpus, conf->block_map_size); if ((gres_plugin_init() != SLURM_SUCCESS) || (gres_plugin_node_config_load(cpu_cnt) != SLURM_SUCCESS)) return SLURM_FAILURE; if (slurm_topo_init() != SLURM_SUCCESS) return SLURM_FAILURE; /* * Get and set slurmd topology information * Build node hash table first to speed up the topo build */ rehash_node(); slurm_topo_build_config(); _set_topo_info(); /* * Check for cpu frequency set capabilities on this node */ cpu_freq_init(conf); _print_conf(); if (slurm_proctrack_init() != SLURM_SUCCESS) return SLURM_FAILURE; if (slurmd_task_init() != SLURM_SUCCESS) return SLURM_FAILURE; if (slurm_auth_init(NULL) != SLURM_SUCCESS) return SLURM_FAILURE; if (spank_slurmd_init() < 0) return SLURM_FAILURE; if (getrlimit(RLIMIT_CPU, &rlim) == 0) { rlim.rlim_cur = rlim.rlim_max; setrlimit(RLIMIT_CPU, &rlim); if (rlim.rlim_max != RLIM_INFINITY) { error("Slurmd process CPU time limit is %d seconds", (int) rlim.rlim_max); } } if (getrlimit(RLIMIT_NOFILE, &rlim) == 0) { rlim.rlim_cur = rlim.rlim_max; setrlimit(RLIMIT_NOFILE, &rlim); } #ifndef NDEBUG if (getrlimit(RLIMIT_CORE, &rlim) == 0) { rlim.rlim_cur = rlim.rlim_max; setrlimit(RLIMIT_CORE, &rlim); } #endif /* !NDEBUG */ /* * Create a context for verifying slurm job credentials */ if (!(conf->vctx = slurm_cred_verifier_ctx_create(conf->pubkey))) return SLURM_FAILURE; if (!strcmp(conf->select_type, "select/serial")) { /* Only cache credential for 5 seconds with select/serial * for shorter cache searches and higher throughput */ slurm_cred_ctx_set(conf->vctx, SLURM_CRED_OPT_EXPIRY_WINDOW, 5); } /* * Create slurmd spool directory if necessary. */ if (_set_slurmd_spooldir() < 0) { error("Unable to initialize slurmd spooldir"); return SLURM_FAILURE; } if (conf->cleanstart) { /* * Need to kill any running slurmd's here */ _kill_old_slurmd(); stepd_cleanup_sockets(conf->spooldir, conf->node_name); _stepd_cleanup_batch_dirs(conf->spooldir, conf->node_name); } if (conf->daemonize) { bool success = false; if (conf->logfile && (conf->logfile[0] == '/')) { char *slash_ptr, *work_dir; work_dir = xstrdup(conf->logfile); slash_ptr = strrchr(work_dir, '/'); if (slash_ptr == work_dir) work_dir[1] = '\0'; else slash_ptr[0] = '\0'; if ((access(work_dir, W_OK) != 0) || (chdir(work_dir) < 0)) { error("Unable to chdir to %s", work_dir); } else success = true; xfree(work_dir); } if (!success) { if ((access(conf->spooldir, W_OK) != 0) || (chdir(conf->spooldir) < 0)) { error("Unable to chdir to %s", conf->spooldir); } else success = true; } if (!success) { if ((access("/var/tmp", W_OK) != 0) || (chdir("/var/tmp") < 0)) { error("chdir(/var/tmp): %m"); return SLURM_FAILURE; } else info("chdir to /var/tmp"); } } /* * Cache the group access list */ cf = slurm_conf_lock(); if (cf->group_info & GROUP_CACHE) init_gids_cache(1); else init_gids_cache(0); slurm_conf_unlock(); if ((devnull = open_cloexec("/dev/null", O_RDWR)) < 0) { error("Unable to open /dev/null: %m"); return SLURM_FAILURE; } /* make sure we have slurmstepd installed */ if (stat(conf->stepd_loc, &stat_buf)) fatal("Unable to find slurmstepd file at %s", conf->stepd_loc); if (!S_ISREG(stat_buf.st_mode)) fatal("slurmstepd not a file at %s", conf->stepd_loc); return SLURM_SUCCESS; }
static void _reconfigure(void) { List steps; ListIterator i; slurm_ctl_conf_t *cf; step_loc_t *stepd; bool did_change; _reconfig = 0; slurm_conf_reinit(conf->conffile); _read_config(); /* * Rebuild topology information and refresh slurmd topo infos */ slurm_topo_build_config(); _set_topo_info(); /* * In case the administrator changed the cpu frequency set capabilities * on this node, rebuild the cpu frequency table information */ cpu_freq_init(conf); _print_conf(); /* * Make best effort at changing to new public key */ slurm_cred_ctx_key_update(conf->vctx, conf->pubkey); /* * Reinitialize the groups cache */ cf = slurm_conf_lock(); if (cf->group_info & GROUP_CACHE) init_gids_cache(1); else init_gids_cache(0); slurm_conf_unlock(); /* send reconfig to each stepd so they can refresh their log * file handle */ steps = stepd_available(conf->spooldir, conf->node_name); i = list_iterator_create(steps); while ((stepd = list_next(i))) { int fd; fd = stepd_connect(stepd->directory, stepd->nodename, stepd->jobid, stepd->stepid); if (fd == -1) continue; if (stepd_reconfig(fd) != SLURM_SUCCESS) debug("Reconfig jobid=%u.%u failed: %m", stepd->jobid, stepd->stepid); close(fd); } list_iterator_destroy(i); list_destroy(steps); gres_plugin_reconfig(&did_change); (void) switch_g_reconfig(); container_g_reconfig(); if (did_change) { uint32_t cpu_cnt = MAX(conf->conf_cpus, conf->block_map_size); (void) gres_plugin_node_config_load(cpu_cnt); send_registration_msg(SLURM_SUCCESS, false); } /* reconfigure energy */ acct_gather_energy_g_set_data(ENERGY_DATA_RECONFIG, NULL); /* * XXX: reopen slurmd port? */ }
/* * main - slurmctld main function, start various threads and process RPCs * test7.17.prog <TRES_PER_NODE> <CONFIG_DIR_HEAD> <CONFIG_SUB_DIR> <CPU_COUNT> * */ int main(int argc, char *argv[]) { log_options_t opts = LOG_OPTS_STDERR_ONLY; int rc; uint32_t cpu_count, cpu_alloc, job_id = 12345; char *node_name, *reason_down = NULL; char *orig_config, *new_config = NULL, *tres_per_node = NULL; Buf buffer; List job_gres_list = NULL, node_gres_list = NULL; bitstr_t *cpu_bitmap; char config_dir[10000], test[1000]; char slurm_conf[1000]; uint32_t num_tasks = 1; uint32_t min_nodes = 1; uint32_t max_nodes = 1; uint16_t ntasks_per_node = NO_VAL16; uint16_t ntasks_per_socket = NO_VAL16; uint16_t sockets_per_node = NO_VAL16; uint16_t cpus_per_task = NO_VAL16; int core_count, sock_count; /* Setup slurm.conf and gres.conf test paths */ strcpy(config_dir, argv[2]); strcpy(config_dir,strcat(config_dir, "/test7.17_configs")); strcpy(test, strcat(config_dir, argv[3])); strcpy(slurm_conf, strcat(test, "/slurm.conf")); /* Enable detailed logging for now */ opts.stderr_level = LOG_LEVEL_DEBUG; log_init(argv[0], opts, SYSLOG_FACILITY_USER, NULL); /* * Logic normally executed by slurmd daemon */ setenv("SLURM_CONF", slurm_conf, 1); rc = gres_plugin_init(); if (rc != SLURM_SUCCESS) { slurm_perror("failure: gres_plugin_init"); exit(1); } setenv("SLURM_CONFIG_DIR", config_dir, 1); cpu_count = strtol(argv[4], NULL, 10); node_name = "test_node"; rc = gres_plugin_node_config_load(cpu_count, node_name, NULL, NULL, NULL); if (rc != SLURM_SUCCESS) { slurm_perror("failure: gres_plugin_node_config_load"); exit(1); } buffer = init_buf(1024); rc = gres_plugin_node_config_pack(buffer); if (rc != SLURM_SUCCESS) { slurm_perror("failure: gres_plugin_node_config_pack"); exit(1); } /* * Logic normally executed by slurmctld daemon */ orig_config = "gpu:8"; rc = gres_plugin_init_node_config(node_name, orig_config, &node_gres_list); if (rc != SLURM_SUCCESS) { slurm_perror("failure: gres_plugin_init_node_config"); exit(1); } set_buf_offset(buffer, 0); rc = gres_plugin_node_config_unpack(buffer, node_name); if (rc != SLURM_SUCCESS) { slurm_perror("failure: gres_plugin_node_config_unpack"); exit(1); } core_count = cpu_count; sock_count = 1; rc = gres_plugin_node_config_validate(node_name, orig_config, &new_config, &node_gres_list, cpu_count, core_count, sock_count, 0, &reason_down); if (rc != SLURM_SUCCESS) { slurm_perror("failure: gres_plugin_node_config_validate"); exit(1); } if (argc > 2) tres_per_node = xstrdup(argv[1]); rc = gres_plugin_job_state_validate(NULL, /* cpus_per_tres */ NULL, /* tres_freq */ NULL, /* tres_per_job */ tres_per_node, NULL, /* tres_per_socket */ NULL, /* tres_per_task */ NULL, /* mem_per_tres */ &num_tasks, &min_nodes, &max_nodes, &ntasks_per_node, &ntasks_per_socket, &sockets_per_node, &cpus_per_task, &job_gres_list); if (rc != SLURM_SUCCESS) { slurm_seterrno(rc); slurm_perror("failure: gres_plugin_job_state_validate"); exit(1); } gres_plugin_node_state_log(node_gres_list, node_name); gres_plugin_job_state_log(job_gres_list, job_id); cpu_bitmap = bit_alloc(cpu_count); bit_nset(cpu_bitmap, 0, cpu_count - 1); cpu_alloc = gres_plugin_job_test(job_gres_list, node_gres_list, true, cpu_bitmap, 0, cpu_count - 1, job_id, node_name); if (cpu_alloc == NO_VAL) printf("cpu_alloc=ALL\n"); else printf("cpu_alloc=%u\n", cpu_alloc); rc = gres_plugin_fini(); if (rc != SLURM_SUCCESS) { slurm_perror("failure: gres_plugin_fini"); exit(1); } printf("Test %s ran to completion\n\n", argv[3]); exit(0); }