/* * Test if OMPI_MCA_btl_openib_if_include should be set to global device ID or a * device ID that always starts at zero (based upon what the application can see). * RET true if TaskPlugin=task/cgroup AND ConstrainDevices=yes (in cgroup.conf). */ static bool _use_local_device_index(void) { slurm_cgroup_conf_t slurm_cgroup_conf; char *task_plugin = slurm_get_task_plugin(); bool use_cgroup = false, use_local_index = false; if (!task_plugin) return use_local_index; if (strstr(task_plugin, "cgroup")) use_cgroup = true; xfree(task_plugin); if (!use_cgroup) return use_local_index; /* Read and parse cgroup.conf */ bzero(&slurm_cgroup_conf, sizeof(slurm_cgroup_conf_t)); if (read_slurm_cgroup_conf(&slurm_cgroup_conf) != SLURM_SUCCESS) return use_local_index; if (slurm_cgroup_conf.constrain_devices) use_local_index = true; free_slurm_cgroup_conf(&slurm_cgroup_conf); return use_local_index; }
/* * init() is called when the plugin is loaded, before any other functions * are called. Put global initialization here. */ extern int init (void) { /* read cgroup configuration */ if (read_slurm_cgroup_conf(&slurm_cgroup_conf)) return SLURM_ERROR; /* enable subsystems based on conf */ if (slurm_cgroup_conf.constrain_cores) { use_cpuset = true; task_cgroup_cpuset_init(&slurm_cgroup_conf); debug("%s: now constraining jobs allocated cores", plugin_type); } if (slurm_cgroup_conf.constrain_ram_space || slurm_cgroup_conf.constrain_swap_space) { use_memory = true; task_cgroup_memory_init(&slurm_cgroup_conf); debug("%s: now constraining jobs allocated memory", plugin_type); } if (slurm_cgroup_conf.constrain_devices) { use_devices = true; task_cgroup_devices_init(&slurm_cgroup_conf); debug("%s: now constraining jobs allocated devices", plugin_type); } verbose("%s: loaded", plugin_type); return SLURM_SUCCESS; }
static int _load_cgroup_config() { slurm_cgroup_conf = xmalloc(sizeof(slurm_cgroup_conf_t)); bzero(slurm_cgroup_conf, sizeof(slurm_cgroup_conf_t)); if (read_slurm_cgroup_conf(slurm_cgroup_conf) != SLURM_SUCCESS) { info("read_slurm_cgroup_conf failed"); return SLURM_FAILURE; } return SLURM_SUCCESS; }
/* * init() is called when the plugin is loaded, before any other functions * are called. Put global initialization here. */ extern int init (void) { /* If running on the slurmctld don't do any of this since it isn't needed. */ if (_run_in_daemon()) { jag_common_init(0); /* read cgroup configuration */ if (read_slurm_cgroup_conf(&slurm_cgroup_conf)) return SLURM_ERROR; /* initialize cpuinfo internal data */ if (xcpuinfo_init() != XCPUINFO_SUCCESS) { free_slurm_cgroup_conf(&slurm_cgroup_conf); return SLURM_ERROR; } /* enable cpuacct cgroup subsystem */ if (jobacct_gather_cgroup_cpuacct_init(&slurm_cgroup_conf) != SLURM_SUCCESS) { xcpuinfo_fini(); free_slurm_cgroup_conf(&slurm_cgroup_conf); return SLURM_ERROR; } /* enable memory cgroup subsystem */ if (jobacct_gather_cgroup_memory_init(&slurm_cgroup_conf) != SLURM_SUCCESS) { xcpuinfo_fini(); free_slurm_cgroup_conf(&slurm_cgroup_conf); return SLURM_ERROR; } /* FIXME: Enable when kernel support ready. * * Enable blkio subsystem. */ /* if (jobacct_gather_cgroup_blkio_init(&slurm_cgroup_conf) */ /* != SLURM_SUCCESS) { */ /* xcpuinfo_fini(); */ /* free_slurm_cgroup_conf(&slurm_cgroup_conf); */ /* return SLURM_ERROR; */ /* } */ } verbose("%s loaded", plugin_name); return SLURM_SUCCESS; }
extern bool check_corespec_cgroup_job_confinement(void) { char *task_plugin_type = NULL; bool status = false; if (read_slurm_cgroup_conf(&slurm_cgroup_conf)) return false; task_plugin_type = slurm_get_task_plugin(); if (slurm_cgroup_conf.constrain_cores && strstr(task_plugin_type, "cgroup")) status = true; xfree(task_plugin_type); free_slurm_cgroup_conf(&slurm_cgroup_conf); return status; }
/* * init() is called when the plugin is loaded, before any other functions * are called. Put global initialization here. */ extern int init (void) { /* read cgroup configuration */ if (read_slurm_cgroup_conf(&slurm_cgroup_conf)) return SLURM_ERROR; /* initialize cpuinfo internal data */ if (xcpuinfo_init() != XCPUINFO_SUCCESS) { free_slurm_cgroup_conf(&slurm_cgroup_conf); return SLURM_ERROR; } /* initialize cgroup internal data */ if (_slurm_cgroup_init() != SLURM_SUCCESS) { xcpuinfo_fini(); free_slurm_cgroup_conf(&slurm_cgroup_conf); return SLURM_ERROR; } return SLURM_SUCCESS; }
extern void attach_system_cgroup_pid(pid_t pid) { char* slurm_cgpath; if (read_slurm_cgroup_conf(&slurm_cgroup_conf)) return; slurm_cgpath = (char*) xstrdup(slurm_cgroup_conf.cgroup_prepend); #ifdef MULTIPLE_SLURMD if ( conf->node_name != NULL ) xstrsubstitute(slurm_cgpath,"%n", conf->node_name); else { xfree(slurm_cgpath); slurm_cgpath = (char*) xstrdup("/slurm"); } #endif xstrcat(slurm_cgpath,"/system"); if (xcgroup_ns_load(&slurm_cgroup_conf, &cpuset_ns, "cpuset") == XCGROUP_SUCCESS) { if (xcgroup_load(&cpuset_ns, &system_cpuset_cg, slurm_cgpath) == XCGROUP_SUCCESS) if (attach_system_cpuset_pid(pid) != SLURM_SUCCESS) debug2("system cgroup: unable to attach pid to " "system cpuset cgroup"); } if (xcgroup_ns_load(&slurm_cgroup_conf, &memory_ns, "memory") == XCGROUP_SUCCESS) { if (xcgroup_load(&memory_ns, &system_memory_cg, slurm_cgpath) == XCGROUP_SUCCESS) if (attach_system_memory_pid(pid) != SLURM_SUCCESS) debug2("system cgroup: unable to attach pid to " "system memory cgroup"); } xfree(slurm_cgpath); free_slurm_cgroup_conf(&slurm_cgroup_conf); return; }
extern int init_system_cpuset_cgroup(void) { int rc; int fstatus = SLURM_ERROR; char* cpus = NULL; size_t cpus_size; char* slurm_cgpath; xcgroup_t slurm_cg; /* read cgroup configuration */ if (read_slurm_cgroup_conf(&slurm_cgroup_conf)) return SLURM_ERROR; /* initialize cpuset cgroup namespace */ if (xcgroup_ns_create(&slurm_cgroup_conf, &cpuset_ns, "", "cpuset") != XCGROUP_SUCCESS) { error("system cgroup: unable to create cpuset namespace"); free_slurm_cgroup_conf(&slurm_cgroup_conf); return SLURM_ERROR; } /* create slurm root cg in this cg namespace */ slurm_cgpath = _system_cgroup_create_slurm_cg(&cpuset_ns); if ( slurm_cgpath == NULL ) { xcgroup_ns_destroy(&cpuset_ns); free_slurm_cgroup_conf(&slurm_cgroup_conf); return SLURM_ERROR; } /* check that this cgroup has cpus allowed or initialize them */ if (xcgroup_load(&cpuset_ns, &slurm_cg, slurm_cgpath) != XCGROUP_SUCCESS) { error("system cgroup: unable to load slurm cpuset xcgroup"); xfree(slurm_cgpath); xcgroup_ns_destroy(&cpuset_ns); free_slurm_cgroup_conf(&slurm_cgroup_conf); return SLURM_ERROR; } again: snprintf(cpuset_meta, sizeof(cpuset_meta), "%scpus", cpuset_prefix); rc = xcgroup_get_param(&slurm_cg, cpuset_meta, &cpus, &cpus_size); if (rc != XCGROUP_SUCCESS || cpus_size == 1) { if (!cpuset_prefix_set && (rc != XCGROUP_SUCCESS)) { cpuset_prefix_set = 1; cpuset_prefix = "cpuset."; goto again; } /* initialize the cpusets as it was nonexistent */ if (_xcgroup_cpuset_init(&slurm_cg) != XCGROUP_SUCCESS) { xfree(slurm_cgpath); xcgroup_destroy(&slurm_cg); xcgroup_ns_destroy(&cpuset_ns); free_slurm_cgroup_conf(&slurm_cgroup_conf); xfree(cpus); return SLURM_ERROR; } } xcgroup_destroy(&slurm_cg); xfree(cpus); /* build system cgroup relative path */ snprintf(system_cgroup_path, PATH_MAX, "%s/system", slurm_cgpath); xfree(slurm_cgpath); /* create system cgroup in the cpuset ns */ if (xcgroup_create(&cpuset_ns, &system_cpuset_cg, system_cgroup_path, getuid(),getgid()) != XCGROUP_SUCCESS) { goto error; } if (xcgroup_instantiate(&system_cpuset_cg) != XCGROUP_SUCCESS) { goto error; } if (_xcgroup_cpuset_init(&system_cpuset_cg) != XCGROUP_SUCCESS) { goto error; } free_slurm_cgroup_conf(&slurm_cgroup_conf); debug("system cgroup: system cpuset cgroup initialized"); return SLURM_SUCCESS; error: xcgroup_unlock(&system_cpuset_cg); xcgroup_destroy(&system_cpuset_cg); xcgroup_ns_destroy(&cpuset_ns); free_slurm_cgroup_conf(&slurm_cgroup_conf); return fstatus; }
extern int init_system_memory_cgroup(void) { int fstatus = SLURM_ERROR; char* slurm_cgpath; /* read cgroup configuration */ if (read_slurm_cgroup_conf(&slurm_cgroup_conf)) return SLURM_ERROR; /* initialize memory cgroup namespace */ if (xcgroup_ns_create(&slurm_cgroup_conf, &memory_ns, "", "memory") != XCGROUP_SUCCESS) { error("system cgroup: unable to create memory namespace"); free_slurm_cgroup_conf(&slurm_cgroup_conf); return SLURM_ERROR; } constrain_kmem_space = slurm_cgroup_conf.constrain_kmem_space; constrain_ram_space = slurm_cgroup_conf.constrain_ram_space; constrain_swap_space = slurm_cgroup_conf.constrain_swap_space; /* * as the swap space threshold will be configured with a * mem+swp parameter value, if RAM space is not monitored, * set allowed RAM space to 100% of the job requested memory. * It will help to construct the mem+swp value that will be * used for both mem and mem+swp limit during memcg creation. */ if ( constrain_ram_space ) allowed_ram_space = slurm_cgroup_conf.allowed_ram_space; else allowed_ram_space = 100.0; allowed_swap_space = slurm_cgroup_conf.allowed_swap_space; if ((totalram = (uint64_t) conf->real_memory_size) == 0) error ("system cgroup: Unable to get RealMemory size"); max_kmem = _percent_in_bytes(totalram, slurm_cgroup_conf.max_kmem_percent); max_ram = _percent_in_bytes(totalram, slurm_cgroup_conf.max_ram_percent); max_swap = _percent_in_bytes(totalram, slurm_cgroup_conf.max_swap_percent); max_swap += max_ram; min_ram_space = slurm_cgroup_conf.min_ram_space * 1024 * 1024; debug ("system cgroup: memory: total:%luM allowed:%.4g%%(%s), " "swap:%.4g%%(%s), max:%.4g%%(%luM) " "max+swap:%.4g%%(%luM) min:%luM " "kmem:%.4g%%(%luM %s) min:%luM", (unsigned long) totalram, allowed_ram_space, constrain_ram_space?"enforced":"permissive", allowed_swap_space, constrain_swap_space?"enforced":"permissive", slurm_cgroup_conf.max_ram_percent, (unsigned long) (max_ram/(1024*1024)), slurm_cgroup_conf.max_swap_percent, (unsigned long) (max_swap/(1024*1024)), (unsigned long) slurm_cgroup_conf.min_ram_space, slurm_cgroup_conf.max_kmem_percent, (unsigned long)(max_kmem/(1024*1024)), constrain_kmem_space?"enforced":"permissive", (unsigned long) slurm_cgroup_conf.min_kmem_space); /* * Warning: OOM Killer must be disabled for slurmstepd * or it would be destroyed if the application use * more memory than permitted * * If an env value is already set for slurmstepd * OOM killer behavior, keep it, otherwise set the * -1000 value, wich means do not let OOM killer kill it * * FYI, setting "export SLURMSTEPD_OOM_ADJ=-1000" * in /etc/sysconfig/slurm would be the same */ setenv("SLURMSTEPD_OOM_ADJ", "-1000", 0); /* create slurm root cg in this cg namespace */ slurm_cgpath = _system_cgroup_create_slurm_cg(&memory_ns); if ( slurm_cgpath == NULL ) { xcgroup_ns_destroy(&memory_ns); free_slurm_cgroup_conf(&slurm_cgroup_conf); return SLURM_ERROR; } /* build system cgroup relative path */ snprintf(system_cgroup_path, PATH_MAX, "%s/system", slurm_cgpath); xfree(slurm_cgpath); /* create system cgroup in the cpuset ns */ if (xcgroup_create(&memory_ns, &system_memory_cg, system_cgroup_path, getuid(), getgid()) != XCGROUP_SUCCESS) { goto error; } if (xcgroup_instantiate(&system_memory_cg) != XCGROUP_SUCCESS) { goto error; } if ( xcgroup_set_param(&system_memory_cg, "memory.use_hierarchy", "1") != XCGROUP_SUCCESS ) { error("system cgroup: unable to ask for hierarchical accounting" "of system memcg '%s'", system_memory_cg.path); goto error; } free_slurm_cgroup_conf(&slurm_cgroup_conf); debug("system cgroup: system memory cgroup initialized"); return SLURM_SUCCESS; error: xcgroup_unlock(&system_memory_cg); xcgroup_destroy(&system_memory_cg); xcgroup_ns_destroy(&memory_ns); free_slurm_cgroup_conf(&slurm_cgroup_conf); return fstatus; }