extern void fini_system_cgroup(void) { xcgroup_destroy(&system_cpuset_cg); xcgroup_destroy(&system_memory_cg); xcgroup_ns_destroy(&cpuset_ns); xcgroup_ns_destroy(&memory_ns); }
int _slurm_cgroup_destroy(void) { xcgroup_lock(&freezer_cg); if (jobstep_cgroup_path[0] != '\0') { if (xcgroup_delete(&step_freezer_cg) != XCGROUP_SUCCESS) { error("_slurm_cgroup_destroy: problem deleting step " "cgroup path %s: %m", step_freezer_cg.path); xcgroup_unlock(&freezer_cg); return SLURM_ERROR; } xcgroup_destroy(&step_freezer_cg); } if (job_cgroup_path[0] != '\0') { xcgroup_delete(&job_freezer_cg); xcgroup_destroy(&job_freezer_cg); } if (user_cgroup_path[0] != '\0') { xcgroup_delete(&user_freezer_cg); xcgroup_destroy(&user_freezer_cg); } if (slurm_freezer_init) { xcgroup_destroy(&slurm_freezer_cg); } xcgroup_unlock(&freezer_cg); xcgroup_destroy(&freezer_cg); xcgroup_ns_destroy(&freezer_ns); return SLURM_SUCCESS; }
/* * create a cgroup namespace for tasks containment * * returned values: * - XCGROUP_ERROR * - XCGROUP_SUCCESS */ int xcgroup_ns_create(slurm_cgroup_conf_t *conf, xcgroup_ns_t *cgns, char *mnt_args, char *subsys) { cgns->mnt_point = xstrdup_printf("%s/%s", conf->cgroup_mountpoint, subsys); cgns->mnt_args = xstrdup(mnt_args); cgns->subsystems = xstrdup(subsys); cgns->notify_prog = xstrdup_printf("%s/release_%s", conf->cgroup_release_agent, subsys); /* check that freezer cgroup namespace is available */ if (!xcgroup_ns_is_available(cgns)) { if (conf->cgroup_automount) { if (xcgroup_ns_mount(cgns)) { error("unable to mount %s cgroup " "namespace: %s", subsys, slurm_strerror(errno)); goto clean; } info("cgroup namespace '%s' is now mounted", subsys); } else { error("cgroup namespace '%s' not mounted. aborting", subsys); goto clean; } } return XCGROUP_SUCCESS; clean: xcgroup_ns_destroy(cgns); return XCGROUP_ERROR; }
int _slurm_cgroup_destroy(void) { if (slurm_freezer_init) xcgroup_lock(&slurm_freezer_cg); if (jobstep_cgroup_path[0] != '\0') { if ( xcgroup_delete(&step_freezer_cg) != XCGROUP_SUCCESS ) { if (slurm_freezer_init) xcgroup_unlock(&slurm_freezer_cg); return SLURM_ERROR; } xcgroup_destroy(&step_freezer_cg); } if (job_cgroup_path[0] != '\0') { xcgroup_delete(&job_freezer_cg); xcgroup_destroy(&job_freezer_cg); } if (user_cgroup_path[0] != '\0') { xcgroup_delete(&user_freezer_cg); xcgroup_destroy(&user_freezer_cg); } if (slurm_freezer_init) { xcgroup_unlock(&slurm_freezer_cg); xcgroup_destroy(&slurm_freezer_cg); } xcgroup_ns_destroy(&freezer_ns); return SLURM_SUCCESS; }
int _slurm_cgroup_init(void) { /* initialize user/job/jobstep cgroup relative paths * and release agent path */ user_cgroup_path[0]='\0'; job_cgroup_path[0]='\0'; jobstep_cgroup_path[0]='\0'; /* initialize freezer cgroup namespace */ if (xcgroup_ns_create(&slurm_cgroup_conf, &freezer_ns, "", "freezer") != XCGROUP_SUCCESS) { error("unable to create freezer cgroup namespace"); return SLURM_ERROR; } /* initialize the root freezer cg */ if (xcgroup_create(&freezer_ns, &freezer_cg, "", 0, 0) != XCGROUP_SUCCESS) { error("proctrack/cgroup unable to create root freezer xcgroup"); xcgroup_ns_destroy(&freezer_ns); return SLURM_ERROR; } return SLURM_SUCCESS; }
extern int jobacct_gather_cgroup_cpuacct_fini( slurm_cgroup_conf_t *slurm_cgroup_conf) { xcgroup_t cpuacct_cg; if (user_cgroup_path[0] == '\0' || job_cgroup_path[0] == '\0' || jobstep_cgroup_path[0] == '\0') return SLURM_SUCCESS; /* * Move the slurmstepd back to the root cpuacct cg. * The release_agent will asynchroneously be called for the step * cgroup. It will do the necessary cleanup. */ if (xcgroup_create(&cpuacct_ns, &cpuacct_cg, "", 0, 0) == XCGROUP_SUCCESS) { xcgroup_set_uint32_param(&cpuacct_cg, "tasks", getpid()); xcgroup_destroy(&cpuacct_cg); } xcgroup_destroy(&user_cpuacct_cg); xcgroup_destroy(&job_cpuacct_cg); xcgroup_destroy(&step_cpuacct_cg); user_cgroup_path[0]='\0'; job_cgroup_path[0]='\0'; jobstep_cgroup_path[0]='\0'; xcgroup_ns_destroy(&cpuacct_ns); return SLURM_SUCCESS; }
extern int task_cgroup_memory_fini(slurm_cgroup_conf_t *slurm_cgroup_conf) { xcgroup_t memory_cg; if (user_cgroup_path[0] == '\0' || job_cgroup_path[0] == '\0' || jobstep_cgroup_path[0] == '\0') return SLURM_SUCCESS; /* * Lock the root memcg and try to remove the different memcgs. * The reason why we are locking here is that if a concurrent * step is in the process of being executed, he could try to * create the step memcg just after we remove the job memcg, * resulting in a failure. * First, delete step memcg as all the tasks have now exited. * Then, try to remove the job memcg. * If it fails, it is due to the fact that it is still in use by an * other running step. * After that, try to remove the user memcg. If it fails, it is due * to jobs that are still running for the same user on the node or * because of tasks attached directly to the user cg by an other * component (PAM). The user memcg was created with the * notify_on_release=1 flag (default) so it will be removed * automatically after that. * For now, do not try to detect if only externally attached tasks * are present to see if they can be be moved to an orhpan memcg. * That could be done in the future, if it is necessary. */ if (xcgroup_create(&memory_ns,&memory_cg,"",0,0) == XCGROUP_SUCCESS) { if (xcgroup_lock(&memory_cg) == XCGROUP_SUCCESS) { if (xcgroup_delete(&step_memory_cg) != SLURM_SUCCESS) debug2("task/cgroup: unable to remove step " "memcg : %m"); if (xcgroup_delete(&job_memory_cg) != XCGROUP_SUCCESS) debug2("task/cgroup: not removing " "job memcg : %m"); if (xcgroup_delete(&user_memory_cg) != XCGROUP_SUCCESS) debug2("task/cgroup: not removing " "user memcg : %m"); xcgroup_unlock(&memory_cg); } else error("task/cgroup: unable to lock root memcg : %m"); xcgroup_destroy(&memory_cg); } else error("task/cgroup: unable to create root memcg : %m"); xcgroup_destroy(&user_memory_cg); xcgroup_destroy(&job_memory_cg); xcgroup_destroy(&step_memory_cg); user_cgroup_path[0]='\0'; job_cgroup_path[0]='\0'; jobstep_cgroup_path[0]='\0'; xcgroup_ns_destroy(&memory_ns); return SLURM_SUCCESS; }
extern int task_cgroup_cpuset_init(slurm_cgroup_conf_t *slurm_cgroup_conf) { char release_agent_path[PATH_MAX]; /* initialize cpuinfo internal data */ if (xcpuinfo_init() != XCPUINFO_SUCCESS) { return SLURM_ERROR; } /* initialize user/job/jobstep cgroup relative paths */ user_cgroup_path[0]='\0'; job_cgroup_path[0]='\0'; jobstep_cgroup_path[0]='\0'; /* initialize cpuset cgroup namespace */ release_agent_path[0]='\0'; if (snprintf(release_agent_path,PATH_MAX,"%s/release_cpuset", slurm_cgroup_conf->cgroup_release_agent) >= PATH_MAX) { error("task/cgroup: unable to build cpuset release agent path"); goto error; } if (xcgroup_ns_create(slurm_cgroup_conf, &cpuset_ns, "/cpuset", "", "cpuset",release_agent_path) != XCGROUP_SUCCESS) { error("task/cgroup: unable to create cpuset namespace"); goto error; } /* check that cpuset cgroup namespace is available */ if (! xcgroup_ns_is_available(&cpuset_ns)) { if (slurm_cgroup_conf->cgroup_automount) { if (xcgroup_ns_mount(&cpuset_ns)) { error("task/cgroup: unable to mount cpuset " "namespace"); goto clean; } info("task/cgroup: cpuset namespace is now mounted"); } else { error("task/cgroup: cpuset namespace not mounted. " "aborting"); goto clean; } } return SLURM_SUCCESS; clean: xcgroup_ns_destroy(&cpuset_ns); error: xcpuinfo_fini(); return SLURM_ERROR; }
extern int jobacct_gather_cgroup_memory_init( slurm_cgroup_conf_t *slurm_cgroup_conf) { char release_agent_path[PATH_MAX]; /* initialize user/job/jobstep cgroup relative paths */ user_cgroup_path[0]='\0'; job_cgroup_path[0]='\0'; jobstep_cgroup_path[0]='\0'; /* initialize memory cgroup namespace */ release_agent_path[0]='\0'; if (snprintf(release_agent_path, PATH_MAX, "%s/release_memory", slurm_cgroup_conf->cgroup_release_agent) >= PATH_MAX) { error("jobacct_gather/cgroup: unable to build memory release " "agent path"); goto error; } if (xcgroup_ns_create(slurm_cgroup_conf, &memory_ns, "/memory", "", "memory", release_agent_path) != XCGROUP_SUCCESS) { error("jobacct_gather/cgroup: unable to create memory " "namespace"); goto error; } /* check that memory cgroup namespace is available */ if (!xcgroup_ns_is_available(&memory_ns)) { if (slurm_cgroup_conf->cgroup_automount) { if (xcgroup_ns_mount(&memory_ns)) { error("jobacct_gather/cgroup: unable to mount " "memory namespace"); goto clean; } info("jobacct_gather/cgroup: memory namespace is now " "mounted"); } else { error("jobacct_gather/cgroup: memory namespace not " "mounted. aborting"); goto clean; } } return SLURM_SUCCESS; clean: xcgroup_ns_destroy(&memory_ns); error: return SLURM_ERROR; }
extern int task_cgroup_devices_fini(slurm_cgroup_conf_t *slurm_cgroup_conf) { xcgroup_t devices_cg; /* Similarly to task_cgroup_{memory,cpuset}_fini(), we must lock the * root cgroup so we don't race with another job step that is * being started. */ if (xcgroup_create(&devices_ns, &devices_cg,"",0,0) == XCGROUP_SUCCESS) { if (xcgroup_lock(&devices_cg) == XCGROUP_SUCCESS) { /* First move slurmstepd to the root devices cg * so we can remove the step/job/user devices * cg's. */ xcgroup_move_process(&devices_cg, getpid()); if (xcgroup_delete(&step_devices_cg) != SLURM_SUCCESS) debug2("task/cgroup: unable to remove step " "devices : %m"); if (xcgroup_delete(&job_devices_cg) != XCGROUP_SUCCESS) debug2("task/cgroup: not removing " "job devices : %m"); if (xcgroup_delete(&user_devices_cg) != XCGROUP_SUCCESS) debug2("task/cgroup: not removing " "user devices : %m"); xcgroup_unlock(&devices_cg); } else error("task/cgroup: unable to lock root devices : %m"); xcgroup_destroy(&devices_cg); } else error("task/cgroup: unable to create root devices : %m"); if ( user_cgroup_path[0] != '\0' ) xcgroup_destroy(&user_devices_cg); if ( job_cgroup_path[0] != '\0' ) xcgroup_destroy(&job_devices_cg); if ( jobstep_cgroup_path[0] != '\0' ) xcgroup_destroy(&step_devices_cg); user_cgroup_path[0] = '\0'; job_cgroup_path[0] = '\0'; jobstep_cgroup_path[0] = '\0'; cgroup_allowed_devices_file[0] = '\0'; xcgroup_ns_destroy(&devices_ns); xcpuinfo_fini(); return SLURM_SUCCESS; }
extern int task_cgroup_cpuset_fini(slurm_cgroup_conf_t *slurm_cgroup_conf) { if (user_cgroup_path[0] != '\0') xcgroup_destroy(&user_cpuset_cg); if (job_cgroup_path[0] != '\0') xcgroup_destroy(&job_cpuset_cg); if (jobstep_cgroup_path[0] != '\0') xcgroup_destroy(&step_cpuset_cg); user_cgroup_path[0]='\0'; job_cgroup_path[0]='\0'; jobstep_cgroup_path[0]='\0'; xcgroup_ns_destroy(&cpuset_ns); return SLURM_SUCCESS; }
int _slurm_cgroup_destroy(void) { xcgroup_lock(&freezer_cg); /* * First move slurmstepd process to the root cgroup, otherwise * the rmdir(2) triggered by the calls below will always fail, * because slurmstepd is still in the cgroup! */ _move_current_to_root_cgroup(&freezer_ns); if (jobstep_cgroup_path[0] != '\0') { if (xcgroup_delete(&step_freezer_cg) != XCGROUP_SUCCESS) { debug("_slurm_cgroup_destroy: problem deleting step cgroup path %s: %m", step_freezer_cg.path); xcgroup_unlock(&freezer_cg); return SLURM_ERROR; } xcgroup_destroy(&step_freezer_cg); } if (job_cgroup_path[0] != '\0') { xcgroup_delete(&job_freezer_cg); xcgroup_destroy(&job_freezer_cg); } if (user_cgroup_path[0] != '\0') { xcgroup_delete(&user_freezer_cg); xcgroup_destroy(&user_freezer_cg); } if (slurm_freezer_init) { xcgroup_destroy(&slurm_freezer_cg); } xcgroup_unlock(&freezer_cg); xcgroup_destroy(&freezer_cg); xcgroup_ns_destroy(&freezer_ns); return SLURM_SUCCESS; }
extern int task_cgroup_devices_fini(slurm_cgroup_conf_t *slurm_cgroup_conf) { if ( user_cgroup_path[0] != '\0' ) xcgroup_destroy(&user_devices_cg); if ( job_cgroup_path[0] != '\0' ) xcgroup_destroy(&job_devices_cg); if ( jobstep_cgroup_path[0] != '\0' ) xcgroup_destroy(&step_devices_cg); user_cgroup_path[0] = '\0'; job_cgroup_path[0] = '\0'; jobstep_cgroup_path[0] = '\0'; cgroup_allowed_devices_file[0] = '\0'; xcgroup_ns_destroy(&devices_ns); xcpuinfo_fini(); return SLURM_SUCCESS; }
extern int task_cgroup_memory_fini(slurm_cgroup_conf_t *slurm_cgroup_conf) { xcgroup_t memory_cg; if (user_cgroup_path[0] == '\0' || job_cgroup_path[0] == '\0' || jobstep_cgroup_path[0] == '\0') return SLURM_SUCCESS; /* * Move the slurmstepd back to the root memory cg and remove[*] * the step cgroup to move its allocated pages to its parent. * * [*] Calling rmdir(2) on an empty cgroup moves all resident charged * pages to the parent (i.e. the job cgroup). (If force_empty were * used instead, only clean pages would be flushed). This keeps * resident pagecache pages associated with the job. It is expected * that the job epilog will then optionally force_empty the * job cgroup (to flush pagecache), and then rmdir(2) the cgroup * or wait for release notification from kernel. */ if (xcgroup_create(&memory_ns,&memory_cg,"",0,0) == XCGROUP_SUCCESS) { xcgroup_move_process(&memory_cg, getpid()); xcgroup_destroy(&memory_cg); if (xcgroup_delete(&step_memory_cg) != XCGROUP_SUCCESS) error ("cgroup: rmdir step memcg failed: %m"); } xcgroup_destroy(&user_memory_cg); xcgroup_destroy(&job_memory_cg); xcgroup_destroy(&step_memory_cg); user_cgroup_path[0]='\0'; job_cgroup_path[0]='\0'; jobstep_cgroup_path[0]='\0'; xcgroup_ns_destroy(&memory_ns); return SLURM_SUCCESS; }
extern int jobacct_gather_cgroup_memory_fini( slurm_cgroup_conf_t *slurm_cgroup_conf) { xcgroup_t memory_cg; if (user_cgroup_path[0] == '\0' || job_cgroup_path[0] == '\0' || jobstep_cgroup_path[0] == '\0') return SLURM_SUCCESS; /* * Move the slurmstepd back to the root memory cg and force empty * the step cgroup to move its allocated pages to its parent. * The release_agent will asynchroneously be called for the step * cgroup. It will do the necessary cleanup. * It should be good if this force_empty mech could be done directly * by the memcg implementation at the end of the last task managed * by a cgroup. It is too difficult and near impossible to handle * that cleanup correctly with current memcg. */ if (xcgroup_create(&memory_ns, &memory_cg, "", 0, 0) == XCGROUP_SUCCESS) { xcgroup_set_uint32_param(&memory_cg, "tasks", getpid()); xcgroup_destroy(&memory_cg); xcgroup_set_param(&step_memory_cg, "memory.force_empty", "1"); } xcgroup_destroy(&user_memory_cg); xcgroup_destroy(&job_memory_cg); xcgroup_destroy(&step_memory_cg); user_cgroup_path[0]='\0'; job_cgroup_path[0]='\0'; jobstep_cgroup_path[0]='\0'; xcgroup_ns_destroy(&memory_ns); return SLURM_SUCCESS; }
extern int task_cgroup_devices_init(slurm_cgroup_conf_t *slurm_cgroup_conf) { uint16_t cpunum; /* initialize cpuinfo internal data */ if ( xcpuinfo_init() != XCPUINFO_SUCCESS ) return SLURM_ERROR; /* initialize user/job/jobstep cgroup relative paths */ user_cgroup_path[0] = '\0'; job_cgroup_path[0] = '\0'; jobstep_cgroup_path[0] = '\0'; /* initialize allowed_devices_filename */ cgroup_allowed_devices_file[0] = '\0'; if ( get_procs(&cpunum) != 0 ) { error("task/cgroup: unable to get a number of CPU"); goto error; } (void) gres_plugin_node_config_load(cpunum, conf->node_name, NULL); strcpy(cgroup_allowed_devices_file, slurm_cgroup_conf->allowed_devices_file); if (xcgroup_ns_create(slurm_cgroup_conf, &devices_ns, "", "devices") != XCGROUP_SUCCESS ) { error("task/cgroup: unable to create devices namespace"); goto error; } return SLURM_SUCCESS; error: xcgroup_ns_destroy(&devices_ns); xcpuinfo_fini(); return SLURM_ERROR; }
extern int init_system_memory_cgroup(void) { int fstatus = SLURM_ERROR; char* slurm_cgpath; /* read cgroup configuration */ if (read_slurm_cgroup_conf(&slurm_cgroup_conf)) return SLURM_ERROR; /* initialize memory cgroup namespace */ if (xcgroup_ns_create(&slurm_cgroup_conf, &memory_ns, "", "memory") != XCGROUP_SUCCESS) { error("system cgroup: unable to create memory namespace"); free_slurm_cgroup_conf(&slurm_cgroup_conf); return SLURM_ERROR; } constrain_kmem_space = slurm_cgroup_conf.constrain_kmem_space; constrain_ram_space = slurm_cgroup_conf.constrain_ram_space; constrain_swap_space = slurm_cgroup_conf.constrain_swap_space; /* * as the swap space threshold will be configured with a * mem+swp parameter value, if RAM space is not monitored, * set allowed RAM space to 100% of the job requested memory. * It will help to construct the mem+swp value that will be * used for both mem and mem+swp limit during memcg creation. */ if ( constrain_ram_space ) allowed_ram_space = slurm_cgroup_conf.allowed_ram_space; else allowed_ram_space = 100.0; allowed_swap_space = slurm_cgroup_conf.allowed_swap_space; if ((totalram = (uint64_t) conf->real_memory_size) == 0) error ("system cgroup: Unable to get RealMemory size"); max_kmem = _percent_in_bytes(totalram, slurm_cgroup_conf.max_kmem_percent); max_ram = _percent_in_bytes(totalram, slurm_cgroup_conf.max_ram_percent); max_swap = _percent_in_bytes(totalram, slurm_cgroup_conf.max_swap_percent); max_swap += max_ram; min_ram_space = slurm_cgroup_conf.min_ram_space * 1024 * 1024; debug ("system cgroup: memory: total:%luM allowed:%.4g%%(%s), " "swap:%.4g%%(%s), max:%.4g%%(%luM) " "max+swap:%.4g%%(%luM) min:%luM " "kmem:%.4g%%(%luM %s) min:%luM", (unsigned long) totalram, allowed_ram_space, constrain_ram_space?"enforced":"permissive", allowed_swap_space, constrain_swap_space?"enforced":"permissive", slurm_cgroup_conf.max_ram_percent, (unsigned long) (max_ram/(1024*1024)), slurm_cgroup_conf.max_swap_percent, (unsigned long) (max_swap/(1024*1024)), (unsigned long) slurm_cgroup_conf.min_ram_space, slurm_cgroup_conf.max_kmem_percent, (unsigned long)(max_kmem/(1024*1024)), constrain_kmem_space?"enforced":"permissive", (unsigned long) slurm_cgroup_conf.min_kmem_space); /* * Warning: OOM Killer must be disabled for slurmstepd * or it would be destroyed if the application use * more memory than permitted * * If an env value is already set for slurmstepd * OOM killer behavior, keep it, otherwise set the * -1000 value, wich means do not let OOM killer kill it * * FYI, setting "export SLURMSTEPD_OOM_ADJ=-1000" * in /etc/sysconfig/slurm would be the same */ setenv("SLURMSTEPD_OOM_ADJ", "-1000", 0); /* create slurm root cg in this cg namespace */ slurm_cgpath = _system_cgroup_create_slurm_cg(&memory_ns); if ( slurm_cgpath == NULL ) { xcgroup_ns_destroy(&memory_ns); free_slurm_cgroup_conf(&slurm_cgroup_conf); return SLURM_ERROR; } /* build system cgroup relative path */ snprintf(system_cgroup_path, PATH_MAX, "%s/system", slurm_cgpath); xfree(slurm_cgpath); /* create system cgroup in the cpuset ns */ if (xcgroup_create(&memory_ns, &system_memory_cg, system_cgroup_path, getuid(), getgid()) != XCGROUP_SUCCESS) { goto error; } if (xcgroup_instantiate(&system_memory_cg) != XCGROUP_SUCCESS) { goto error; } if ( xcgroup_set_param(&system_memory_cg, "memory.use_hierarchy", "1") != XCGROUP_SUCCESS ) { error("system cgroup: unable to ask for hierarchical accounting" "of system memcg '%s'", system_memory_cg.path); goto error; } free_slurm_cgroup_conf(&slurm_cgroup_conf); debug("system cgroup: system memory cgroup initialized"); return SLURM_SUCCESS; error: xcgroup_unlock(&system_memory_cg); xcgroup_destroy(&system_memory_cg); xcgroup_ns_destroy(&memory_ns); free_slurm_cgroup_conf(&slurm_cgroup_conf); return fstatus; }
extern int jobacct_gather_cgroup_cpuacct_fini(slurm_cgroup_conf_t *slurm_cgroup_conf) { xcgroup_t cpuacct_cg; bool lock_ok; int cc; if (user_cgroup_path[0] == '\0' || job_cgroup_path[0] == '\0' || jobstep_cgroup_path[0] == '\0' || task_cgroup_path[0] == 0) return SLURM_SUCCESS; /* * Move the slurmstepd back to the root cpuacct cg. * The release_agent will asynchroneously be called for the step * cgroup. It will do the necessary cleanup. */ if (xcgroup_create(&cpuacct_ns, &cpuacct_cg, "", 0, 0) == XCGROUP_SUCCESS) { xcgroup_set_uint32_param(&cpuacct_cg, "tasks", getpid()); } /* Lock the root of the cgroup and remove the subdirectories * related to this job. */ lock_ok = true; if (xcgroup_lock(&cpuacct_cg) != XCGROUP_SUCCESS) { error("%s: failed to flock() %s %m", __func__, cpuacct_cg.path); lock_ok = false; } /* Clean up starting from the leaves way up, the * reverse order in which the cgroups were created. */ for (cc = 0; cc <= max_task_id; cc++) { xcgroup_t cgroup; char buf[PATH_MAX]; /* rmdir all tasks this running slurmstepd * was responsible for. */ sprintf(buf, "%s%s/task_%d", cpuacct_ns.mnt_point, jobstep_cgroup_path, cc); cgroup.path = buf; if (strstr(buf, "step_extern")) kill_extern_procs(cgroup.path); if (xcgroup_delete(&cgroup) != XCGROUP_SUCCESS) { debug2("%s: failed to delete %s %m", __func__, buf); } } if (xcgroup_delete(&step_cpuacct_cg) != XCGROUP_SUCCESS) { debug2("%s: failed to delete %s %m", __func__, cpuacct_cg.path); } if (xcgroup_delete(&job_cpuacct_cg) != XCGROUP_SUCCESS) { debug2("%s: failed to delete %s %m", __func__, job_cpuacct_cg.path); } if (xcgroup_delete(&user_cpuacct_cg) != XCGROUP_SUCCESS) { debug2("%s: failed to delete %s %m", __func__, user_cpuacct_cg.path); } if (lock_ok == true) xcgroup_unlock(&cpuacct_cg); xcgroup_destroy(&task_cpuacct_cg); xcgroup_destroy(&user_cpuacct_cg); xcgroup_destroy(&job_cpuacct_cg); xcgroup_destroy(&step_cpuacct_cg); xcgroup_destroy(&cpuacct_cg); user_cgroup_path[0]='\0'; job_cgroup_path[0]='\0'; jobstep_cgroup_path[0]='\0'; task_cgroup_path[0] = 0; xcgroup_ns_destroy(&cpuacct_ns); return SLURM_SUCCESS; }
extern int task_cgroup_cpuset_fini(slurm_cgroup_conf_t *slurm_cgroup_conf) { xcgroup_t cpuset_cg; /* Similarly to task_cgroup_memory_fini(), we must lock the * root cgroup so we don't race with another job step that is * being started. */ if (xcgroup_create(&cpuset_ns, &cpuset_cg,"",0,0) == XCGROUP_SUCCESS) { if (xcgroup_lock(&cpuset_cg) == XCGROUP_SUCCESS) { int i = 0, npids = 0, cnt = 0; pid_t* pids = NULL; /* First move slurmstepd to the root cpuset cg * so we can remove the step/job/user cpuset * cg's. */ xcgroup_move_process(&cpuset_cg, getpid()); /* There is a delay in the cgroup system when moving the * pid from one cgroup to another. This is usually * short, but we need to wait to make sure the pid is * out of the step cgroup or we will occur an error * leaving the cgroup unable to be removed. */ do { xcgroup_get_pids(&step_cpuset_cg, &pids, &npids); for (i = 0 ; i<npids ; i++) if (pids[i] == getpid()) { cnt++; break; } xfree(pids); } while ((i < npids) && (cnt < MAX_MOVE_WAIT)); if (cnt < MAX_MOVE_WAIT) debug3("Took %d checks before stepd pid was removed from the step cgroup.", cnt); else error("Pid %d is still in the step cgroup. It might be left uncleaned after the job.", getpid()); if (xcgroup_delete(&step_cpuset_cg) != SLURM_SUCCESS) debug2("task/cgroup: unable to remove step " "cpuset : %m"); if (xcgroup_delete(&job_cpuset_cg) != XCGROUP_SUCCESS) debug2("task/cgroup: not removing " "job cpuset : %m"); if (xcgroup_delete(&user_cpuset_cg) != XCGROUP_SUCCESS) debug2("task/cgroup: not removing " "user cpuset : %m"); xcgroup_unlock(&cpuset_cg); } else error("task/cgroup: unable to lock root cpuset : %m"); xcgroup_destroy(&cpuset_cg); } else error("task/cgroup: unable to create root cpuset : %m"); if (user_cgroup_path[0] != '\0') xcgroup_destroy(&user_cpuset_cg); if (job_cgroup_path[0] != '\0') xcgroup_destroy(&job_cpuset_cg); if (jobstep_cgroup_path[0] != '\0') xcgroup_destroy(&step_cpuset_cg); user_cgroup_path[0]='\0'; job_cgroup_path[0]='\0'; jobstep_cgroup_path[0]='\0'; xcgroup_ns_destroy(&cpuset_ns); return SLURM_SUCCESS; }
extern int init_system_cpuset_cgroup(void) { int rc; int fstatus = SLURM_ERROR; char* cpus = NULL; size_t cpus_size; char* slurm_cgpath; xcgroup_t slurm_cg; /* read cgroup configuration */ if (read_slurm_cgroup_conf(&slurm_cgroup_conf)) return SLURM_ERROR; /* initialize cpuset cgroup namespace */ if (xcgroup_ns_create(&slurm_cgroup_conf, &cpuset_ns, "", "cpuset") != XCGROUP_SUCCESS) { error("system cgroup: unable to create cpuset namespace"); free_slurm_cgroup_conf(&slurm_cgroup_conf); return SLURM_ERROR; } /* create slurm root cg in this cg namespace */ slurm_cgpath = _system_cgroup_create_slurm_cg(&cpuset_ns); if ( slurm_cgpath == NULL ) { xcgroup_ns_destroy(&cpuset_ns); free_slurm_cgroup_conf(&slurm_cgroup_conf); return SLURM_ERROR; } /* check that this cgroup has cpus allowed or initialize them */ if (xcgroup_load(&cpuset_ns, &slurm_cg, slurm_cgpath) != XCGROUP_SUCCESS) { error("system cgroup: unable to load slurm cpuset xcgroup"); xfree(slurm_cgpath); xcgroup_ns_destroy(&cpuset_ns); free_slurm_cgroup_conf(&slurm_cgroup_conf); return SLURM_ERROR; } again: snprintf(cpuset_meta, sizeof(cpuset_meta), "%scpus", cpuset_prefix); rc = xcgroup_get_param(&slurm_cg, cpuset_meta, &cpus, &cpus_size); if (rc != XCGROUP_SUCCESS || cpus_size == 1) { if (!cpuset_prefix_set && (rc != XCGROUP_SUCCESS)) { cpuset_prefix_set = 1; cpuset_prefix = "cpuset."; goto again; } /* initialize the cpusets as it was nonexistent */ if (_xcgroup_cpuset_init(&slurm_cg) != XCGROUP_SUCCESS) { xfree(slurm_cgpath); xcgroup_destroy(&slurm_cg); xcgroup_ns_destroy(&cpuset_ns); free_slurm_cgroup_conf(&slurm_cgroup_conf); xfree(cpus); return SLURM_ERROR; } } xcgroup_destroy(&slurm_cg); xfree(cpus); /* build system cgroup relative path */ snprintf(system_cgroup_path, PATH_MAX, "%s/system", slurm_cgpath); xfree(slurm_cgpath); /* create system cgroup in the cpuset ns */ if (xcgroup_create(&cpuset_ns, &system_cpuset_cg, system_cgroup_path, getuid(),getgid()) != XCGROUP_SUCCESS) { goto error; } if (xcgroup_instantiate(&system_cpuset_cg) != XCGROUP_SUCCESS) { goto error; } if (_xcgroup_cpuset_init(&system_cpuset_cg) != XCGROUP_SUCCESS) { goto error; } free_slurm_cgroup_conf(&slurm_cgroup_conf); debug("system cgroup: system cpuset cgroup initialized"); return SLURM_SUCCESS; error: xcgroup_unlock(&system_cpuset_cg); xcgroup_destroy(&system_cpuset_cg); xcgroup_ns_destroy(&cpuset_ns); free_slurm_cgroup_conf(&slurm_cgroup_conf); return fstatus; }