/* * Look for the cgroup in a specific cgroup namespace that owns * a particular pid * * returned values: * - XCGROUP_ERROR * - XCGROUP_SUCCESS */ int xcgroup_ns_find_by_pid(xcgroup_ns_t* cgns, xcgroup_t* cg, pid_t pid) { int fstatus = SLURM_ERROR; char file_path[PATH_MAX]; char* buf; size_t fsize; char* p; char* e; char* entry; char* subsys; /* build pid cgroup meta filepath */ if (snprintf(file_path, PATH_MAX, "/proc/%u/cgroup", pid) >= PATH_MAX) { debug2("unable to build cgroup meta filepath for pid=%u : %m", pid); return XCGROUP_ERROR; } /* * read file content * multiple lines of the form : * num_mask:subsystems:relative_path */ fstatus = _file_read_content(file_path, &buf, &fsize); if (fstatus == XCGROUP_SUCCESS) { fstatus = XCGROUP_ERROR; p = buf; while ((e = index(p, '\n')) != NULL) { *e='\0'; /* get subsystems entry */ subsys = index(p, ':'); p = e + 1; if (subsys == NULL) continue; subsys++; /* get relative path entry */ entry = index(subsys, ':'); if (entry == NULL) continue; *entry='\0'; /* check subsystem versus ns one */ if (xstrcmp(cgns->subsystems, subsys) != 0) { debug("skipping cgroup subsys %s(%s)", subsys, cgns->subsystems); continue; } entry++; fstatus = xcgroup_load(cgns, cg, entry); break; } xfree(buf); } return fstatus; }
extern void attach_system_cgroup_pid(pid_t pid) { char* slurm_cgpath; if (read_slurm_cgroup_conf(&slurm_cgroup_conf)) return; slurm_cgpath = (char*) xstrdup(slurm_cgroup_conf.cgroup_prepend); #ifdef MULTIPLE_SLURMD if ( conf->node_name != NULL ) xstrsubstitute(slurm_cgpath,"%n", conf->node_name); else { xfree(slurm_cgpath); slurm_cgpath = (char*) xstrdup("/slurm"); } #endif xstrcat(slurm_cgpath,"/system"); if (xcgroup_ns_load(&slurm_cgroup_conf, &cpuset_ns, "cpuset") == XCGROUP_SUCCESS) { if (xcgroup_load(&cpuset_ns, &system_cpuset_cg, slurm_cgpath) == XCGROUP_SUCCESS) if (attach_system_cpuset_pid(pid) != SLURM_SUCCESS) debug2("system cgroup: unable to attach pid to " "system cpuset cgroup"); } if (xcgroup_ns_load(&slurm_cgroup_conf, &memory_ns, "memory") == XCGROUP_SUCCESS) { if (xcgroup_load(&memory_ns, &system_memory_cg, slurm_cgpath) == XCGROUP_SUCCESS) if (attach_system_memory_pid(pid) != SLURM_SUCCESS) debug2("system cgroup: unable to attach pid to " "system memory cgroup"); } xfree(slurm_cgpath); free_slurm_cgroup_conf(&slurm_cgroup_conf); return; }
/* when cgroups are configured with cpuset, at least * cpuset.cpus and cpuset.mems must be set or the cgroup * will not be available at all. * we duplicate the ancestor configuration in the init step */ static int _xcgroup_cpuset_init(xcgroup_t* cg) { int fstatus,i; char* cpuset_metafiles[] = { "cpuset.cpus", "cpuset.mems" }; char* cpuset_meta; char* cpuset_conf; size_t csize; xcgroup_t acg; char* acg_name; char* p; fstatus = XCGROUP_ERROR; /* load ancestor cg */ acg_name = (char*) xstrdup(cg->name); p = rindex(acg_name,'/'); if (p == NULL) { debug2("task/cgroup: unable to get ancestor path for " "cpuset cg '%s' : %m",cg->path); return fstatus; } else *p = '\0'; if (xcgroup_load(cg->ns,&acg,acg_name) != XCGROUP_SUCCESS) { debug2("task/cgroup: unable to load ancestor for " "cpuset cg '%s' : %m",cg->path); return fstatus; } /* inherits ancestor params */ for (i = 0 ; i < 2 ; i++) { cpuset_meta = cpuset_metafiles[i]; if (xcgroup_get_param(&acg,cpuset_meta, &cpuset_conf,&csize) != XCGROUP_SUCCESS) { debug2("task/cgroup: assuming no cpuset cg " "support for '%s'",acg.path); xcgroup_destroy(&acg); return fstatus; } if (csize > 0) cpuset_conf[csize-1]='\0'; if (xcgroup_set_param(cg,cpuset_meta,cpuset_conf) != XCGROUP_SUCCESS) { debug2("task/cgroup: unable to write %s configuration " "(%s) for cpuset cg '%s'",cpuset_meta, cpuset_conf,cg->path); xcgroup_destroy(&acg); xfree(cpuset_conf); return fstatus; } xfree(cpuset_conf); } xcgroup_destroy(&acg); return XCGROUP_SUCCESS; }
extern int task_cgroup_cpuset_create(slurmd_job_t *job) { int rc; int fstatus = SLURM_ERROR; xcgroup_t cpuset_cg; uint32_t jobid = job->jobid; uint32_t stepid = job->stepid; uid_t uid = job->uid; uid_t gid = job->gid; char* user_alloc_cores = NULL; char* job_alloc_cores = NULL; char* step_alloc_cores = NULL; char* cpus = NULL; size_t cpus_size; char* slurm_cgpath ; xcgroup_t slurm_cg; /* create slurm root cg in this cg namespace */ slurm_cgpath = task_cgroup_create_slurm_cg(&cpuset_ns); if ( slurm_cgpath == NULL ) { return SLURM_ERROR; } /* check that this cgroup has cpus allowed or initialize them */ if (xcgroup_load(&cpuset_ns,&slurm_cg,slurm_cgpath) != XCGROUP_SUCCESS) { error("task/cgroup: unable to load slurm cpuset xcgroup"); xfree(slurm_cgpath); return SLURM_ERROR; } rc = xcgroup_get_param(&slurm_cg,"cpuset.cpus",&cpus,&cpus_size); if (rc != XCGROUP_SUCCESS || cpus_size == 1) { /* initialize the cpusets as it was inexistant */ if (_xcgroup_cpuset_init(&slurm_cg) != XCGROUP_SUCCESS) { xfree(slurm_cgpath); xcgroup_destroy(&slurm_cg); return SLURM_ERROR; } } xfree(cpus); /* build user cgroup relative path if not set (should not be) */ if (*user_cgroup_path == '\0') { if (snprintf(user_cgroup_path, PATH_MAX, "%s/uid_%u", slurm_cgpath, uid) >= PATH_MAX) { error("unable to build uid %u cgroup relative " "path : %m", uid); xfree(slurm_cgpath); return SLURM_ERROR; } } xfree(slurm_cgpath); /* build job cgroup relative path if no set (should not be) */ if (*job_cgroup_path == '\0') { if (snprintf(job_cgroup_path,PATH_MAX,"%s/job_%u", user_cgroup_path,jobid) >= PATH_MAX) { error("task/cgroup: unable to build job %u cpuset " "cg relative path : %m",jobid); return SLURM_ERROR; } } /* build job step cgroup relative path (should not be) */ if (*jobstep_cgroup_path == '\0') { if (stepid == NO_VAL) { if (snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_batch", job_cgroup_path) >= PATH_MAX) { error("task/cgroup: unable to build job step" " %u.batch cpuset cg relative path: %m", jobid); return SLURM_ERROR; } } else { if (snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_%u", job_cgroup_path, stepid) >= PATH_MAX) { error("task/cgroup: unable to build job step" " %u.%u cpuset cg relative path: %m", jobid, stepid); return SLURM_ERROR; } } } /* * create cpuset root cg and lock it * * we will keep the lock until the end to avoid the effect of a release * agent that would remove an existing cgroup hierarchy while we are * setting it up. As soon as the step cgroup is created, we can release * the lock. * Indeed, consecutive slurm steps could result in cg being removed * between the next EEXIST instanciation and the first addition of * a task. The release_agent will have to lock the root cpuset cgroup * to avoid this scenario. */ if (xcgroup_create(&cpuset_ns,&cpuset_cg,"",0,0) != XCGROUP_SUCCESS) { error("task/cgroup: unable to create root cpuset xcgroup"); return SLURM_ERROR; } if (xcgroup_lock(&cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&cpuset_cg); error("task/cgroup: unable to lock root cpuset cg"); return SLURM_ERROR; } /* * build job and job steps allocated cores lists */ debug("task/cgroup: job abstract cores are '%s'", job->job_alloc_cores); debug("task/cgroup: step abstract cores are '%s'", job->step_alloc_cores); if (xcpuinfo_abs_to_mac(job->job_alloc_cores, &job_alloc_cores) != XCPUINFO_SUCCESS) { error("task/cgroup: unable to build job physical cores"); goto error; } if (xcpuinfo_abs_to_mac(job->step_alloc_cores, &step_alloc_cores) != XCPUINFO_SUCCESS) { error("task/cgroup: unable to build step physical cores"); goto error; } debug("task/cgroup: job physical cores are '%s'", job->job_alloc_cores); debug("task/cgroup: step physical cores are '%s'", job->step_alloc_cores); /* * create user cgroup in the cpuset ns (it could already exist) */ if (xcgroup_create(&cpuset_ns,&user_cpuset_cg, user_cgroup_path, getuid(),getgid()) != XCGROUP_SUCCESS) { goto error; } if (xcgroup_instanciate(&user_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); goto error; } /* * check that user's cpuset cgroup is consistant and add the job cores */ rc = xcgroup_get_param(&user_cpuset_cg,"cpuset.cpus",&cpus,&cpus_size); if (rc != XCGROUP_SUCCESS || cpus_size == 1) { /* initialize the cpusets as it was inexistant */ if (_xcgroup_cpuset_init(&user_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_delete(&user_cpuset_cg); xcgroup_destroy(&user_cpuset_cg); goto error; } } user_alloc_cores = xstrdup(job_alloc_cores); if (cpus != NULL && cpus_size > 1) { cpus[cpus_size-1]='\0'; xstrcat(user_alloc_cores,","); xstrcat(user_alloc_cores,cpus); } xcgroup_set_param(&user_cpuset_cg,"cpuset.cpus",user_alloc_cores); xfree(cpus); /* * create job cgroup in the cpuset ns (it could already exist) */ if (xcgroup_create(&cpuset_ns,&job_cpuset_cg, job_cgroup_path, getuid(),getgid()) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); goto error; } if (xcgroup_instanciate(&job_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); xcgroup_destroy(&job_cpuset_cg); goto error; } if (_xcgroup_cpuset_init(&job_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); xcgroup_destroy(&job_cpuset_cg); goto error; } xcgroup_set_param(&job_cpuset_cg,"cpuset.cpus",job_alloc_cores); /* * create step cgroup in the cpuset ns (it should not exists) * use job's user uid/gid to enable tasks cgroups creation by * the user inside the step cgroup owned by root */ if (xcgroup_create(&cpuset_ns,&step_cpuset_cg, jobstep_cgroup_path, uid,gid) != XCGROUP_SUCCESS) { /* do not delete user/job cgroup as */ /* they can exist for other steps */ xcgroup_destroy(&user_cpuset_cg); xcgroup_destroy(&job_cpuset_cg); goto error; } if (xcgroup_instanciate(&step_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); xcgroup_destroy(&job_cpuset_cg); xcgroup_destroy(&step_cpuset_cg); goto error; } if (_xcgroup_cpuset_init(&step_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); xcgroup_destroy(&job_cpuset_cg); xcgroup_delete(&step_cpuset_cg); xcgroup_destroy(&step_cpuset_cg); goto error; } xcgroup_set_param(&step_cpuset_cg,"cpuset.cpus",step_alloc_cores); /* attach the slurmstepd to the step cpuset cgroup */ pid_t pid = getpid(); rc = xcgroup_add_pids(&step_cpuset_cg,&pid,1); if (rc != XCGROUP_SUCCESS) { error("task/cgroup: unable to add slurmstepd to cpuset cg '%s'", step_cpuset_cg.path); fstatus = SLURM_ERROR; } else fstatus = SLURM_SUCCESS; error: xcgroup_unlock(&cpuset_cg); xcgroup_destroy(&cpuset_cg); xfree(user_alloc_cores); xfree(job_alloc_cores); xfree(step_alloc_cores); return fstatus; }
extern int init_system_cpuset_cgroup(void) { int rc; int fstatus = SLURM_ERROR; char* cpus = NULL; size_t cpus_size; char* slurm_cgpath; xcgroup_t slurm_cg; /* read cgroup configuration */ if (read_slurm_cgroup_conf(&slurm_cgroup_conf)) return SLURM_ERROR; /* initialize cpuset cgroup namespace */ if (xcgroup_ns_create(&slurm_cgroup_conf, &cpuset_ns, "", "cpuset") != XCGROUP_SUCCESS) { error("system cgroup: unable to create cpuset namespace"); free_slurm_cgroup_conf(&slurm_cgroup_conf); return SLURM_ERROR; } /* create slurm root cg in this cg namespace */ slurm_cgpath = _system_cgroup_create_slurm_cg(&cpuset_ns); if ( slurm_cgpath == NULL ) { xcgroup_ns_destroy(&cpuset_ns); free_slurm_cgroup_conf(&slurm_cgroup_conf); return SLURM_ERROR; } /* check that this cgroup has cpus allowed or initialize them */ if (xcgroup_load(&cpuset_ns, &slurm_cg, slurm_cgpath) != XCGROUP_SUCCESS) { error("system cgroup: unable to load slurm cpuset xcgroup"); xfree(slurm_cgpath); xcgroup_ns_destroy(&cpuset_ns); free_slurm_cgroup_conf(&slurm_cgroup_conf); return SLURM_ERROR; } again: snprintf(cpuset_meta, sizeof(cpuset_meta), "%scpus", cpuset_prefix); rc = xcgroup_get_param(&slurm_cg, cpuset_meta, &cpus, &cpus_size); if (rc != XCGROUP_SUCCESS || cpus_size == 1) { if (!cpuset_prefix_set && (rc != XCGROUP_SUCCESS)) { cpuset_prefix_set = 1; cpuset_prefix = "cpuset."; goto again; } /* initialize the cpusets as it was nonexistent */ if (_xcgroup_cpuset_init(&slurm_cg) != XCGROUP_SUCCESS) { xfree(slurm_cgpath); xcgroup_destroy(&slurm_cg); xcgroup_ns_destroy(&cpuset_ns); free_slurm_cgroup_conf(&slurm_cgroup_conf); xfree(cpus); return SLURM_ERROR; } } xcgroup_destroy(&slurm_cg); xfree(cpus); /* build system cgroup relative path */ snprintf(system_cgroup_path, PATH_MAX, "%s/system", slurm_cgpath); xfree(slurm_cgpath); /* create system cgroup in the cpuset ns */ if (xcgroup_create(&cpuset_ns, &system_cpuset_cg, system_cgroup_path, getuid(),getgid()) != XCGROUP_SUCCESS) { goto error; } if (xcgroup_instantiate(&system_cpuset_cg) != XCGROUP_SUCCESS) { goto error; } if (_xcgroup_cpuset_init(&system_cpuset_cg) != XCGROUP_SUCCESS) { goto error; } free_slurm_cgroup_conf(&slurm_cgroup_conf); debug("system cgroup: system cpuset cgroup initialized"); return SLURM_SUCCESS; error: xcgroup_unlock(&system_cpuset_cg); xcgroup_destroy(&system_cpuset_cg); xcgroup_ns_destroy(&cpuset_ns); free_slurm_cgroup_conf(&slurm_cgroup_conf); return fstatus; }
/* when cgroups are configured with cpuset, at least * cpuset.cpus and cpuset.mems must be set or the cgroup * will not be available at all. * we duplicate the ancestor configuration in the init step */ static int _xcgroup_cpuset_init(xcgroup_t* cg) { int fstatus, i; char* cpuset_metafiles[] = { "cpus", "mems" }; char* cpuset_conf = NULL; size_t csize = 0; xcgroup_t acg; char* acg_name = NULL; char* p; fstatus = XCGROUP_ERROR; /* load ancestor cg */ acg_name = (char*) xstrdup(cg->name); p = xstrrchr(acg_name, '/'); if (p == NULL) { debug2("system cgroup: unable to get ancestor path for " "cpuset cg '%s' : %m", cg->path); xfree(acg_name); return fstatus; } else *p = '\0'; if (xcgroup_load(cg->ns, &acg, acg_name) != XCGROUP_SUCCESS) { debug2("system cgroup: unable to load ancestor for " "cpuset cg '%s' : %m", cg->path); xfree(acg_name); return fstatus; } xfree(acg_name); /* inherits ancestor params */ for (i = 0 ; i < 2 ; i++) { again: snprintf(cpuset_meta, sizeof(cpuset_meta), "%s%s", cpuset_prefix, cpuset_metafiles[i]); if (xcgroup_get_param(&acg ,cpuset_meta, &cpuset_conf, &csize) != XCGROUP_SUCCESS) { if (!cpuset_prefix_set) { cpuset_prefix_set = 1; cpuset_prefix = "cpuset."; goto again; } debug("system cgroup: assuming no cpuset cg " "support for '%s'",acg.path); xcgroup_destroy(&acg); return fstatus; } if (csize > 0) cpuset_conf[csize-1] = '\0'; if (xcgroup_set_param(cg,cpuset_meta, cpuset_conf) != XCGROUP_SUCCESS) { debug("system cgroup: unable to write %s configuration " "(%s) for cpuset cg '%s'",cpuset_meta, cpuset_conf, cg->path); xcgroup_destroy(&acg); xfree(cpuset_conf); return fstatus; } xfree(cpuset_conf); } xcgroup_destroy(&acg); return XCGROUP_SUCCESS; }
extern int task_cgroup_cpuset_create(stepd_step_rec_t *job) { int rc; int fstatus = SLURM_ERROR; xcgroup_t cpuset_cg; uint32_t jobid = job->jobid; uint32_t stepid = job->stepid; uid_t uid = job->uid; uid_t gid = job->gid; char* user_alloc_cores = NULL; char* job_alloc_cores = NULL; char* step_alloc_cores = NULL; char cpuset_meta[PATH_MAX]; char* cpus = NULL; size_t cpus_size; char* slurm_cgpath; xcgroup_t slurm_cg; #ifdef HAVE_NATIVE_CRAY char expected_usage[32]; #endif /* create slurm root cg in this cg namespace */ slurm_cgpath = task_cgroup_create_slurm_cg(&cpuset_ns); if ( slurm_cgpath == NULL ) { return SLURM_ERROR; } /* check that this cgroup has cpus allowed or initialize them */ if (xcgroup_load(&cpuset_ns,&slurm_cg,slurm_cgpath) != XCGROUP_SUCCESS) { error("task/cgroup: unable to load slurm cpuset xcgroup"); xfree(slurm_cgpath); return SLURM_ERROR; } again: snprintf(cpuset_meta, sizeof(cpuset_meta), "%scpus", cpuset_prefix); rc = xcgroup_get_param(&slurm_cg, cpuset_meta, &cpus,&cpus_size); if (rc != XCGROUP_SUCCESS || cpus_size == 1) { if (!cpuset_prefix_set && (rc != XCGROUP_SUCCESS)) { cpuset_prefix_set = 1; cpuset_prefix = "cpuset."; goto again; } /* initialize the cpusets as it was inexistant */ if (_xcgroup_cpuset_init(&slurm_cg) != XCGROUP_SUCCESS) { xfree(slurm_cgpath); xcgroup_destroy(&slurm_cg); return SLURM_ERROR; } } xfree(cpus); /* build user cgroup relative path if not set (should not be) */ if (*user_cgroup_path == '\0') { if (snprintf(user_cgroup_path, PATH_MAX, "%s/uid_%u", slurm_cgpath, uid) >= PATH_MAX) { error("task/cgroup: unable to build uid %u cgroup " "relative path : %m", uid); xfree(slurm_cgpath); return SLURM_ERROR; } } xfree(slurm_cgpath); /* build job cgroup relative path if no set (should not be) */ if (*job_cgroup_path == '\0') { if (snprintf(job_cgroup_path,PATH_MAX,"%s/job_%u", user_cgroup_path,jobid) >= PATH_MAX) { error("task/cgroup: unable to build job %u cpuset " "cg relative path : %m",jobid); return SLURM_ERROR; } } /* build job step cgroup relative path (should not be) */ if (*jobstep_cgroup_path == '\0') { int cc; if (stepid == SLURM_BATCH_SCRIPT) { cc = snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_batch", job_cgroup_path); } else if (stepid == SLURM_EXTERN_CONT) { cc = snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_extern", job_cgroup_path); } else { cc = snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_%u", job_cgroup_path, stepid); } if (cc >= PATH_MAX) { error("task/cgroup: unable to build job step %u.%u " "cpuset cg relative path: %m", jobid, stepid); return SLURM_ERROR; } } /* * create cpuset root cg and lock it * * we will keep the lock until the end to avoid the effect of a release * agent that would remove an existing cgroup hierarchy while we are * setting it up. As soon as the step cgroup is created, we can release * the lock. * Indeed, consecutive slurm steps could result in cg being removed * between the next EEXIST instanciation and the first addition of * a task. The release_agent will have to lock the root cpuset cgroup * to avoid this scenario. */ if (xcgroup_create(&cpuset_ns,&cpuset_cg,"",0,0) != XCGROUP_SUCCESS) { error("task/cgroup: unable to create root cpuset xcgroup"); return SLURM_ERROR; } if (xcgroup_lock(&cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&cpuset_cg); error("task/cgroup: unable to lock root cpuset cg"); return SLURM_ERROR; } /* * build job and job steps allocated cores lists */ debug("task/cgroup: job abstract cores are '%s'", job->job_alloc_cores); debug("task/cgroup: step abstract cores are '%s'", job->step_alloc_cores); if (xcpuinfo_abs_to_mac(job->job_alloc_cores, &job_alloc_cores) != SLURM_SUCCESS) { error("task/cgroup: unable to build job physical cores"); goto error; } if (xcpuinfo_abs_to_mac(job->step_alloc_cores, &step_alloc_cores) != SLURM_SUCCESS) { error("task/cgroup: unable to build step physical cores"); goto error; } debug("task/cgroup: job physical cores are '%s'", job_alloc_cores); debug("task/cgroup: step physical cores are '%s'", step_alloc_cores); /* * create user cgroup in the cpuset ns (it could already exist) */ if (xcgroup_create(&cpuset_ns,&user_cpuset_cg, user_cgroup_path, getuid(),getgid()) != XCGROUP_SUCCESS) { goto error; } if (xcgroup_instanciate(&user_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); goto error; } /* * check that user's cpuset cgroup is consistant and add the job cores */ rc = xcgroup_get_param(&user_cpuset_cg, cpuset_meta, &cpus,&cpus_size); if (rc != XCGROUP_SUCCESS || cpus_size == 1) { /* initialize the cpusets as it was inexistant */ if (_xcgroup_cpuset_init(&user_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_delete(&user_cpuset_cg); xcgroup_destroy(&user_cpuset_cg); goto error; } } user_alloc_cores = xstrdup(job_alloc_cores); if (cpus != NULL && cpus_size > 1) { cpus[cpus_size-1]='\0'; xstrcat(user_alloc_cores,","); xstrcat(user_alloc_cores,cpus); } xcgroup_set_param(&user_cpuset_cg, cpuset_meta, user_alloc_cores); xfree(cpus); /* * create job cgroup in the cpuset ns (it could already exist) */ if (xcgroup_create(&cpuset_ns,&job_cpuset_cg, job_cgroup_path, getuid(),getgid()) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); goto error; } if (xcgroup_instanciate(&job_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); xcgroup_destroy(&job_cpuset_cg); goto error; } if (_xcgroup_cpuset_init(&job_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); xcgroup_destroy(&job_cpuset_cg); goto error; } xcgroup_set_param(&job_cpuset_cg, cpuset_meta, job_alloc_cores); /* * create step cgroup in the cpuset ns (it should not exists) * use job's user uid/gid to enable tasks cgroups creation by * the user inside the step cgroup owned by root */ if (xcgroup_create(&cpuset_ns,&step_cpuset_cg, jobstep_cgroup_path, uid,gid) != XCGROUP_SUCCESS) { /* do not delete user/job cgroup as */ /* they can exist for other steps */ xcgroup_destroy(&user_cpuset_cg); xcgroup_destroy(&job_cpuset_cg); goto error; } if (xcgroup_instanciate(&step_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); xcgroup_destroy(&job_cpuset_cg); xcgroup_destroy(&step_cpuset_cg); goto error; } if (_xcgroup_cpuset_init(&step_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); xcgroup_destroy(&job_cpuset_cg); xcgroup_delete(&step_cpuset_cg); xcgroup_destroy(&step_cpuset_cg); goto error; } xcgroup_set_param(&step_cpuset_cg, cpuset_meta, step_alloc_cores); /* * on Cray systems, set the expected usage in bytes. * This is used by the Cray OOM killer */ #ifdef HAVE_NATIVE_CRAY snprintf(expected_usage, sizeof(expected_usage), "%"PRIu64, (uint64_t)job->step_mem * 1024 * 1024); xcgroup_set_param(&step_cpuset_cg, "expected_usage_in_bytes", expected_usage); #endif /* attach the slurmstepd to the step cpuset cgroup */ pid_t pid = getpid(); rc = xcgroup_add_pids(&step_cpuset_cg,&pid,1); if (rc != XCGROUP_SUCCESS) { error("task/cgroup: unable to add slurmstepd to cpuset cg '%s'", step_cpuset_cg.path); fstatus = SLURM_ERROR; } else fstatus = SLURM_SUCCESS; /* validate the requested cpu frequency and set it */ cpu_freq_cgroup_validate(job, step_alloc_cores); error: xcgroup_unlock(&cpuset_cg); xcgroup_destroy(&cpuset_cg); xfree(user_alloc_cores); xfree(job_alloc_cores); xfree(step_alloc_cores); return fstatus; }