int _slurm_cgroup_destroy(void) { if (slurm_freezer_init) xcgroup_lock(&slurm_freezer_cg); if (jobstep_cgroup_path[0] != '\0') { if ( xcgroup_delete(&step_freezer_cg) != XCGROUP_SUCCESS ) { if (slurm_freezer_init) xcgroup_unlock(&slurm_freezer_cg); return SLURM_ERROR; } xcgroup_destroy(&step_freezer_cg); } if (job_cgroup_path[0] != '\0') { xcgroup_delete(&job_freezer_cg); xcgroup_destroy(&job_freezer_cg); } if (user_cgroup_path[0] != '\0') { xcgroup_delete(&user_freezer_cg); xcgroup_destroy(&user_freezer_cg); } if (slurm_freezer_init) { xcgroup_unlock(&slurm_freezer_cg); xcgroup_destroy(&slurm_freezer_cg); } xcgroup_ns_destroy(&freezer_ns); return SLURM_SUCCESS; }
int _slurm_cgroup_destroy(void) { xcgroup_lock(&freezer_cg); if (jobstep_cgroup_path[0] != '\0') { if (xcgroup_delete(&step_freezer_cg) != XCGROUP_SUCCESS) { error("_slurm_cgroup_destroy: problem deleting step " "cgroup path %s: %m", step_freezer_cg.path); xcgroup_unlock(&freezer_cg); return SLURM_ERROR; } xcgroup_destroy(&step_freezer_cg); } if (job_cgroup_path[0] != '\0') { xcgroup_delete(&job_freezer_cg); xcgroup_destroy(&job_freezer_cg); } if (user_cgroup_path[0] != '\0') { xcgroup_delete(&user_freezer_cg); xcgroup_destroy(&user_freezer_cg); } if (slurm_freezer_init) { xcgroup_destroy(&slurm_freezer_cg); } xcgroup_unlock(&freezer_cg); xcgroup_destroy(&freezer_cg); xcgroup_ns_destroy(&freezer_ns); return SLURM_SUCCESS; }
extern int task_cgroup_memory_fini(slurm_cgroup_conf_t *slurm_cgroup_conf) { xcgroup_t memory_cg; if (user_cgroup_path[0] == '\0' || job_cgroup_path[0] == '\0' || jobstep_cgroup_path[0] == '\0') return SLURM_SUCCESS; /* * Lock the root memcg and try to remove the different memcgs. * The reason why we are locking here is that if a concurrent * step is in the process of being executed, he could try to * create the step memcg just after we remove the job memcg, * resulting in a failure. * First, delete step memcg as all the tasks have now exited. * Then, try to remove the job memcg. * If it fails, it is due to the fact that it is still in use by an * other running step. * After that, try to remove the user memcg. If it fails, it is due * to jobs that are still running for the same user on the node or * because of tasks attached directly to the user cg by an other * component (PAM). The user memcg was created with the * notify_on_release=1 flag (default) so it will be removed * automatically after that. * For now, do not try to detect if only externally attached tasks * are present to see if they can be be moved to an orhpan memcg. * That could be done in the future, if it is necessary. */ if (xcgroup_create(&memory_ns,&memory_cg,"",0,0) == XCGROUP_SUCCESS) { if (xcgroup_lock(&memory_cg) == XCGROUP_SUCCESS) { if (xcgroup_delete(&step_memory_cg) != SLURM_SUCCESS) debug2("task/cgroup: unable to remove step " "memcg : %m"); if (xcgroup_delete(&job_memory_cg) != XCGROUP_SUCCESS) debug2("task/cgroup: not removing " "job memcg : %m"); if (xcgroup_delete(&user_memory_cg) != XCGROUP_SUCCESS) debug2("task/cgroup: not removing " "user memcg : %m"); xcgroup_unlock(&memory_cg); } else error("task/cgroup: unable to lock root memcg : %m"); xcgroup_destroy(&memory_cg); } else error("task/cgroup: unable to create root memcg : %m"); xcgroup_destroy(&user_memory_cg); xcgroup_destroy(&job_memory_cg); xcgroup_destroy(&step_memory_cg); user_cgroup_path[0]='\0'; job_cgroup_path[0]='\0'; jobstep_cgroup_path[0]='\0'; xcgroup_ns_destroy(&memory_ns); return SLURM_SUCCESS; }
extern int task_cgroup_devices_fini(slurm_cgroup_conf_t *slurm_cgroup_conf) { xcgroup_t devices_cg; /* Similarly to task_cgroup_{memory,cpuset}_fini(), we must lock the * root cgroup so we don't race with another job step that is * being started. */ if (xcgroup_create(&devices_ns, &devices_cg,"",0,0) == XCGROUP_SUCCESS) { if (xcgroup_lock(&devices_cg) == XCGROUP_SUCCESS) { /* First move slurmstepd to the root devices cg * so we can remove the step/job/user devices * cg's. */ xcgroup_move_process(&devices_cg, getpid()); if (xcgroup_delete(&step_devices_cg) != SLURM_SUCCESS) debug2("task/cgroup: unable to remove step " "devices : %m"); if (xcgroup_delete(&job_devices_cg) != XCGROUP_SUCCESS) debug2("task/cgroup: not removing " "job devices : %m"); if (xcgroup_delete(&user_devices_cg) != XCGROUP_SUCCESS) debug2("task/cgroup: not removing " "user devices : %m"); xcgroup_unlock(&devices_cg); } else error("task/cgroup: unable to lock root devices : %m"); xcgroup_destroy(&devices_cg); } else error("task/cgroup: unable to create root devices : %m"); if ( user_cgroup_path[0] != '\0' ) xcgroup_destroy(&user_devices_cg); if ( job_cgroup_path[0] != '\0' ) xcgroup_destroy(&job_devices_cg); if ( jobstep_cgroup_path[0] != '\0' ) xcgroup_destroy(&step_devices_cg); user_cgroup_path[0] = '\0'; job_cgroup_path[0] = '\0'; jobstep_cgroup_path[0] = '\0'; cgroup_allowed_devices_file[0] = '\0'; xcgroup_ns_destroy(&devices_ns); xcpuinfo_fini(); return SLURM_SUCCESS; }
int _slurm_cgroup_destroy(void) { xcgroup_lock(&freezer_cg); /* * First move slurmstepd process to the root cgroup, otherwise * the rmdir(2) triggered by the calls below will always fail, * because slurmstepd is still in the cgroup! */ _move_current_to_root_cgroup(&freezer_ns); if (jobstep_cgroup_path[0] != '\0') { if (xcgroup_delete(&step_freezer_cg) != XCGROUP_SUCCESS) { debug("_slurm_cgroup_destroy: problem deleting step cgroup path %s: %m", step_freezer_cg.path); xcgroup_unlock(&freezer_cg); return SLURM_ERROR; } xcgroup_destroy(&step_freezer_cg); } if (job_cgroup_path[0] != '\0') { xcgroup_delete(&job_freezer_cg); xcgroup_destroy(&job_freezer_cg); } if (user_cgroup_path[0] != '\0') { xcgroup_delete(&user_freezer_cg); xcgroup_destroy(&user_freezer_cg); } if (slurm_freezer_init) { xcgroup_destroy(&slurm_freezer_cg); } xcgroup_unlock(&freezer_cg); xcgroup_destroy(&freezer_cg); xcgroup_ns_destroy(&freezer_ns); return SLURM_SUCCESS; }
extern int task_cgroup_memory_fini(slurm_cgroup_conf_t *slurm_cgroup_conf) { xcgroup_t memory_cg; if (user_cgroup_path[0] == '\0' || job_cgroup_path[0] == '\0' || jobstep_cgroup_path[0] == '\0') return SLURM_SUCCESS; /* * Move the slurmstepd back to the root memory cg and remove[*] * the step cgroup to move its allocated pages to its parent. * * [*] Calling rmdir(2) on an empty cgroup moves all resident charged * pages to the parent (i.e. the job cgroup). (If force_empty were * used instead, only clean pages would be flushed). This keeps * resident pagecache pages associated with the job. It is expected * that the job epilog will then optionally force_empty the * job cgroup (to flush pagecache), and then rmdir(2) the cgroup * or wait for release notification from kernel. */ if (xcgroup_create(&memory_ns,&memory_cg,"",0,0) == XCGROUP_SUCCESS) { xcgroup_move_process(&memory_cg, getpid()); xcgroup_destroy(&memory_cg); if (xcgroup_delete(&step_memory_cg) != XCGROUP_SUCCESS) error ("cgroup: rmdir step memcg failed: %m"); } xcgroup_destroy(&user_memory_cg); xcgroup_destroy(&job_memory_cg); xcgroup_destroy(&step_memory_cg); user_cgroup_path[0]='\0'; job_cgroup_path[0]='\0'; jobstep_cgroup_path[0]='\0'; xcgroup_ns_destroy(&memory_ns); return SLURM_SUCCESS; }
extern int task_cgroup_cpuset_create(slurmd_job_t *job) { int rc; int fstatus = SLURM_ERROR; xcgroup_t cpuset_cg; uint32_t jobid = job->jobid; uint32_t stepid = job->stepid; uid_t uid = job->uid; uid_t gid = job->gid; char* user_alloc_cores = NULL; char* job_alloc_cores = NULL; char* step_alloc_cores = NULL; char* cpus = NULL; size_t cpus_size; char* slurm_cgpath ; xcgroup_t slurm_cg; /* create slurm root cg in this cg namespace */ slurm_cgpath = task_cgroup_create_slurm_cg(&cpuset_ns); if ( slurm_cgpath == NULL ) { return SLURM_ERROR; } /* check that this cgroup has cpus allowed or initialize them */ if (xcgroup_load(&cpuset_ns,&slurm_cg,slurm_cgpath) != XCGROUP_SUCCESS) { error("task/cgroup: unable to load slurm cpuset xcgroup"); xfree(slurm_cgpath); return SLURM_ERROR; } rc = xcgroup_get_param(&slurm_cg,"cpuset.cpus",&cpus,&cpus_size); if (rc != XCGROUP_SUCCESS || cpus_size == 1) { /* initialize the cpusets as it was inexistant */ if (_xcgroup_cpuset_init(&slurm_cg) != XCGROUP_SUCCESS) { xfree(slurm_cgpath); xcgroup_destroy(&slurm_cg); return SLURM_ERROR; } } xfree(cpus); /* build user cgroup relative path if not set (should not be) */ if (*user_cgroup_path == '\0') { if (snprintf(user_cgroup_path, PATH_MAX, "%s/uid_%u", slurm_cgpath, uid) >= PATH_MAX) { error("unable to build uid %u cgroup relative " "path : %m", uid); xfree(slurm_cgpath); return SLURM_ERROR; } } xfree(slurm_cgpath); /* build job cgroup relative path if no set (should not be) */ if (*job_cgroup_path == '\0') { if (snprintf(job_cgroup_path,PATH_MAX,"%s/job_%u", user_cgroup_path,jobid) >= PATH_MAX) { error("task/cgroup: unable to build job %u cpuset " "cg relative path : %m",jobid); return SLURM_ERROR; } } /* build job step cgroup relative path (should not be) */ if (*jobstep_cgroup_path == '\0') { if (stepid == NO_VAL) { if (snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_batch", job_cgroup_path) >= PATH_MAX) { error("task/cgroup: unable to build job step" " %u.batch cpuset cg relative path: %m", jobid); return SLURM_ERROR; } } else { if (snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_%u", job_cgroup_path, stepid) >= PATH_MAX) { error("task/cgroup: unable to build job step" " %u.%u cpuset cg relative path: %m", jobid, stepid); return SLURM_ERROR; } } } /* * create cpuset root cg and lock it * * we will keep the lock until the end to avoid the effect of a release * agent that would remove an existing cgroup hierarchy while we are * setting it up. As soon as the step cgroup is created, we can release * the lock. * Indeed, consecutive slurm steps could result in cg being removed * between the next EEXIST instanciation and the first addition of * a task. The release_agent will have to lock the root cpuset cgroup * to avoid this scenario. */ if (xcgroup_create(&cpuset_ns,&cpuset_cg,"",0,0) != XCGROUP_SUCCESS) { error("task/cgroup: unable to create root cpuset xcgroup"); return SLURM_ERROR; } if (xcgroup_lock(&cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&cpuset_cg); error("task/cgroup: unable to lock root cpuset cg"); return SLURM_ERROR; } /* * build job and job steps allocated cores lists */ debug("task/cgroup: job abstract cores are '%s'", job->job_alloc_cores); debug("task/cgroup: step abstract cores are '%s'", job->step_alloc_cores); if (xcpuinfo_abs_to_mac(job->job_alloc_cores, &job_alloc_cores) != XCPUINFO_SUCCESS) { error("task/cgroup: unable to build job physical cores"); goto error; } if (xcpuinfo_abs_to_mac(job->step_alloc_cores, &step_alloc_cores) != XCPUINFO_SUCCESS) { error("task/cgroup: unable to build step physical cores"); goto error; } debug("task/cgroup: job physical cores are '%s'", job->job_alloc_cores); debug("task/cgroup: step physical cores are '%s'", job->step_alloc_cores); /* * create user cgroup in the cpuset ns (it could already exist) */ if (xcgroup_create(&cpuset_ns,&user_cpuset_cg, user_cgroup_path, getuid(),getgid()) != XCGROUP_SUCCESS) { goto error; } if (xcgroup_instanciate(&user_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); goto error; } /* * check that user's cpuset cgroup is consistant and add the job cores */ rc = xcgroup_get_param(&user_cpuset_cg,"cpuset.cpus",&cpus,&cpus_size); if (rc != XCGROUP_SUCCESS || cpus_size == 1) { /* initialize the cpusets as it was inexistant */ if (_xcgroup_cpuset_init(&user_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_delete(&user_cpuset_cg); xcgroup_destroy(&user_cpuset_cg); goto error; } } user_alloc_cores = xstrdup(job_alloc_cores); if (cpus != NULL && cpus_size > 1) { cpus[cpus_size-1]='\0'; xstrcat(user_alloc_cores,","); xstrcat(user_alloc_cores,cpus); } xcgroup_set_param(&user_cpuset_cg,"cpuset.cpus",user_alloc_cores); xfree(cpus); /* * create job cgroup in the cpuset ns (it could already exist) */ if (xcgroup_create(&cpuset_ns,&job_cpuset_cg, job_cgroup_path, getuid(),getgid()) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); goto error; } if (xcgroup_instanciate(&job_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); xcgroup_destroy(&job_cpuset_cg); goto error; } if (_xcgroup_cpuset_init(&job_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); xcgroup_destroy(&job_cpuset_cg); goto error; } xcgroup_set_param(&job_cpuset_cg,"cpuset.cpus",job_alloc_cores); /* * create step cgroup in the cpuset ns (it should not exists) * use job's user uid/gid to enable tasks cgroups creation by * the user inside the step cgroup owned by root */ if (xcgroup_create(&cpuset_ns,&step_cpuset_cg, jobstep_cgroup_path, uid,gid) != XCGROUP_SUCCESS) { /* do not delete user/job cgroup as */ /* they can exist for other steps */ xcgroup_destroy(&user_cpuset_cg); xcgroup_destroy(&job_cpuset_cg); goto error; } if (xcgroup_instanciate(&step_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); xcgroup_destroy(&job_cpuset_cg); xcgroup_destroy(&step_cpuset_cg); goto error; } if (_xcgroup_cpuset_init(&step_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); xcgroup_destroy(&job_cpuset_cg); xcgroup_delete(&step_cpuset_cg); xcgroup_destroy(&step_cpuset_cg); goto error; } xcgroup_set_param(&step_cpuset_cg,"cpuset.cpus",step_alloc_cores); /* attach the slurmstepd to the step cpuset cgroup */ pid_t pid = getpid(); rc = xcgroup_add_pids(&step_cpuset_cg,&pid,1); if (rc != XCGROUP_SUCCESS) { error("task/cgroup: unable to add slurmstepd to cpuset cg '%s'", step_cpuset_cg.path); fstatus = SLURM_ERROR; } else fstatus = SLURM_SUCCESS; error: xcgroup_unlock(&cpuset_cg); xcgroup_destroy(&cpuset_cg); xfree(user_alloc_cores); xfree(job_alloc_cores); xfree(step_alloc_cores); return fstatus; }
extern int task_cgroup_cpuset_create(stepd_step_rec_t *job) { int rc; int fstatus = SLURM_ERROR; xcgroup_t cpuset_cg; uint32_t jobid = job->jobid; uint32_t stepid = job->stepid; uid_t uid = job->uid; uid_t gid = job->gid; char* user_alloc_cores = NULL; char* job_alloc_cores = NULL; char* step_alloc_cores = NULL; char cpuset_meta[PATH_MAX]; char* cpus = NULL; size_t cpus_size; char* slurm_cgpath; xcgroup_t slurm_cg; #ifdef HAVE_NATIVE_CRAY char expected_usage[32]; #endif /* create slurm root cg in this cg namespace */ slurm_cgpath = task_cgroup_create_slurm_cg(&cpuset_ns); if ( slurm_cgpath == NULL ) { return SLURM_ERROR; } /* check that this cgroup has cpus allowed or initialize them */ if (xcgroup_load(&cpuset_ns,&slurm_cg,slurm_cgpath) != XCGROUP_SUCCESS) { error("task/cgroup: unable to load slurm cpuset xcgroup"); xfree(slurm_cgpath); return SLURM_ERROR; } again: snprintf(cpuset_meta, sizeof(cpuset_meta), "%scpus", cpuset_prefix); rc = xcgroup_get_param(&slurm_cg, cpuset_meta, &cpus,&cpus_size); if (rc != XCGROUP_SUCCESS || cpus_size == 1) { if (!cpuset_prefix_set && (rc != XCGROUP_SUCCESS)) { cpuset_prefix_set = 1; cpuset_prefix = "cpuset."; goto again; } /* initialize the cpusets as it was inexistant */ if (_xcgroup_cpuset_init(&slurm_cg) != XCGROUP_SUCCESS) { xfree(slurm_cgpath); xcgroup_destroy(&slurm_cg); return SLURM_ERROR; } } xfree(cpus); /* build user cgroup relative path if not set (should not be) */ if (*user_cgroup_path == '\0') { if (snprintf(user_cgroup_path, PATH_MAX, "%s/uid_%u", slurm_cgpath, uid) >= PATH_MAX) { error("task/cgroup: unable to build uid %u cgroup " "relative path : %m", uid); xfree(slurm_cgpath); return SLURM_ERROR; } } xfree(slurm_cgpath); /* build job cgroup relative path if no set (should not be) */ if (*job_cgroup_path == '\0') { if (snprintf(job_cgroup_path,PATH_MAX,"%s/job_%u", user_cgroup_path,jobid) >= PATH_MAX) { error("task/cgroup: unable to build job %u cpuset " "cg relative path : %m",jobid); return SLURM_ERROR; } } /* build job step cgroup relative path (should not be) */ if (*jobstep_cgroup_path == '\0') { int cc; if (stepid == SLURM_BATCH_SCRIPT) { cc = snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_batch", job_cgroup_path); } else if (stepid == SLURM_EXTERN_CONT) { cc = snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_extern", job_cgroup_path); } else { cc = snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_%u", job_cgroup_path, stepid); } if (cc >= PATH_MAX) { error("task/cgroup: unable to build job step %u.%u " "cpuset cg relative path: %m", jobid, stepid); return SLURM_ERROR; } } /* * create cpuset root cg and lock it * * we will keep the lock until the end to avoid the effect of a release * agent that would remove an existing cgroup hierarchy while we are * setting it up. As soon as the step cgroup is created, we can release * the lock. * Indeed, consecutive slurm steps could result in cg being removed * between the next EEXIST instanciation and the first addition of * a task. The release_agent will have to lock the root cpuset cgroup * to avoid this scenario. */ if (xcgroup_create(&cpuset_ns,&cpuset_cg,"",0,0) != XCGROUP_SUCCESS) { error("task/cgroup: unable to create root cpuset xcgroup"); return SLURM_ERROR; } if (xcgroup_lock(&cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&cpuset_cg); error("task/cgroup: unable to lock root cpuset cg"); return SLURM_ERROR; } /* * build job and job steps allocated cores lists */ debug("task/cgroup: job abstract cores are '%s'", job->job_alloc_cores); debug("task/cgroup: step abstract cores are '%s'", job->step_alloc_cores); if (xcpuinfo_abs_to_mac(job->job_alloc_cores, &job_alloc_cores) != SLURM_SUCCESS) { error("task/cgroup: unable to build job physical cores"); goto error; } if (xcpuinfo_abs_to_mac(job->step_alloc_cores, &step_alloc_cores) != SLURM_SUCCESS) { error("task/cgroup: unable to build step physical cores"); goto error; } debug("task/cgroup: job physical cores are '%s'", job_alloc_cores); debug("task/cgroup: step physical cores are '%s'", step_alloc_cores); /* * create user cgroup in the cpuset ns (it could already exist) */ if (xcgroup_create(&cpuset_ns,&user_cpuset_cg, user_cgroup_path, getuid(),getgid()) != XCGROUP_SUCCESS) { goto error; } if (xcgroup_instanciate(&user_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); goto error; } /* * check that user's cpuset cgroup is consistant and add the job cores */ rc = xcgroup_get_param(&user_cpuset_cg, cpuset_meta, &cpus,&cpus_size); if (rc != XCGROUP_SUCCESS || cpus_size == 1) { /* initialize the cpusets as it was inexistant */ if (_xcgroup_cpuset_init(&user_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_delete(&user_cpuset_cg); xcgroup_destroy(&user_cpuset_cg); goto error; } } user_alloc_cores = xstrdup(job_alloc_cores); if (cpus != NULL && cpus_size > 1) { cpus[cpus_size-1]='\0'; xstrcat(user_alloc_cores,","); xstrcat(user_alloc_cores,cpus); } xcgroup_set_param(&user_cpuset_cg, cpuset_meta, user_alloc_cores); xfree(cpus); /* * create job cgroup in the cpuset ns (it could already exist) */ if (xcgroup_create(&cpuset_ns,&job_cpuset_cg, job_cgroup_path, getuid(),getgid()) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); goto error; } if (xcgroup_instanciate(&job_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); xcgroup_destroy(&job_cpuset_cg); goto error; } if (_xcgroup_cpuset_init(&job_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); xcgroup_destroy(&job_cpuset_cg); goto error; } xcgroup_set_param(&job_cpuset_cg, cpuset_meta, job_alloc_cores); /* * create step cgroup in the cpuset ns (it should not exists) * use job's user uid/gid to enable tasks cgroups creation by * the user inside the step cgroup owned by root */ if (xcgroup_create(&cpuset_ns,&step_cpuset_cg, jobstep_cgroup_path, uid,gid) != XCGROUP_SUCCESS) { /* do not delete user/job cgroup as */ /* they can exist for other steps */ xcgroup_destroy(&user_cpuset_cg); xcgroup_destroy(&job_cpuset_cg); goto error; } if (xcgroup_instanciate(&step_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); xcgroup_destroy(&job_cpuset_cg); xcgroup_destroy(&step_cpuset_cg); goto error; } if (_xcgroup_cpuset_init(&step_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); xcgroup_destroy(&job_cpuset_cg); xcgroup_delete(&step_cpuset_cg); xcgroup_destroy(&step_cpuset_cg); goto error; } xcgroup_set_param(&step_cpuset_cg, cpuset_meta, step_alloc_cores); /* * on Cray systems, set the expected usage in bytes. * This is used by the Cray OOM killer */ #ifdef HAVE_NATIVE_CRAY snprintf(expected_usage, sizeof(expected_usage), "%"PRIu64, (uint64_t)job->step_mem * 1024 * 1024); xcgroup_set_param(&step_cpuset_cg, "expected_usage_in_bytes", expected_usage); #endif /* attach the slurmstepd to the step cpuset cgroup */ pid_t pid = getpid(); rc = xcgroup_add_pids(&step_cpuset_cg,&pid,1); if (rc != XCGROUP_SUCCESS) { error("task/cgroup: unable to add slurmstepd to cpuset cg '%s'", step_cpuset_cg.path); fstatus = SLURM_ERROR; } else fstatus = SLURM_SUCCESS; /* validate the requested cpu frequency and set it */ cpu_freq_cgroup_validate(job, step_alloc_cores); error: xcgroup_unlock(&cpuset_cg); xcgroup_destroy(&cpuset_cg); xfree(user_alloc_cores); xfree(job_alloc_cores); xfree(step_alloc_cores); return fstatus; }
extern int jobacct_gather_cgroup_cpuacct_fini(slurm_cgroup_conf_t *slurm_cgroup_conf) { xcgroup_t cpuacct_cg; bool lock_ok; int cc; if (user_cgroup_path[0] == '\0' || job_cgroup_path[0] == '\0' || jobstep_cgroup_path[0] == '\0' || task_cgroup_path[0] == 0) return SLURM_SUCCESS; /* * Move the slurmstepd back to the root cpuacct cg. * The release_agent will asynchroneously be called for the step * cgroup. It will do the necessary cleanup. */ if (xcgroup_create(&cpuacct_ns, &cpuacct_cg, "", 0, 0) == XCGROUP_SUCCESS) { xcgroup_set_uint32_param(&cpuacct_cg, "tasks", getpid()); } /* Lock the root of the cgroup and remove the subdirectories * related to this job. */ lock_ok = true; if (xcgroup_lock(&cpuacct_cg) != XCGROUP_SUCCESS) { error("%s: failed to flock() %s %m", __func__, cpuacct_cg.path); lock_ok = false; } /* Clean up starting from the leaves way up, the * reverse order in which the cgroups were created. */ for (cc = 0; cc <= max_task_id; cc++) { xcgroup_t cgroup; char buf[PATH_MAX]; /* rmdir all tasks this running slurmstepd * was responsible for. */ sprintf(buf, "%s%s/task_%d", cpuacct_ns.mnt_point, jobstep_cgroup_path, cc); cgroup.path = buf; if (strstr(buf, "step_extern")) kill_extern_procs(cgroup.path); if (xcgroup_delete(&cgroup) != XCGROUP_SUCCESS) { debug2("%s: failed to delete %s %m", __func__, buf); } } if (xcgroup_delete(&step_cpuacct_cg) != XCGROUP_SUCCESS) { debug2("%s: failed to delete %s %m", __func__, cpuacct_cg.path); } if (xcgroup_delete(&job_cpuacct_cg) != XCGROUP_SUCCESS) { debug2("%s: failed to delete %s %m", __func__, job_cpuacct_cg.path); } if (xcgroup_delete(&user_cpuacct_cg) != XCGROUP_SUCCESS) { debug2("%s: failed to delete %s %m", __func__, user_cpuacct_cg.path); } if (lock_ok == true) xcgroup_unlock(&cpuacct_cg); xcgroup_destroy(&task_cpuacct_cg); xcgroup_destroy(&user_cpuacct_cg); xcgroup_destroy(&job_cpuacct_cg); xcgroup_destroy(&step_cpuacct_cg); xcgroup_destroy(&cpuacct_cg); user_cgroup_path[0]='\0'; job_cgroup_path[0]='\0'; jobstep_cgroup_path[0]='\0'; task_cgroup_path[0] = 0; xcgroup_ns_destroy(&cpuacct_ns); return SLURM_SUCCESS; }
extern int task_cgroup_cpuset_fini(slurm_cgroup_conf_t *slurm_cgroup_conf) { xcgroup_t cpuset_cg; /* Similarly to task_cgroup_memory_fini(), we must lock the * root cgroup so we don't race with another job step that is * being started. */ if (xcgroup_create(&cpuset_ns, &cpuset_cg,"",0,0) == XCGROUP_SUCCESS) { if (xcgroup_lock(&cpuset_cg) == XCGROUP_SUCCESS) { int i = 0, npids = 0, cnt = 0; pid_t* pids = NULL; /* First move slurmstepd to the root cpuset cg * so we can remove the step/job/user cpuset * cg's. */ xcgroup_move_process(&cpuset_cg, getpid()); /* There is a delay in the cgroup system when moving the * pid from one cgroup to another. This is usually * short, but we need to wait to make sure the pid is * out of the step cgroup or we will occur an error * leaving the cgroup unable to be removed. */ do { xcgroup_get_pids(&step_cpuset_cg, &pids, &npids); for (i = 0 ; i<npids ; i++) if (pids[i] == getpid()) { cnt++; break; } xfree(pids); } while ((i < npids) && (cnt < MAX_MOVE_WAIT)); if (cnt < MAX_MOVE_WAIT) debug3("Took %d checks before stepd pid was removed from the step cgroup.", cnt); else error("Pid %d is still in the step cgroup. It might be left uncleaned after the job.", getpid()); if (xcgroup_delete(&step_cpuset_cg) != SLURM_SUCCESS) debug2("task/cgroup: unable to remove step " "cpuset : %m"); if (xcgroup_delete(&job_cpuset_cg) != XCGROUP_SUCCESS) debug2("task/cgroup: not removing " "job cpuset : %m"); if (xcgroup_delete(&user_cpuset_cg) != XCGROUP_SUCCESS) debug2("task/cgroup: not removing " "user cpuset : %m"); xcgroup_unlock(&cpuset_cg); } else error("task/cgroup: unable to lock root cpuset : %m"); xcgroup_destroy(&cpuset_cg); } else error("task/cgroup: unable to create root cpuset : %m"); if (user_cgroup_path[0] != '\0') xcgroup_destroy(&user_cpuset_cg); if (job_cgroup_path[0] != '\0') xcgroup_destroy(&job_cpuset_cg); if (jobstep_cgroup_path[0] != '\0') xcgroup_destroy(&step_cpuset_cg); user_cgroup_path[0]='\0'; job_cgroup_path[0]='\0'; jobstep_cgroup_path[0]='\0'; xcgroup_ns_destroy(&cpuset_ns); return SLURM_SUCCESS; }