extern char* task_cgroup_create_slurm_cg (xcgroup_ns_t* ns) { /* we do it here as we do not have access to the conf structure */ /* in libslurm (src/common/xcgroup.c) */ xcgroup_t slurm_cg; char* pre = (char*) xstrdup(slurm_cgroup_conf.cgroup_prepend); #ifdef MULTIPLE_SLURMD if ( conf->node_name != NULL ) xstrsubstitute(pre,"%n", conf->node_name); else { xfree(pre); pre = (char*) xstrdup("/slurm"); } #endif /* create slurm cgroup in the ns (it could already exist) * disable notify_on_release to avoid the removal/creation * of this cgroup for each last/first running job on the node */ if (xcgroup_create(ns,&slurm_cg,pre, getuid(), getgid()) != XCGROUP_SUCCESS) { xfree(pre); return pre; } slurm_cg.notify = 0; if (xcgroup_instanciate(&slurm_cg) != XCGROUP_SUCCESS) { error("unable to build slurm cgroup for ns %s: %m", ns->subsystems); xcgroup_destroy(&slurm_cg); xfree(pre); return pre; } else { debug3("slurm cgroup %s successfully created for ns %s: %m", pre,ns->subsystems); xcgroup_destroy(&slurm_cg); } return pre; }
/* * check that a cgroup namespace is ready to be used * * returned values: * - XCGROUP_ERROR : not available * - XCGROUP_SUCCESS : ready to be used */ int xcgroup_ns_is_available(xcgroup_ns_t* cgns) { int fstatus = 0; char* value; size_t s; xcgroup_t cg; if (xcgroup_create(cgns, &cg, "/", 0, 0) == XCGROUP_ERROR) return 0; if (xcgroup_get_param(&cg, "release_agent", &value, &s) != XCGROUP_SUCCESS) fstatus = 0; else { xfree(value); fstatus = 1; } xcgroup_destroy(&cg); return fstatus; }
extern int task_cgroup_devices_create(stepd_step_rec_t *job) { int f, k, rc, gres_conf_lines, allow_lines; int fstatus = SLURM_ERROR; char **gres_name = NULL; char **gres_cgroup = NULL, **dev_path = NULL; char *allowed_devices[PATH_MAX], *allowed_dev_major[PATH_MAX]; int *gres_job_bit_alloc = NULL; int *gres_step_bit_alloc = NULL; int *gres_count = NULL; xcgroup_t devices_cg; uint32_t jobid = job->jobid; uint32_t stepid = job->stepid; uid_t uid = job->uid; uid_t gid = job->gid; List job_gres_list = job->job_gres_list; List step_gres_list = job->step_gres_list; char* slurm_cgpath ; /* create slurm root cgroup in this cgroup namespace */ slurm_cgpath = task_cgroup_create_slurm_cg(&devices_ns); if (slurm_cgpath == NULL) return SLURM_ERROR; /* build user cgroup relative path if not set (should not be) */ if (*user_cgroup_path == '\0') { if (snprintf(user_cgroup_path, PATH_MAX, "%s/uid_%u", slurm_cgpath, uid) >= PATH_MAX) { error("unable to build uid %u cgroup relative path : %m", uid); xfree(slurm_cgpath); return SLURM_ERROR; } } xfree(slurm_cgpath); /* build job cgroup relative path if no set (should not be) */ if (*job_cgroup_path == '\0') { if (snprintf(job_cgroup_path, PATH_MAX, "%s/job_%u", user_cgroup_path, jobid) >= PATH_MAX) { error("task/cgroup: unable to build job %u devices " "cgroup relative path : %m", jobid); return SLURM_ERROR; } } /* build job step cgroup relative path (should not be) */ if (*jobstep_cgroup_path == '\0') { int cc; if (stepid == SLURM_BATCH_SCRIPT) { cc = snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_batch", job_cgroup_path); } else if (stepid == SLURM_EXTERN_CONT) { cc = snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_extern", job_cgroup_path); } else { cc = snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_%u", job_cgroup_path, stepid); } if (cc >= PATH_MAX) { error("task/cgroup: unable to build job step %u.%u " "devices cgroup relative path : %m", jobid, stepid); return SLURM_ERROR; } } /* * create devices root cgroup and lock it * * we will keep the lock until the end to avoid the effect of a release * agent that would remove an existing cgroup hierarchy while we are * setting it up. As soon as the step cgroup is created, we can release * the lock. * Indeed, consecutive slurm steps could result in cgroup being removed * between the next EEXIST instanciation and the first addition of * a task. The release_agent will have to lock the root devices cgroup * to avoid this scenario. */ if (xcgroup_create(&devices_ns, &devices_cg, "", 0, 0) != XCGROUP_SUCCESS ) { error("task/cgroup: unable to create root devices cgroup"); return SLURM_ERROR; } if (xcgroup_lock(&devices_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&devices_cg); error("task/cgroup: unable to lock root devices cgroup"); return SLURM_ERROR; } info("task/cgroup: manage devices jor job '%u'", jobid); /* * collect info concerning the gres.conf file * the GRES devices paths and the GRES names */ gres_conf_lines = gres_plugin_node_config_devices_path(&dev_path, &gres_name, job->node_name); /* * create the entry for cgroup devices subsystem with major minor */ gres_cgroup = xmalloc(sizeof(char *) * gres_conf_lines); _calc_device_major(dev_path, gres_cgroup, gres_conf_lines); /* * create the entry with major minor for the default allowed devices * read from the file */ allow_lines = read_allowed_devices_file(allowed_devices); _calc_device_major(allowed_devices, allowed_dev_major, allow_lines); /* * calculate the number of gres.conf records for each gres name */ gres_count = xmalloc(sizeof(int) * gres_conf_lines); f = 0; gres_count[f] = 1; for (k = 0; k < gres_conf_lines; k++) { if ((k+1 < gres_conf_lines) && (xstrcmp(gres_name[k], gres_name[k+1]) == 0)) gres_count[f]++; if ((k+1 < gres_conf_lines) && (xstrcmp(gres_name[k], gres_name[k+1]) != 0)) { f++; gres_count[f] = 1; } } /* * create user cgroup in the devices ns (it could already exist) */ if (xcgroup_create(&devices_ns, &user_devices_cg, user_cgroup_path, getuid(), getgid()) != XCGROUP_SUCCESS) { goto error; } if (xcgroup_instantiate(&user_devices_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_devices_cg); goto error; } /* TODO * check that user's devices cgroup is consistant and allow the * appropriate devices */ /* * create job cgroup in the devices ns (it could already exist) */ if (xcgroup_create(&devices_ns, &job_devices_cg, job_cgroup_path, getuid(), getgid()) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_devices_cg); goto error; } if (xcgroup_instantiate(&job_devices_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_devices_cg); xcgroup_destroy(&job_devices_cg); goto error; } /* fetch information concerning the gres devices allocation for the job */ gres_job_bit_alloc = xmalloc(sizeof (int) * (gres_conf_lines + 10)); gres_plugin_job_state_file(job_gres_list, gres_job_bit_alloc, gres_count); /* * with the current cgroup devices subsystem design (whitelist only * supported) we need to allow all different devices that are supposed * to be allowed by* default. */ for (k = 0; k < allow_lines; k++) { info("Default access allowed to device %s", allowed_dev_major[k]); xcgroup_set_param(&job_devices_cg, "devices.allow", allowed_dev_major[k]); } /* * allow or deny access to devices according to job GRES permissions */ for (k = 0; k < gres_conf_lines; k++) { if (gres_job_bit_alloc[k] == 1) { info("Allowing access to device %s", gres_cgroup[k]); xcgroup_set_param(&job_devices_cg, "devices.allow", gres_cgroup[k]); } else { info("Not allowing access to device %s", gres_cgroup[k]); xcgroup_set_param(&job_devices_cg, "devices.deny", gres_cgroup[k]); } } /* * create step cgroup in the devices ns (it should not exists) * use job's user uid/gid to enable tasks cgroups creation by * the user inside the step cgroup owned by root */ if (xcgroup_create(&devices_ns, &step_devices_cg, jobstep_cgroup_path, uid, gid) != XCGROUP_SUCCESS ) { /* do not delete user/job cgroup as */ /* they can exist for other steps */ xcgroup_destroy(&user_devices_cg); xcgroup_destroy(&job_devices_cg); goto error; } if ( xcgroup_instantiate(&step_devices_cg) != XCGROUP_SUCCESS ) { xcgroup_destroy(&user_devices_cg); xcgroup_destroy(&job_devices_cg); xcgroup_destroy(&step_devices_cg); goto error; } if ((job->stepid != SLURM_BATCH_SCRIPT) && (job->stepid != SLURM_EXTERN_CONT)) { /* fetch information about step GRES devices allocation */ gres_step_bit_alloc = xmalloc(sizeof (int) * (gres_conf_lines + 10)); gres_plugin_step_state_file(step_gres_list, gres_step_bit_alloc, gres_count); /* * with the current cgroup devices subsystem design (whitelist * only supported) we need to allow all different devices that * are supposed to be allowed by default. */ for (k = 0; k < allow_lines; k++) { info("Default access allowed to device %s", allowed_dev_major[k]); xcgroup_set_param(&step_devices_cg, "devices.allow", allowed_dev_major[k]); } /* * allow or deny access to devices according to GRES permissions * for the step */ for (k = 0; k < gres_conf_lines; k++) { if (gres_step_bit_alloc[k] == 1) { info("Allowing access to device %s for step", gres_cgroup[k]); xcgroup_set_param(&step_devices_cg, "devices.allow", gres_cgroup[k]); } else { info("Not allowing access to device %s for step", gres_cgroup[k]); xcgroup_set_param(&step_devices_cg, "devices.deny", gres_cgroup[k]); } } } /* attach the slurmstepd to the step devices cgroup */ pid_t pid = getpid(); rc = xcgroup_add_pids(&step_devices_cg, &pid, 1); if (rc != XCGROUP_SUCCESS) { error("task/cgroup: unable to add slurmstepd to devices cg '%s'", step_devices_cg.path); fstatus = SLURM_ERROR; } else { fstatus = SLURM_SUCCESS; } error: xcgroup_unlock(&devices_cg); xcgroup_destroy(&devices_cg); xfree(gres_step_bit_alloc); xfree(gres_job_bit_alloc); xfree(gres_name); xfree(dev_path); xfree(gres_cgroup); return fstatus; }
extern int jobacct_gather_cgroup_memory_attach_task( pid_t pid, jobacct_id_t *jobacct_id) { xcgroup_t memory_cg; slurmd_job_t *job; uid_t uid; gid_t gid; uint32_t jobid; uint32_t stepid; uint32_t taskid; int fstatus = SLURM_SUCCESS; int rc; char* slurm_cgpath; job = jobacct_id->job; uid = job->uid; gid = job->gid; jobid = job->jobid; stepid = job->stepid; taskid = jobacct_id->taskid; /* create slurm root cg in this cg namespace */ slurm_cgpath = jobacct_cgroup_create_slurm_cg(&memory_ns); if (!slurm_cgpath) { return SLURM_ERROR; } /* build user cgroup relative path if not set (should not be) */ if (*user_cgroup_path == '\0') { if (snprintf(user_cgroup_path, PATH_MAX, "%s/uid_%u", slurm_cgpath, uid) >= PATH_MAX) { error("unable to build uid %u cgroup relative " "path : %m", uid); xfree(slurm_cgpath); return SLURM_ERROR; } } /* build job cgroup relative path if not set (may not be) */ if (*job_cgroup_path == '\0') { if (snprintf(job_cgroup_path, PATH_MAX, "%s/job_%u", user_cgroup_path, jobid) >= PATH_MAX) { error("jobacct_gather/cgroup: unable to build job %u " "memory cg relative path : %m", jobid); return SLURM_ERROR; } } /* build job step cgroup relative path if not set (may not be) */ if (*jobstep_cgroup_path == '\0') { if (snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_%u", job_cgroup_path, stepid) >= PATH_MAX) { error("jobacct_gather/cgroup: unable to build job step " "%u memory cg relative path : %m", stepid); return SLURM_ERROR; } } /* build task cgroup relative path */ if (snprintf(task_cgroup_path, PATH_MAX, "%s/task_%u", jobstep_cgroup_path, taskid) >= PATH_MAX) { error("jobacct_gather/cgroup: unable to build task %u " "memory cg relative path : %m", taskid); return SLURM_ERROR; } fstatus = SLURM_SUCCESS; /* * create memory root cg and lock it * * we will keep the lock until the end to avoid the effect of a release * agent that would remove an existing cgroup hierarchy while we are * setting it up. As soon as the step cgroup is created, we can release * the lock. * Indeed, consecutive slurm steps could result in cg being removed * between the next EEXIST instanciation and the first addition of * a task. The release_agent will have to lock the root memory cgroup * to avoid this scenario. */ if (xcgroup_create(&memory_ns, &memory_cg, "", 0, 0) != XCGROUP_SUCCESS) { error("jobacct_gather/cgroup: unable to create root memory " "xcgroup"); return SLURM_ERROR; } if (xcgroup_lock(&memory_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&memory_cg); error("jobacct_gather/cgroup: unable to lock root memory cg"); return SLURM_ERROR; } /* * Create user cgroup in the memory ns (it could already exist) * Ask for hierarchical memory accounting starting from the user * container in order to track the memory consumption up to the * user. */ if (xcgroup_create(&memory_ns, &user_memory_cg, user_cgroup_path, uid, gid) != XCGROUP_SUCCESS) { error("jobacct_gather/cgroup: unable to create user %u memory " "cgroup", uid); fstatus = SLURM_ERROR; goto error; } if (xcgroup_instanciate(&user_memory_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_memory_cg); error("jobacct_gather/cgroup: unable to instanciate user %u " "memory cgroup", uid); fstatus = SLURM_ERROR; goto error; } /* * Create job cgroup in the memory ns (it could already exist) */ if (xcgroup_create(&memory_ns, &job_memory_cg, job_cgroup_path, uid, gid) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_memory_cg); error("jobacct_gather/cgroup: unable to create job %u memory " "cgroup", jobid); fstatus = SLURM_ERROR; goto error; } if (xcgroup_instanciate(&job_memory_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_memory_cg); xcgroup_destroy(&job_memory_cg); error("jobacct_gather/cgroup: unable to instanciate job %u " "memory cgroup", jobid); fstatus = SLURM_ERROR; goto error; } /* * Create step cgroup in the memory ns (it could already exist) */ if (xcgroup_create(&memory_ns, &step_memory_cg, jobstep_cgroup_path, uid, gid) != XCGROUP_SUCCESS) { /* do not delete user/job cgroup as they can exist for other * steps, but release cgroup structures */ xcgroup_destroy(&user_memory_cg); xcgroup_destroy(&job_memory_cg); error("jobacct_gather/cgroup: unable to create jobstep %u.%u " "memory cgroup", jobid, stepid); fstatus = SLURM_ERROR; goto error; } if (xcgroup_instanciate(&step_memory_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_memory_cg); xcgroup_destroy(&job_memory_cg); xcgroup_destroy(&step_memory_cg); error("jobacct_gather/cgroup: unable to instantiate jobstep " "%u.%u memory cgroup", jobid, stepid); fstatus = SLURM_ERROR; goto error; } /* * Create task cgroup in the memory ns */ if (xcgroup_create(&memory_ns, &task_memory_cg, task_cgroup_path, uid, gid) != XCGROUP_SUCCESS) { /* do not delete user/job cgroup as they can exist for other * steps, but release cgroup structures */ xcgroup_destroy(&user_memory_cg); xcgroup_destroy(&job_memory_cg); error("jobacct_gather/cgroup: unable to create jobstep %u.%u " "task %u memory cgroup", jobid, stepid, taskid); fstatus = SLURM_ERROR; goto error; } if (xcgroup_instanciate(&task_memory_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_memory_cg); xcgroup_destroy(&job_memory_cg); xcgroup_destroy(&step_memory_cg); error("jobacct_gather/cgroup: unable to instantiate jobstep " "%u.%u task %u memory cgroup", jobid, stepid, taskid); fstatus = SLURM_ERROR; goto error; } /* * Attach the slurmstepd to the task memory cgroup */ rc = xcgroup_add_pids(&task_memory_cg, &pid, 1); if (rc != XCGROUP_SUCCESS) { error("jobacct_gather/cgroup: unable to add slurmstepd to " "memory cg '%s'", task_memory_cg.path); fstatus = SLURM_ERROR; } else fstatus = SLURM_SUCCESS; error: xcgroup_unlock(&memory_cg); xcgroup_destroy(&memory_cg); return fstatus; }
int _slurm_cgroup_create(stepd_step_rec_t *job, uint64_t id, uid_t uid, gid_t gid) { /* we do it here as we do not have access to the conf structure */ /* in libslurm (src/common/xcgroup.c) */ char *pre = (char *)xstrdup(slurm_cgroup_conf.cgroup_prepend); #ifdef MULTIPLE_SLURMD if ( conf->node_name != NULL ) xstrsubstitute(pre,"%n", conf->node_name); else { xfree(pre); pre = (char*) xstrdup("/slurm"); } #endif if (xcgroup_create(&freezer_ns, &slurm_freezer_cg, pre, getuid(), getgid()) != XCGROUP_SUCCESS) { return SLURM_ERROR; } /* * While creating the cgroup hierarchy of the step, lock the root * cgroup directory. The same lock is hold during removal of the * hierarchies of other jobs/steps. This helps to avoid the race * condition with concurrent creation/removal of the intermediate * shared directories that could result in the failure of the * hierarchy setup */ xcgroup_lock(&freezer_cg); /* create slurm cgroup in the freezer ns (it could already exist) */ if (xcgroup_instanciate(&slurm_freezer_cg) != XCGROUP_SUCCESS) goto bail; /* build user cgroup relative path if not set (should not be) */ if (*user_cgroup_path == '\0') { if (snprintf(user_cgroup_path, PATH_MAX, "%s/uid_%u", pre, uid) >= PATH_MAX) { error("unable to build uid %u cgroup relative " "path : %m", uid); xfree(pre); goto bail; } } xfree(pre); /* build job cgroup relative path if no set (should not be) */ if (*job_cgroup_path == '\0') { if (snprintf(job_cgroup_path, PATH_MAX, "%s/job_%u", user_cgroup_path, job->jobid) >= PATH_MAX) { error("unable to build job %u cgroup relative " "path : %m", job->jobid); goto bail; } } /* build job step cgroup relative path (should not be) */ if (*jobstep_cgroup_path == '\0') { if (job->stepid == NO_VAL) { if (snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_batch", job_cgroup_path) >= PATH_MAX) { error("proctrack/cgroup unable to build job step" " %u.batch freezer cg relative path: %m", job->jobid); goto bail; } } else { if (snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_%u", job_cgroup_path, job->stepid) >= PATH_MAX) { error("proctrack/cgroup unable to build job step" " %u.%u freezer cg relative path: %m", job->jobid, job->stepid); goto bail; } } } /* create user cgroup in the freezer ns (it could already exist) */ if (xcgroup_create(&freezer_ns, &user_freezer_cg, user_cgroup_path, getuid(), getgid()) != XCGROUP_SUCCESS) { xcgroup_destroy(&slurm_freezer_cg); goto bail; } /* create job cgroup in the freezer ns (it could already exist) */ if (xcgroup_create(&freezer_ns, &job_freezer_cg, job_cgroup_path, getuid(), getgid()) != XCGROUP_SUCCESS) { xcgroup_destroy(&slurm_freezer_cg); xcgroup_destroy(&user_freezer_cg); goto bail; } /* create step cgroup in the freezer ns (it should not exists) */ if (xcgroup_create(&freezer_ns, &step_freezer_cg, jobstep_cgroup_path, getuid(), getgid()) != XCGROUP_SUCCESS) { xcgroup_destroy(&slurm_freezer_cg); xcgroup_destroy(&user_freezer_cg); xcgroup_destroy(&job_freezer_cg); goto bail; } if ((xcgroup_instanciate(&user_freezer_cg) != XCGROUP_SUCCESS) || (xcgroup_instanciate(&job_freezer_cg) != XCGROUP_SUCCESS) || (xcgroup_instanciate(&step_freezer_cg) != XCGROUP_SUCCESS)) { xcgroup_destroy(&user_freezer_cg); xcgroup_destroy(&job_freezer_cg); xcgroup_destroy(&step_freezer_cg); goto bail; } /* inhibit release agent for the step cgroup thus letting * slurmstepd being able to add new pids to the container * when the job ends (TaskEpilog,...) */ xcgroup_set_param(&step_freezer_cg,"notify_on_release","0"); slurm_freezer_init = true; xcgroup_unlock(&freezer_cg); return SLURM_SUCCESS; bail: xcgroup_destroy(&slurm_freezer_cg); xcgroup_unlock(&freezer_cg); xcgroup_destroy(&freezer_cg); return SLURM_ERROR; }
extern int task_cgroup_cpuset_create(slurmd_job_t *job) { int rc; int fstatus = SLURM_ERROR; xcgroup_t cpuset_cg; uint32_t jobid = job->jobid; uint32_t stepid = job->stepid; uid_t uid = job->uid; uid_t gid = job->gid; char* user_alloc_cores = NULL; char* job_alloc_cores = NULL; char* step_alloc_cores = NULL; char* cpus = NULL; size_t cpus_size; char* slurm_cgpath ; xcgroup_t slurm_cg; /* create slurm root cg in this cg namespace */ slurm_cgpath = task_cgroup_create_slurm_cg(&cpuset_ns); if ( slurm_cgpath == NULL ) { return SLURM_ERROR; } /* check that this cgroup has cpus allowed or initialize them */ if (xcgroup_load(&cpuset_ns,&slurm_cg,slurm_cgpath) != XCGROUP_SUCCESS) { error("task/cgroup: unable to load slurm cpuset xcgroup"); xfree(slurm_cgpath); return SLURM_ERROR; } rc = xcgroup_get_param(&slurm_cg,"cpuset.cpus",&cpus,&cpus_size); if (rc != XCGROUP_SUCCESS || cpus_size == 1) { /* initialize the cpusets as it was inexistant */ if (_xcgroup_cpuset_init(&slurm_cg) != XCGROUP_SUCCESS) { xfree(slurm_cgpath); xcgroup_destroy(&slurm_cg); return SLURM_ERROR; } } xfree(cpus); /* build user cgroup relative path if not set (should not be) */ if (*user_cgroup_path == '\0') { if (snprintf(user_cgroup_path, PATH_MAX, "%s/uid_%u", slurm_cgpath, uid) >= PATH_MAX) { error("unable to build uid %u cgroup relative " "path : %m", uid); xfree(slurm_cgpath); return SLURM_ERROR; } } xfree(slurm_cgpath); /* build job cgroup relative path if no set (should not be) */ if (*job_cgroup_path == '\0') { if (snprintf(job_cgroup_path,PATH_MAX,"%s/job_%u", user_cgroup_path,jobid) >= PATH_MAX) { error("task/cgroup: unable to build job %u cpuset " "cg relative path : %m",jobid); return SLURM_ERROR; } } /* build job step cgroup relative path (should not be) */ if (*jobstep_cgroup_path == '\0') { if (stepid == NO_VAL) { if (snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_batch", job_cgroup_path) >= PATH_MAX) { error("task/cgroup: unable to build job step" " %u.batch cpuset cg relative path: %m", jobid); return SLURM_ERROR; } } else { if (snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_%u", job_cgroup_path, stepid) >= PATH_MAX) { error("task/cgroup: unable to build job step" " %u.%u cpuset cg relative path: %m", jobid, stepid); return SLURM_ERROR; } } } /* * create cpuset root cg and lock it * * we will keep the lock until the end to avoid the effect of a release * agent that would remove an existing cgroup hierarchy while we are * setting it up. As soon as the step cgroup is created, we can release * the lock. * Indeed, consecutive slurm steps could result in cg being removed * between the next EEXIST instanciation and the first addition of * a task. The release_agent will have to lock the root cpuset cgroup * to avoid this scenario. */ if (xcgroup_create(&cpuset_ns,&cpuset_cg,"",0,0) != XCGROUP_SUCCESS) { error("task/cgroup: unable to create root cpuset xcgroup"); return SLURM_ERROR; } if (xcgroup_lock(&cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&cpuset_cg); error("task/cgroup: unable to lock root cpuset cg"); return SLURM_ERROR; } /* * build job and job steps allocated cores lists */ debug("task/cgroup: job abstract cores are '%s'", job->job_alloc_cores); debug("task/cgroup: step abstract cores are '%s'", job->step_alloc_cores); if (xcpuinfo_abs_to_mac(job->job_alloc_cores, &job_alloc_cores) != XCPUINFO_SUCCESS) { error("task/cgroup: unable to build job physical cores"); goto error; } if (xcpuinfo_abs_to_mac(job->step_alloc_cores, &step_alloc_cores) != XCPUINFO_SUCCESS) { error("task/cgroup: unable to build step physical cores"); goto error; } debug("task/cgroup: job physical cores are '%s'", job->job_alloc_cores); debug("task/cgroup: step physical cores are '%s'", job->step_alloc_cores); /* * create user cgroup in the cpuset ns (it could already exist) */ if (xcgroup_create(&cpuset_ns,&user_cpuset_cg, user_cgroup_path, getuid(),getgid()) != XCGROUP_SUCCESS) { goto error; } if (xcgroup_instanciate(&user_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); goto error; } /* * check that user's cpuset cgroup is consistant and add the job cores */ rc = xcgroup_get_param(&user_cpuset_cg,"cpuset.cpus",&cpus,&cpus_size); if (rc != XCGROUP_SUCCESS || cpus_size == 1) { /* initialize the cpusets as it was inexistant */ if (_xcgroup_cpuset_init(&user_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_delete(&user_cpuset_cg); xcgroup_destroy(&user_cpuset_cg); goto error; } } user_alloc_cores = xstrdup(job_alloc_cores); if (cpus != NULL && cpus_size > 1) { cpus[cpus_size-1]='\0'; xstrcat(user_alloc_cores,","); xstrcat(user_alloc_cores,cpus); } xcgroup_set_param(&user_cpuset_cg,"cpuset.cpus",user_alloc_cores); xfree(cpus); /* * create job cgroup in the cpuset ns (it could already exist) */ if (xcgroup_create(&cpuset_ns,&job_cpuset_cg, job_cgroup_path, getuid(),getgid()) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); goto error; } if (xcgroup_instanciate(&job_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); xcgroup_destroy(&job_cpuset_cg); goto error; } if (_xcgroup_cpuset_init(&job_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); xcgroup_destroy(&job_cpuset_cg); goto error; } xcgroup_set_param(&job_cpuset_cg,"cpuset.cpus",job_alloc_cores); /* * create step cgroup in the cpuset ns (it should not exists) * use job's user uid/gid to enable tasks cgroups creation by * the user inside the step cgroup owned by root */ if (xcgroup_create(&cpuset_ns,&step_cpuset_cg, jobstep_cgroup_path, uid,gid) != XCGROUP_SUCCESS) { /* do not delete user/job cgroup as */ /* they can exist for other steps */ xcgroup_destroy(&user_cpuset_cg); xcgroup_destroy(&job_cpuset_cg); goto error; } if (xcgroup_instanciate(&step_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); xcgroup_destroy(&job_cpuset_cg); xcgroup_destroy(&step_cpuset_cg); goto error; } if (_xcgroup_cpuset_init(&step_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); xcgroup_destroy(&job_cpuset_cg); xcgroup_delete(&step_cpuset_cg); xcgroup_destroy(&step_cpuset_cg); goto error; } xcgroup_set_param(&step_cpuset_cg,"cpuset.cpus",step_alloc_cores); /* attach the slurmstepd to the step cpuset cgroup */ pid_t pid = getpid(); rc = xcgroup_add_pids(&step_cpuset_cg,&pid,1); if (rc != XCGROUP_SUCCESS) { error("task/cgroup: unable to add slurmstepd to cpuset cg '%s'", step_cpuset_cg.path); fstatus = SLURM_ERROR; } else fstatus = SLURM_SUCCESS; error: xcgroup_unlock(&cpuset_cg); xcgroup_destroy(&cpuset_cg); xfree(user_alloc_cores); xfree(job_alloc_cores); xfree(step_alloc_cores); return fstatus; }
extern int init_system_cpuset_cgroup(void) { int rc; int fstatus = SLURM_ERROR; char* cpus = NULL; size_t cpus_size; char* slurm_cgpath; xcgroup_t slurm_cg; /* read cgroup configuration */ if (read_slurm_cgroup_conf(&slurm_cgroup_conf)) return SLURM_ERROR; /* initialize cpuset cgroup namespace */ if (xcgroup_ns_create(&slurm_cgroup_conf, &cpuset_ns, "", "cpuset") != XCGROUP_SUCCESS) { error("system cgroup: unable to create cpuset namespace"); free_slurm_cgroup_conf(&slurm_cgroup_conf); return SLURM_ERROR; } /* create slurm root cg in this cg namespace */ slurm_cgpath = _system_cgroup_create_slurm_cg(&cpuset_ns); if ( slurm_cgpath == NULL ) { xcgroup_ns_destroy(&cpuset_ns); free_slurm_cgroup_conf(&slurm_cgroup_conf); return SLURM_ERROR; } /* check that this cgroup has cpus allowed or initialize them */ if (xcgroup_load(&cpuset_ns, &slurm_cg, slurm_cgpath) != XCGROUP_SUCCESS) { error("system cgroup: unable to load slurm cpuset xcgroup"); xfree(slurm_cgpath); xcgroup_ns_destroy(&cpuset_ns); free_slurm_cgroup_conf(&slurm_cgroup_conf); return SLURM_ERROR; } again: snprintf(cpuset_meta, sizeof(cpuset_meta), "%scpus", cpuset_prefix); rc = xcgroup_get_param(&slurm_cg, cpuset_meta, &cpus, &cpus_size); if (rc != XCGROUP_SUCCESS || cpus_size == 1) { if (!cpuset_prefix_set && (rc != XCGROUP_SUCCESS)) { cpuset_prefix_set = 1; cpuset_prefix = "cpuset."; goto again; } /* initialize the cpusets as it was nonexistent */ if (_xcgroup_cpuset_init(&slurm_cg) != XCGROUP_SUCCESS) { xfree(slurm_cgpath); xcgroup_destroy(&slurm_cg); xcgroup_ns_destroy(&cpuset_ns); free_slurm_cgroup_conf(&slurm_cgroup_conf); xfree(cpus); return SLURM_ERROR; } } xcgroup_destroy(&slurm_cg); xfree(cpus); /* build system cgroup relative path */ snprintf(system_cgroup_path, PATH_MAX, "%s/system", slurm_cgpath); xfree(slurm_cgpath); /* create system cgroup in the cpuset ns */ if (xcgroup_create(&cpuset_ns, &system_cpuset_cg, system_cgroup_path, getuid(),getgid()) != XCGROUP_SUCCESS) { goto error; } if (xcgroup_instantiate(&system_cpuset_cg) != XCGROUP_SUCCESS) { goto error; } if (_xcgroup_cpuset_init(&system_cpuset_cg) != XCGROUP_SUCCESS) { goto error; } free_slurm_cgroup_conf(&slurm_cgroup_conf); debug("system cgroup: system cpuset cgroup initialized"); return SLURM_SUCCESS; error: xcgroup_unlock(&system_cpuset_cg); xcgroup_destroy(&system_cpuset_cg); xcgroup_ns_destroy(&cpuset_ns); free_slurm_cgroup_conf(&slurm_cgroup_conf); return fstatus; }
extern int init_system_memory_cgroup(void) { int fstatus = SLURM_ERROR; char* slurm_cgpath; /* read cgroup configuration */ if (read_slurm_cgroup_conf(&slurm_cgroup_conf)) return SLURM_ERROR; /* initialize memory cgroup namespace */ if (xcgroup_ns_create(&slurm_cgroup_conf, &memory_ns, "", "memory") != XCGROUP_SUCCESS) { error("system cgroup: unable to create memory namespace"); free_slurm_cgroup_conf(&slurm_cgroup_conf); return SLURM_ERROR; } constrain_kmem_space = slurm_cgroup_conf.constrain_kmem_space; constrain_ram_space = slurm_cgroup_conf.constrain_ram_space; constrain_swap_space = slurm_cgroup_conf.constrain_swap_space; /* * as the swap space threshold will be configured with a * mem+swp parameter value, if RAM space is not monitored, * set allowed RAM space to 100% of the job requested memory. * It will help to construct the mem+swp value that will be * used for both mem and mem+swp limit during memcg creation. */ if ( constrain_ram_space ) allowed_ram_space = slurm_cgroup_conf.allowed_ram_space; else allowed_ram_space = 100.0; allowed_swap_space = slurm_cgroup_conf.allowed_swap_space; if ((totalram = (uint64_t) conf->real_memory_size) == 0) error ("system cgroup: Unable to get RealMemory size"); max_kmem = _percent_in_bytes(totalram, slurm_cgroup_conf.max_kmem_percent); max_ram = _percent_in_bytes(totalram, slurm_cgroup_conf.max_ram_percent); max_swap = _percent_in_bytes(totalram, slurm_cgroup_conf.max_swap_percent); max_swap += max_ram; min_ram_space = slurm_cgroup_conf.min_ram_space * 1024 * 1024; debug ("system cgroup: memory: total:%luM allowed:%.4g%%(%s), " "swap:%.4g%%(%s), max:%.4g%%(%luM) " "max+swap:%.4g%%(%luM) min:%luM " "kmem:%.4g%%(%luM %s) min:%luM", (unsigned long) totalram, allowed_ram_space, constrain_ram_space?"enforced":"permissive", allowed_swap_space, constrain_swap_space?"enforced":"permissive", slurm_cgroup_conf.max_ram_percent, (unsigned long) (max_ram/(1024*1024)), slurm_cgroup_conf.max_swap_percent, (unsigned long) (max_swap/(1024*1024)), (unsigned long) slurm_cgroup_conf.min_ram_space, slurm_cgroup_conf.max_kmem_percent, (unsigned long)(max_kmem/(1024*1024)), constrain_kmem_space?"enforced":"permissive", (unsigned long) slurm_cgroup_conf.min_kmem_space); /* * Warning: OOM Killer must be disabled for slurmstepd * or it would be destroyed if the application use * more memory than permitted * * If an env value is already set for slurmstepd * OOM killer behavior, keep it, otherwise set the * -1000 value, wich means do not let OOM killer kill it * * FYI, setting "export SLURMSTEPD_OOM_ADJ=-1000" * in /etc/sysconfig/slurm would be the same */ setenv("SLURMSTEPD_OOM_ADJ", "-1000", 0); /* create slurm root cg in this cg namespace */ slurm_cgpath = _system_cgroup_create_slurm_cg(&memory_ns); if ( slurm_cgpath == NULL ) { xcgroup_ns_destroy(&memory_ns); free_slurm_cgroup_conf(&slurm_cgroup_conf); return SLURM_ERROR; } /* build system cgroup relative path */ snprintf(system_cgroup_path, PATH_MAX, "%s/system", slurm_cgpath); xfree(slurm_cgpath); /* create system cgroup in the cpuset ns */ if (xcgroup_create(&memory_ns, &system_memory_cg, system_cgroup_path, getuid(), getgid()) != XCGROUP_SUCCESS) { goto error; } if (xcgroup_instantiate(&system_memory_cg) != XCGROUP_SUCCESS) { goto error; } if ( xcgroup_set_param(&system_memory_cg, "memory.use_hierarchy", "1") != XCGROUP_SUCCESS ) { error("system cgroup: unable to ask for hierarchical accounting" "of system memcg '%s'", system_memory_cg.path); goto error; } free_slurm_cgroup_conf(&slurm_cgroup_conf); debug("system cgroup: system memory cgroup initialized"); return SLURM_SUCCESS; error: xcgroup_unlock(&system_memory_cg); xcgroup_destroy(&system_memory_cg); xcgroup_ns_destroy(&memory_ns); free_slurm_cgroup_conf(&slurm_cgroup_conf); return fstatus; }
extern int task_cgroup_cpuset_create(stepd_step_rec_t *job) { int rc; int fstatus = SLURM_ERROR; xcgroup_t cpuset_cg; uint32_t jobid = job->jobid; uint32_t stepid = job->stepid; uid_t uid = job->uid; uid_t gid = job->gid; char* user_alloc_cores = NULL; char* job_alloc_cores = NULL; char* step_alloc_cores = NULL; char cpuset_meta[PATH_MAX]; char* cpus = NULL; size_t cpus_size; char* slurm_cgpath; xcgroup_t slurm_cg; #ifdef HAVE_NATIVE_CRAY char expected_usage[32]; #endif /* create slurm root cg in this cg namespace */ slurm_cgpath = task_cgroup_create_slurm_cg(&cpuset_ns); if ( slurm_cgpath == NULL ) { return SLURM_ERROR; } /* check that this cgroup has cpus allowed or initialize them */ if (xcgroup_load(&cpuset_ns,&slurm_cg,slurm_cgpath) != XCGROUP_SUCCESS) { error("task/cgroup: unable to load slurm cpuset xcgroup"); xfree(slurm_cgpath); return SLURM_ERROR; } again: snprintf(cpuset_meta, sizeof(cpuset_meta), "%scpus", cpuset_prefix); rc = xcgroup_get_param(&slurm_cg, cpuset_meta, &cpus,&cpus_size); if (rc != XCGROUP_SUCCESS || cpus_size == 1) { if (!cpuset_prefix_set && (rc != XCGROUP_SUCCESS)) { cpuset_prefix_set = 1; cpuset_prefix = "cpuset."; goto again; } /* initialize the cpusets as it was inexistant */ if (_xcgroup_cpuset_init(&slurm_cg) != XCGROUP_SUCCESS) { xfree(slurm_cgpath); xcgroup_destroy(&slurm_cg); return SLURM_ERROR; } } xfree(cpus); /* build user cgroup relative path if not set (should not be) */ if (*user_cgroup_path == '\0') { if (snprintf(user_cgroup_path, PATH_MAX, "%s/uid_%u", slurm_cgpath, uid) >= PATH_MAX) { error("task/cgroup: unable to build uid %u cgroup " "relative path : %m", uid); xfree(slurm_cgpath); return SLURM_ERROR; } } xfree(slurm_cgpath); /* build job cgroup relative path if no set (should not be) */ if (*job_cgroup_path == '\0') { if (snprintf(job_cgroup_path,PATH_MAX,"%s/job_%u", user_cgroup_path,jobid) >= PATH_MAX) { error("task/cgroup: unable to build job %u cpuset " "cg relative path : %m",jobid); return SLURM_ERROR; } } /* build job step cgroup relative path (should not be) */ if (*jobstep_cgroup_path == '\0') { int cc; if (stepid == SLURM_BATCH_SCRIPT) { cc = snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_batch", job_cgroup_path); } else if (stepid == SLURM_EXTERN_CONT) { cc = snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_extern", job_cgroup_path); } else { cc = snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_%u", job_cgroup_path, stepid); } if (cc >= PATH_MAX) { error("task/cgroup: unable to build job step %u.%u " "cpuset cg relative path: %m", jobid, stepid); return SLURM_ERROR; } } /* * create cpuset root cg and lock it * * we will keep the lock until the end to avoid the effect of a release * agent that would remove an existing cgroup hierarchy while we are * setting it up. As soon as the step cgroup is created, we can release * the lock. * Indeed, consecutive slurm steps could result in cg being removed * between the next EEXIST instanciation and the first addition of * a task. The release_agent will have to lock the root cpuset cgroup * to avoid this scenario. */ if (xcgroup_create(&cpuset_ns,&cpuset_cg,"",0,0) != XCGROUP_SUCCESS) { error("task/cgroup: unable to create root cpuset xcgroup"); return SLURM_ERROR; } if (xcgroup_lock(&cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&cpuset_cg); error("task/cgroup: unable to lock root cpuset cg"); return SLURM_ERROR; } /* * build job and job steps allocated cores lists */ debug("task/cgroup: job abstract cores are '%s'", job->job_alloc_cores); debug("task/cgroup: step abstract cores are '%s'", job->step_alloc_cores); if (xcpuinfo_abs_to_mac(job->job_alloc_cores, &job_alloc_cores) != SLURM_SUCCESS) { error("task/cgroup: unable to build job physical cores"); goto error; } if (xcpuinfo_abs_to_mac(job->step_alloc_cores, &step_alloc_cores) != SLURM_SUCCESS) { error("task/cgroup: unable to build step physical cores"); goto error; } debug("task/cgroup: job physical cores are '%s'", job_alloc_cores); debug("task/cgroup: step physical cores are '%s'", step_alloc_cores); /* * create user cgroup in the cpuset ns (it could already exist) */ if (xcgroup_create(&cpuset_ns,&user_cpuset_cg, user_cgroup_path, getuid(),getgid()) != XCGROUP_SUCCESS) { goto error; } if (xcgroup_instanciate(&user_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); goto error; } /* * check that user's cpuset cgroup is consistant and add the job cores */ rc = xcgroup_get_param(&user_cpuset_cg, cpuset_meta, &cpus,&cpus_size); if (rc != XCGROUP_SUCCESS || cpus_size == 1) { /* initialize the cpusets as it was inexistant */ if (_xcgroup_cpuset_init(&user_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_delete(&user_cpuset_cg); xcgroup_destroy(&user_cpuset_cg); goto error; } } user_alloc_cores = xstrdup(job_alloc_cores); if (cpus != NULL && cpus_size > 1) { cpus[cpus_size-1]='\0'; xstrcat(user_alloc_cores,","); xstrcat(user_alloc_cores,cpus); } xcgroup_set_param(&user_cpuset_cg, cpuset_meta, user_alloc_cores); xfree(cpus); /* * create job cgroup in the cpuset ns (it could already exist) */ if (xcgroup_create(&cpuset_ns,&job_cpuset_cg, job_cgroup_path, getuid(),getgid()) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); goto error; } if (xcgroup_instanciate(&job_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); xcgroup_destroy(&job_cpuset_cg); goto error; } if (_xcgroup_cpuset_init(&job_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); xcgroup_destroy(&job_cpuset_cg); goto error; } xcgroup_set_param(&job_cpuset_cg, cpuset_meta, job_alloc_cores); /* * create step cgroup in the cpuset ns (it should not exists) * use job's user uid/gid to enable tasks cgroups creation by * the user inside the step cgroup owned by root */ if (xcgroup_create(&cpuset_ns,&step_cpuset_cg, jobstep_cgroup_path, uid,gid) != XCGROUP_SUCCESS) { /* do not delete user/job cgroup as */ /* they can exist for other steps */ xcgroup_destroy(&user_cpuset_cg); xcgroup_destroy(&job_cpuset_cg); goto error; } if (xcgroup_instanciate(&step_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); xcgroup_destroy(&job_cpuset_cg); xcgroup_destroy(&step_cpuset_cg); goto error; } if (_xcgroup_cpuset_init(&step_cpuset_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuset_cg); xcgroup_destroy(&job_cpuset_cg); xcgroup_delete(&step_cpuset_cg); xcgroup_destroy(&step_cpuset_cg); goto error; } xcgroup_set_param(&step_cpuset_cg, cpuset_meta, step_alloc_cores); /* * on Cray systems, set the expected usage in bytes. * This is used by the Cray OOM killer */ #ifdef HAVE_NATIVE_CRAY snprintf(expected_usage, sizeof(expected_usage), "%"PRIu64, (uint64_t)job->step_mem * 1024 * 1024); xcgroup_set_param(&step_cpuset_cg, "expected_usage_in_bytes", expected_usage); #endif /* attach the slurmstepd to the step cpuset cgroup */ pid_t pid = getpid(); rc = xcgroup_add_pids(&step_cpuset_cg,&pid,1); if (rc != XCGROUP_SUCCESS) { error("task/cgroup: unable to add slurmstepd to cpuset cg '%s'", step_cpuset_cg.path); fstatus = SLURM_ERROR; } else fstatus = SLURM_SUCCESS; /* validate the requested cpu frequency and set it */ cpu_freq_cgroup_validate(job, step_alloc_cores); error: xcgroup_unlock(&cpuset_cg); xcgroup_destroy(&cpuset_cg); xfree(user_alloc_cores); xfree(job_alloc_cores); xfree(step_alloc_cores); return fstatus; }
extern int jobacct_gather_cgroup_cpuacct_fini(slurm_cgroup_conf_t *slurm_cgroup_conf) { xcgroup_t cpuacct_cg; bool lock_ok; int cc; if (user_cgroup_path[0] == '\0' || job_cgroup_path[0] == '\0' || jobstep_cgroup_path[0] == '\0' || task_cgroup_path[0] == 0) return SLURM_SUCCESS; /* * Move the slurmstepd back to the root cpuacct cg. * The release_agent will asynchroneously be called for the step * cgroup. It will do the necessary cleanup. */ if (xcgroup_create(&cpuacct_ns, &cpuacct_cg, "", 0, 0) == XCGROUP_SUCCESS) { xcgroup_set_uint32_param(&cpuacct_cg, "tasks", getpid()); } /* Lock the root of the cgroup and remove the subdirectories * related to this job. */ lock_ok = true; if (xcgroup_lock(&cpuacct_cg) != XCGROUP_SUCCESS) { error("%s: failed to flock() %s %m", __func__, cpuacct_cg.path); lock_ok = false; } /* Clean up starting from the leaves way up, the * reverse order in which the cgroups were created. */ for (cc = 0; cc <= max_task_id; cc++) { xcgroup_t cgroup; char buf[PATH_MAX]; /* rmdir all tasks this running slurmstepd * was responsible for. */ sprintf(buf, "%s%s/task_%d", cpuacct_ns.mnt_point, jobstep_cgroup_path, cc); cgroup.path = buf; if (strstr(buf, "step_extern")) kill_extern_procs(cgroup.path); if (xcgroup_delete(&cgroup) != XCGROUP_SUCCESS) { debug2("%s: failed to delete %s %m", __func__, buf); } } if (xcgroup_delete(&step_cpuacct_cg) != XCGROUP_SUCCESS) { debug2("%s: failed to delete %s %m", __func__, cpuacct_cg.path); } if (xcgroup_delete(&job_cpuacct_cg) != XCGROUP_SUCCESS) { debug2("%s: failed to delete %s %m", __func__, job_cpuacct_cg.path); } if (xcgroup_delete(&user_cpuacct_cg) != XCGROUP_SUCCESS) { debug2("%s: failed to delete %s %m", __func__, user_cpuacct_cg.path); } if (lock_ok == true) xcgroup_unlock(&cpuacct_cg); xcgroup_destroy(&task_cpuacct_cg); xcgroup_destroy(&user_cpuacct_cg); xcgroup_destroy(&job_cpuacct_cg); xcgroup_destroy(&step_cpuacct_cg); xcgroup_destroy(&cpuacct_cg); user_cgroup_path[0]='\0'; job_cgroup_path[0]='\0'; jobstep_cgroup_path[0]='\0'; task_cgroup_path[0] = 0; xcgroup_ns_destroy(&cpuacct_ns); return SLURM_SUCCESS; }
extern int jobacct_gather_cgroup_cpuacct_attach_task(pid_t pid, jobacct_id_t *jobacct_id) { xcgroup_t cpuacct_cg; stepd_step_rec_t *job; uid_t uid; gid_t gid; uint32_t jobid; uint32_t stepid; uint32_t taskid; int fstatus = SLURM_SUCCESS; int rc; char* slurm_cgpath; job = jobacct_id->job; uid = job->uid; gid = job->gid; jobid = job->jobid; stepid = job->stepid; taskid = jobacct_id->taskid; if (taskid >= max_task_id) max_task_id = taskid; debug("%s: jobid %u stepid %u taskid %u max_task_id %u", __func__, jobid, stepid, taskid, max_task_id); /* create slurm root cg in this cg namespace */ slurm_cgpath = jobacct_cgroup_create_slurm_cg(&cpuacct_ns); if (!slurm_cgpath) { return SLURM_ERROR; } /* build user cgroup relative path if not set (may not be) */ if (*user_cgroup_path == '\0') { if (snprintf(user_cgroup_path, PATH_MAX, "%s/uid_%u", slurm_cgpath, uid) >= PATH_MAX) { error("jobacct_gather/cgroup: unable to build uid %u " "cgroup relative path", uid); xfree(slurm_cgpath); return SLURM_ERROR; } } /* build job cgroup relative path if not set (may not be) */ if (*job_cgroup_path == '\0') { if (snprintf(job_cgroup_path, PATH_MAX, "%s/job_%u", user_cgroup_path, jobid) >= PATH_MAX) { error("jobacct_gather/cgroup: unable to build job %u " "cpuacct cg relative path : %m", jobid); return SLURM_ERROR; } } /* build job step cgroup relative path if not set (may not be) */ if (*jobstep_cgroup_path == '\0') { int len; if (stepid == SLURM_BATCH_SCRIPT) { len = snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_batch", job_cgroup_path); } else if (stepid == SLURM_EXTERN_CONT) { len = snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_extern", job_cgroup_path); } else { len = snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_%u", job_cgroup_path, stepid); } if (len >= PATH_MAX) { error("jobacct_gather/cgroup: unable to build job step " " %u.%u cpuacct cg relative path: %m", jobid, stepid); return SLURM_ERROR; } } /* build task cgroup relative path */ if (snprintf(task_cgroup_path, PATH_MAX, "%s/task_%u", jobstep_cgroup_path, taskid) >= PATH_MAX) { error("jobacct_gather/cgroup: unable to build task %u " "cpuacct cg relative path : %m", taskid); return SLURM_ERROR; } /* * create cpuacct root cg and lock it * * we will keep the lock until the end to avoid the effect of a release * agent that would remove an existing cgroup hierarchy while we are * setting it up. As soon as the step cgroup is created, we can release * the lock. * Indeed, consecutive slurm steps could result in cg being removed * between the next EEXIST instanciation and the first addition of * a task. The release_agent will have to lock the root cpuacct cgroup * to avoid this scenario. */ if (xcgroup_create(&cpuacct_ns, &cpuacct_cg, "", 0, 0) != XCGROUP_SUCCESS) { error("jobacct_gather/cgroup: unable to create root cpuacct " "xcgroup"); return SLURM_ERROR; } if (xcgroup_lock(&cpuacct_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&cpuacct_cg); error("jobacct_gather/cgroup: unable to lock root cpuacct cg"); return SLURM_ERROR; } /* * Create user cgroup in the cpuacct ns (it could already exist) */ if (xcgroup_create(&cpuacct_ns, &user_cpuacct_cg, user_cgroup_path, uid, gid) != XCGROUP_SUCCESS) { error("jobacct_gather/cgroup: unable to create user %u cpuacct " "cgroup", uid); fstatus = SLURM_ERROR; goto error; } if (xcgroup_instanciate(&user_cpuacct_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuacct_cg); error("jobacct_gather/cgroup: unable to instanciate user %u " "cpuacct cgroup", uid); fstatus = SLURM_ERROR; goto error; } /* * Create job cgroup in the cpuacct ns (it could already exist) */ if (xcgroup_create(&cpuacct_ns, &job_cpuacct_cg, job_cgroup_path, uid, gid) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuacct_cg); error("jobacct_gather/cgroup: unable to create job %u cpuacct " "cgroup", jobid); fstatus = SLURM_ERROR; goto error; } if (xcgroup_instanciate(&job_cpuacct_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuacct_cg); xcgroup_destroy(&job_cpuacct_cg); error("jobacct_gather/cgroup: unable to instanciate job %u " "cpuacct cgroup", jobid); fstatus = SLURM_ERROR; goto error; } /* * Create step cgroup in the cpuacct ns (it could already exist) */ if (xcgroup_create(&cpuacct_ns, &step_cpuacct_cg, jobstep_cgroup_path, uid, gid) != XCGROUP_SUCCESS) { /* do not delete user/job cgroup as they can exist for other * steps, but release cgroup structures */ xcgroup_destroy(&user_cpuacct_cg); xcgroup_destroy(&job_cpuacct_cg); error("jobacct_gather/cgroup: unable to create jobstep %u.%u " "cpuacct cgroup", jobid, stepid); fstatus = SLURM_ERROR; goto error; } if (xcgroup_instanciate(&step_cpuacct_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuacct_cg); xcgroup_destroy(&job_cpuacct_cg); xcgroup_destroy(&step_cpuacct_cg); error("jobacct_gather/cgroup: unable to instantiate jobstep " "%u.%u cpuacct cgroup", jobid, stepid); fstatus = SLURM_ERROR; goto error; } /* * Create task cgroup in the cpuacct ns */ if (xcgroup_create(&cpuacct_ns, &task_cpuacct_cg, task_cgroup_path, uid, gid) != XCGROUP_SUCCESS) { /* do not delete user/job cgroup as they can exist for other * steps, but release cgroup structures */ xcgroup_destroy(&user_cpuacct_cg); xcgroup_destroy(&job_cpuacct_cg); error("jobacct_gather/cgroup: unable to create jobstep %u.%u " "task %u cpuacct cgroup", jobid, stepid, taskid); fstatus = SLURM_ERROR; goto error; } if (xcgroup_instanciate(&task_cpuacct_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_cpuacct_cg); xcgroup_destroy(&job_cpuacct_cg); xcgroup_destroy(&step_cpuacct_cg); error("jobacct_gather/cgroup: unable to instantiate jobstep " "%u.%u task %u cpuacct cgroup", jobid, stepid, taskid); fstatus = SLURM_ERROR; goto error; } /* * Attach the slurmstepd to the task cpuacct cgroup */ rc = xcgroup_add_pids(&task_cpuacct_cg, &pid, 1); if (rc != XCGROUP_SUCCESS) { error("jobacct_gather/cgroup: unable to add slurmstepd to " "cpuacct cg '%s'", task_cpuacct_cg.path); fstatus = SLURM_ERROR; } else fstatus = SLURM_SUCCESS; error: xcgroup_unlock(&cpuacct_cg); xcgroup_destroy(&cpuacct_cg); return fstatus; }
extern int task_cgroup_memory_init(slurm_cgroup_conf_t *slurm_cgroup_conf) { xcgroup_t memory_cg; /* initialize user/job/jobstep cgroup relative paths */ user_cgroup_path[0]='\0'; job_cgroup_path[0]='\0'; jobstep_cgroup_path[0]='\0'; /* initialize memory cgroup namespace */ if (xcgroup_ns_create(slurm_cgroup_conf, &memory_ns, "", "memory") != XCGROUP_SUCCESS) { error("task/cgroup: unable to create memory namespace"); return SLURM_ERROR; } /* Enable memory.use_hierarchy in the root of the cgroup. */ xcgroup_create(&memory_ns, &memory_cg, "", 0, 0); xcgroup_set_param(&memory_cg, "memory.use_hierarchy","1"); xcgroup_destroy(&memory_cg); constrain_ram_space = slurm_cgroup_conf->constrain_ram_space; constrain_swap_space = slurm_cgroup_conf->constrain_swap_space; /* * as the swap space threshold will be configured with a * mem+swp parameter value, if RAM space is not monitored, * set allowed RAM space to 100% of the job requested memory. * It will help to construct the mem+swp value that will be * used for both mem and mem+swp limit during memcg creation. */ if ( constrain_ram_space ) allowed_ram_space = slurm_cgroup_conf->allowed_ram_space; else allowed_ram_space = 100.0; allowed_swap_space = slurm_cgroup_conf->allowed_swap_space; if ((totalram = (uint64_t) conf->real_memory_size) == 0) error ("task/cgroup: Unable to get RealMemory size"); max_ram = percent_in_bytes(totalram, slurm_cgroup_conf->max_ram_percent); max_swap = percent_in_bytes(totalram, slurm_cgroup_conf->max_swap_percent); max_swap += max_ram; min_ram_space = slurm_cgroup_conf->min_ram_space * 1024 * 1024; debug ("task/cgroup/memory: total:%luM allowed:%.4g%%(%s), " "swap:%.4g%%(%s), max:%.4g%%(%luM) max+swap:%.4g%%(%luM) min:%uM", (unsigned long) totalram, allowed_ram_space, constrain_ram_space?"enforced":"permissive", allowed_swap_space, constrain_swap_space?"enforced":"permissive", slurm_cgroup_conf->max_ram_percent, (unsigned long) (max_ram/(1024*1024)), slurm_cgroup_conf->max_swap_percent, (unsigned long) (max_swap/(1024*1024)), (unsigned) slurm_cgroup_conf->min_ram_space); /* * Warning: OOM Killer must be disabled for slurmstepd * or it would be destroyed if the application use * more memory than permitted * * If an env value is already set for slurmstepd * OOM killer behavior, keep it, otherwise set the * -1000 value, wich means do not let OOM killer kill it * * FYI, setting "export SLURMSTEPD_OOM_ADJ=-1000" * in /etc/sysconfig/slurm would be the same */ setenv("SLURMSTEPD_OOM_ADJ", "-1000", 0); return SLURM_SUCCESS; }
extern int task_cgroup_memory_create(stepd_step_rec_t *job) { int fstatus = SLURM_ERROR; xcgroup_t memory_cg; uint32_t jobid = job->jobid; uint32_t stepid = job->stepid; uid_t uid = job->uid; gid_t gid = job->gid; char *slurm_cgpath; /* create slurm root cg in this cg namespace */ slurm_cgpath = task_cgroup_create_slurm_cg(&memory_ns); if ( slurm_cgpath == NULL ) { return SLURM_ERROR; } /* build user cgroup relative path if not set (should not be) */ if (*user_cgroup_path == '\0') { if (snprintf(user_cgroup_path, PATH_MAX, "%s/uid_%u", slurm_cgpath, uid) >= PATH_MAX) { error("unable to build uid %u cgroup relative " "path : %m", uid); xfree(slurm_cgpath); return SLURM_ERROR; } } xfree(slurm_cgpath); /* build job cgroup relative path if no set (should not be) */ if (*job_cgroup_path == '\0') { if (snprintf(job_cgroup_path,PATH_MAX,"%s/job_%u", user_cgroup_path,jobid) >= PATH_MAX) { error("task/cgroup: unable to build job %u memory " "cg relative path : %m",jobid); return SLURM_ERROR; } } /* build job step cgroup relative path (should not be) */ if (*jobstep_cgroup_path == '\0') { int cc; if (stepid == NO_VAL) { cc = snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_batch", job_cgroup_path); if (cc >= PATH_MAX) { error("task/cgroup: unable to build " "step batch memory cg path : %m"); } } else { if (snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_%u", job_cgroup_path,stepid) >= PATH_MAX) { error("task/cgroup: unable to build job step %u memory " "cg relative path : %m",stepid); return SLURM_ERROR; } } } /* * create memory root cg and lock it * * we will keep the lock until the end to avoid the effect of a release * agent that would remove an existing cgroup hierarchy while we are * setting it up. As soon as the step cgroup is created, we can release * the lock. * Indeed, consecutive slurm steps could result in cg being removed * between the next EEXIST instanciation and the first addition of * a task. The release_agent will have to lock the root memory cgroup * to avoid this scenario. */ if (xcgroup_create(&memory_ns,&memory_cg,"",0,0) != XCGROUP_SUCCESS) { error("task/cgroup: unable to create root memory xcgroup"); return SLURM_ERROR; } if (xcgroup_lock(&memory_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&memory_cg); error("task/cgroup: unable to lock root memory cg"); return SLURM_ERROR; } /* * Create user cgroup in the memory ns (it could already exist) * Ask for hierarchical memory accounting starting from the user * container in order to track the memory consumption up to the * user. * We do not set any limits at this level for now. It could be * interesting to do it in the future but memcg cleanup mech * are not working well so it will be really difficult to manage * addition/removal of memory amounts at this level. (kernel 2.6.34) */ if (xcgroup_create(&memory_ns,&user_memory_cg, user_cgroup_path, getuid(),getgid()) != XCGROUP_SUCCESS) { goto error; } if (xcgroup_instanciate(&user_memory_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_memory_cg); goto error; } if ( xcgroup_set_param(&user_memory_cg,"memory.use_hierarchy","1") != XCGROUP_SUCCESS ) { error("task/cgroup: unable to ask for hierarchical accounting" "of user memcg '%s'",user_memory_cg.path); xcgroup_destroy (&user_memory_cg); goto error; } /* * Create job cgroup in the memory ns (it could already exist) * and set the associated memory limits. * Disable notify_on_release for this memcg, it will be * manually removed by the plugin at the end of the step. */ if (memcg_initialize (&memory_ns, &job_memory_cg, job_cgroup_path, job->job_mem, getuid(), getgid(), 0) < 0) { xcgroup_destroy (&user_memory_cg); goto error; } /* * Create step cgroup in the memory ns (it should not exists) * and set the associated memory limits. * Disable notify_on_release for the step memcg, it will be * manually removed by the plugin at the end of the step. */ if (memcg_initialize (&memory_ns, &step_memory_cg, jobstep_cgroup_path, job->step_mem, uid, gid, 0) < 0) { xcgroup_destroy(&user_memory_cg); xcgroup_destroy(&job_memory_cg); goto error; } error: xcgroup_unlock(&memory_cg); xcgroup_destroy(&memory_cg); return fstatus; }
/* * mount a cgroup namespace * * returned values: * - XCGROUP_ERROR * - XCGROUP_SUCCESS * * If an error occurs, errno will be set. */ int xcgroup_ns_mount(xcgroup_ns_t* cgns) { int fstatus; char* options; char opt_combined[1024]; char* mnt_point; char* p; xcgroup_t cg; mode_t cmask; mode_t omask; cmask = S_IWGRP | S_IWOTH; omask = umask(cmask); fstatus = mkdir(cgns->mnt_point, 0755); if (fstatus && errno != EEXIST) { if (cgns->mnt_point[0] != '/') { debug("unable to create cgroup ns directory '%s'" " : do not start with '/'", cgns->mnt_point); umask(omask); return XCGROUP_ERROR; } mnt_point = xstrdup(cgns->mnt_point); p = mnt_point; while ((p = index(p+1, '/')) != NULL) { *p = '\0'; fstatus = mkdir(mnt_point, 0755); if (fstatus && errno != EEXIST) { debug("unable to create cgroup ns required " "directory '%s'", mnt_point); xfree(mnt_point); umask(omask); return XCGROUP_ERROR; } *p='/'; } xfree(mnt_point); fstatus = mkdir(cgns->mnt_point, 0755); } if (fstatus && errno != EEXIST) { debug("unable to create cgroup ns directory '%s'" " : %m", cgns->mnt_point); umask(omask); return XCGROUP_ERROR; } umask(omask); if (cgns->mnt_args == NULL || strlen(cgns->mnt_args) == 0) options = cgns->subsystems; else { if (snprintf(opt_combined, sizeof(opt_combined), "%s,%s", cgns->subsystems, cgns->mnt_args) >= sizeof(opt_combined)) { debug2("unable to build cgroup options string"); return XCGROUP_ERROR; } options = opt_combined; } #if defined(__FreeBSD__) if (mount("cgroup", cgns->mnt_point, MS_NOSUID|MS_NOEXEC|MS_NODEV, options)) #else if (mount("cgroup", cgns->mnt_point, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, options)) #endif return XCGROUP_ERROR; else { /* FIXME: this only gets set when we aren't mounted at all. Since we never umount this may only be loaded at startup the first time. */ /* we then set the release_agent if necessary */ if (cgns->notify_prog) { if (xcgroup_create(cgns, &cg, "/", 0, 0) == XCGROUP_ERROR) return XCGROUP_SUCCESS; xcgroup_set_param(&cg, "release_agent", cgns->notify_prog); xcgroup_destroy(&cg); } return XCGROUP_SUCCESS; } }
int _slurm_cgroup_create(stepd_step_rec_t *job, uint64_t id, uid_t uid, gid_t gid) { /* we do it here as we do not have access to the conf structure */ /* in libslurm (src/common/xcgroup.c) */ xcgroup_t slurm_cg; char* pre = (char*) xstrdup(slurm_cgroup_conf.cgroup_prepend); #ifdef MULTIPLE_SLURMD if ( conf->node_name != NULL ) xstrsubstitute(pre,"%n", conf->node_name); else { xfree(pre); pre = (char*) xstrdup("/slurm"); } #endif /* create slurm cgroup in the freezer ns (it could already exist) */ if (xcgroup_create(&freezer_ns, &slurm_cg,pre, getuid(), getgid()) != XCGROUP_SUCCESS) { return SLURM_ERROR; } if (xcgroup_instanciate(&slurm_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&slurm_cg); return SLURM_ERROR; } else xcgroup_destroy(&slurm_cg); /* build user cgroup relative path if not set (should not be) */ if (*user_cgroup_path == '\0') { if (snprintf(user_cgroup_path, PATH_MAX, "%s/uid_%u", pre, uid) >= PATH_MAX) { error("unable to build uid %u cgroup relative " "path : %m", uid); xfree(pre); return SLURM_ERROR; } } xfree(pre); /* build job cgroup relative path if no set (should not be) */ if (*job_cgroup_path == '\0') { if (snprintf(job_cgroup_path, PATH_MAX, "%s/job_%u", user_cgroup_path, job->jobid) >= PATH_MAX) { error("unable to build job %u cgroup relative " "path : %m", job->jobid); return SLURM_ERROR; } } /* build job step cgroup relative path (should not be) */ if (*jobstep_cgroup_path == '\0') { if (job->stepid == NO_VAL) { if (snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_batch", job_cgroup_path) >= PATH_MAX) { error("proctrack/cgroup unable to build job step" " %u.batch freezer cg relative path: %m", job->jobid); return SLURM_ERROR; } } else { if (snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_%u", job_cgroup_path, job->stepid) >= PATH_MAX) { error("proctrack/cgroup unable to build job step" " %u.%u freezer cg relative path: %m", job->jobid, job->stepid); return SLURM_ERROR; } } } /* create user cgroup in the freezer ns (it could already exist) */ if (xcgroup_create(&freezer_ns, &user_freezer_cg, user_cgroup_path, getuid(), getgid()) != XCGROUP_SUCCESS) { return SLURM_ERROR; } if (xcgroup_instanciate(&user_freezer_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_freezer_cg); return SLURM_ERROR; } /* create job cgroup in the freezer ns (it could already exist) */ if (xcgroup_create(&freezer_ns, &job_freezer_cg, job_cgroup_path, getuid(), getgid()) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_freezer_cg); return SLURM_ERROR; } if (xcgroup_instanciate(&job_freezer_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_freezer_cg); xcgroup_destroy(&job_freezer_cg); return SLURM_ERROR; } /* create step cgroup in the freezer ns (it should not exists) */ if (xcgroup_create(&freezer_ns, &step_freezer_cg, jobstep_cgroup_path, getuid(), getgid()) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_freezer_cg); xcgroup_destroy(&job_freezer_cg); return SLURM_ERROR; } if (xcgroup_instanciate(&step_freezer_cg) != XCGROUP_SUCCESS) { xcgroup_destroy(&user_freezer_cg); xcgroup_destroy(&job_freezer_cg); xcgroup_destroy(&step_freezer_cg); return SLURM_ERROR; } /* inhibit release agent for the step cgroup thus letting * slurmstepd being able to add new pids to the container * when the job ends (TaskEpilog,...) */ xcgroup_set_param(&step_freezer_cg,"notify_on_release","0"); return SLURM_SUCCESS; }
extern int task_cgroup_cpuset_fini(slurm_cgroup_conf_t *slurm_cgroup_conf) { xcgroup_t cpuset_cg; /* Similarly to task_cgroup_memory_fini(), we must lock the * root cgroup so we don't race with another job step that is * being started. */ if (xcgroup_create(&cpuset_ns, &cpuset_cg,"",0,0) == XCGROUP_SUCCESS) { if (xcgroup_lock(&cpuset_cg) == XCGROUP_SUCCESS) { int i = 0, npids = 0, cnt = 0; pid_t* pids = NULL; /* First move slurmstepd to the root cpuset cg * so we can remove the step/job/user cpuset * cg's. */ xcgroup_move_process(&cpuset_cg, getpid()); /* There is a delay in the cgroup system when moving the * pid from one cgroup to another. This is usually * short, but we need to wait to make sure the pid is * out of the step cgroup or we will occur an error * leaving the cgroup unable to be removed. */ do { xcgroup_get_pids(&step_cpuset_cg, &pids, &npids); for (i = 0 ; i<npids ; i++) if (pids[i] == getpid()) { cnt++; break; } xfree(pids); } while ((i < npids) && (cnt < MAX_MOVE_WAIT)); if (cnt < MAX_MOVE_WAIT) debug3("Took %d checks before stepd pid was removed from the step cgroup.", cnt); else error("Pid %d is still in the step cgroup. It might be left uncleaned after the job.", getpid()); if (xcgroup_delete(&step_cpuset_cg) != SLURM_SUCCESS) debug2("task/cgroup: unable to remove step " "cpuset : %m"); if (xcgroup_delete(&job_cpuset_cg) != XCGROUP_SUCCESS) debug2("task/cgroup: not removing " "job cpuset : %m"); if (xcgroup_delete(&user_cpuset_cg) != XCGROUP_SUCCESS) debug2("task/cgroup: not removing " "user cpuset : %m"); xcgroup_unlock(&cpuset_cg); } else error("task/cgroup: unable to lock root cpuset : %m"); xcgroup_destroy(&cpuset_cg); } else error("task/cgroup: unable to create root cpuset : %m"); if (user_cgroup_path[0] != '\0') xcgroup_destroy(&user_cpuset_cg); if (job_cgroup_path[0] != '\0') xcgroup_destroy(&job_cpuset_cg); if (jobstep_cgroup_path[0] != '\0') xcgroup_destroy(&step_cpuset_cg); user_cgroup_path[0]='\0'; job_cgroup_path[0]='\0'; jobstep_cgroup_path[0]='\0'; xcgroup_ns_destroy(&cpuset_ns); return SLURM_SUCCESS; }