extern int task_cgroup_devices_create(stepd_step_rec_t *job)
{
	int f, k, rc, gres_conf_lines, allow_lines;
	int fstatus = SLURM_ERROR;
	char **gres_name = NULL;
	char **gres_cgroup = NULL, **dev_path = NULL;
	char *allowed_devices[PATH_MAX], *allowed_dev_major[PATH_MAX];
	int *gres_job_bit_alloc = NULL;
	int *gres_step_bit_alloc = NULL;
	int *gres_count = NULL;
	xcgroup_t devices_cg;
	uint32_t jobid = job->jobid;
	uint32_t stepid = job->stepid;
	uid_t uid = job->uid;
	uid_t gid = job->gid;

	List job_gres_list = job->job_gres_list;
	List step_gres_list = job->step_gres_list;

	char* slurm_cgpath ;

	/* create slurm root cgroup in this cgroup namespace */
	slurm_cgpath = task_cgroup_create_slurm_cg(&devices_ns);
	if (slurm_cgpath == NULL)
		return SLURM_ERROR;

	/* build user cgroup relative path if not set (should not be) */
	if (*user_cgroup_path == '\0') {
		if (snprintf(user_cgroup_path, PATH_MAX, "%s/uid_%u",
			     slurm_cgpath, uid) >= PATH_MAX) {
			error("unable to build uid %u cgroup relative path : %m",
			      uid);
			xfree(slurm_cgpath);
			return SLURM_ERROR;
		}
	}
	xfree(slurm_cgpath);

	/* build job cgroup relative path if no set (should not be) */
	if (*job_cgroup_path == '\0') {
		if (snprintf(job_cgroup_path, PATH_MAX, "%s/job_%u",
			     user_cgroup_path, jobid) >= PATH_MAX) {
			error("task/cgroup: unable to build job %u devices "
			      "cgroup relative path : %m", jobid);
			return SLURM_ERROR;
		}
	}

	/* build job step cgroup relative path (should not be) */
	if (*jobstep_cgroup_path == '\0') {
		int cc;
		if (stepid == SLURM_BATCH_SCRIPT) {
			cc = snprintf(jobstep_cgroup_path, PATH_MAX,
				      "%s/step_batch", job_cgroup_path);
		} else if (stepid == SLURM_EXTERN_CONT) {
			cc = snprintf(jobstep_cgroup_path, PATH_MAX,
				      "%s/step_extern", job_cgroup_path);
		} else {
			cc = snprintf(jobstep_cgroup_path, PATH_MAX,
				     "%s/step_%u",
				     job_cgroup_path, stepid);
		}
		if (cc >= PATH_MAX) {
			error("task/cgroup: unable to build job step %u.%u "
			      "devices cgroup relative path : %m",
			      jobid, stepid);
			return SLURM_ERROR;
		}
	}

	/*
	 * create devices root cgroup and lock it
	 *
	 * we will keep the lock until the end to avoid the effect of a release
	 * agent that would remove an existing cgroup hierarchy while we are
	 * setting it up. As soon as the step cgroup is created, we can release
	 * the lock.
	 * Indeed, consecutive slurm steps could result in cgroup being removed
	 * between the next EEXIST instanciation and the first addition of
	 * a task. The release_agent will have to lock the root devices cgroup
	 * to avoid this scenario.
	 */
	if (xcgroup_create(&devices_ns, &devices_cg, "", 0, 0) !=
	    XCGROUP_SUCCESS ) {
		error("task/cgroup: unable to create root devices cgroup");
		return SLURM_ERROR;
	}
	if (xcgroup_lock(&devices_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&devices_cg);
		error("task/cgroup: unable to lock root devices cgroup");
		return SLURM_ERROR;
	}

	info("task/cgroup: manage devices jor job '%u'", jobid);

	 /*
	  * collect info concerning the gres.conf file
	  * the GRES devices paths and the GRES names
	  */
	gres_conf_lines = gres_plugin_node_config_devices_path(&dev_path,
							       &gres_name,
							       job->node_name);

	/*
	 * create the entry for cgroup devices subsystem with major minor
	 */
	gres_cgroup = xmalloc(sizeof(char *) * gres_conf_lines);
	_calc_device_major(dev_path, gres_cgroup, gres_conf_lines);

	/*
         * create the entry with major minor for the default allowed devices
         * read from the file
         */
	allow_lines = read_allowed_devices_file(allowed_devices);
	_calc_device_major(allowed_devices, allowed_dev_major, allow_lines);

	/*
	 * calculate the number of gres.conf records for each gres name
	 */
	gres_count = xmalloc(sizeof(int) * gres_conf_lines);
	f = 0;
	gres_count[f] = 1;
	for (k = 0; k < gres_conf_lines; k++) {
		if ((k+1 < gres_conf_lines) &&
		    (xstrcmp(gres_name[k], gres_name[k+1]) == 0))
			gres_count[f]++;
		if ((k+1 < gres_conf_lines) &&
		    (xstrcmp(gres_name[k], gres_name[k+1]) != 0)) {
			f++;
			gres_count[f] = 1;
		}
	}

	/*
	 * create user cgroup in the devices ns (it could already exist)
	 */
	if (xcgroup_create(&devices_ns, &user_devices_cg, user_cgroup_path,
			   getuid(), getgid()) != XCGROUP_SUCCESS) {
		goto error;
	}
	if (xcgroup_instantiate(&user_devices_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_devices_cg);
		goto error;
	}

	/* TODO
	 * check that user's devices cgroup is consistant and allow the
	 * appropriate devices
	 */


	/*
	 * create job cgroup in the devices ns (it could already exist)
	 */
	if (xcgroup_create(&devices_ns, &job_devices_cg, job_cgroup_path,
			    getuid(), getgid()) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_devices_cg);
		goto error;
	}
	if (xcgroup_instantiate(&job_devices_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_devices_cg);
		xcgroup_destroy(&job_devices_cg);
		goto error;
	}

	/* fetch information concerning the gres devices allocation for the job */
	gres_job_bit_alloc = xmalloc(sizeof (int) * (gres_conf_lines + 10));
	gres_plugin_job_state_file(job_gres_list, gres_job_bit_alloc,
				   gres_count);

	/*
	 * with the current cgroup devices subsystem design (whitelist only
	 * supported) we need to allow all different devices that are supposed
	 * to be allowed by* default.
	 */
	for (k = 0; k < allow_lines; k++) {
		info("Default access allowed to device %s",
		     allowed_dev_major[k]);
		xcgroup_set_param(&job_devices_cg, "devices.allow",
				  allowed_dev_major[k]);
	}

	/*
         * allow or deny access to devices according to job GRES permissions
         */
	for (k = 0; k < gres_conf_lines; k++) {
		if (gres_job_bit_alloc[k] == 1) {
			info("Allowing access to device %s", gres_cgroup[k]);
			xcgroup_set_param(&job_devices_cg, "devices.allow",
                                          gres_cgroup[k]);
		} else {
			info("Not allowing access to device %s", gres_cgroup[k]);
			xcgroup_set_param(&job_devices_cg, "devices.deny",
					  gres_cgroup[k]);
		}
	}

	/*
	 * create step cgroup in the devices ns (it should not exists)
	 * use job's user uid/gid to enable tasks cgroups creation by
	 * the user inside the step cgroup owned by root
	 */
	if (xcgroup_create(&devices_ns, &step_devices_cg, jobstep_cgroup_path,
			   uid, gid) != XCGROUP_SUCCESS ) {
		/* do not delete user/job cgroup as */
		/* they can exist for other steps */
		xcgroup_destroy(&user_devices_cg);
		xcgroup_destroy(&job_devices_cg);
		goto error;
	}
	if ( xcgroup_instantiate(&step_devices_cg) != XCGROUP_SUCCESS ) {
		xcgroup_destroy(&user_devices_cg);
		xcgroup_destroy(&job_devices_cg);
		xcgroup_destroy(&step_devices_cg);
		goto error;
	}


	if ((job->stepid != SLURM_BATCH_SCRIPT) &&
	    (job->stepid != SLURM_EXTERN_CONT)) {

		/* fetch information about step GRES devices allocation */
		gres_step_bit_alloc = xmalloc(sizeof (int) * (gres_conf_lines + 10));
		gres_plugin_step_state_file(step_gres_list, gres_step_bit_alloc,
					    gres_count);

		/*
		 * with the current cgroup devices subsystem design (whitelist
		 * only supported) we need to allow all different devices that
		 * are supposed to be allowed by default.
		 */
		for (k = 0; k < allow_lines; k++) {
			info("Default access allowed to device %s",
			     allowed_dev_major[k]);
			xcgroup_set_param(&step_devices_cg, "devices.allow",
					  allowed_dev_major[k]);
		}

		/*
		 * allow or deny access to devices according to GRES permissions
		 * for the step
		 */
		for (k = 0; k < gres_conf_lines; k++) {
			if (gres_step_bit_alloc[k] == 1) {
				info("Allowing access to device %s for step",
				     gres_cgroup[k]);
				xcgroup_set_param(&step_devices_cg,
						 "devices.allow",
						  gres_cgroup[k]);
			} else {
				info("Not allowing access to device %s for step",
				     gres_cgroup[k]);
				xcgroup_set_param(&step_devices_cg,
						 "devices.deny",
						  gres_cgroup[k]);
			}
		}
	}
	/* attach the slurmstepd to the step devices cgroup */
	pid_t pid = getpid();
	rc = xcgroup_add_pids(&step_devices_cg, &pid, 1);
	if (rc != XCGROUP_SUCCESS) {
		error("task/cgroup: unable to add slurmstepd to devices cg '%s'",
		      step_devices_cg.path);
		fstatus = SLURM_ERROR;
	} else {
		fstatus = SLURM_SUCCESS;
	}

error:
	xcgroup_unlock(&devices_cg);
	xcgroup_destroy(&devices_cg);
	xfree(gres_step_bit_alloc);
	xfree(gres_job_bit_alloc);
	xfree(gres_name);
	xfree(dev_path);
	xfree(gres_cgroup);

	return fstatus;
}
Ejemplo n.º 2
0
extern int task_cgroup_cpuset_create(stepd_step_rec_t *job)
{
	int rc;
	int fstatus = SLURM_ERROR;

	xcgroup_t cpuset_cg;

	uint32_t jobid = job->jobid;
	uint32_t stepid = job->stepid;
	uid_t uid = job->uid;
	uid_t gid = job->gid;
	char* user_alloc_cores = NULL;
	char* job_alloc_cores = NULL;
	char* step_alloc_cores = NULL;
	char cpuset_meta[PATH_MAX];

	char* cpus = NULL;
	size_t cpus_size;

	char* slurm_cgpath;
	xcgroup_t slurm_cg;

#ifdef HAVE_NATIVE_CRAY
	char expected_usage[32];
#endif

	/* create slurm root cg in this cg namespace */
	slurm_cgpath = task_cgroup_create_slurm_cg(&cpuset_ns);
	if ( slurm_cgpath == NULL ) {
		return SLURM_ERROR;
	}

	/* check that this cgroup has cpus allowed or initialize them */
	if (xcgroup_load(&cpuset_ns,&slurm_cg,slurm_cgpath)
	    != XCGROUP_SUCCESS) {
		error("task/cgroup: unable to load slurm cpuset xcgroup");
		xfree(slurm_cgpath);
		return SLURM_ERROR;
	}
again:
	snprintf(cpuset_meta, sizeof(cpuset_meta), "%scpus", cpuset_prefix);
	rc = xcgroup_get_param(&slurm_cg, cpuset_meta, &cpus,&cpus_size);
	if (rc != XCGROUP_SUCCESS || cpus_size == 1) {
		if (!cpuset_prefix_set && (rc != XCGROUP_SUCCESS)) {
			cpuset_prefix_set = 1;
			cpuset_prefix = "cpuset.";
			goto again;
		}

		/* initialize the cpusets as it was inexistant */
		if (_xcgroup_cpuset_init(&slurm_cg) !=
		    XCGROUP_SUCCESS) {
			xfree(slurm_cgpath);
			xcgroup_destroy(&slurm_cg);
			return SLURM_ERROR;
		}
	}
	xfree(cpus);

	/* build user cgroup relative path if not set (should not be) */
	if (*user_cgroup_path == '\0') {
		if (snprintf(user_cgroup_path, PATH_MAX,
			     "%s/uid_%u", slurm_cgpath, uid) >= PATH_MAX) {
			error("task/cgroup: unable to build uid %u cgroup "
			      "relative path : %m", uid);
			xfree(slurm_cgpath);
			return SLURM_ERROR;
		}
	}
	xfree(slurm_cgpath);

	/* build job cgroup relative path if no set (should not be) */
	if (*job_cgroup_path == '\0') {
		if (snprintf(job_cgroup_path,PATH_MAX,"%s/job_%u",
			     user_cgroup_path,jobid) >= PATH_MAX) {
			error("task/cgroup: unable to build job %u cpuset "
			      "cg relative path : %m",jobid);
			return SLURM_ERROR;
		}
	}

	/* build job step cgroup relative path (should not be) */
	if (*jobstep_cgroup_path == '\0') {
		int cc;
		if (stepid == SLURM_BATCH_SCRIPT) {
			cc = snprintf(jobstep_cgroup_path, PATH_MAX,
				      "%s/step_batch", job_cgroup_path);
		} else if (stepid == SLURM_EXTERN_CONT) {
			cc = snprintf(jobstep_cgroup_path, PATH_MAX,
				      "%s/step_extern", job_cgroup_path);
		} else {
			cc = snprintf(jobstep_cgroup_path, PATH_MAX,
				      "%s/step_%u", job_cgroup_path, stepid);
		}
		if (cc >= PATH_MAX) {
			error("task/cgroup: unable to build job step %u.%u "
			      "cpuset cg relative path: %m",
			      jobid, stepid);
			return SLURM_ERROR;
		}
	}

	/*
	 * create cpuset root cg and lock it
	 *
	 * we will keep the lock until the end to avoid the effect of a release
	 * agent that would remove an existing cgroup hierarchy while we are
	 * setting it up. As soon as the step cgroup is created, we can release
	 * the lock.
	 * Indeed, consecutive slurm steps could result in cg being removed
	 * between the next EEXIST instanciation and the first addition of
	 * a task. The release_agent will have to lock the root cpuset cgroup
	 * to avoid this scenario.
	 */
	if (xcgroup_create(&cpuset_ns,&cpuset_cg,"",0,0) != XCGROUP_SUCCESS) {
		error("task/cgroup: unable to create root cpuset xcgroup");
		return SLURM_ERROR;
	}
	if (xcgroup_lock(&cpuset_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&cpuset_cg);
		error("task/cgroup: unable to lock root cpuset cg");
		return SLURM_ERROR;
	}

	/*
	 * build job and job steps allocated cores lists
	 */
	debug("task/cgroup: job abstract cores are '%s'",
	      job->job_alloc_cores);
	debug("task/cgroup: step abstract cores are '%s'",
	      job->step_alloc_cores);
	if (xcpuinfo_abs_to_mac(job->job_alloc_cores,
				&job_alloc_cores) != SLURM_SUCCESS) {
		error("task/cgroup: unable to build job physical cores");
		goto error;
	}
	if (xcpuinfo_abs_to_mac(job->step_alloc_cores,
				&step_alloc_cores) != SLURM_SUCCESS) {
		error("task/cgroup: unable to build step physical cores");
		goto error;
	}
	debug("task/cgroup: job physical cores are '%s'",
	      job_alloc_cores);
	debug("task/cgroup: step physical cores are '%s'",
	      step_alloc_cores);

	/*
	 * create user cgroup in the cpuset ns (it could already exist)
	 */
	if (xcgroup_create(&cpuset_ns,&user_cpuset_cg,
			   user_cgroup_path,
			   getuid(),getgid()) != XCGROUP_SUCCESS) {
		goto error;
	}
	if (xcgroup_instanciate(&user_cpuset_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_cpuset_cg);
		goto error;
	}

	/*
	 * check that user's cpuset cgroup is consistant and add the job cores
	 */
	rc = xcgroup_get_param(&user_cpuset_cg, cpuset_meta, &cpus,&cpus_size);
	if (rc != XCGROUP_SUCCESS || cpus_size == 1) {
		/* initialize the cpusets as it was inexistant */
		if (_xcgroup_cpuset_init(&user_cpuset_cg) !=
		    XCGROUP_SUCCESS) {
			xcgroup_delete(&user_cpuset_cg);
			xcgroup_destroy(&user_cpuset_cg);
			goto error;
		}
	}
	user_alloc_cores = xstrdup(job_alloc_cores);
	if (cpus != NULL && cpus_size > 1) {
		cpus[cpus_size-1]='\0';
		xstrcat(user_alloc_cores,",");
		xstrcat(user_alloc_cores,cpus);
	}
	xcgroup_set_param(&user_cpuset_cg, cpuset_meta, user_alloc_cores);
	xfree(cpus);

	/*
	 * create job cgroup in the cpuset ns (it could already exist)
	 */
	if (xcgroup_create(&cpuset_ns,&job_cpuset_cg,
			   job_cgroup_path,
			   getuid(),getgid()) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_cpuset_cg);
		goto error;
	}
	if (xcgroup_instanciate(&job_cpuset_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_cpuset_cg);
		xcgroup_destroy(&job_cpuset_cg);
		goto error;
	}
	if (_xcgroup_cpuset_init(&job_cpuset_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_cpuset_cg);
		xcgroup_destroy(&job_cpuset_cg);
		goto error;
	}
	xcgroup_set_param(&job_cpuset_cg, cpuset_meta, job_alloc_cores);

	/*
	 * create step cgroup in the cpuset ns (it should not exists)
	 * use job's user uid/gid to enable tasks cgroups creation by
	 * the user inside the step cgroup owned by root
	 */
	if (xcgroup_create(&cpuset_ns,&step_cpuset_cg,
			   jobstep_cgroup_path,
			   uid,gid) != XCGROUP_SUCCESS) {
		/* do not delete user/job cgroup as */
		/* they can exist for other steps */
		xcgroup_destroy(&user_cpuset_cg);
		xcgroup_destroy(&job_cpuset_cg);
		goto error;
	}
	if (xcgroup_instanciate(&step_cpuset_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_cpuset_cg);
		xcgroup_destroy(&job_cpuset_cg);
		xcgroup_destroy(&step_cpuset_cg);
		goto error;
	}
	if (_xcgroup_cpuset_init(&step_cpuset_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_cpuset_cg);
		xcgroup_destroy(&job_cpuset_cg);
		xcgroup_delete(&step_cpuset_cg);
		xcgroup_destroy(&step_cpuset_cg);
		goto error;
	}
	xcgroup_set_param(&step_cpuset_cg, cpuset_meta, step_alloc_cores);

	/*
	 * on Cray systems, set the expected usage in bytes.
	 * This is used by the Cray OOM killer
	 */
#ifdef HAVE_NATIVE_CRAY
	snprintf(expected_usage, sizeof(expected_usage), "%"PRIu64,
		 (uint64_t)job->step_mem * 1024 * 1024);
	xcgroup_set_param(&step_cpuset_cg, "expected_usage_in_bytes",
			  expected_usage);
#endif

	/* attach the slurmstepd to the step cpuset cgroup */
	pid_t pid = getpid();
	rc = xcgroup_add_pids(&step_cpuset_cg,&pid,1);
	if (rc != XCGROUP_SUCCESS) {
		error("task/cgroup: unable to add slurmstepd to cpuset cg '%s'",
		      step_cpuset_cg.path);
		fstatus = SLURM_ERROR;
	} else
		fstatus = SLURM_SUCCESS;

	/* validate the requested cpu frequency and set it */
	cpu_freq_cgroup_validate(job, step_alloc_cores);

error:
	xcgroup_unlock(&cpuset_cg);
	xcgroup_destroy(&cpuset_cg);

	xfree(user_alloc_cores);
	xfree(job_alloc_cores);
	xfree(step_alloc_cores);

	return fstatus;
}
Ejemplo n.º 3
0
extern int task_cgroup_cpuset_create(slurmd_job_t *job)
{
	int rc;
	int fstatus = SLURM_ERROR;

	xcgroup_t cpuset_cg;

	uint32_t jobid = job->jobid;
	uint32_t stepid = job->stepid;
	uid_t uid = job->uid;
	uid_t gid = job->gid;
	char* user_alloc_cores = NULL;
	char* job_alloc_cores = NULL;
	char* step_alloc_cores = NULL;

	char* cpus = NULL;
	size_t cpus_size;

	char* slurm_cgpath ;
	xcgroup_t slurm_cg;

	/* create slurm root cg in this cg namespace */
	slurm_cgpath = task_cgroup_create_slurm_cg(&cpuset_ns);
	if ( slurm_cgpath == NULL ) {
		return SLURM_ERROR;
	}

	/* check that this cgroup has cpus allowed or initialize them */
	if (xcgroup_load(&cpuset_ns,&slurm_cg,slurm_cgpath)
	    != XCGROUP_SUCCESS) {
		error("task/cgroup: unable to load slurm cpuset xcgroup");
		xfree(slurm_cgpath);
		return SLURM_ERROR;
	}
	rc = xcgroup_get_param(&slurm_cg,"cpuset.cpus",&cpus,&cpus_size);
	if (rc != XCGROUP_SUCCESS || cpus_size == 1) {
		/* initialize the cpusets as it was inexistant */
		if (_xcgroup_cpuset_init(&slurm_cg) !=
		    XCGROUP_SUCCESS) {
			xfree(slurm_cgpath);
			xcgroup_destroy(&slurm_cg);
			return SLURM_ERROR;
		}
	}
	xfree(cpus);

	/* build user cgroup relative path if not set (should not be) */
	if (*user_cgroup_path == '\0') {
		if (snprintf(user_cgroup_path, PATH_MAX,
			     "%s/uid_%u", slurm_cgpath, uid) >= PATH_MAX) {
			error("unable to build uid %u cgroup relative "
			      "path : %m", uid);
			xfree(slurm_cgpath);
			return SLURM_ERROR;
		}
	}
	xfree(slurm_cgpath);

	/* build job cgroup relative path if no set (should not be) */
	if (*job_cgroup_path == '\0') {
		if (snprintf(job_cgroup_path,PATH_MAX,"%s/job_%u",
			      user_cgroup_path,jobid) >= PATH_MAX) {
			error("task/cgroup: unable to build job %u cpuset "
			      "cg relative path : %m",jobid);
			return SLURM_ERROR;
		}
	}

	/* build job step cgroup relative path (should not be) */
	if (*jobstep_cgroup_path == '\0') {
		if (stepid == NO_VAL) {
			if (snprintf(jobstep_cgroup_path, PATH_MAX,
				     "%s/step_batch", job_cgroup_path)
			    >= PATH_MAX) {
				error("task/cgroup: unable to build job step"
				      " %u.batch cpuset cg relative path: %m",
				      jobid);
				return SLURM_ERROR;
			}
		} else {
			if (snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_%u",
				     job_cgroup_path, stepid) >= PATH_MAX) {
				error("task/cgroup: unable to build job step"
				      " %u.%u cpuset cg relative path: %m",
				      jobid, stepid);
				return SLURM_ERROR;
			}
		}
	}

	/*
	 * create cpuset root cg and lock it
	 *
	 * we will keep the lock until the end to avoid the effect of a release
	 * agent that would remove an existing cgroup hierarchy while we are
	 * setting it up. As soon as the step cgroup is created, we can release
	 * the lock.
	 * Indeed, consecutive slurm steps could result in cg being removed
	 * between the next EEXIST instanciation and the first addition of
	 * a task. The release_agent will have to lock the root cpuset cgroup
	 * to avoid this scenario.
	 */
	if (xcgroup_create(&cpuset_ns,&cpuset_cg,"",0,0) != XCGROUP_SUCCESS) {
		error("task/cgroup: unable to create root cpuset xcgroup");
		return SLURM_ERROR;
	}
	if (xcgroup_lock(&cpuset_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&cpuset_cg);
		error("task/cgroup: unable to lock root cpuset cg");
		return SLURM_ERROR;
	}

	/*
	 * build job and job steps allocated cores lists
	 */
	debug("task/cgroup: job abstract cores are '%s'",
	      job->job_alloc_cores);
	debug("task/cgroup: step abstract cores are '%s'",
	      job->step_alloc_cores);
	if (xcpuinfo_abs_to_mac(job->job_alloc_cores,
				 &job_alloc_cores) != XCPUINFO_SUCCESS) {
		error("task/cgroup: unable to build job physical cores");
		goto error;
	}
	if (xcpuinfo_abs_to_mac(job->step_alloc_cores,
				 &step_alloc_cores) != XCPUINFO_SUCCESS) {
		error("task/cgroup: unable to build step physical cores");
		goto error;
	}
	debug("task/cgroup: job physical cores are '%s'",
	      job->job_alloc_cores);
	debug("task/cgroup: step physical cores are '%s'",
	      job->step_alloc_cores);

	/*
	 * create user cgroup in the cpuset ns (it could already exist)
	 */
	if (xcgroup_create(&cpuset_ns,&user_cpuset_cg,
			    user_cgroup_path,
			    getuid(),getgid()) != XCGROUP_SUCCESS) {
		goto error;
	}
	if (xcgroup_instanciate(&user_cpuset_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_cpuset_cg);
		goto error;
	}

	/*
	 * check that user's cpuset cgroup is consistant and add the job cores
	 */
	rc = xcgroup_get_param(&user_cpuset_cg,"cpuset.cpus",&cpus,&cpus_size);
	if (rc != XCGROUP_SUCCESS || cpus_size == 1) {
		/* initialize the cpusets as it was inexistant */
		if (_xcgroup_cpuset_init(&user_cpuset_cg) !=
		     XCGROUP_SUCCESS) {
			xcgroup_delete(&user_cpuset_cg);
			xcgroup_destroy(&user_cpuset_cg);
			goto error;
		}
	}
	user_alloc_cores = xstrdup(job_alloc_cores);
	if (cpus != NULL && cpus_size > 1) {
		cpus[cpus_size-1]='\0';
		xstrcat(user_alloc_cores,",");
		xstrcat(user_alloc_cores,cpus);
	}
	xcgroup_set_param(&user_cpuset_cg,"cpuset.cpus",user_alloc_cores);
	xfree(cpus);

	/*
	 * create job cgroup in the cpuset ns (it could already exist)
	 */
	if (xcgroup_create(&cpuset_ns,&job_cpuset_cg,
			    job_cgroup_path,
			    getuid(),getgid()) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_cpuset_cg);
		goto error;
	}
	if (xcgroup_instanciate(&job_cpuset_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_cpuset_cg);
		xcgroup_destroy(&job_cpuset_cg);
		goto error;
	}
	if (_xcgroup_cpuset_init(&job_cpuset_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_cpuset_cg);
		xcgroup_destroy(&job_cpuset_cg);
		goto error;
	}
	xcgroup_set_param(&job_cpuset_cg,"cpuset.cpus",job_alloc_cores);

	/*
	 * create step cgroup in the cpuset ns (it should not exists)
	 * use job's user uid/gid to enable tasks cgroups creation by
	 * the user inside the step cgroup owned by root
	 */
	if (xcgroup_create(&cpuset_ns,&step_cpuset_cg,
			    jobstep_cgroup_path,
			    uid,gid) != XCGROUP_SUCCESS) {
		/* do not delete user/job cgroup as */
		/* they can exist for other steps */
		xcgroup_destroy(&user_cpuset_cg);
		xcgroup_destroy(&job_cpuset_cg);
		goto error;
	}
	if (xcgroup_instanciate(&step_cpuset_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_cpuset_cg);
		xcgroup_destroy(&job_cpuset_cg);
		xcgroup_destroy(&step_cpuset_cg);
		goto error;
	}
	if (_xcgroup_cpuset_init(&step_cpuset_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_cpuset_cg);
		xcgroup_destroy(&job_cpuset_cg);
		xcgroup_delete(&step_cpuset_cg);
		xcgroup_destroy(&step_cpuset_cg);
		goto error;
	}
	xcgroup_set_param(&step_cpuset_cg,"cpuset.cpus",step_alloc_cores);

	/* attach the slurmstepd to the step cpuset cgroup */
	pid_t pid = getpid();
	rc = xcgroup_add_pids(&step_cpuset_cg,&pid,1);
	if (rc != XCGROUP_SUCCESS) {
		error("task/cgroup: unable to add slurmstepd to cpuset cg '%s'",
		      step_cpuset_cg.path);
		fstatus = SLURM_ERROR;
	} else
		fstatus = SLURM_SUCCESS;

error:
	xcgroup_unlock(&cpuset_cg);
	xcgroup_destroy(&cpuset_cg);

	xfree(user_alloc_cores);
	xfree(job_alloc_cores);
	xfree(step_alloc_cores);

	return fstatus;
}
Ejemplo n.º 4
0
extern int task_cgroup_memory_create(stepd_step_rec_t *job)
{
	int fstatus = SLURM_ERROR;
	xcgroup_t memory_cg;
	uint32_t jobid = job->jobid;
	uint32_t stepid = job->stepid;
	uid_t uid = job->uid;
	gid_t gid = job->gid;
	char *slurm_cgpath;

	/* create slurm root cg in this cg namespace */
	slurm_cgpath = task_cgroup_create_slurm_cg(&memory_ns);
	if ( slurm_cgpath == NULL ) {
		return SLURM_ERROR;
	}

	/* build user cgroup relative path if not set (should not be) */
	if (*user_cgroup_path == '\0') {
		if (snprintf(user_cgroup_path, PATH_MAX,
			     "%s/uid_%u", slurm_cgpath, uid) >= PATH_MAX) {
			error("unable to build uid %u cgroup relative "
			      "path : %m", uid);
			xfree(slurm_cgpath);
			return SLURM_ERROR;
		}
	}
	xfree(slurm_cgpath);

	/* build job cgroup relative path if no set (should not be) */
	if (*job_cgroup_path == '\0') {
		if (snprintf(job_cgroup_path,PATH_MAX,"%s/job_%u",
			      user_cgroup_path,jobid) >= PATH_MAX) {
			error("task/cgroup: unable to build job %u memory "
			      "cg relative path : %m",jobid);
			return SLURM_ERROR;
		}
	}

	/* build job step cgroup relative path (should not be) */
	if (*jobstep_cgroup_path == '\0') {
		int cc;

		if (stepid == NO_VAL) {
			cc = snprintf(jobstep_cgroup_path, PATH_MAX,
				      "%s/step_batch", job_cgroup_path);
			if (cc >= PATH_MAX) {
				error("task/cgroup: unable to build "
				      "step batch memory cg path : %m");

			}

		} else {
			if (snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_%u",
				     job_cgroup_path,stepid) >= PATH_MAX) {
				error("task/cgroup: unable to build job step %u memory "
				      "cg relative path : %m",stepid);
				return SLURM_ERROR;
			}
		}
	}

	/*
	 * create memory root cg and lock it
	 *
	 * we will keep the lock until the end to avoid the effect of a release
	 * agent that would remove an existing cgroup hierarchy while we are
	 * setting it up. As soon as the step cgroup is created, we can release
	 * the lock.
	 * Indeed, consecutive slurm steps could result in cg being removed
	 * between the next EEXIST instanciation and the first addition of
	 * a task. The release_agent will have to lock the root memory cgroup
	 * to avoid this scenario.
	 */
	if (xcgroup_create(&memory_ns,&memory_cg,"",0,0) != XCGROUP_SUCCESS) {
		error("task/cgroup: unable to create root memory xcgroup");
		return SLURM_ERROR;
	}
	if (xcgroup_lock(&memory_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&memory_cg);
		error("task/cgroup: unable to lock root memory cg");
		return SLURM_ERROR;
	}

	/*
	 * Create user cgroup in the memory ns (it could already exist)
	 * Ask for hierarchical memory accounting starting from the user
	 * container in order to track the memory consumption up to the
	 * user.
	 * We do not set any limits at this level for now. It could be
	 * interesting to do it in the future but memcg cleanup mech
	 * are not working well so it will be really difficult to manage
	 * addition/removal of memory amounts at this level. (kernel 2.6.34)
	 */
	if (xcgroup_create(&memory_ns,&user_memory_cg,
			    user_cgroup_path,
			    getuid(),getgid()) != XCGROUP_SUCCESS) {
		goto error;
	}
	if (xcgroup_instanciate(&user_memory_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_memory_cg);
		goto error;
	}
	if ( xcgroup_set_param(&user_memory_cg,"memory.use_hierarchy","1")
	     != XCGROUP_SUCCESS ) {
		error("task/cgroup: unable to ask for hierarchical accounting"
		      "of user memcg '%s'",user_memory_cg.path);
		xcgroup_destroy (&user_memory_cg);
		goto error;
	}

	/*
	 * Create job cgroup in the memory ns (it could already exist)
	 * and set the associated memory limits.
	 * Disable notify_on_release for this memcg, it will be
	 * manually removed by the plugin at the end of the step.
	 */
	if (memcg_initialize (&memory_ns, &job_memory_cg, job_cgroup_path,
	                      job->job_mem, getuid(), getgid(), 0) < 0) {
		xcgroup_destroy (&user_memory_cg);
		goto error;
	}

	/*
	 * Create step cgroup in the memory ns (it should not exists)
	 * and set the associated memory limits.
	 * Disable notify_on_release for the step memcg, it will be
	 * manually removed by the plugin at the end of the step.
	 */
	if (memcg_initialize (&memory_ns, &step_memory_cg, jobstep_cgroup_path,
	                      job->step_mem, uid, gid, 0) < 0) {
		xcgroup_destroy(&user_memory_cg);
		xcgroup_destroy(&job_memory_cg);
		goto error;
	}

error:
	xcgroup_unlock(&memory_cg);
	xcgroup_destroy(&memory_cg);

	return fstatus;
}