Пример #1
0
int _slurm_cgroup_destroy(void)
{
    xcgroup_lock(&freezer_cg);

    if (jobstep_cgroup_path[0] != '\0') {
        if (xcgroup_delete(&step_freezer_cg) != XCGROUP_SUCCESS) {
            error("_slurm_cgroup_destroy: problem deleting step "
                  "cgroup path %s: %m", step_freezer_cg.path);
            xcgroup_unlock(&freezer_cg);
            return SLURM_ERROR;
        }
        xcgroup_destroy(&step_freezer_cg);
    }

    if (job_cgroup_path[0] != '\0') {
        xcgroup_delete(&job_freezer_cg);
        xcgroup_destroy(&job_freezer_cg);
    }

    if (user_cgroup_path[0] != '\0') {
        xcgroup_delete(&user_freezer_cg);
        xcgroup_destroy(&user_freezer_cg);
    }

    if (slurm_freezer_init) {
        xcgroup_destroy(&slurm_freezer_cg);
    }

    xcgroup_unlock(&freezer_cg);
    xcgroup_destroy(&freezer_cg);
    xcgroup_ns_destroy(&freezer_ns);

    return SLURM_SUCCESS;
}
Пример #2
0
int _slurm_cgroup_destroy(void)
{
	if (slurm_freezer_init)
		xcgroup_lock(&slurm_freezer_cg);

	if (jobstep_cgroup_path[0] != '\0') {
		if ( xcgroup_delete(&step_freezer_cg) != XCGROUP_SUCCESS ) {
			if (slurm_freezer_init)
				xcgroup_unlock(&slurm_freezer_cg);
			return SLURM_ERROR;
		}
		xcgroup_destroy(&step_freezer_cg);
	}

	if (job_cgroup_path[0] != '\0') {
		xcgroup_delete(&job_freezer_cg);
		xcgroup_destroy(&job_freezer_cg);
	}

	if (user_cgroup_path[0] != '\0') {
		xcgroup_delete(&user_freezer_cg);
		xcgroup_destroy(&user_freezer_cg);
	}

	if (slurm_freezer_init) {
		xcgroup_unlock(&slurm_freezer_cg);
		xcgroup_destroy(&slurm_freezer_cg);
	}

	xcgroup_ns_destroy(&freezer_ns);

	return SLURM_SUCCESS;
}
Пример #3
0
extern int jobacct_gather_cgroup_cpuacct_fini(
	slurm_cgroup_conf_t *slurm_cgroup_conf)
{
	xcgroup_t cpuacct_cg;

	if (user_cgroup_path[0] == '\0' ||
	    job_cgroup_path[0] == '\0' ||
	    jobstep_cgroup_path[0] == '\0')
		return SLURM_SUCCESS;

	/*
	 * Move the slurmstepd back to the root cpuacct cg.
	 * The release_agent will asynchroneously be called for the step
	 * cgroup. It will do the necessary cleanup.
	 */
	if (xcgroup_create(&cpuacct_ns, &cpuacct_cg, "", 0, 0)
	    == XCGROUP_SUCCESS) {
		xcgroup_set_uint32_param(&cpuacct_cg, "tasks", getpid());
		xcgroup_destroy(&cpuacct_cg);
	}

	xcgroup_destroy(&user_cpuacct_cg);
	xcgroup_destroy(&job_cpuacct_cg);
	xcgroup_destroy(&step_cpuacct_cg);

	user_cgroup_path[0]='\0';
	job_cgroup_path[0]='\0';
	jobstep_cgroup_path[0]='\0';
	xcgroup_ns_destroy(&cpuacct_ns);

	return SLURM_SUCCESS;
}
Пример #4
0
extern void fini_system_cgroup(void)
{
	xcgroup_destroy(&system_cpuset_cg);
	xcgroup_destroy(&system_memory_cg);
	xcgroup_ns_destroy(&cpuset_ns);
	xcgroup_ns_destroy(&memory_ns);
}
Пример #5
0
extern char* jobacct_cgroup_create_slurm_cg(xcgroup_ns_t* ns)
 {
	/* we do it here as we do not have access to the conf structure */
	/* in libslurm (src/common/xcgroup.c) */
	xcgroup_t slurm_cg;
	char* pre = (char*) xstrdup(slurm_cgroup_conf.cgroup_prepend);
#ifdef MULTIPLE_SLURMD
	if (conf->node_name != NULL)
		xstrsubstitute(pre,"%n", conf->node_name);
	else {
		xfree(pre);
		pre = (char*) xstrdup("/slurm");
	}
#endif

	/* create slurm cgroup in the ns (it could already exist) */
	if (xcgroup_create(ns,&slurm_cg,pre,
			   getuid(), getgid()) != XCGROUP_SUCCESS) {
		return pre;
	}

	if (xcgroup_instanciate(&slurm_cg) != XCGROUP_SUCCESS) {
		error("unable to build slurm cgroup for ns %s: %m",
		      ns->subsystems);
		xcgroup_destroy(&slurm_cg);
		return pre;
	} else {
		debug3("slurm cgroup %s successfully created for ns %s: %m",
		       pre,ns->subsystems);
		xcgroup_destroy(&slurm_cg);
	}

	return pre;
}
Пример #6
0
extern int task_cgroup_memory_fini(slurm_cgroup_conf_t *slurm_cgroup_conf)
{
	xcgroup_t memory_cg;

	if (user_cgroup_path[0] == '\0' ||
	     job_cgroup_path[0] == '\0' ||
	     jobstep_cgroup_path[0] == '\0')
		return SLURM_SUCCESS;
	/*
	 * Lock the root memcg and try to remove the different memcgs.
	 * The reason why we are locking here is that if a concurrent
	 * step is in the process of being executed, he could try to
	 * create the step memcg just after we remove the job memcg,
	 * resulting in a failure.
	 * First, delete step memcg as all the tasks have now exited.
	 * Then, try to remove the job memcg.
	 * If it fails, it is due to the fact that it is still in use by an
	 * other running step.
	 * After that, try to remove the user memcg. If it fails, it is due
	 * to jobs that are still running for the same user on the node or
	 * because of tasks attached directly to the user cg by an other
	 * component (PAM). The user memcg was created with the
	 * notify_on_release=1 flag (default) so it will be removed
	 * automatically after that.
	 * For now, do not try to detect if only externally attached tasks
	 * are present to see if they can be be moved to an orhpan memcg.
	 * That could be done in the future, if it is necessary.
	 */
	if (xcgroup_create(&memory_ns,&memory_cg,"",0,0) == XCGROUP_SUCCESS) {
		if (xcgroup_lock(&memory_cg) == XCGROUP_SUCCESS) {
			if (xcgroup_delete(&step_memory_cg) != SLURM_SUCCESS)
				debug2("task/cgroup: unable to remove step "
				       "memcg : %m");
			if (xcgroup_delete(&job_memory_cg) != XCGROUP_SUCCESS)
				debug2("task/cgroup: not removing "
				       "job memcg : %m");
			if (xcgroup_delete(&user_memory_cg) != XCGROUP_SUCCESS)
				debug2("task/cgroup: not removing "
				       "user memcg : %m");
			xcgroup_unlock(&memory_cg);
		} else
			error("task/cgroup: unable to lock root memcg : %m");
		xcgroup_destroy(&memory_cg);
	} else
		error("task/cgroup: unable to create root memcg : %m");

	xcgroup_destroy(&user_memory_cg);
	xcgroup_destroy(&job_memory_cg);
	xcgroup_destroy(&step_memory_cg);

	user_cgroup_path[0]='\0';
	job_cgroup_path[0]='\0';
	jobstep_cgroup_path[0]='\0';

	xcgroup_ns_destroy(&memory_ns);

	return SLURM_SUCCESS;
}
extern int task_cgroup_devices_fini(slurm_cgroup_conf_t *slurm_cgroup_conf)
{
	xcgroup_t devices_cg;

	/* Similarly to task_cgroup_{memory,cpuset}_fini(), we must lock the
	 * root cgroup so we don't race with another job step that is
	 * being started.  */
        if (xcgroup_create(&devices_ns, &devices_cg,"",0,0)
	    == XCGROUP_SUCCESS) {
                if (xcgroup_lock(&devices_cg) == XCGROUP_SUCCESS) {
			/* First move slurmstepd to the root devices cg
			 * so we can remove the step/job/user devices
			 * cg's.  */
			xcgroup_move_process(&devices_cg, getpid());
                        if (xcgroup_delete(&step_devices_cg) != SLURM_SUCCESS)
                                debug2("task/cgroup: unable to remove step "
                                       "devices : %m");
                        if (xcgroup_delete(&job_devices_cg) != XCGROUP_SUCCESS)
                                debug2("task/cgroup: not removing "
                                       "job devices : %m");
                        if (xcgroup_delete(&user_devices_cg)
			    != XCGROUP_SUCCESS)
                                debug2("task/cgroup: not removing "
                                       "user devices : %m");
                        xcgroup_unlock(&devices_cg);
                } else
                        error("task/cgroup: unable to lock root devices : %m");
                xcgroup_destroy(&devices_cg);
        } else
                error("task/cgroup: unable to create root devices : %m");

	if ( user_cgroup_path[0] != '\0' )
		xcgroup_destroy(&user_devices_cg);
	if ( job_cgroup_path[0] != '\0' )
		xcgroup_destroy(&job_devices_cg);
	if ( jobstep_cgroup_path[0] != '\0' )
		xcgroup_destroy(&step_devices_cg);

	user_cgroup_path[0] = '\0';
	job_cgroup_path[0] = '\0';
	jobstep_cgroup_path[0] = '\0';

	cgroup_allowed_devices_file[0] = '\0';

	xcgroup_ns_destroy(&devices_ns);

	xcpuinfo_fini();
	return SLURM_SUCCESS;
}
Пример #8
0
static int memcg_initialize (xcgroup_ns_t *ns, xcgroup_t *cg,
		char *path, uint64_t mem_limit, uid_t uid, gid_t gid)
{
	uint64_t mlb = mem_limit_in_bytes (mem_limit);
	uint64_t mls = swap_limit_in_bytes  (mem_limit);

	if (xcgroup_create (ns, cg, path, uid, gid) != XCGROUP_SUCCESS)
		return -1;

	if (xcgroup_instanciate (cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy (cg);
		return -1;
	}

	xcgroup_set_param (cg, "memory.use_hierarchy","1");
	xcgroup_set_uint64_param (cg, "memory.limit_in_bytes", mlb);
	xcgroup_set_uint64_param (cg, "memory.memsw.limit_in_bytes", mls);

	info ("task/cgroup: %s: alloc=%luMB mem.limit=%luMB memsw.limit=%luMB",
		path,
		(unsigned long) mem_limit,
		(unsigned long) mlb/(1024*1024),
		(unsigned long) mls/(1024*1024));

	return 0;
}
Пример #9
0
extern int task_cgroup_cpuset_fini(slurm_cgroup_conf_t *slurm_cgroup_conf)
{

	if (user_cgroup_path[0] != '\0')
		xcgroup_destroy(&user_cpuset_cg);
	if (job_cgroup_path[0] != '\0')
		xcgroup_destroy(&job_cpuset_cg);
	if (jobstep_cgroup_path[0] != '\0')
		xcgroup_destroy(&step_cpuset_cg);

	user_cgroup_path[0]='\0';
	job_cgroup_path[0]='\0';
	jobstep_cgroup_path[0]='\0';

	xcgroup_ns_destroy(&cpuset_ns);

	return SLURM_SUCCESS;
}
Пример #10
0
static char* _system_cgroup_create_slurm_cg (xcgroup_ns_t* ns)
{
	/* we do it here as we do not have access to the conf structure */
	/* in libslurm (src/common/xcgroup.c) */
	xcgroup_t slurm_cg;
	char* pre = (char*) xstrdup(slurm_cgroup_conf.cgroup_prepend);

#ifdef MULTIPLE_SLURMD
	if ( conf->node_name != NULL )
		xstrsubstitute(pre, "%n", conf->node_name);
	else {
		xfree(pre);
		pre = (char*) xstrdup("/slurm");
	}
#endif

	/* create slurm cgroup in the ns
	 * disable notify_on_release to avoid the removal/creation
	 * of this cgroup for each last/first running job on the node */
	if (xcgroup_create(ns, &slurm_cg, pre,
			   getuid(), getgid()) != XCGROUP_SUCCESS) {
		xfree(pre);
		return pre;
	}
	slurm_cg.notify = 0;
	if (xcgroup_instantiate(&slurm_cg) != XCGROUP_SUCCESS) {
		error("system cgroup: unable to build slurm cgroup for "
		      "ns %s: %m",
		      ns->subsystems);
		xcgroup_destroy(&slurm_cg);
		xfree(pre);
		return pre;
	}
	else {
		debug3("system cgroup: slurm cgroup %s successfully created "
		       "for ns %s: %m",
		       pre, ns->subsystems);
		xcgroup_destroy(&slurm_cg);
	}

	return pre;
}
Пример #11
0
int _slurm_cgroup_destroy(void)
{
	xcgroup_lock(&freezer_cg);

	/*
	 *  First move slurmstepd process to the root cgroup, otherwise
	 *   the rmdir(2) triggered by the calls below will always fail,
	 *   because slurmstepd is still in the cgroup!
	 */
	_move_current_to_root_cgroup(&freezer_ns);

	if (jobstep_cgroup_path[0] != '\0') {
		if (xcgroup_delete(&step_freezer_cg) != XCGROUP_SUCCESS) {
			debug("_slurm_cgroup_destroy: problem deleting step cgroup path %s: %m",
			      step_freezer_cg.path);
			xcgroup_unlock(&freezer_cg);
			return SLURM_ERROR;
		}
		xcgroup_destroy(&step_freezer_cg);
	}

	if (job_cgroup_path[0] != '\0') {
		xcgroup_delete(&job_freezer_cg);
		xcgroup_destroy(&job_freezer_cg);
	}

	if (user_cgroup_path[0] != '\0') {
		xcgroup_delete(&user_freezer_cg);
		xcgroup_destroy(&user_freezer_cg);
	}

	if (slurm_freezer_init) {
		xcgroup_destroy(&slurm_freezer_cg);
	}

	xcgroup_unlock(&freezer_cg);
	xcgroup_destroy(&freezer_cg);
	xcgroup_ns_destroy(&freezer_ns);

	return SLURM_SUCCESS;
}
Пример #12
0
extern int task_cgroup_devices_fini(slurm_cgroup_conf_t *slurm_cgroup_conf)
{

	if ( user_cgroup_path[0] != '\0' )
		xcgroup_destroy(&user_devices_cg);
	if ( job_cgroup_path[0] != '\0' )
		xcgroup_destroy(&job_devices_cg);
	if ( jobstep_cgroup_path[0] != '\0' )
		xcgroup_destroy(&step_devices_cg);

	user_cgroup_path[0] = '\0';
	job_cgroup_path[0] = '\0';
	jobstep_cgroup_path[0] = '\0';

	cgroup_allowed_devices_file[0] = '\0';

	xcgroup_ns_destroy(&devices_ns);

	xcpuinfo_fini();
	return SLURM_SUCCESS;
}
Пример #13
0
extern int task_cgroup_memory_fini(slurm_cgroup_conf_t *slurm_cgroup_conf)
{
	xcgroup_t memory_cg;

	if (user_cgroup_path[0] == '\0' ||
	     job_cgroup_path[0] == '\0' ||
	     jobstep_cgroup_path[0] == '\0')
		return SLURM_SUCCESS;

	/*
	 * Move the slurmstepd back to the root memory cg and remove[*]
	 * the step cgroup to move its allocated pages to its parent.
	 *
	 * [*] Calling rmdir(2) on an empty cgroup moves all resident charged
	 *  pages to the parent (i.e. the job cgroup). (If force_empty were
	 *  used instead, only clean pages would be flushed). This keeps
	 *  resident pagecache pages associated with the job. It is expected
	 *  that the job epilog will then optionally force_empty the
	 *  job cgroup (to flush pagecache), and then rmdir(2) the cgroup
	 *  or wait for release notification from kernel.
	 */
	if (xcgroup_create(&memory_ns,&memory_cg,"",0,0) == XCGROUP_SUCCESS) {
		xcgroup_move_process(&memory_cg, getpid());
		xcgroup_destroy(&memory_cg);
		if (xcgroup_delete(&step_memory_cg) != XCGROUP_SUCCESS)
			error ("cgroup: rmdir step memcg failed: %m");
	}

	xcgroup_destroy(&user_memory_cg);
	xcgroup_destroy(&job_memory_cg);
	xcgroup_destroy(&step_memory_cg);

	user_cgroup_path[0]='\0';
	job_cgroup_path[0]='\0';
	jobstep_cgroup_path[0]='\0';

	xcgroup_ns_destroy(&memory_ns);

	return SLURM_SUCCESS;
}
Пример #14
0
static int _move_current_to_root_cgroup(xcgroup_ns_t *ns)
{
	xcgroup_t cg;
	int rc;

	if (xcgroup_create(ns, &cg, "", 0, 0) != XCGROUP_SUCCESS)
		return SLURM_ERROR;

	rc = xcgroup_move_process(&cg, getpid());
	xcgroup_destroy(&cg);

	return rc;
}
Пример #15
0
extern int jobacct_gather_cgroup_memory_fini(
	slurm_cgroup_conf_t *slurm_cgroup_conf)
{
	xcgroup_t memory_cg;

	if (user_cgroup_path[0] == '\0' ||
	    job_cgroup_path[0] == '\0' ||
	    jobstep_cgroup_path[0] == '\0')
		return SLURM_SUCCESS;

	/*
	 * Move the slurmstepd back to the root memory cg and force empty
	 * the step cgroup to move its allocated pages to its parent.
	 * The release_agent will asynchroneously be called for the step
	 * cgroup. It will do the necessary cleanup.
	 * It should be good if this force_empty mech could be done directly
	 * by the memcg implementation at the end of the last task managed
	 * by a cgroup. It is too difficult and near impossible to handle
	 * that cleanup correctly with current memcg.
	 */
	if (xcgroup_create(&memory_ns, &memory_cg, "", 0, 0)
	    == XCGROUP_SUCCESS) {
		xcgroup_set_uint32_param(&memory_cg, "tasks", getpid());
		xcgroup_destroy(&memory_cg);
		xcgroup_set_param(&step_memory_cg, "memory.force_empty", "1");
	}

	xcgroup_destroy(&user_memory_cg);
	xcgroup_destroy(&job_memory_cg);
	xcgroup_destroy(&step_memory_cg);

	user_cgroup_path[0]='\0';
	job_cgroup_path[0]='\0';
	jobstep_cgroup_path[0]='\0';

	xcgroup_ns_destroy(&memory_ns);

	return SLURM_SUCCESS;
}
Пример #16
0
static int memcg_initialize (xcgroup_ns_t *ns, xcgroup_t *cg,
			     char *path, uint64_t mem_limit, uid_t uid,
			     gid_t gid, uint32_t notify)
{
	uint64_t mlb = mem_limit_in_bytes (mem_limit);
	uint64_t mls = swap_limit_in_bytes  (mem_limit);

	if (xcgroup_create (ns, cg, path, uid, gid) != XCGROUP_SUCCESS)
		return -1;

	cg->notify = notify;

	if (xcgroup_instantiate (cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy (cg);
		return -1;
	}

	xcgroup_set_param (cg, "memory.use_hierarchy", "1");

	/* when RAM space has not to be constrained and we are here, it
	 * means that only Swap space has to be constrained. Thus set
	 * RAM space limit to the mem+swap limit too */
	if ( ! constrain_ram_space )
		mlb = mls;
	xcgroup_set_uint64_param (cg, "memory.limit_in_bytes", mlb);

	/*
	 * Also constrain kernel memory (if available).
	 * See https://lwn.net/Articles/516529/
	 */
	xcgroup_set_uint64_param (cg, "memory.kmem.limit_in_bytes", mlb);

	/* this limit has to be set only if ConstrainSwapSpace is set to yes */
	if ( constrain_swap_space ) {
		xcgroup_set_uint64_param (cg, "memory.memsw.limit_in_bytes",
					  mls);
		info ("task/cgroup: %s: alloc=%luMB mem.limit=%luMB "
		      "memsw.limit=%luMB", path,
		      (unsigned long) mem_limit,
		      (unsigned long) mlb/(1024*1024),
		      (unsigned long) mls/(1024*1024));
	} else {
		info ("task/cgroup: %s: alloc=%luMB mem.limit=%luMB "
		      "memsw.limit=unlimited", path,
		      (unsigned long) mem_limit,
		      (unsigned long) mlb/(1024*1024));
	}

	return 0;
}
Пример #17
0
extern int task_cgroup_memory_check_oom(stepd_step_rec_t *job)
{
	xcgroup_t memory_cg;

	if (xcgroup_create(&memory_ns, &memory_cg, "", 0, 0)
	    == XCGROUP_SUCCESS) {
		if (xcgroup_lock(&memory_cg) == XCGROUP_SUCCESS) {
			/* for some reason the job cgroup limit is hit
			 * for a step and vice versa...
			 * can't tell which is which so we'll treat
			 * them the same */
			if (failcnt_non_zero(&step_memory_cg,
					     "memory.memsw.failcnt"))
				/* reports the number of times that the
				 * memory plus swap space limit has
				 * reached the value set in
				 * memory.memsw.limit_in_bytes.
				 */
				error("Exceeded step memory limit at some point.");
			else if (failcnt_non_zero(&step_memory_cg,
						  "memory.failcnt"))
				/* reports the number of times that the
				 * memory limit has reached the value set
				 * in memory.limit_in_bytes.
				 */
				error("Exceeded step memory limit at some point.");
			if (failcnt_non_zero(&job_memory_cg,
					     "memory.memsw.failcnt"))
				error("Exceeded job memory limit at some point.");
			else if (failcnt_non_zero(&job_memory_cg,
						  "memory.failcnt"))
				error("Exceeded job memory limit at some point.");
			xcgroup_unlock(&memory_cg);
		} else
			error("task/cgroup task_cgroup_memory_check_oom: "
			      "task_cgroup_memory_check_oom: unable to lock "
			      "root memcg : %m");
		xcgroup_destroy(&memory_cg);
	} else
		error("task/cgroup task_cgroup_memory_check_oom: "
		      "unable to create root memcg : %m");

	return SLURM_SUCCESS;
}
Пример #18
0
extern int task_cgroup_memory_check_oom(stepd_step_rec_t *job)
{
	xcgroup_t memory_cg;

	if (xcgroup_create(&memory_ns, &memory_cg, "", 0, 0)
	    == XCGROUP_SUCCESS) {
		if (xcgroup_lock(&memory_cg) == XCGROUP_SUCCESS) {
			/* for some reason the job cgroup limit is hit
			 * for a step and vice versa...
			 * can't tell which is which so we'll treat
			 * them the same */
			if (failcnt_non_zero(&step_memory_cg,
					     "memory.memsw.failcnt"))
				error("Exceeded step memory limit at some "
				      "point. oom-killer likely killed a "
				      "process.");
			else if(failcnt_non_zero(&step_memory_cg,
						 "memory.failcnt"))
				error("Exceeded step memory limit at some "
				      "point. Step may have been partially "
				      "swapped out to disk.");
			if (failcnt_non_zero(&job_memory_cg,
					     "memory.memsw.failcnt"))
				error("Exceeded job memory limit at some "
				      "point. oom-killer likely killed a "
				      "process.");
			else if (failcnt_non_zero(&job_memory_cg,
						  "memory.failcnt"))
				error("Exceeded job memory limit at some "
				      "point. Job may have been partially "
				      "swapped out to disk.");
			xcgroup_unlock(&memory_cg);
		} else
			error("task/cgroup task_cgroup_memory_check_oom: "
			      "task_cgroup_memory_check_oom: unable to lock "
			      "root memcg : %m");
		xcgroup_destroy(&memory_cg);
	} else
		error("task/cgroup task_cgroup_memory_check_oom: "
		      "unable to create root memcg : %m");

	return SLURM_SUCCESS;
}
Пример #19
0
bool
_slurm_cgroup_has_pid(pid_t pid)
{
    bool fstatus;
    xcgroup_t cg;

    fstatus = xcgroup_ns_find_by_pid(&freezer_ns, &cg, pid);
    if ( fstatus != XCGROUP_SUCCESS)
        return false;

    if (strcmp(cg.path, step_freezer_cg.path)) {
        fstatus = false;
    }
    else {
        fstatus = true;
    }

    xcgroup_destroy(&cg);
    return fstatus;
}
Пример #20
0
/*
 * check that a cgroup namespace is ready to be used
 *
 * returned values:
 *  - XCGROUP_ERROR : not available
 *  - XCGROUP_SUCCESS : ready to be used
 */
int xcgroup_ns_is_available(xcgroup_ns_t* cgns)
{
	int fstatus = 0;
	char* value;
	size_t s;
	xcgroup_t cg;

	if (xcgroup_create(cgns, &cg, "/", 0, 0) == XCGROUP_ERROR)
		return 0;

	if (xcgroup_get_param(&cg, "release_agent",
			      &value, &s) != XCGROUP_SUCCESS)
		fstatus = 0;
	else {
		xfree(value);
		fstatus = 1;
	}

	xcgroup_destroy(&cg);

	return fstatus;
}
Пример #21
0
extern int init_system_memory_cgroup(void)
{
	int fstatus = SLURM_ERROR;
	char* slurm_cgpath;

	/* read cgroup configuration */
	if (read_slurm_cgroup_conf(&slurm_cgroup_conf))
		return SLURM_ERROR;

	/* initialize memory cgroup namespace */
	if (xcgroup_ns_create(&slurm_cgroup_conf, &memory_ns, "", "memory")
	    != XCGROUP_SUCCESS) {
		error("system cgroup: unable to create memory namespace");
		free_slurm_cgroup_conf(&slurm_cgroup_conf);
		return SLURM_ERROR;
	}

	constrain_kmem_space = slurm_cgroup_conf.constrain_kmem_space;
	constrain_ram_space = slurm_cgroup_conf.constrain_ram_space;
	constrain_swap_space = slurm_cgroup_conf.constrain_swap_space;

	/*
	 * as the swap space threshold will be configured with a
	 * mem+swp parameter value, if RAM space is not monitored,
	 * set allowed RAM space to 100% of the job requested memory.
	 * It will help to construct the mem+swp value that will be
	 * used for both mem and mem+swp limit during memcg creation.
	 */
	if ( constrain_ram_space )
		allowed_ram_space = slurm_cgroup_conf.allowed_ram_space;
	else
		allowed_ram_space = 100.0;

	allowed_swap_space = slurm_cgroup_conf.allowed_swap_space;

	if ((totalram = (uint64_t) conf->real_memory_size) == 0)
		error ("system cgroup: Unable to get RealMemory size");

	max_kmem = _percent_in_bytes(totalram, slurm_cgroup_conf.max_kmem_percent);
	max_ram = _percent_in_bytes(totalram, slurm_cgroup_conf.max_ram_percent);
	max_swap = _percent_in_bytes(totalram, slurm_cgroup_conf.max_swap_percent);
	max_swap += max_ram;
	min_ram_space = slurm_cgroup_conf.min_ram_space * 1024 * 1024;

	debug ("system cgroup: memory: total:%luM allowed:%.4g%%(%s), "
	       "swap:%.4g%%(%s), max:%.4g%%(%luM) "
	       "max+swap:%.4g%%(%luM) min:%luM "
	       "kmem:%.4g%%(%luM %s) min:%luM",
	       (unsigned long) totalram,
	       allowed_ram_space,
	       constrain_ram_space?"enforced":"permissive",

	       allowed_swap_space,
	       constrain_swap_space?"enforced":"permissive",
	       slurm_cgroup_conf.max_ram_percent,
	       (unsigned long) (max_ram/(1024*1024)),

	       slurm_cgroup_conf.max_swap_percent,
	       (unsigned long) (max_swap/(1024*1024)),
	       (unsigned long) slurm_cgroup_conf.min_ram_space,

	       slurm_cgroup_conf.max_kmem_percent,
	       (unsigned long)(max_kmem/(1024*1024)),
	       constrain_kmem_space?"enforced":"permissive",
	       (unsigned long) slurm_cgroup_conf.min_kmem_space);

        /*
         *  Warning: OOM Killer must be disabled for slurmstepd
         *  or it would be destroyed if the application use
         *  more memory than permitted
         *
         *  If an env value is already set for slurmstepd
         *  OOM killer behavior, keep it, otherwise set the
         *  -1000 value, wich means do not let OOM killer kill it
         *
         *  FYI, setting "export SLURMSTEPD_OOM_ADJ=-1000"
         *  in /etc/sysconfig/slurm would be the same
         */
	 setenv("SLURMSTEPD_OOM_ADJ", "-1000", 0);

	/* create slurm root cg in this cg namespace */
	slurm_cgpath = _system_cgroup_create_slurm_cg(&memory_ns);
	if ( slurm_cgpath == NULL ) {
		xcgroup_ns_destroy(&memory_ns);
		free_slurm_cgroup_conf(&slurm_cgroup_conf);
		return SLURM_ERROR;
	}

	/* build system cgroup relative path */
	snprintf(system_cgroup_path, PATH_MAX, "%s/system", slurm_cgpath);
	xfree(slurm_cgpath);

	/* create system cgroup in the cpuset ns */
	if (xcgroup_create(&memory_ns, &system_memory_cg,
			   system_cgroup_path,
			   getuid(), getgid()) != XCGROUP_SUCCESS) {
		goto error;
	}
	if (xcgroup_instantiate(&system_memory_cg) != XCGROUP_SUCCESS) {
		goto error;
	}

	if ( xcgroup_set_param(&system_memory_cg, "memory.use_hierarchy", "1")
	     != XCGROUP_SUCCESS ) {
		error("system cgroup: unable to ask for hierarchical accounting"
		      "of system memcg '%s'", system_memory_cg.path);
		goto error;
	}

	free_slurm_cgroup_conf(&slurm_cgroup_conf);
	debug("system cgroup: system memory cgroup initialized");
	return SLURM_SUCCESS;

error:
	xcgroup_unlock(&system_memory_cg);
	xcgroup_destroy(&system_memory_cg);
	xcgroup_ns_destroy(&memory_ns);
	free_slurm_cgroup_conf(&slurm_cgroup_conf);
	return fstatus;
}
extern int task_cgroup_devices_create(stepd_step_rec_t *job)
{
	int f, k, rc, gres_conf_lines, allow_lines;
	int fstatus = SLURM_ERROR;
	char **gres_name = NULL;
	char **gres_cgroup = NULL, **dev_path = NULL;
	char *allowed_devices[PATH_MAX], *allowed_dev_major[PATH_MAX];
	int *gres_job_bit_alloc = NULL;
	int *gres_step_bit_alloc = NULL;
	int *gres_count = NULL;
	xcgroup_t devices_cg;
	uint32_t jobid = job->jobid;
	uint32_t stepid = job->stepid;
	uid_t uid = job->uid;
	uid_t gid = job->gid;

	List job_gres_list = job->job_gres_list;
	List step_gres_list = job->step_gres_list;

	char* slurm_cgpath ;

	/* create slurm root cgroup in this cgroup namespace */
	slurm_cgpath = task_cgroup_create_slurm_cg(&devices_ns);
	if (slurm_cgpath == NULL)
		return SLURM_ERROR;

	/* build user cgroup relative path if not set (should not be) */
	if (*user_cgroup_path == '\0') {
		if (snprintf(user_cgroup_path, PATH_MAX, "%s/uid_%u",
			     slurm_cgpath, uid) >= PATH_MAX) {
			error("unable to build uid %u cgroup relative path : %m",
			      uid);
			xfree(slurm_cgpath);
			return SLURM_ERROR;
		}
	}
	xfree(slurm_cgpath);

	/* build job cgroup relative path if no set (should not be) */
	if (*job_cgroup_path == '\0') {
		if (snprintf(job_cgroup_path, PATH_MAX, "%s/job_%u",
			     user_cgroup_path, jobid) >= PATH_MAX) {
			error("task/cgroup: unable to build job %u devices "
			      "cgroup relative path : %m", jobid);
			return SLURM_ERROR;
		}
	}

	/* build job step cgroup relative path (should not be) */
	if (*jobstep_cgroup_path == '\0') {
		int cc;
		if (stepid == SLURM_BATCH_SCRIPT) {
			cc = snprintf(jobstep_cgroup_path, PATH_MAX,
				      "%s/step_batch", job_cgroup_path);
		} else if (stepid == SLURM_EXTERN_CONT) {
			cc = snprintf(jobstep_cgroup_path, PATH_MAX,
				      "%s/step_extern", job_cgroup_path);
		} else {
			cc = snprintf(jobstep_cgroup_path, PATH_MAX,
				     "%s/step_%u",
				     job_cgroup_path, stepid);
		}
		if (cc >= PATH_MAX) {
			error("task/cgroup: unable to build job step %u.%u "
			      "devices cgroup relative path : %m",
			      jobid, stepid);
			return SLURM_ERROR;
		}
	}

	/*
	 * create devices root cgroup and lock it
	 *
	 * we will keep the lock until the end to avoid the effect of a release
	 * agent that would remove an existing cgroup hierarchy while we are
	 * setting it up. As soon as the step cgroup is created, we can release
	 * the lock.
	 * Indeed, consecutive slurm steps could result in cgroup being removed
	 * between the next EEXIST instanciation and the first addition of
	 * a task. The release_agent will have to lock the root devices cgroup
	 * to avoid this scenario.
	 */
	if (xcgroup_create(&devices_ns, &devices_cg, "", 0, 0) !=
	    XCGROUP_SUCCESS ) {
		error("task/cgroup: unable to create root devices cgroup");
		return SLURM_ERROR;
	}
	if (xcgroup_lock(&devices_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&devices_cg);
		error("task/cgroup: unable to lock root devices cgroup");
		return SLURM_ERROR;
	}

	info("task/cgroup: manage devices jor job '%u'", jobid);

	 /*
	  * collect info concerning the gres.conf file
	  * the GRES devices paths and the GRES names
	  */
	gres_conf_lines = gres_plugin_node_config_devices_path(&dev_path,
							       &gres_name,
							       job->node_name);

	/*
	 * create the entry for cgroup devices subsystem with major minor
	 */
	gres_cgroup = xmalloc(sizeof(char *) * gres_conf_lines);
	_calc_device_major(dev_path, gres_cgroup, gres_conf_lines);

	/*
         * create the entry with major minor for the default allowed devices
         * read from the file
         */
	allow_lines = read_allowed_devices_file(allowed_devices);
	_calc_device_major(allowed_devices, allowed_dev_major, allow_lines);

	/*
	 * calculate the number of gres.conf records for each gres name
	 */
	gres_count = xmalloc(sizeof(int) * gres_conf_lines);
	f = 0;
	gres_count[f] = 1;
	for (k = 0; k < gres_conf_lines; k++) {
		if ((k+1 < gres_conf_lines) &&
		    (xstrcmp(gres_name[k], gres_name[k+1]) == 0))
			gres_count[f]++;
		if ((k+1 < gres_conf_lines) &&
		    (xstrcmp(gres_name[k], gres_name[k+1]) != 0)) {
			f++;
			gres_count[f] = 1;
		}
	}

	/*
	 * create user cgroup in the devices ns (it could already exist)
	 */
	if (xcgroup_create(&devices_ns, &user_devices_cg, user_cgroup_path,
			   getuid(), getgid()) != XCGROUP_SUCCESS) {
		goto error;
	}
	if (xcgroup_instantiate(&user_devices_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_devices_cg);
		goto error;
	}

	/* TODO
	 * check that user's devices cgroup is consistant and allow the
	 * appropriate devices
	 */


	/*
	 * create job cgroup in the devices ns (it could already exist)
	 */
	if (xcgroup_create(&devices_ns, &job_devices_cg, job_cgroup_path,
			    getuid(), getgid()) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_devices_cg);
		goto error;
	}
	if (xcgroup_instantiate(&job_devices_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_devices_cg);
		xcgroup_destroy(&job_devices_cg);
		goto error;
	}

	/* fetch information concerning the gres devices allocation for the job */
	gres_job_bit_alloc = xmalloc(sizeof (int) * (gres_conf_lines + 10));
	gres_plugin_job_state_file(job_gres_list, gres_job_bit_alloc,
				   gres_count);

	/*
	 * with the current cgroup devices subsystem design (whitelist only
	 * supported) we need to allow all different devices that are supposed
	 * to be allowed by* default.
	 */
	for (k = 0; k < allow_lines; k++) {
		info("Default access allowed to device %s",
		     allowed_dev_major[k]);
		xcgroup_set_param(&job_devices_cg, "devices.allow",
				  allowed_dev_major[k]);
	}

	/*
         * allow or deny access to devices according to job GRES permissions
         */
	for (k = 0; k < gres_conf_lines; k++) {
		if (gres_job_bit_alloc[k] == 1) {
			info("Allowing access to device %s", gres_cgroup[k]);
			xcgroup_set_param(&job_devices_cg, "devices.allow",
                                          gres_cgroup[k]);
		} else {
			info("Not allowing access to device %s", gres_cgroup[k]);
			xcgroup_set_param(&job_devices_cg, "devices.deny",
					  gres_cgroup[k]);
		}
	}

	/*
	 * create step cgroup in the devices ns (it should not exists)
	 * use job's user uid/gid to enable tasks cgroups creation by
	 * the user inside the step cgroup owned by root
	 */
	if (xcgroup_create(&devices_ns, &step_devices_cg, jobstep_cgroup_path,
			   uid, gid) != XCGROUP_SUCCESS ) {
		/* do not delete user/job cgroup as */
		/* they can exist for other steps */
		xcgroup_destroy(&user_devices_cg);
		xcgroup_destroy(&job_devices_cg);
		goto error;
	}
	if ( xcgroup_instantiate(&step_devices_cg) != XCGROUP_SUCCESS ) {
		xcgroup_destroy(&user_devices_cg);
		xcgroup_destroy(&job_devices_cg);
		xcgroup_destroy(&step_devices_cg);
		goto error;
	}


	if ((job->stepid != SLURM_BATCH_SCRIPT) &&
	    (job->stepid != SLURM_EXTERN_CONT)) {

		/* fetch information about step GRES devices allocation */
		gres_step_bit_alloc = xmalloc(sizeof (int) * (gres_conf_lines + 10));
		gres_plugin_step_state_file(step_gres_list, gres_step_bit_alloc,
					    gres_count);

		/*
		 * with the current cgroup devices subsystem design (whitelist
		 * only supported) we need to allow all different devices that
		 * are supposed to be allowed by default.
		 */
		for (k = 0; k < allow_lines; k++) {
			info("Default access allowed to device %s",
			     allowed_dev_major[k]);
			xcgroup_set_param(&step_devices_cg, "devices.allow",
					  allowed_dev_major[k]);
		}

		/*
		 * allow or deny access to devices according to GRES permissions
		 * for the step
		 */
		for (k = 0; k < gres_conf_lines; k++) {
			if (gres_step_bit_alloc[k] == 1) {
				info("Allowing access to device %s for step",
				     gres_cgroup[k]);
				xcgroup_set_param(&step_devices_cg,
						 "devices.allow",
						  gres_cgroup[k]);
			} else {
				info("Not allowing access to device %s for step",
				     gres_cgroup[k]);
				xcgroup_set_param(&step_devices_cg,
						 "devices.deny",
						  gres_cgroup[k]);
			}
		}
	}
	/* attach the slurmstepd to the step devices cgroup */
	pid_t pid = getpid();
	rc = xcgroup_add_pids(&step_devices_cg, &pid, 1);
	if (rc != XCGROUP_SUCCESS) {
		error("task/cgroup: unable to add slurmstepd to devices cg '%s'",
		      step_devices_cg.path);
		fstatus = SLURM_ERROR;
	} else {
		fstatus = SLURM_SUCCESS;
	}

error:
	xcgroup_unlock(&devices_cg);
	xcgroup_destroy(&devices_cg);
	xfree(gres_step_bit_alloc);
	xfree(gres_job_bit_alloc);
	xfree(gres_name);
	xfree(dev_path);
	xfree(gres_cgroup);

	return fstatus;
}
Пример #23
0
extern int jobacct_gather_cgroup_memory_attach_task(
	pid_t pid, jobacct_id_t *jobacct_id)
{
	xcgroup_t memory_cg;
	slurmd_job_t *job;
	uid_t uid;
	gid_t gid;
	uint32_t jobid;
	uint32_t stepid;
	uint32_t taskid;
	int fstatus = SLURM_SUCCESS;
	int rc;
	char* slurm_cgpath;

	job = jobacct_id->job;
	uid = job->uid;
	gid = job->gid;
	jobid = job->jobid;
	stepid = job->stepid;
	taskid = jobacct_id->taskid;

	/* create slurm root cg in this cg namespace */
	slurm_cgpath = jobacct_cgroup_create_slurm_cg(&memory_ns);
	if (!slurm_cgpath) {
		return SLURM_ERROR;
	}

	/* build user cgroup relative path if not set (should not be) */
	if (*user_cgroup_path == '\0') {
		if (snprintf(user_cgroup_path, PATH_MAX,
			     "%s/uid_%u", slurm_cgpath, uid) >= PATH_MAX) {
			error("unable to build uid %u cgroup relative "
			      "path : %m", uid);
			xfree(slurm_cgpath);
			return SLURM_ERROR;
		}
	}

	/* build job cgroup relative path if not set (may not be) */
	if (*job_cgroup_path == '\0') {
		if (snprintf(job_cgroup_path, PATH_MAX, "%s/job_%u",
			     user_cgroup_path, jobid) >= PATH_MAX) {
			error("jobacct_gather/cgroup: unable to build job %u "
			      "memory cg relative path : %m", jobid);
			return SLURM_ERROR;
		}
	}

	/* build job step cgroup relative path if not set (may not be) */
	if (*jobstep_cgroup_path == '\0') {
		if (snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_%u",
			     job_cgroup_path, stepid) >= PATH_MAX) {
			error("jobacct_gather/cgroup: unable to build job step "
			      "%u memory cg relative path : %m", stepid);
			return SLURM_ERROR;
		}
	}

	/* build task cgroup relative path */
	if (snprintf(task_cgroup_path, PATH_MAX, "%s/task_%u",
		     jobstep_cgroup_path, taskid) >= PATH_MAX) {
		error("jobacct_gather/cgroup: unable to build task %u "
		      "memory cg relative path : %m", taskid);
		return SLURM_ERROR;
	}

	fstatus = SLURM_SUCCESS;

	/*
	 * create memory root cg and lock it
	 *
	 * we will keep the lock until the end to avoid the effect of a release
	 * agent that would remove an existing cgroup hierarchy while we are
	 * setting it up. As soon as the step cgroup is created, we can release
	 * the lock.
	 * Indeed, consecutive slurm steps could result in cg being removed
	 * between the next EEXIST instanciation and the first addition of
	 * a task. The release_agent will have to lock the root memory cgroup
	 * to avoid this scenario.
	 */

	if (xcgroup_create(&memory_ns, &memory_cg, "", 0, 0)
	    != XCGROUP_SUCCESS) {
		error("jobacct_gather/cgroup: unable to create root memory "
		      "xcgroup");
		return SLURM_ERROR;
	}
	if (xcgroup_lock(&memory_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&memory_cg);
		error("jobacct_gather/cgroup: unable to lock root memory cg");
		return SLURM_ERROR;
	}

	/*
	 * Create user cgroup in the memory ns (it could already exist)
	 * Ask for hierarchical memory accounting starting from the user
	 * container in order to track the memory consumption up to the
	 * user.
	 */
	if (xcgroup_create(&memory_ns, &user_memory_cg,
			   user_cgroup_path,
			   uid, gid) != XCGROUP_SUCCESS) {
		error("jobacct_gather/cgroup: unable to create user %u memory "
		      "cgroup", uid);
		fstatus = SLURM_ERROR;
		goto error;
	}

	if (xcgroup_instanciate(&user_memory_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_memory_cg);
		error("jobacct_gather/cgroup: unable to instanciate user %u "
		      "memory cgroup", uid);
		fstatus = SLURM_ERROR;
		goto error;
	}

	/*
	 * Create job cgroup in the memory ns (it could already exist)
	 */
	if (xcgroup_create(&memory_ns, &job_memory_cg,
			   job_cgroup_path,
			   uid, gid) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_memory_cg);
		error("jobacct_gather/cgroup: unable to create job %u memory "
		      "cgroup", jobid);
		fstatus = SLURM_ERROR;
		goto error;
	}

	if (xcgroup_instanciate(&job_memory_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_memory_cg);
		xcgroup_destroy(&job_memory_cg);
		error("jobacct_gather/cgroup: unable to instanciate job %u "
		      "memory cgroup", jobid);
		fstatus = SLURM_ERROR;
		goto error;
	}

	/*
	 * Create step cgroup in the memory ns (it could already exist)
	 */
	if (xcgroup_create(&memory_ns, &step_memory_cg,
			   jobstep_cgroup_path,
			   uid, gid) != XCGROUP_SUCCESS) {
		/* do not delete user/job cgroup as they can exist for other
		 * steps, but release cgroup structures */
		xcgroup_destroy(&user_memory_cg);
		xcgroup_destroy(&job_memory_cg);
		error("jobacct_gather/cgroup: unable to create jobstep %u.%u "
		      "memory cgroup", jobid, stepid);
		fstatus = SLURM_ERROR;
		goto error;
	}

	if (xcgroup_instanciate(&step_memory_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_memory_cg);
		xcgroup_destroy(&job_memory_cg);
		xcgroup_destroy(&step_memory_cg);
		error("jobacct_gather/cgroup: unable to instantiate jobstep "
		      "%u.%u memory cgroup", jobid, stepid);
		fstatus = SLURM_ERROR;
		goto error;
	}

	/*
	 * Create task cgroup in the memory ns
	 */
	if (xcgroup_create(&memory_ns, &task_memory_cg,
			   task_cgroup_path,
			   uid, gid) != XCGROUP_SUCCESS) {
		/* do not delete user/job cgroup as they can exist for other
		 * steps, but release cgroup structures */
		xcgroup_destroy(&user_memory_cg);
		xcgroup_destroy(&job_memory_cg);
		error("jobacct_gather/cgroup: unable to create jobstep %u.%u "
		      "task %u memory cgroup", jobid, stepid, taskid);
		fstatus = SLURM_ERROR;
		goto error;
	}

	if (xcgroup_instanciate(&task_memory_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_memory_cg);
		xcgroup_destroy(&job_memory_cg);
		xcgroup_destroy(&step_memory_cg);
		error("jobacct_gather/cgroup: unable to instantiate jobstep "
		      "%u.%u task %u memory cgroup", jobid, stepid, taskid);
		fstatus = SLURM_ERROR;
		goto error;
	}

	/*
	 * Attach the slurmstepd to the task memory cgroup
	 */
	rc = xcgroup_add_pids(&task_memory_cg, &pid, 1);
	if (rc != XCGROUP_SUCCESS) {
		error("jobacct_gather/cgroup: unable to add slurmstepd to "
		      "memory cg '%s'", task_memory_cg.path);
		fstatus = SLURM_ERROR;
	} else
		fstatus = SLURM_SUCCESS;

error:
	xcgroup_unlock(&memory_cg);
	xcgroup_destroy(&memory_cg);
	return fstatus;
}
extern int
jobacct_gather_cgroup_cpuacct_fini(slurm_cgroup_conf_t *slurm_cgroup_conf)
{
	xcgroup_t cpuacct_cg;
	bool lock_ok;
	int cc;

	if (user_cgroup_path[0] == '\0'
	    || job_cgroup_path[0] == '\0'
	    || jobstep_cgroup_path[0] == '\0'
	    || task_cgroup_path[0] == 0)
		return SLURM_SUCCESS;

	/*
	 * Move the slurmstepd back to the root cpuacct cg.
	 * The release_agent will asynchroneously be called for the step
	 * cgroup. It will do the necessary cleanup.
	 */
	if (xcgroup_create(&cpuacct_ns,
			   &cpuacct_cg, "", 0, 0) == XCGROUP_SUCCESS) {
		xcgroup_set_uint32_param(&cpuacct_cg, "tasks", getpid());
	}

	/* Lock the root of the cgroup and remove the subdirectories
	 * related to this job.
	 */
	lock_ok = true;
	if (xcgroup_lock(&cpuacct_cg) != XCGROUP_SUCCESS) {
		error("%s: failed to flock() %s %m", __func__, cpuacct_cg.path);
		lock_ok = false;
	}

	/* Clean up starting from the leaves way up, the
	 * reverse order in which the cgroups were created.
	 */
	for (cc = 0; cc <= max_task_id; cc++) {
		xcgroup_t cgroup;
		char buf[PATH_MAX];

		/* rmdir all tasks this running slurmstepd
		 * was responsible for.
		 */
		sprintf(buf, "%s%s/task_%d",
			cpuacct_ns.mnt_point, jobstep_cgroup_path, cc);
		cgroup.path = buf;

		if (strstr(buf, "step_extern"))
			kill_extern_procs(cgroup.path);

		if (xcgroup_delete(&cgroup) != XCGROUP_SUCCESS) {
			debug2("%s: failed to delete %s %m", __func__, buf);
		}
	}

	if (xcgroup_delete(&step_cpuacct_cg) != XCGROUP_SUCCESS) {
		debug2("%s: failed to delete %s %m", __func__,
		       cpuacct_cg.path);
	}

	if (xcgroup_delete(&job_cpuacct_cg) != XCGROUP_SUCCESS) {
		debug2("%s: failed to delete %s %m", __func__,
		       job_cpuacct_cg.path);
	}

	if (xcgroup_delete(&user_cpuacct_cg) != XCGROUP_SUCCESS) {
		debug2("%s: failed to delete %s %m", __func__,
		       user_cpuacct_cg.path);
	}

	if (lock_ok == true)
		xcgroup_unlock(&cpuacct_cg);

	xcgroup_destroy(&task_cpuacct_cg);
	xcgroup_destroy(&user_cpuacct_cg);
	xcgroup_destroy(&job_cpuacct_cg);
	xcgroup_destroy(&step_cpuacct_cg);
	xcgroup_destroy(&cpuacct_cg);

	user_cgroup_path[0]='\0';
	job_cgroup_path[0]='\0';
	jobstep_cgroup_path[0]='\0';
	task_cgroup_path[0] = 0;

	xcgroup_ns_destroy(&cpuacct_ns);

	return SLURM_SUCCESS;
}
Пример #25
0
int _slurm_cgroup_create(stepd_step_rec_t *job, uint64_t id, uid_t uid, gid_t gid)
{
    /* we do it here as we do not have access to the conf structure */
    /* in libslurm (src/common/xcgroup.c) */
    char *pre = (char *)xstrdup(slurm_cgroup_conf.cgroup_prepend);
#ifdef MULTIPLE_SLURMD
    if ( conf->node_name != NULL )
        xstrsubstitute(pre,"%n", conf->node_name);
    else {
        xfree(pre);
        pre = (char*) xstrdup("/slurm");
    }
#endif

    if (xcgroup_create(&freezer_ns, &slurm_freezer_cg, pre,
                       getuid(), getgid()) != XCGROUP_SUCCESS) {
        return SLURM_ERROR;
    }

    /*
     * While creating the cgroup hierarchy of the step, lock the root
     * cgroup directory. The same lock is hold during removal of the
     * hierarchies of other jobs/steps. This helps to  avoid the race
     * condition with concurrent creation/removal of the intermediate
     * shared directories that could result in the failure of the
     * hierarchy setup
     */
    xcgroup_lock(&freezer_cg);

    /* create slurm cgroup in the freezer ns (it could already exist) */
    if (xcgroup_instanciate(&slurm_freezer_cg) != XCGROUP_SUCCESS)
        goto bail;

    /* build user cgroup relative path if not set (should not be) */
    if (*user_cgroup_path == '\0') {
        if (snprintf(user_cgroup_path, PATH_MAX,
                     "%s/uid_%u", pre, uid) >= PATH_MAX) {
            error("unable to build uid %u cgroup relative "
                  "path : %m", uid);
            xfree(pre);
            goto bail;
        }
    }
    xfree(pre);

    /* build job cgroup relative path if no set (should not be) */
    if (*job_cgroup_path == '\0') {
        if (snprintf(job_cgroup_path, PATH_MAX, "%s/job_%u",
                     user_cgroup_path, job->jobid) >= PATH_MAX) {
            error("unable to build job %u cgroup relative "
                  "path : %m", job->jobid);
            goto bail;
        }
    }

    /* build job step cgroup relative path (should not be) */
    if (*jobstep_cgroup_path == '\0') {
        if (job->stepid == NO_VAL) {
            if (snprintf(jobstep_cgroup_path, PATH_MAX,
                         "%s/step_batch", job_cgroup_path)
                    >= PATH_MAX) {
                error("proctrack/cgroup unable to build job step"
                      " %u.batch freezer cg relative path: %m",
                      job->jobid);
                goto bail;
            }
        } else {
            if (snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_%u",
                         job_cgroup_path, job->stepid) >= PATH_MAX) {
                error("proctrack/cgroup unable to build job step"
                      " %u.%u freezer cg relative path: %m",
                      job->jobid, job->stepid);
                goto bail;
            }
        }
    }

    /* create user cgroup in the freezer ns (it could already exist) */
    if (xcgroup_create(&freezer_ns, &user_freezer_cg,
                       user_cgroup_path,
                       getuid(), getgid()) != XCGROUP_SUCCESS) {
        xcgroup_destroy(&slurm_freezer_cg);
        goto bail;
    }

    /* create job cgroup in the freezer ns (it could already exist) */
    if (xcgroup_create(&freezer_ns, &job_freezer_cg,
                       job_cgroup_path,
                       getuid(), getgid()) != XCGROUP_SUCCESS) {
        xcgroup_destroy(&slurm_freezer_cg);
        xcgroup_destroy(&user_freezer_cg);
        goto bail;
    }

    /* create step cgroup in the freezer ns (it should not exists) */
    if (xcgroup_create(&freezer_ns, &step_freezer_cg,
                       jobstep_cgroup_path,
                       getuid(), getgid()) != XCGROUP_SUCCESS) {
        xcgroup_destroy(&slurm_freezer_cg);
        xcgroup_destroy(&user_freezer_cg);
        xcgroup_destroy(&job_freezer_cg);
        goto bail;
    }

    if ((xcgroup_instanciate(&user_freezer_cg) != XCGROUP_SUCCESS) ||
            (xcgroup_instanciate(&job_freezer_cg)  != XCGROUP_SUCCESS) ||
            (xcgroup_instanciate(&step_freezer_cg) != XCGROUP_SUCCESS)) {
        xcgroup_destroy(&user_freezer_cg);
        xcgroup_destroy(&job_freezer_cg);
        xcgroup_destroy(&step_freezer_cg);
        goto bail;
    }

    /* inhibit release agent for the step cgroup thus letting
     * slurmstepd being able to add new pids to the container
     * when the job ends (TaskEpilog,...) */
    xcgroup_set_param(&step_freezer_cg,"notify_on_release","0");
    slurm_freezer_init = true;

    xcgroup_unlock(&freezer_cg);
    return SLURM_SUCCESS;

bail:
    xcgroup_destroy(&slurm_freezer_cg);
    xcgroup_unlock(&freezer_cg);
    xcgroup_destroy(&freezer_cg);
    return SLURM_ERROR;
}
Пример #26
0
/* when cgroups are configured with cpuset, at least
 * cpuset.cpus and cpuset.mems must be set or the cgroup
 * will not be available at all.
 * we duplicate the ancestor configuration in the init step */
static int _xcgroup_cpuset_init(xcgroup_t* cg)
{
	int fstatus,i;

	char* cpuset_metafiles[] = {
		"cpuset.cpus",
		"cpuset.mems"
	};
	char* cpuset_meta;
	char* cpuset_conf;
	size_t csize;

	xcgroup_t acg;
	char* acg_name;
	char* p;

	fstatus = XCGROUP_ERROR;

	/* load ancestor cg */
	acg_name = (char*) xstrdup(cg->name);
	p = rindex(acg_name,'/');
	if (p == NULL) {
		debug2("task/cgroup: unable to get ancestor path for "
		       "cpuset cg '%s' : %m",cg->path);
		return fstatus;
	} else
		*p = '\0';
	if (xcgroup_load(cg->ns,&acg,acg_name) != XCGROUP_SUCCESS) {
		debug2("task/cgroup: unable to load ancestor for "
		       "cpuset cg '%s' : %m",cg->path);
		return fstatus;
	}

	/* inherits ancestor params */
	for (i = 0 ; i < 2 ; i++) {
		cpuset_meta = cpuset_metafiles[i];
		if (xcgroup_get_param(&acg,cpuset_meta,
				       &cpuset_conf,&csize)
		     != XCGROUP_SUCCESS) {
			debug2("task/cgroup: assuming no cpuset cg "
			       "support for '%s'",acg.path);
			xcgroup_destroy(&acg);
			return fstatus;
		}
		if (csize > 0)
			cpuset_conf[csize-1]='\0';
		if (xcgroup_set_param(cg,cpuset_meta,cpuset_conf)
		     != XCGROUP_SUCCESS) {
			debug2("task/cgroup: unable to write %s configuration "
			       "(%s) for cpuset cg '%s'",cpuset_meta,
			       cpuset_conf,cg->path);
			xcgroup_destroy(&acg);
			xfree(cpuset_conf);
			return fstatus;
		}
		xfree(cpuset_conf);
	}

	xcgroup_destroy(&acg);
	return XCGROUP_SUCCESS;
}
Пример #27
0
extern int task_cgroup_cpuset_create(slurmd_job_t *job)
{
	int rc;
	int fstatus = SLURM_ERROR;

	xcgroup_t cpuset_cg;

	uint32_t jobid = job->jobid;
	uint32_t stepid = job->stepid;
	uid_t uid = job->uid;
	uid_t gid = job->gid;
	char* user_alloc_cores = NULL;
	char* job_alloc_cores = NULL;
	char* step_alloc_cores = NULL;

	char* cpus = NULL;
	size_t cpus_size;

	char* slurm_cgpath ;
	xcgroup_t slurm_cg;

	/* create slurm root cg in this cg namespace */
	slurm_cgpath = task_cgroup_create_slurm_cg(&cpuset_ns);
	if ( slurm_cgpath == NULL ) {
		return SLURM_ERROR;
	}

	/* check that this cgroup has cpus allowed or initialize them */
	if (xcgroup_load(&cpuset_ns,&slurm_cg,slurm_cgpath)
	    != XCGROUP_SUCCESS) {
		error("task/cgroup: unable to load slurm cpuset xcgroup");
		xfree(slurm_cgpath);
		return SLURM_ERROR;
	}
	rc = xcgroup_get_param(&slurm_cg,"cpuset.cpus",&cpus,&cpus_size);
	if (rc != XCGROUP_SUCCESS || cpus_size == 1) {
		/* initialize the cpusets as it was inexistant */
		if (_xcgroup_cpuset_init(&slurm_cg) !=
		    XCGROUP_SUCCESS) {
			xfree(slurm_cgpath);
			xcgroup_destroy(&slurm_cg);
			return SLURM_ERROR;
		}
	}
	xfree(cpus);

	/* build user cgroup relative path if not set (should not be) */
	if (*user_cgroup_path == '\0') {
		if (snprintf(user_cgroup_path, PATH_MAX,
			     "%s/uid_%u", slurm_cgpath, uid) >= PATH_MAX) {
			error("unable to build uid %u cgroup relative "
			      "path : %m", uid);
			xfree(slurm_cgpath);
			return SLURM_ERROR;
		}
	}
	xfree(slurm_cgpath);

	/* build job cgroup relative path if no set (should not be) */
	if (*job_cgroup_path == '\0') {
		if (snprintf(job_cgroup_path,PATH_MAX,"%s/job_%u",
			      user_cgroup_path,jobid) >= PATH_MAX) {
			error("task/cgroup: unable to build job %u cpuset "
			      "cg relative path : %m",jobid);
			return SLURM_ERROR;
		}
	}

	/* build job step cgroup relative path (should not be) */
	if (*jobstep_cgroup_path == '\0') {
		if (stepid == NO_VAL) {
			if (snprintf(jobstep_cgroup_path, PATH_MAX,
				     "%s/step_batch", job_cgroup_path)
			    >= PATH_MAX) {
				error("task/cgroup: unable to build job step"
				      " %u.batch cpuset cg relative path: %m",
				      jobid);
				return SLURM_ERROR;
			}
		} else {
			if (snprintf(jobstep_cgroup_path, PATH_MAX, "%s/step_%u",
				     job_cgroup_path, stepid) >= PATH_MAX) {
				error("task/cgroup: unable to build job step"
				      " %u.%u cpuset cg relative path: %m",
				      jobid, stepid);
				return SLURM_ERROR;
			}
		}
	}

	/*
	 * create cpuset root cg and lock it
	 *
	 * we will keep the lock until the end to avoid the effect of a release
	 * agent that would remove an existing cgroup hierarchy while we are
	 * setting it up. As soon as the step cgroup is created, we can release
	 * the lock.
	 * Indeed, consecutive slurm steps could result in cg being removed
	 * between the next EEXIST instanciation and the first addition of
	 * a task. The release_agent will have to lock the root cpuset cgroup
	 * to avoid this scenario.
	 */
	if (xcgroup_create(&cpuset_ns,&cpuset_cg,"",0,0) != XCGROUP_SUCCESS) {
		error("task/cgroup: unable to create root cpuset xcgroup");
		return SLURM_ERROR;
	}
	if (xcgroup_lock(&cpuset_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&cpuset_cg);
		error("task/cgroup: unable to lock root cpuset cg");
		return SLURM_ERROR;
	}

	/*
	 * build job and job steps allocated cores lists
	 */
	debug("task/cgroup: job abstract cores are '%s'",
	      job->job_alloc_cores);
	debug("task/cgroup: step abstract cores are '%s'",
	      job->step_alloc_cores);
	if (xcpuinfo_abs_to_mac(job->job_alloc_cores,
				 &job_alloc_cores) != XCPUINFO_SUCCESS) {
		error("task/cgroup: unable to build job physical cores");
		goto error;
	}
	if (xcpuinfo_abs_to_mac(job->step_alloc_cores,
				 &step_alloc_cores) != XCPUINFO_SUCCESS) {
		error("task/cgroup: unable to build step physical cores");
		goto error;
	}
	debug("task/cgroup: job physical cores are '%s'",
	      job->job_alloc_cores);
	debug("task/cgroup: step physical cores are '%s'",
	      job->step_alloc_cores);

	/*
	 * create user cgroup in the cpuset ns (it could already exist)
	 */
	if (xcgroup_create(&cpuset_ns,&user_cpuset_cg,
			    user_cgroup_path,
			    getuid(),getgid()) != XCGROUP_SUCCESS) {
		goto error;
	}
	if (xcgroup_instanciate(&user_cpuset_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_cpuset_cg);
		goto error;
	}

	/*
	 * check that user's cpuset cgroup is consistant and add the job cores
	 */
	rc = xcgroup_get_param(&user_cpuset_cg,"cpuset.cpus",&cpus,&cpus_size);
	if (rc != XCGROUP_SUCCESS || cpus_size == 1) {
		/* initialize the cpusets as it was inexistant */
		if (_xcgroup_cpuset_init(&user_cpuset_cg) !=
		     XCGROUP_SUCCESS) {
			xcgroup_delete(&user_cpuset_cg);
			xcgroup_destroy(&user_cpuset_cg);
			goto error;
		}
	}
	user_alloc_cores = xstrdup(job_alloc_cores);
	if (cpus != NULL && cpus_size > 1) {
		cpus[cpus_size-1]='\0';
		xstrcat(user_alloc_cores,",");
		xstrcat(user_alloc_cores,cpus);
	}
	xcgroup_set_param(&user_cpuset_cg,"cpuset.cpus",user_alloc_cores);
	xfree(cpus);

	/*
	 * create job cgroup in the cpuset ns (it could already exist)
	 */
	if (xcgroup_create(&cpuset_ns,&job_cpuset_cg,
			    job_cgroup_path,
			    getuid(),getgid()) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_cpuset_cg);
		goto error;
	}
	if (xcgroup_instanciate(&job_cpuset_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_cpuset_cg);
		xcgroup_destroy(&job_cpuset_cg);
		goto error;
	}
	if (_xcgroup_cpuset_init(&job_cpuset_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_cpuset_cg);
		xcgroup_destroy(&job_cpuset_cg);
		goto error;
	}
	xcgroup_set_param(&job_cpuset_cg,"cpuset.cpus",job_alloc_cores);

	/*
	 * create step cgroup in the cpuset ns (it should not exists)
	 * use job's user uid/gid to enable tasks cgroups creation by
	 * the user inside the step cgroup owned by root
	 */
	if (xcgroup_create(&cpuset_ns,&step_cpuset_cg,
			    jobstep_cgroup_path,
			    uid,gid) != XCGROUP_SUCCESS) {
		/* do not delete user/job cgroup as */
		/* they can exist for other steps */
		xcgroup_destroy(&user_cpuset_cg);
		xcgroup_destroy(&job_cpuset_cg);
		goto error;
	}
	if (xcgroup_instanciate(&step_cpuset_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_cpuset_cg);
		xcgroup_destroy(&job_cpuset_cg);
		xcgroup_destroy(&step_cpuset_cg);
		goto error;
	}
	if (_xcgroup_cpuset_init(&step_cpuset_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_cpuset_cg);
		xcgroup_destroy(&job_cpuset_cg);
		xcgroup_delete(&step_cpuset_cg);
		xcgroup_destroy(&step_cpuset_cg);
		goto error;
	}
	xcgroup_set_param(&step_cpuset_cg,"cpuset.cpus",step_alloc_cores);

	/* attach the slurmstepd to the step cpuset cgroup */
	pid_t pid = getpid();
	rc = xcgroup_add_pids(&step_cpuset_cg,&pid,1);
	if (rc != XCGROUP_SUCCESS) {
		error("task/cgroup: unable to add slurmstepd to cpuset cg '%s'",
		      step_cpuset_cg.path);
		fstatus = SLURM_ERROR;
	} else
		fstatus = SLURM_SUCCESS;

error:
	xcgroup_unlock(&cpuset_cg);
	xcgroup_destroy(&cpuset_cg);

	xfree(user_alloc_cores);
	xfree(job_alloc_cores);
	xfree(step_alloc_cores);

	return fstatus;
}
Пример #28
0
extern int task_cgroup_cpuset_create(stepd_step_rec_t *job)
{
	int rc;
	int fstatus = SLURM_ERROR;

	xcgroup_t cpuset_cg;

	uint32_t jobid = job->jobid;
	uint32_t stepid = job->stepid;
	uid_t uid = job->uid;
	uid_t gid = job->gid;
	char* user_alloc_cores = NULL;
	char* job_alloc_cores = NULL;
	char* step_alloc_cores = NULL;
	char cpuset_meta[PATH_MAX];

	char* cpus = NULL;
	size_t cpus_size;

	char* slurm_cgpath;
	xcgroup_t slurm_cg;

#ifdef HAVE_NATIVE_CRAY
	char expected_usage[32];
#endif

	/* create slurm root cg in this cg namespace */
	slurm_cgpath = task_cgroup_create_slurm_cg(&cpuset_ns);
	if ( slurm_cgpath == NULL ) {
		return SLURM_ERROR;
	}

	/* check that this cgroup has cpus allowed or initialize them */
	if (xcgroup_load(&cpuset_ns,&slurm_cg,slurm_cgpath)
	    != XCGROUP_SUCCESS) {
		error("task/cgroup: unable to load slurm cpuset xcgroup");
		xfree(slurm_cgpath);
		return SLURM_ERROR;
	}
again:
	snprintf(cpuset_meta, sizeof(cpuset_meta), "%scpus", cpuset_prefix);
	rc = xcgroup_get_param(&slurm_cg, cpuset_meta, &cpus,&cpus_size);
	if (rc != XCGROUP_SUCCESS || cpus_size == 1) {
		if (!cpuset_prefix_set && (rc != XCGROUP_SUCCESS)) {
			cpuset_prefix_set = 1;
			cpuset_prefix = "cpuset.";
			goto again;
		}

		/* initialize the cpusets as it was inexistant */
		if (_xcgroup_cpuset_init(&slurm_cg) !=
		    XCGROUP_SUCCESS) {
			xfree(slurm_cgpath);
			xcgroup_destroy(&slurm_cg);
			return SLURM_ERROR;
		}
	}
	xfree(cpus);

	/* build user cgroup relative path if not set (should not be) */
	if (*user_cgroup_path == '\0') {
		if (snprintf(user_cgroup_path, PATH_MAX,
			     "%s/uid_%u", slurm_cgpath, uid) >= PATH_MAX) {
			error("task/cgroup: unable to build uid %u cgroup "
			      "relative path : %m", uid);
			xfree(slurm_cgpath);
			return SLURM_ERROR;
		}
	}
	xfree(slurm_cgpath);

	/* build job cgroup relative path if no set (should not be) */
	if (*job_cgroup_path == '\0') {
		if (snprintf(job_cgroup_path,PATH_MAX,"%s/job_%u",
			     user_cgroup_path,jobid) >= PATH_MAX) {
			error("task/cgroup: unable to build job %u cpuset "
			      "cg relative path : %m",jobid);
			return SLURM_ERROR;
		}
	}

	/* build job step cgroup relative path (should not be) */
	if (*jobstep_cgroup_path == '\0') {
		int cc;
		if (stepid == SLURM_BATCH_SCRIPT) {
			cc = snprintf(jobstep_cgroup_path, PATH_MAX,
				      "%s/step_batch", job_cgroup_path);
		} else if (stepid == SLURM_EXTERN_CONT) {
			cc = snprintf(jobstep_cgroup_path, PATH_MAX,
				      "%s/step_extern", job_cgroup_path);
		} else {
			cc = snprintf(jobstep_cgroup_path, PATH_MAX,
				      "%s/step_%u", job_cgroup_path, stepid);
		}
		if (cc >= PATH_MAX) {
			error("task/cgroup: unable to build job step %u.%u "
			      "cpuset cg relative path: %m",
			      jobid, stepid);
			return SLURM_ERROR;
		}
	}

	/*
	 * create cpuset root cg and lock it
	 *
	 * we will keep the lock until the end to avoid the effect of a release
	 * agent that would remove an existing cgroup hierarchy while we are
	 * setting it up. As soon as the step cgroup is created, we can release
	 * the lock.
	 * Indeed, consecutive slurm steps could result in cg being removed
	 * between the next EEXIST instanciation and the first addition of
	 * a task. The release_agent will have to lock the root cpuset cgroup
	 * to avoid this scenario.
	 */
	if (xcgroup_create(&cpuset_ns,&cpuset_cg,"",0,0) != XCGROUP_SUCCESS) {
		error("task/cgroup: unable to create root cpuset xcgroup");
		return SLURM_ERROR;
	}
	if (xcgroup_lock(&cpuset_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&cpuset_cg);
		error("task/cgroup: unable to lock root cpuset cg");
		return SLURM_ERROR;
	}

	/*
	 * build job and job steps allocated cores lists
	 */
	debug("task/cgroup: job abstract cores are '%s'",
	      job->job_alloc_cores);
	debug("task/cgroup: step abstract cores are '%s'",
	      job->step_alloc_cores);
	if (xcpuinfo_abs_to_mac(job->job_alloc_cores,
				&job_alloc_cores) != SLURM_SUCCESS) {
		error("task/cgroup: unable to build job physical cores");
		goto error;
	}
	if (xcpuinfo_abs_to_mac(job->step_alloc_cores,
				&step_alloc_cores) != SLURM_SUCCESS) {
		error("task/cgroup: unable to build step physical cores");
		goto error;
	}
	debug("task/cgroup: job physical cores are '%s'",
	      job_alloc_cores);
	debug("task/cgroup: step physical cores are '%s'",
	      step_alloc_cores);

	/*
	 * create user cgroup in the cpuset ns (it could already exist)
	 */
	if (xcgroup_create(&cpuset_ns,&user_cpuset_cg,
			   user_cgroup_path,
			   getuid(),getgid()) != XCGROUP_SUCCESS) {
		goto error;
	}
	if (xcgroup_instanciate(&user_cpuset_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_cpuset_cg);
		goto error;
	}

	/*
	 * check that user's cpuset cgroup is consistant and add the job cores
	 */
	rc = xcgroup_get_param(&user_cpuset_cg, cpuset_meta, &cpus,&cpus_size);
	if (rc != XCGROUP_SUCCESS || cpus_size == 1) {
		/* initialize the cpusets as it was inexistant */
		if (_xcgroup_cpuset_init(&user_cpuset_cg) !=
		    XCGROUP_SUCCESS) {
			xcgroup_delete(&user_cpuset_cg);
			xcgroup_destroy(&user_cpuset_cg);
			goto error;
		}
	}
	user_alloc_cores = xstrdup(job_alloc_cores);
	if (cpus != NULL && cpus_size > 1) {
		cpus[cpus_size-1]='\0';
		xstrcat(user_alloc_cores,",");
		xstrcat(user_alloc_cores,cpus);
	}
	xcgroup_set_param(&user_cpuset_cg, cpuset_meta, user_alloc_cores);
	xfree(cpus);

	/*
	 * create job cgroup in the cpuset ns (it could already exist)
	 */
	if (xcgroup_create(&cpuset_ns,&job_cpuset_cg,
			   job_cgroup_path,
			   getuid(),getgid()) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_cpuset_cg);
		goto error;
	}
	if (xcgroup_instanciate(&job_cpuset_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_cpuset_cg);
		xcgroup_destroy(&job_cpuset_cg);
		goto error;
	}
	if (_xcgroup_cpuset_init(&job_cpuset_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_cpuset_cg);
		xcgroup_destroy(&job_cpuset_cg);
		goto error;
	}
	xcgroup_set_param(&job_cpuset_cg, cpuset_meta, job_alloc_cores);

	/*
	 * create step cgroup in the cpuset ns (it should not exists)
	 * use job's user uid/gid to enable tasks cgroups creation by
	 * the user inside the step cgroup owned by root
	 */
	if (xcgroup_create(&cpuset_ns,&step_cpuset_cg,
			   jobstep_cgroup_path,
			   uid,gid) != XCGROUP_SUCCESS) {
		/* do not delete user/job cgroup as */
		/* they can exist for other steps */
		xcgroup_destroy(&user_cpuset_cg);
		xcgroup_destroy(&job_cpuset_cg);
		goto error;
	}
	if (xcgroup_instanciate(&step_cpuset_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_cpuset_cg);
		xcgroup_destroy(&job_cpuset_cg);
		xcgroup_destroy(&step_cpuset_cg);
		goto error;
	}
	if (_xcgroup_cpuset_init(&step_cpuset_cg) != XCGROUP_SUCCESS) {
		xcgroup_destroy(&user_cpuset_cg);
		xcgroup_destroy(&job_cpuset_cg);
		xcgroup_delete(&step_cpuset_cg);
		xcgroup_destroy(&step_cpuset_cg);
		goto error;
	}
	xcgroup_set_param(&step_cpuset_cg, cpuset_meta, step_alloc_cores);

	/*
	 * on Cray systems, set the expected usage in bytes.
	 * This is used by the Cray OOM killer
	 */
#ifdef HAVE_NATIVE_CRAY
	snprintf(expected_usage, sizeof(expected_usage), "%"PRIu64,
		 (uint64_t)job->step_mem * 1024 * 1024);
	xcgroup_set_param(&step_cpuset_cg, "expected_usage_in_bytes",
			  expected_usage);
#endif

	/* attach the slurmstepd to the step cpuset cgroup */
	pid_t pid = getpid();
	rc = xcgroup_add_pids(&step_cpuset_cg,&pid,1);
	if (rc != XCGROUP_SUCCESS) {
		error("task/cgroup: unable to add slurmstepd to cpuset cg '%s'",
		      step_cpuset_cg.path);
		fstatus = SLURM_ERROR;
	} else
		fstatus = SLURM_SUCCESS;

	/* validate the requested cpu frequency and set it */
	cpu_freq_cgroup_validate(job, step_alloc_cores);

error:
	xcgroup_unlock(&cpuset_cg);
	xcgroup_destroy(&cpuset_cg);

	xfree(user_alloc_cores);
	xfree(job_alloc_cores);
	xfree(step_alloc_cores);

	return fstatus;
}
Пример #29
0
extern int init_system_cpuset_cgroup(void)
{
	int rc;
	int fstatus = SLURM_ERROR;
	char* cpus = NULL;
	size_t cpus_size;
	char* slurm_cgpath;
	xcgroup_t slurm_cg;

	/* read cgroup configuration */
	if (read_slurm_cgroup_conf(&slurm_cgroup_conf))
		return SLURM_ERROR;

	/* initialize cpuset cgroup namespace */
	if (xcgroup_ns_create(&slurm_cgroup_conf, &cpuset_ns, "", "cpuset")
	    != XCGROUP_SUCCESS) {
		error("system cgroup: unable to create cpuset namespace");
		free_slurm_cgroup_conf(&slurm_cgroup_conf);
		return SLURM_ERROR;
	}

	/* create slurm root cg in this cg namespace */
	slurm_cgpath = _system_cgroup_create_slurm_cg(&cpuset_ns);
	if ( slurm_cgpath == NULL ) {
		xcgroup_ns_destroy(&cpuset_ns);
		free_slurm_cgroup_conf(&slurm_cgroup_conf);
		return SLURM_ERROR;
	}

	/* check that this cgroup has cpus allowed or initialize them */
	if (xcgroup_load(&cpuset_ns, &slurm_cg, slurm_cgpath)
	    != XCGROUP_SUCCESS) {
		error("system cgroup: unable to load slurm cpuset xcgroup");
		xfree(slurm_cgpath);
		xcgroup_ns_destroy(&cpuset_ns);
		free_slurm_cgroup_conf(&slurm_cgroup_conf);
		return SLURM_ERROR;
	}

again:
	snprintf(cpuset_meta, sizeof(cpuset_meta), "%scpus", cpuset_prefix);
	rc = xcgroup_get_param(&slurm_cg, cpuset_meta, &cpus, &cpus_size);
	if (rc != XCGROUP_SUCCESS || cpus_size == 1) {
		if (!cpuset_prefix_set && (rc != XCGROUP_SUCCESS)) {
			cpuset_prefix_set = 1;
			cpuset_prefix = "cpuset.";
			goto again;
		}

		/* initialize the cpusets as it was nonexistent */
		if (_xcgroup_cpuset_init(&slurm_cg) != XCGROUP_SUCCESS) {
			xfree(slurm_cgpath);
			xcgroup_destroy(&slurm_cg);
			xcgroup_ns_destroy(&cpuset_ns);
			free_slurm_cgroup_conf(&slurm_cgroup_conf);
			xfree(cpus);
			return SLURM_ERROR;
		}
	}
	xcgroup_destroy(&slurm_cg);
	xfree(cpus);

	/* build system cgroup relative path */
	snprintf(system_cgroup_path, PATH_MAX, "%s/system", slurm_cgpath);
	xfree(slurm_cgpath);

	/* create system cgroup in the cpuset ns */
	if (xcgroup_create(&cpuset_ns, &system_cpuset_cg, system_cgroup_path,
			   getuid(),getgid()) != XCGROUP_SUCCESS) {
		goto error;
	}
	if (xcgroup_instantiate(&system_cpuset_cg) != XCGROUP_SUCCESS) {
		goto error;
	}
	if (_xcgroup_cpuset_init(&system_cpuset_cg) != XCGROUP_SUCCESS) {
		goto error;
	}

	free_slurm_cgroup_conf(&slurm_cgroup_conf);
	debug("system cgroup: system cpuset cgroup initialized");
	return SLURM_SUCCESS;

error:
	xcgroup_unlock(&system_cpuset_cg);
	xcgroup_destroy(&system_cpuset_cg);
	xcgroup_ns_destroy(&cpuset_ns);
	free_slurm_cgroup_conf(&slurm_cgroup_conf);
	return fstatus;
}
Пример #30
0
/* when cgroups are configured with cpuset, at least
 * cpuset.cpus and cpuset.mems must be set or the cgroup
 * will not be available at all.
 * we duplicate the ancestor configuration in the init step */
static int _xcgroup_cpuset_init(xcgroup_t* cg)
{
	int fstatus, i;

	char* cpuset_metafiles[] = {
		"cpus",
		"mems"
	};
	char* cpuset_conf = NULL;
	size_t csize = 0;

	xcgroup_t acg;
	char* acg_name = NULL;
	char* p;

	fstatus = XCGROUP_ERROR;

	/* load ancestor cg */
	acg_name = (char*) xstrdup(cg->name);
	p = xstrrchr(acg_name, '/');
	if (p == NULL) {
		debug2("system cgroup: unable to get ancestor path for "
		       "cpuset cg '%s' : %m", cg->path);
		xfree(acg_name);
		return fstatus;
	} else
		*p = '\0';
	if (xcgroup_load(cg->ns, &acg, acg_name) != XCGROUP_SUCCESS) {
		debug2("system cgroup: unable to load ancestor for "
		       "cpuset cg '%s' : %m", cg->path);
		xfree(acg_name);
		return fstatus;
	}
	xfree(acg_name);

	/* inherits ancestor params */
	for (i = 0 ; i < 2 ; i++) {
	again:
		snprintf(cpuset_meta, sizeof(cpuset_meta), "%s%s",
			 cpuset_prefix, cpuset_metafiles[i]);
		if (xcgroup_get_param(&acg ,cpuset_meta,
				      &cpuset_conf, &csize)
		    != XCGROUP_SUCCESS) {
			if (!cpuset_prefix_set) {
				cpuset_prefix_set = 1;
				cpuset_prefix = "cpuset.";
				goto again;
			}

			debug("system cgroup: assuming no cpuset cg "
			       "support for '%s'",acg.path);
			xcgroup_destroy(&acg);
			return fstatus;
		}
		if (csize > 0)
			cpuset_conf[csize-1] = '\0';
		if (xcgroup_set_param(cg,cpuset_meta, cpuset_conf)
		    != XCGROUP_SUCCESS) {
			debug("system cgroup: unable to write %s configuration "
			       "(%s) for cpuset cg '%s'",cpuset_meta,
			       cpuset_conf, cg->path);
			xcgroup_destroy(&acg);
			xfree(cpuset_conf);
			return fstatus;
		}
		xfree(cpuset_conf);
	}

	xcgroup_destroy(&acg);
	return XCGROUP_SUCCESS;
}