Esempio n. 1
0
extern void fini_system_cgroup(void)
{
	xcgroup_destroy(&system_cpuset_cg);
	xcgroup_destroy(&system_memory_cg);
	xcgroup_ns_destroy(&cpuset_ns);
	xcgroup_ns_destroy(&memory_ns);
}
Esempio n. 2
0
int _slurm_cgroup_destroy(void)
{
    xcgroup_lock(&freezer_cg);

    if (jobstep_cgroup_path[0] != '\0') {
        if (xcgroup_delete(&step_freezer_cg) != XCGROUP_SUCCESS) {
            error("_slurm_cgroup_destroy: problem deleting step "
                  "cgroup path %s: %m", step_freezer_cg.path);
            xcgroup_unlock(&freezer_cg);
            return SLURM_ERROR;
        }
        xcgroup_destroy(&step_freezer_cg);
    }

    if (job_cgroup_path[0] != '\0') {
        xcgroup_delete(&job_freezer_cg);
        xcgroup_destroy(&job_freezer_cg);
    }

    if (user_cgroup_path[0] != '\0') {
        xcgroup_delete(&user_freezer_cg);
        xcgroup_destroy(&user_freezer_cg);
    }

    if (slurm_freezer_init) {
        xcgroup_destroy(&slurm_freezer_cg);
    }

    xcgroup_unlock(&freezer_cg);
    xcgroup_destroy(&freezer_cg);
    xcgroup_ns_destroy(&freezer_ns);

    return SLURM_SUCCESS;
}
Esempio n. 3
0
/*
 * create a cgroup namespace for tasks containment
 *
 * returned values:
 *  - XCGROUP_ERROR
 *  - XCGROUP_SUCCESS
 */
int xcgroup_ns_create(slurm_cgroup_conf_t *conf,
		      xcgroup_ns_t *cgns, char *mnt_args, char *subsys) {

	cgns->mnt_point = xstrdup_printf("%s/%s",
					 conf->cgroup_mountpoint, subsys);
	cgns->mnt_args = xstrdup(mnt_args);
	cgns->subsystems = xstrdup(subsys);
	cgns->notify_prog = xstrdup_printf("%s/release_%s",
					   conf->cgroup_release_agent, subsys);

	/* check that freezer cgroup namespace is available */
	if (!xcgroup_ns_is_available(cgns)) {
		if (conf->cgroup_automount) {
			if (xcgroup_ns_mount(cgns)) {
				error("unable to mount %s cgroup "
				      "namespace: %s",
				      subsys, slurm_strerror(errno));
				goto clean;
			}
			info("cgroup namespace '%s' is now mounted", subsys);
		} else {
			error("cgroup namespace '%s' not mounted. aborting",
			      subsys);
			goto clean;
		}
	}

	return XCGROUP_SUCCESS;
clean:
	xcgroup_ns_destroy(cgns);
	return XCGROUP_ERROR;
}
Esempio n. 4
0
int _slurm_cgroup_destroy(void)
{
	if (slurm_freezer_init)
		xcgroup_lock(&slurm_freezer_cg);

	if (jobstep_cgroup_path[0] != '\0') {
		if ( xcgroup_delete(&step_freezer_cg) != XCGROUP_SUCCESS ) {
			if (slurm_freezer_init)
				xcgroup_unlock(&slurm_freezer_cg);
			return SLURM_ERROR;
		}
		xcgroup_destroy(&step_freezer_cg);
	}

	if (job_cgroup_path[0] != '\0') {
		xcgroup_delete(&job_freezer_cg);
		xcgroup_destroy(&job_freezer_cg);
	}

	if (user_cgroup_path[0] != '\0') {
		xcgroup_delete(&user_freezer_cg);
		xcgroup_destroy(&user_freezer_cg);
	}

	if (slurm_freezer_init) {
		xcgroup_unlock(&slurm_freezer_cg);
		xcgroup_destroy(&slurm_freezer_cg);
	}

	xcgroup_ns_destroy(&freezer_ns);

	return SLURM_SUCCESS;
}
Esempio n. 5
0
int _slurm_cgroup_init(void)
{
    /* initialize user/job/jobstep cgroup relative paths
     * and release agent path */
    user_cgroup_path[0]='\0';
    job_cgroup_path[0]='\0';
    jobstep_cgroup_path[0]='\0';

    /* initialize freezer cgroup namespace */
    if (xcgroup_ns_create(&slurm_cgroup_conf, &freezer_ns, "", "freezer")
            != XCGROUP_SUCCESS) {
        error("unable to create freezer cgroup namespace");
        return SLURM_ERROR;
    }

    /* initialize the root freezer cg */
    if (xcgroup_create(&freezer_ns, &freezer_cg, "", 0, 0)
            != XCGROUP_SUCCESS) {
        error("proctrack/cgroup unable to create root freezer xcgroup");
        xcgroup_ns_destroy(&freezer_ns);
        return SLURM_ERROR;
    }

    return SLURM_SUCCESS;
}
Esempio n. 6
0
extern int jobacct_gather_cgroup_cpuacct_fini(
	slurm_cgroup_conf_t *slurm_cgroup_conf)
{
	xcgroup_t cpuacct_cg;

	if (user_cgroup_path[0] == '\0' ||
	    job_cgroup_path[0] == '\0' ||
	    jobstep_cgroup_path[0] == '\0')
		return SLURM_SUCCESS;

	/*
	 * Move the slurmstepd back to the root cpuacct cg.
	 * The release_agent will asynchroneously be called for the step
	 * cgroup. It will do the necessary cleanup.
	 */
	if (xcgroup_create(&cpuacct_ns, &cpuacct_cg, "", 0, 0)
	    == XCGROUP_SUCCESS) {
		xcgroup_set_uint32_param(&cpuacct_cg, "tasks", getpid());
		xcgroup_destroy(&cpuacct_cg);
	}

	xcgroup_destroy(&user_cpuacct_cg);
	xcgroup_destroy(&job_cpuacct_cg);
	xcgroup_destroy(&step_cpuacct_cg);

	user_cgroup_path[0]='\0';
	job_cgroup_path[0]='\0';
	jobstep_cgroup_path[0]='\0';
	xcgroup_ns_destroy(&cpuacct_ns);

	return SLURM_SUCCESS;
}
Esempio n. 7
0
extern int task_cgroup_memory_fini(slurm_cgroup_conf_t *slurm_cgroup_conf)
{
	xcgroup_t memory_cg;

	if (user_cgroup_path[0] == '\0' ||
	     job_cgroup_path[0] == '\0' ||
	     jobstep_cgroup_path[0] == '\0')
		return SLURM_SUCCESS;
	/*
	 * Lock the root memcg and try to remove the different memcgs.
	 * The reason why we are locking here is that if a concurrent
	 * step is in the process of being executed, he could try to
	 * create the step memcg just after we remove the job memcg,
	 * resulting in a failure.
	 * First, delete step memcg as all the tasks have now exited.
	 * Then, try to remove the job memcg.
	 * If it fails, it is due to the fact that it is still in use by an
	 * other running step.
	 * After that, try to remove the user memcg. If it fails, it is due
	 * to jobs that are still running for the same user on the node or
	 * because of tasks attached directly to the user cg by an other
	 * component (PAM). The user memcg was created with the
	 * notify_on_release=1 flag (default) so it will be removed
	 * automatically after that.
	 * For now, do not try to detect if only externally attached tasks
	 * are present to see if they can be be moved to an orhpan memcg.
	 * That could be done in the future, if it is necessary.
	 */
	if (xcgroup_create(&memory_ns,&memory_cg,"",0,0) == XCGROUP_SUCCESS) {
		if (xcgroup_lock(&memory_cg) == XCGROUP_SUCCESS) {
			if (xcgroup_delete(&step_memory_cg) != SLURM_SUCCESS)
				debug2("task/cgroup: unable to remove step "
				       "memcg : %m");
			if (xcgroup_delete(&job_memory_cg) != XCGROUP_SUCCESS)
				debug2("task/cgroup: not removing "
				       "job memcg : %m");
			if (xcgroup_delete(&user_memory_cg) != XCGROUP_SUCCESS)
				debug2("task/cgroup: not removing "
				       "user memcg : %m");
			xcgroup_unlock(&memory_cg);
		} else
			error("task/cgroup: unable to lock root memcg : %m");
		xcgroup_destroy(&memory_cg);
	} else
		error("task/cgroup: unable to create root memcg : %m");

	xcgroup_destroy(&user_memory_cg);
	xcgroup_destroy(&job_memory_cg);
	xcgroup_destroy(&step_memory_cg);

	user_cgroup_path[0]='\0';
	job_cgroup_path[0]='\0';
	jobstep_cgroup_path[0]='\0';

	xcgroup_ns_destroy(&memory_ns);

	return SLURM_SUCCESS;
}
Esempio n. 8
0
extern int task_cgroup_cpuset_init(slurm_cgroup_conf_t *slurm_cgroup_conf)
{
	char release_agent_path[PATH_MAX];

	/* initialize cpuinfo internal data */
	if (xcpuinfo_init() != XCPUINFO_SUCCESS) {
		return SLURM_ERROR;
	}

	/* initialize user/job/jobstep cgroup relative paths */
	user_cgroup_path[0]='\0';
	job_cgroup_path[0]='\0';
	jobstep_cgroup_path[0]='\0';

	/* initialize cpuset cgroup namespace */
	release_agent_path[0]='\0';
	if (snprintf(release_agent_path,PATH_MAX,"%s/release_cpuset",
		      slurm_cgroup_conf->cgroup_release_agent) >= PATH_MAX) {
		error("task/cgroup: unable to build cpuset release agent path");
		goto error;
	}
	if (xcgroup_ns_create(slurm_cgroup_conf, &cpuset_ns, "/cpuset", "",
			       "cpuset",release_agent_path) !=
	     XCGROUP_SUCCESS) {
		error("task/cgroup: unable to create cpuset namespace");
		goto error;
	}

	/* check that cpuset cgroup namespace is available */
	if (! xcgroup_ns_is_available(&cpuset_ns)) {
		if (slurm_cgroup_conf->cgroup_automount) {
			if (xcgroup_ns_mount(&cpuset_ns)) {
				error("task/cgroup: unable to mount cpuset "
				      "namespace");
				goto clean;
			}
			info("task/cgroup: cpuset namespace is now mounted");
		} else {
			error("task/cgroup: cpuset namespace not mounted. "
			      "aborting");
			goto clean;
		}
	}

	return SLURM_SUCCESS;

clean:
	xcgroup_ns_destroy(&cpuset_ns);

error:
	xcpuinfo_fini();
	return SLURM_ERROR;
}
extern int jobacct_gather_cgroup_memory_init(
	slurm_cgroup_conf_t *slurm_cgroup_conf)
{
	char release_agent_path[PATH_MAX];

	/* initialize user/job/jobstep cgroup relative paths */
	user_cgroup_path[0]='\0';
	job_cgroup_path[0]='\0';
	jobstep_cgroup_path[0]='\0';

	/* initialize memory cgroup namespace */
	release_agent_path[0]='\0';
	if (snprintf(release_agent_path, PATH_MAX, "%s/release_memory",
		     slurm_cgroup_conf->cgroup_release_agent) >= PATH_MAX) {
		error("jobacct_gather/cgroup: unable to build memory release "
		      "agent path");
		goto error;
	}
	if (xcgroup_ns_create(slurm_cgroup_conf, &memory_ns, "/memory", "",
			      "memory", release_agent_path) !=
	    XCGROUP_SUCCESS) {
		error("jobacct_gather/cgroup: unable to create memory "
		      "namespace");
		goto error;
	}

	/* check that memory cgroup namespace is available */
	if (!xcgroup_ns_is_available(&memory_ns)) {
		if (slurm_cgroup_conf->cgroup_automount) {
			if (xcgroup_ns_mount(&memory_ns)) {
				error("jobacct_gather/cgroup: unable to mount "
				      "memory namespace");
				goto clean;
			}
			info("jobacct_gather/cgroup: memory namespace is now "
			     "mounted");
		} else {
			error("jobacct_gather/cgroup: memory namespace not "
			      "mounted. aborting");
			goto clean;
		}
	}
	return SLURM_SUCCESS;

clean:
	xcgroup_ns_destroy(&memory_ns);

error:
	return SLURM_ERROR;
}
extern int task_cgroup_devices_fini(slurm_cgroup_conf_t *slurm_cgroup_conf)
{
	xcgroup_t devices_cg;

	/* Similarly to task_cgroup_{memory,cpuset}_fini(), we must lock the
	 * root cgroup so we don't race with another job step that is
	 * being started.  */
        if (xcgroup_create(&devices_ns, &devices_cg,"",0,0)
	    == XCGROUP_SUCCESS) {
                if (xcgroup_lock(&devices_cg) == XCGROUP_SUCCESS) {
			/* First move slurmstepd to the root devices cg
			 * so we can remove the step/job/user devices
			 * cg's.  */
			xcgroup_move_process(&devices_cg, getpid());
                        if (xcgroup_delete(&step_devices_cg) != SLURM_SUCCESS)
                                debug2("task/cgroup: unable to remove step "
                                       "devices : %m");
                        if (xcgroup_delete(&job_devices_cg) != XCGROUP_SUCCESS)
                                debug2("task/cgroup: not removing "
                                       "job devices : %m");
                        if (xcgroup_delete(&user_devices_cg)
			    != XCGROUP_SUCCESS)
                                debug2("task/cgroup: not removing "
                                       "user devices : %m");
                        xcgroup_unlock(&devices_cg);
                } else
                        error("task/cgroup: unable to lock root devices : %m");
                xcgroup_destroy(&devices_cg);
        } else
                error("task/cgroup: unable to create root devices : %m");

	if ( user_cgroup_path[0] != '\0' )
		xcgroup_destroy(&user_devices_cg);
	if ( job_cgroup_path[0] != '\0' )
		xcgroup_destroy(&job_devices_cg);
	if ( jobstep_cgroup_path[0] != '\0' )
		xcgroup_destroy(&step_devices_cg);

	user_cgroup_path[0] = '\0';
	job_cgroup_path[0] = '\0';
	jobstep_cgroup_path[0] = '\0';

	cgroup_allowed_devices_file[0] = '\0';

	xcgroup_ns_destroy(&devices_ns);

	xcpuinfo_fini();
	return SLURM_SUCCESS;
}
Esempio n. 11
0
extern int task_cgroup_cpuset_fini(slurm_cgroup_conf_t *slurm_cgroup_conf)
{

	if (user_cgroup_path[0] != '\0')
		xcgroup_destroy(&user_cpuset_cg);
	if (job_cgroup_path[0] != '\0')
		xcgroup_destroy(&job_cpuset_cg);
	if (jobstep_cgroup_path[0] != '\0')
		xcgroup_destroy(&step_cpuset_cg);

	user_cgroup_path[0]='\0';
	job_cgroup_path[0]='\0';
	jobstep_cgroup_path[0]='\0';

	xcgroup_ns_destroy(&cpuset_ns);

	return SLURM_SUCCESS;
}
Esempio n. 12
0
int _slurm_cgroup_destroy(void)
{
	xcgroup_lock(&freezer_cg);

	/*
	 *  First move slurmstepd process to the root cgroup, otherwise
	 *   the rmdir(2) triggered by the calls below will always fail,
	 *   because slurmstepd is still in the cgroup!
	 */
	_move_current_to_root_cgroup(&freezer_ns);

	if (jobstep_cgroup_path[0] != '\0') {
		if (xcgroup_delete(&step_freezer_cg) != XCGROUP_SUCCESS) {
			debug("_slurm_cgroup_destroy: problem deleting step cgroup path %s: %m",
			      step_freezer_cg.path);
			xcgroup_unlock(&freezer_cg);
			return SLURM_ERROR;
		}
		xcgroup_destroy(&step_freezer_cg);
	}

	if (job_cgroup_path[0] != '\0') {
		xcgroup_delete(&job_freezer_cg);
		xcgroup_destroy(&job_freezer_cg);
	}

	if (user_cgroup_path[0] != '\0') {
		xcgroup_delete(&user_freezer_cg);
		xcgroup_destroy(&user_freezer_cg);
	}

	if (slurm_freezer_init) {
		xcgroup_destroy(&slurm_freezer_cg);
	}

	xcgroup_unlock(&freezer_cg);
	xcgroup_destroy(&freezer_cg);
	xcgroup_ns_destroy(&freezer_ns);

	return SLURM_SUCCESS;
}
Esempio n. 13
0
extern int task_cgroup_devices_fini(slurm_cgroup_conf_t *slurm_cgroup_conf)
{

	if ( user_cgroup_path[0] != '\0' )
		xcgroup_destroy(&user_devices_cg);
	if ( job_cgroup_path[0] != '\0' )
		xcgroup_destroy(&job_devices_cg);
	if ( jobstep_cgroup_path[0] != '\0' )
		xcgroup_destroy(&step_devices_cg);

	user_cgroup_path[0] = '\0';
	job_cgroup_path[0] = '\0';
	jobstep_cgroup_path[0] = '\0';

	cgroup_allowed_devices_file[0] = '\0';

	xcgroup_ns_destroy(&devices_ns);

	xcpuinfo_fini();
	return SLURM_SUCCESS;
}
Esempio n. 14
0
extern int task_cgroup_memory_fini(slurm_cgroup_conf_t *slurm_cgroup_conf)
{
	xcgroup_t memory_cg;

	if (user_cgroup_path[0] == '\0' ||
	     job_cgroup_path[0] == '\0' ||
	     jobstep_cgroup_path[0] == '\0')
		return SLURM_SUCCESS;

	/*
	 * Move the slurmstepd back to the root memory cg and remove[*]
	 * the step cgroup to move its allocated pages to its parent.
	 *
	 * [*] Calling rmdir(2) on an empty cgroup moves all resident charged
	 *  pages to the parent (i.e. the job cgroup). (If force_empty were
	 *  used instead, only clean pages would be flushed). This keeps
	 *  resident pagecache pages associated with the job. It is expected
	 *  that the job epilog will then optionally force_empty the
	 *  job cgroup (to flush pagecache), and then rmdir(2) the cgroup
	 *  or wait for release notification from kernel.
	 */
	if (xcgroup_create(&memory_ns,&memory_cg,"",0,0) == XCGROUP_SUCCESS) {
		xcgroup_move_process(&memory_cg, getpid());
		xcgroup_destroy(&memory_cg);
		if (xcgroup_delete(&step_memory_cg) != XCGROUP_SUCCESS)
			error ("cgroup: rmdir step memcg failed: %m");
	}

	xcgroup_destroy(&user_memory_cg);
	xcgroup_destroy(&job_memory_cg);
	xcgroup_destroy(&step_memory_cg);

	user_cgroup_path[0]='\0';
	job_cgroup_path[0]='\0';
	jobstep_cgroup_path[0]='\0';

	xcgroup_ns_destroy(&memory_ns);

	return SLURM_SUCCESS;
}
Esempio n. 15
0
extern int jobacct_gather_cgroup_memory_fini(
	slurm_cgroup_conf_t *slurm_cgroup_conf)
{
	xcgroup_t memory_cg;

	if (user_cgroup_path[0] == '\0' ||
	    job_cgroup_path[0] == '\0' ||
	    jobstep_cgroup_path[0] == '\0')
		return SLURM_SUCCESS;

	/*
	 * Move the slurmstepd back to the root memory cg and force empty
	 * the step cgroup to move its allocated pages to its parent.
	 * The release_agent will asynchroneously be called for the step
	 * cgroup. It will do the necessary cleanup.
	 * It should be good if this force_empty mech could be done directly
	 * by the memcg implementation at the end of the last task managed
	 * by a cgroup. It is too difficult and near impossible to handle
	 * that cleanup correctly with current memcg.
	 */
	if (xcgroup_create(&memory_ns, &memory_cg, "", 0, 0)
	    == XCGROUP_SUCCESS) {
		xcgroup_set_uint32_param(&memory_cg, "tasks", getpid());
		xcgroup_destroy(&memory_cg);
		xcgroup_set_param(&step_memory_cg, "memory.force_empty", "1");
	}

	xcgroup_destroy(&user_memory_cg);
	xcgroup_destroy(&job_memory_cg);
	xcgroup_destroy(&step_memory_cg);

	user_cgroup_path[0]='\0';
	job_cgroup_path[0]='\0';
	jobstep_cgroup_path[0]='\0';

	xcgroup_ns_destroy(&memory_ns);

	return SLURM_SUCCESS;
}
extern int task_cgroup_devices_init(slurm_cgroup_conf_t *slurm_cgroup_conf)
{
	uint16_t cpunum;

	/* initialize cpuinfo internal data */
	if ( xcpuinfo_init() != XCPUINFO_SUCCESS )
		return SLURM_ERROR;

	/* initialize user/job/jobstep cgroup relative paths */
	user_cgroup_path[0] = '\0';
	job_cgroup_path[0] = '\0';
	jobstep_cgroup_path[0] = '\0';
	/* initialize allowed_devices_filename */
	cgroup_allowed_devices_file[0] = '\0';

	if ( get_procs(&cpunum) != 0 ) {
		error("task/cgroup: unable to get a number of CPU");
		goto error;
	}

	(void) gres_plugin_node_config_load(cpunum, conf->node_name, NULL);

	strcpy(cgroup_allowed_devices_file,
	       slurm_cgroup_conf->allowed_devices_file);
	if (xcgroup_ns_create(slurm_cgroup_conf, &devices_ns, "", "devices")
	    != XCGROUP_SUCCESS ) {
		error("task/cgroup: unable to create devices namespace");
		goto error;
	}

	return SLURM_SUCCESS;

error:
	xcgroup_ns_destroy(&devices_ns);
	xcpuinfo_fini();
	return SLURM_ERROR;
}
Esempio n. 17
0
extern int init_system_memory_cgroup(void)
{
	int fstatus = SLURM_ERROR;
	char* slurm_cgpath;

	/* read cgroup configuration */
	if (read_slurm_cgroup_conf(&slurm_cgroup_conf))
		return SLURM_ERROR;

	/* initialize memory cgroup namespace */
	if (xcgroup_ns_create(&slurm_cgroup_conf, &memory_ns, "", "memory")
	    != XCGROUP_SUCCESS) {
		error("system cgroup: unable to create memory namespace");
		free_slurm_cgroup_conf(&slurm_cgroup_conf);
		return SLURM_ERROR;
	}

	constrain_kmem_space = slurm_cgroup_conf.constrain_kmem_space;
	constrain_ram_space = slurm_cgroup_conf.constrain_ram_space;
	constrain_swap_space = slurm_cgroup_conf.constrain_swap_space;

	/*
	 * as the swap space threshold will be configured with a
	 * mem+swp parameter value, if RAM space is not monitored,
	 * set allowed RAM space to 100% of the job requested memory.
	 * It will help to construct the mem+swp value that will be
	 * used for both mem and mem+swp limit during memcg creation.
	 */
	if ( constrain_ram_space )
		allowed_ram_space = slurm_cgroup_conf.allowed_ram_space;
	else
		allowed_ram_space = 100.0;

	allowed_swap_space = slurm_cgroup_conf.allowed_swap_space;

	if ((totalram = (uint64_t) conf->real_memory_size) == 0)
		error ("system cgroup: Unable to get RealMemory size");

	max_kmem = _percent_in_bytes(totalram, slurm_cgroup_conf.max_kmem_percent);
	max_ram = _percent_in_bytes(totalram, slurm_cgroup_conf.max_ram_percent);
	max_swap = _percent_in_bytes(totalram, slurm_cgroup_conf.max_swap_percent);
	max_swap += max_ram;
	min_ram_space = slurm_cgroup_conf.min_ram_space * 1024 * 1024;

	debug ("system cgroup: memory: total:%luM allowed:%.4g%%(%s), "
	       "swap:%.4g%%(%s), max:%.4g%%(%luM) "
	       "max+swap:%.4g%%(%luM) min:%luM "
	       "kmem:%.4g%%(%luM %s) min:%luM",
	       (unsigned long) totalram,
	       allowed_ram_space,
	       constrain_ram_space?"enforced":"permissive",

	       allowed_swap_space,
	       constrain_swap_space?"enforced":"permissive",
	       slurm_cgroup_conf.max_ram_percent,
	       (unsigned long) (max_ram/(1024*1024)),

	       slurm_cgroup_conf.max_swap_percent,
	       (unsigned long) (max_swap/(1024*1024)),
	       (unsigned long) slurm_cgroup_conf.min_ram_space,

	       slurm_cgroup_conf.max_kmem_percent,
	       (unsigned long)(max_kmem/(1024*1024)),
	       constrain_kmem_space?"enforced":"permissive",
	       (unsigned long) slurm_cgroup_conf.min_kmem_space);

        /*
         *  Warning: OOM Killer must be disabled for slurmstepd
         *  or it would be destroyed if the application use
         *  more memory than permitted
         *
         *  If an env value is already set for slurmstepd
         *  OOM killer behavior, keep it, otherwise set the
         *  -1000 value, wich means do not let OOM killer kill it
         *
         *  FYI, setting "export SLURMSTEPD_OOM_ADJ=-1000"
         *  in /etc/sysconfig/slurm would be the same
         */
	 setenv("SLURMSTEPD_OOM_ADJ", "-1000", 0);

	/* create slurm root cg in this cg namespace */
	slurm_cgpath = _system_cgroup_create_slurm_cg(&memory_ns);
	if ( slurm_cgpath == NULL ) {
		xcgroup_ns_destroy(&memory_ns);
		free_slurm_cgroup_conf(&slurm_cgroup_conf);
		return SLURM_ERROR;
	}

	/* build system cgroup relative path */
	snprintf(system_cgroup_path, PATH_MAX, "%s/system", slurm_cgpath);
	xfree(slurm_cgpath);

	/* create system cgroup in the cpuset ns */
	if (xcgroup_create(&memory_ns, &system_memory_cg,
			   system_cgroup_path,
			   getuid(), getgid()) != XCGROUP_SUCCESS) {
		goto error;
	}
	if (xcgroup_instantiate(&system_memory_cg) != XCGROUP_SUCCESS) {
		goto error;
	}

	if ( xcgroup_set_param(&system_memory_cg, "memory.use_hierarchy", "1")
	     != XCGROUP_SUCCESS ) {
		error("system cgroup: unable to ask for hierarchical accounting"
		      "of system memcg '%s'", system_memory_cg.path);
		goto error;
	}

	free_slurm_cgroup_conf(&slurm_cgroup_conf);
	debug("system cgroup: system memory cgroup initialized");
	return SLURM_SUCCESS;

error:
	xcgroup_unlock(&system_memory_cg);
	xcgroup_destroy(&system_memory_cg);
	xcgroup_ns_destroy(&memory_ns);
	free_slurm_cgroup_conf(&slurm_cgroup_conf);
	return fstatus;
}
extern int
jobacct_gather_cgroup_cpuacct_fini(slurm_cgroup_conf_t *slurm_cgroup_conf)
{
	xcgroup_t cpuacct_cg;
	bool lock_ok;
	int cc;

	if (user_cgroup_path[0] == '\0'
	    || job_cgroup_path[0] == '\0'
	    || jobstep_cgroup_path[0] == '\0'
	    || task_cgroup_path[0] == 0)
		return SLURM_SUCCESS;

	/*
	 * Move the slurmstepd back to the root cpuacct cg.
	 * The release_agent will asynchroneously be called for the step
	 * cgroup. It will do the necessary cleanup.
	 */
	if (xcgroup_create(&cpuacct_ns,
			   &cpuacct_cg, "", 0, 0) == XCGROUP_SUCCESS) {
		xcgroup_set_uint32_param(&cpuacct_cg, "tasks", getpid());
	}

	/* Lock the root of the cgroup and remove the subdirectories
	 * related to this job.
	 */
	lock_ok = true;
	if (xcgroup_lock(&cpuacct_cg) != XCGROUP_SUCCESS) {
		error("%s: failed to flock() %s %m", __func__, cpuacct_cg.path);
		lock_ok = false;
	}

	/* Clean up starting from the leaves way up, the
	 * reverse order in which the cgroups were created.
	 */
	for (cc = 0; cc <= max_task_id; cc++) {
		xcgroup_t cgroup;
		char buf[PATH_MAX];

		/* rmdir all tasks this running slurmstepd
		 * was responsible for.
		 */
		sprintf(buf, "%s%s/task_%d",
			cpuacct_ns.mnt_point, jobstep_cgroup_path, cc);
		cgroup.path = buf;

		if (strstr(buf, "step_extern"))
			kill_extern_procs(cgroup.path);

		if (xcgroup_delete(&cgroup) != XCGROUP_SUCCESS) {
			debug2("%s: failed to delete %s %m", __func__, buf);
		}
	}

	if (xcgroup_delete(&step_cpuacct_cg) != XCGROUP_SUCCESS) {
		debug2("%s: failed to delete %s %m", __func__,
		       cpuacct_cg.path);
	}

	if (xcgroup_delete(&job_cpuacct_cg) != XCGROUP_SUCCESS) {
		debug2("%s: failed to delete %s %m", __func__,
		       job_cpuacct_cg.path);
	}

	if (xcgroup_delete(&user_cpuacct_cg) != XCGROUP_SUCCESS) {
		debug2("%s: failed to delete %s %m", __func__,
		       user_cpuacct_cg.path);
	}

	if (lock_ok == true)
		xcgroup_unlock(&cpuacct_cg);

	xcgroup_destroy(&task_cpuacct_cg);
	xcgroup_destroy(&user_cpuacct_cg);
	xcgroup_destroy(&job_cpuacct_cg);
	xcgroup_destroy(&step_cpuacct_cg);
	xcgroup_destroy(&cpuacct_cg);

	user_cgroup_path[0]='\0';
	job_cgroup_path[0]='\0';
	jobstep_cgroup_path[0]='\0';
	task_cgroup_path[0] = 0;

	xcgroup_ns_destroy(&cpuacct_ns);

	return SLURM_SUCCESS;
}
Esempio n. 19
0
extern int task_cgroup_cpuset_fini(slurm_cgroup_conf_t *slurm_cgroup_conf)
{
	xcgroup_t cpuset_cg;

	/* Similarly to task_cgroup_memory_fini(), we must lock the
	 * root cgroup so we don't race with another job step that is
	 * being started.  */
        if (xcgroup_create(&cpuset_ns, &cpuset_cg,"",0,0) == XCGROUP_SUCCESS) {
		if (xcgroup_lock(&cpuset_cg) == XCGROUP_SUCCESS) {
			int i = 0, npids = 0, cnt = 0;
			pid_t* pids = NULL;
			/* First move slurmstepd to the root cpuset cg
			 * so we can remove the step/job/user cpuset
			 * cg's.  */
			xcgroup_move_process(&cpuset_cg, getpid());

			/* There is a delay in the cgroup system when moving the
			 * pid from one cgroup to another.  This is usually
			 * short, but we need to wait to make sure the pid is
			 * out of the step cgroup or we will occur an error
			 * leaving the cgroup unable to be removed.
			 */
			do {
				xcgroup_get_pids(&step_cpuset_cg,
						 &pids, &npids);
				for (i = 0 ; i<npids ; i++)
					if (pids[i] == getpid()) {
						cnt++;
						break;
					}
				xfree(pids);
			} while ((i < npids) && (cnt < MAX_MOVE_WAIT));

			if (cnt < MAX_MOVE_WAIT)
				debug3("Took %d checks before stepd pid was removed from the step cgroup.",
				       cnt);
			else
				error("Pid %d is still in the step cgroup.  It might be left uncleaned after the job.",
				      getpid());

                        if (xcgroup_delete(&step_cpuset_cg) != SLURM_SUCCESS)
                                debug2("task/cgroup: unable to remove step "
                                       "cpuset : %m");
                        if (xcgroup_delete(&job_cpuset_cg) != XCGROUP_SUCCESS)
                                debug2("task/cgroup: not removing "
                                       "job cpuset : %m");
                        if (xcgroup_delete(&user_cpuset_cg) != XCGROUP_SUCCESS)
                                debug2("task/cgroup: not removing "
                                       "user cpuset : %m");
                        xcgroup_unlock(&cpuset_cg);
                } else
                        error("task/cgroup: unable to lock root cpuset : %m");
                xcgroup_destroy(&cpuset_cg);
        } else
                error("task/cgroup: unable to create root cpuset : %m");

	if (user_cgroup_path[0] != '\0')
		xcgroup_destroy(&user_cpuset_cg);
	if (job_cgroup_path[0] != '\0')
		xcgroup_destroy(&job_cpuset_cg);
	if (jobstep_cgroup_path[0] != '\0')
		xcgroup_destroy(&step_cpuset_cg);

	user_cgroup_path[0]='\0';
	job_cgroup_path[0]='\0';
	jobstep_cgroup_path[0]='\0';

	xcgroup_ns_destroy(&cpuset_ns);

	return SLURM_SUCCESS;
}
Esempio n. 20
0
extern int init_system_cpuset_cgroup(void)
{
	int rc;
	int fstatus = SLURM_ERROR;
	char* cpus = NULL;
	size_t cpus_size;
	char* slurm_cgpath;
	xcgroup_t slurm_cg;

	/* read cgroup configuration */
	if (read_slurm_cgroup_conf(&slurm_cgroup_conf))
		return SLURM_ERROR;

	/* initialize cpuset cgroup namespace */
	if (xcgroup_ns_create(&slurm_cgroup_conf, &cpuset_ns, "", "cpuset")
	    != XCGROUP_SUCCESS) {
		error("system cgroup: unable to create cpuset namespace");
		free_slurm_cgroup_conf(&slurm_cgroup_conf);
		return SLURM_ERROR;
	}

	/* create slurm root cg in this cg namespace */
	slurm_cgpath = _system_cgroup_create_slurm_cg(&cpuset_ns);
	if ( slurm_cgpath == NULL ) {
		xcgroup_ns_destroy(&cpuset_ns);
		free_slurm_cgroup_conf(&slurm_cgroup_conf);
		return SLURM_ERROR;
	}

	/* check that this cgroup has cpus allowed or initialize them */
	if (xcgroup_load(&cpuset_ns, &slurm_cg, slurm_cgpath)
	    != XCGROUP_SUCCESS) {
		error("system cgroup: unable to load slurm cpuset xcgroup");
		xfree(slurm_cgpath);
		xcgroup_ns_destroy(&cpuset_ns);
		free_slurm_cgroup_conf(&slurm_cgroup_conf);
		return SLURM_ERROR;
	}

again:
	snprintf(cpuset_meta, sizeof(cpuset_meta), "%scpus", cpuset_prefix);
	rc = xcgroup_get_param(&slurm_cg, cpuset_meta, &cpus, &cpus_size);
	if (rc != XCGROUP_SUCCESS || cpus_size == 1) {
		if (!cpuset_prefix_set && (rc != XCGROUP_SUCCESS)) {
			cpuset_prefix_set = 1;
			cpuset_prefix = "cpuset.";
			goto again;
		}

		/* initialize the cpusets as it was nonexistent */
		if (_xcgroup_cpuset_init(&slurm_cg) != XCGROUP_SUCCESS) {
			xfree(slurm_cgpath);
			xcgroup_destroy(&slurm_cg);
			xcgroup_ns_destroy(&cpuset_ns);
			free_slurm_cgroup_conf(&slurm_cgroup_conf);
			xfree(cpus);
			return SLURM_ERROR;
		}
	}
	xcgroup_destroy(&slurm_cg);
	xfree(cpus);

	/* build system cgroup relative path */
	snprintf(system_cgroup_path, PATH_MAX, "%s/system", slurm_cgpath);
	xfree(slurm_cgpath);

	/* create system cgroup in the cpuset ns */
	if (xcgroup_create(&cpuset_ns, &system_cpuset_cg, system_cgroup_path,
			   getuid(),getgid()) != XCGROUP_SUCCESS) {
		goto error;
	}
	if (xcgroup_instantiate(&system_cpuset_cg) != XCGROUP_SUCCESS) {
		goto error;
	}
	if (_xcgroup_cpuset_init(&system_cpuset_cg) != XCGROUP_SUCCESS) {
		goto error;
	}

	free_slurm_cgroup_conf(&slurm_cgroup_conf);
	debug("system cgroup: system cpuset cgroup initialized");
	return SLURM_SUCCESS;

error:
	xcgroup_unlock(&system_cpuset_cg);
	xcgroup_destroy(&system_cpuset_cg);
	xcgroup_ns_destroy(&cpuset_ns);
	free_slurm_cgroup_conf(&slurm_cgroup_conf);
	return fstatus;
}