Exemplo n.º 1
0
/*
 * Wait for all processes within a container to exit.
 *
 * When slurm_container_wait returns SLURM_SUCCESS, the container is considered
 * destroyed.  There is no need to call slurm_container_destroy after
 * a successful call to slurm_container_wait, and in fact it will trigger
 * undefined behavior.
 *
 * Return SLURM_SUCCESS or SLURM_ERROR.
 */
extern int slurm_container_wait(uint64_t cont_id)
{
	if (slurm_proctrack_init() < 0)
		return SLURM_ERROR;

	return (*(ops.wait)) (cont_id);
}
Exemplo n.º 2
0
/*
 * Get container ID for given process ID
 *
 * Returns zero if no container found for the given pid.
 */
extern uint64_t slurm_container_find(pid_t pid)
{
	if (slurm_proctrack_init() < 0)
		return SLURM_ERROR;

	return (*(ops.find_cont)) (pid);
}
Exemplo n.º 3
0
/*
 * Return "true" if the container "cont_id" contains the process with
 * ID "pid".
 */
extern bool slurm_container_has_pid(uint64_t cont_id, pid_t pid)
{
	if (slurm_proctrack_init() < 0)
		return SLURM_ERROR;

	return (*(ops.has_pid)) (cont_id, pid);
}
Exemplo n.º 4
0
/*
 * Add a process to the specified container
 * job IN - slurmd_job_t structure
 * pid IN      - process ID to be added to the container
 * job->cont_id OUT - Plugin must fill in job->cont_id either here
 *                    or in slurm_container_create()
 *
 * Returns a SLURM errno.
 */
extern int slurm_container_add(slurmd_job_t * job, pid_t pid)
{
	if (slurm_proctrack_init() < 0)
		return SLURM_ERROR;

	return (*(ops.add)) (job, pid);
}
Exemplo n.º 5
0
/*
 * Signal all processes within a container
 * cont_id IN - container ID as returned by slurm_container_create()
 * signal IN  - signal to send, if zero then perform error checking
 *              but do not send signal
 *
 * Returns a SLURM errno.
 */
extern int slurm_container_signal(uint64_t cont_id, int signal)
{
	if (slurm_proctrack_init() < 0) {
		return SLURM_ERROR;
	}
	return (*(ops.signal)) (cont_id, signal);
}
Exemplo n.º 6
0
extern int proctrack_g_signal(uint64_t cont_id, int signal)
{
	if (slurm_proctrack_init() < 0)
		return SLURM_ERROR;

	return (*(ops.signal)) (cont_id, signal);
}
Exemplo n.º 7
0
/*
 * Create a container
 * job IN - slurmd_job_t structure
 * job->cont_id OUT - Plugin must fill in job->cont_id either here
 *                    or in slurm_container_add()
 *
 * Returns a SLURM errno.
 */
extern int slurm_container_create(slurmd_job_t * job)
{
	if (slurm_proctrack_init() < 0)
		return 0;

	return (*(ops.create)) (job);
}
Exemplo n.º 8
0
/*
 * Destroy a container, any processes within the container are not effected
 * cont_id IN - container ID as returned by proctrack_g_create()
 *
 * Returns a SLURM errno.
*/
extern int proctrack_g_destroy(uint64_t cont_id)
{
	if (slurm_proctrack_init() < 0)
		return SLURM_ERROR;

	return (*(ops.destroy)) (cont_id);
}
Exemplo n.º 9
0
/*
 * Add a process to the specified container
 * job IN - stepd_step_rec_t structure
 * pid IN      - process ID to be added to the container
 * job->cont_id OUT - Plugin must fill in job->cont_id either here
 *                    or in proctrack_g_create()
 *
 * Returns a SLURM errno.
 */
extern int proctrack_g_add(stepd_step_rec_t * job, pid_t pid)
{
	if (slurm_proctrack_init() < 0)
		return SLURM_ERROR;

	return (*(ops.add)) (job, pid);
}
Exemplo n.º 10
0
/*
 * Create a container
 * job IN - stepd_step_rec_t structure
 * job->cont_id OUT - Plugin must fill in job->cont_id either here
 *                    or in proctrack_g_add()
 *
 * Returns a SLURM errno.
 */
extern int proctrack_g_create(stepd_step_rec_t * job)
{
	if (slurm_proctrack_init() < 0)
		return 0;

	return (*(ops.create)) (job);
}
Exemplo n.º 11
0
/*
 * Destroy a container, any processes within the container are not effected
 * cont_id IN - container ID as returned by slurm_container_create()
 *
 * Returns a SLURM errno.
*/
extern int slurm_container_destroy(uint64_t cont_id)
{
	if (slurm_proctrack_init() < 0)
		return SLURM_ERROR;

	return (*(g_proctrack_context->ops.destroy)) (cont_id);
}
Exemplo n.º 12
0
/*
 * Get all process IDs within a container.
 *
 * IN cont_id - Container ID.
 * OUT pids - a pointer to an xmalloc'ed array of process ids, of
 *	length "npids".  Caller must free array with xfree().
 * OUT npids - number of process IDs in the returned "pids" array.
 *
 * Return SLURM_SUCCESS if container exists (npids may be zero, and
 *   pids NULL), return SLURM_ERROR if container does not exist, or
 *   plugin does not implement the call.
 */
extern int proctrack_g_get_pids(uint64_t cont_id, pid_t **pids, int *npids)
{
	if (slurm_proctrack_init() < 0)
		return SLURM_ERROR;

	return (*(ops.get_pids)) (cont_id, pids, npids);
}
Exemplo n.º 13
0
/*
 * Get all process IDs within a container.
 *
 * IN cont_id - Container ID.
 * OUT pids - a pointer to an xmalloc'ed array of process ids, of
 *	length "npids".  Caller must free array with xfree().
 * OUT npids - number of process IDs in the returned "pids" array.
 *
 * Return SLURM_SUCCESS if container exists (npids may be zero, and
 *   pids NULL), return SLURM_ERROR if container does not exist, or
 *   plugin does not implement the call.
 */
extern int
slurm_container_get_pids(uint64_t cont_id, pid_t ** pids, int *npids)
{
	if (slurm_proctrack_init() < 0)
		return SLURM_ERROR;

	return (*(g_proctrack_context->ops.get_pids)) (cont_id, pids, npids);
}
Exemplo n.º 14
0
/*
 * Signal all processes within a container
 * cont_id IN - container ID as returned by proctrack_g_create()
 * signal IN  - signal to send, if zero then perform error checking
 *              but do not send signal
 *
 * Returns a SLURM errno.
 */
extern int proctrack_g_signal(uint64_t cont_id, int signal)
{


	if (slurm_proctrack_init() < 0)
		return SLURM_ERROR;

	if (signal == SIGKILL) {
		pid_t *pids = NULL;
		int i, j, npids = 0, hung_pids = 0;
		char *stat_fname = NULL;
		if (proctrack_g_get_pids(cont_id, &pids, &npids) ==
		    SLURM_SUCCESS) {
			/* NOTE: proctrack_g_get_pids() is not supported
			 * by the proctrack/pgid plugin */
			for (j = 0; j < 2; j++) {
				if (j)
					sleep(2);
				hung_pids = 0;
				for (i = 0; i < npids; i++) {
					if (!pids[i])
						continue;
					xstrfmtcat(stat_fname, "/proc/%d/stat",
						   (int) pids[i]);
					if (_test_core_dumping(stat_fname)) {
						debug("Process %d continuing "
						      "core dump",
						      (int) pids[i]);
						hung_pids++;
					} else {
						/* Don't test this PID again */
						pids[i] = 0;
					}
					xfree(stat_fname);
				}
				if (hung_pids == 0)
					break;
			}
			xfree(pids);
			if (hung_pids) {
				info("Defering sending signal, processes in "
				     "job are currently core dumping");
				_spawn_signal_thread(cont_id, signal);
				return SLURM_SUCCESS;
			}
		}
	}

	return (*(ops.signal)) (cont_id, signal);
}
Exemplo n.º 15
0
/*
 * Add a process to the specified container
 * job IN - stepd_step_rec_t structure
 * pid IN      - process ID to be added to the container
 * job->cont_id OUT - Plugin must fill in job->cont_id either here
 *                    or in proctrack_g_create()
 *
 * Returns a Slurm errno.
 */
extern int proctrack_g_add(stepd_step_rec_t * job, pid_t pid)
{
	int i = 0, max_retry = 3, rc;

	if (slurm_proctrack_init() < 0)
		return SLURM_ERROR;

	/* Sometimes a plugin is transient in adding a pid, so lets
	 * try a few times before we call it quits.
	 */
	while ((rc = (*(ops.add)) (job, pid)) != SLURM_SUCCESS) {
		if (i++ > max_retry)
			break;
		debug("%s: %u.%u couldn't add pid %u, sleeping and trying again",
		      __func__, job->jobid, job->stepid, pid);
		sleep(1);
	}

	return rc;
}
Exemplo n.º 16
0
static int
_slurmd_init(void)
{
	struct rlimit rlim;
	slurm_ctl_conf_t *cf;
	struct stat stat_buf;
	uint32_t cpu_cnt;

	/*
	 * Process commandline arguments first, since one option may be
	 * an alternate location for the slurm config file.
	 */
	_process_cmdline(*conf->argc, *conf->argv);

	/*
	 * Build nodes table like in slurmctld
	 * This is required by the topology stack
	 * Node tables setup must preceed _read_config() so that the
	 * proper hostname is set.
	 */
	slurm_conf_init(conf->conffile);
	init_node_conf();
	/* slurm_select_init() must be called before
	 * build_all_nodeline_info() to be called with proper argument. */
	if (slurm_select_init(1) != SLURM_SUCCESS )
		return SLURM_FAILURE;
	build_all_nodeline_info(true);
	build_all_frontend_info(true);

	/*
	 * Read global slurm config file, override necessary values from
	 * defaults and command line.
	 */
	_read_config();

	cpu_cnt = MAX(conf->conf_cpus, conf->block_map_size);

	if ((gres_plugin_init() != SLURM_SUCCESS) ||
	    (gres_plugin_node_config_load(cpu_cnt) != SLURM_SUCCESS))
		return SLURM_FAILURE;
	if (slurm_topo_init() != SLURM_SUCCESS)
		return SLURM_FAILURE;

	/*
	 * Get and set slurmd topology information
	 * Build node hash table first to speed up the topo build
	 */
	rehash_node();
	slurm_topo_build_config();
	_set_topo_info();

	/*
	 * Check for cpu frequency set capabilities on this node
	 */
	cpu_freq_init(conf);

	_print_conf();

	if (slurm_proctrack_init() != SLURM_SUCCESS)
		return SLURM_FAILURE;
	if (slurmd_task_init() != SLURM_SUCCESS)
		return SLURM_FAILURE;
	if (slurm_auth_init(NULL) != SLURM_SUCCESS)
		return SLURM_FAILURE;
	if (spank_slurmd_init() < 0)
		return SLURM_FAILURE;

	if (getrlimit(RLIMIT_CPU, &rlim) == 0) {
		rlim.rlim_cur = rlim.rlim_max;
		setrlimit(RLIMIT_CPU, &rlim);
		if (rlim.rlim_max != RLIM_INFINITY) {
			error("Slurmd process CPU time limit is %d seconds",
			      (int) rlim.rlim_max);
		}
	}

	if (getrlimit(RLIMIT_NOFILE, &rlim) == 0) {
		rlim.rlim_cur = rlim.rlim_max;
		setrlimit(RLIMIT_NOFILE, &rlim);
	}
#ifndef NDEBUG
	if (getrlimit(RLIMIT_CORE, &rlim) == 0) {
		rlim.rlim_cur = rlim.rlim_max;
		setrlimit(RLIMIT_CORE, &rlim);
	}
#endif /* !NDEBUG */

	/*
	 * Create a context for verifying slurm job credentials
	 */
	if (!(conf->vctx = slurm_cred_verifier_ctx_create(conf->pubkey)))
		return SLURM_FAILURE;
	if (!strcmp(conf->select_type, "select/serial")) {
		/* Only cache credential for 5 seconds with select/serial
		 * for shorter cache searches and higher throughput */
		slurm_cred_ctx_set(conf->vctx, SLURM_CRED_OPT_EXPIRY_WINDOW, 5);
	}

	/*
	 * Create slurmd spool directory if necessary.
	 */
	if (_set_slurmd_spooldir() < 0) {
		error("Unable to initialize slurmd spooldir");
		return SLURM_FAILURE;
	}

	if (conf->cleanstart) {
		/*
		 * Need to kill any running slurmd's here
		 */
		_kill_old_slurmd();

		stepd_cleanup_sockets(conf->spooldir, conf->node_name);
		_stepd_cleanup_batch_dirs(conf->spooldir, conf->node_name);
	}

	if (conf->daemonize) {
		bool success = false;

		if (conf->logfile && (conf->logfile[0] == '/')) {
			char *slash_ptr, *work_dir;
			work_dir = xstrdup(conf->logfile);
			slash_ptr = strrchr(work_dir, '/');
			if (slash_ptr == work_dir)
				work_dir[1] = '\0';
			else
				slash_ptr[0] = '\0';
			if ((access(work_dir, W_OK) != 0) ||
			    (chdir(work_dir) < 0)) {
				error("Unable to chdir to %s", work_dir);
			} else
				success = true;
			xfree(work_dir);
		}

		if (!success) {
			if ((access(conf->spooldir, W_OK) != 0) ||
			    (chdir(conf->spooldir) < 0)) {
				error("Unable to chdir to %s", conf->spooldir);
			} else
				success = true;
		}

		if (!success) {
			if ((access("/var/tmp", W_OK) != 0) ||
			    (chdir("/var/tmp") < 0)) {
				error("chdir(/var/tmp): %m");
				return SLURM_FAILURE;
			} else
				info("chdir to /var/tmp");
		}
	}

	/*
	 * Cache the group access list
	 */
	cf = slurm_conf_lock();
	if (cf->group_info & GROUP_CACHE)
		init_gids_cache(1);
	else
		init_gids_cache(0);
	slurm_conf_unlock();

	if ((devnull = open_cloexec("/dev/null", O_RDWR)) < 0) {
		error("Unable to open /dev/null: %m");
		return SLURM_FAILURE;
	}

	/* make sure we have slurmstepd installed */
	if (stat(conf->stepd_loc, &stat_buf))
		fatal("Unable to find slurmstepd file at %s", conf->stepd_loc);
	if (!S_ISREG(stat_buf.st_mode))
		fatal("slurmstepd not a file at %s", conf->stepd_loc);

	return SLURM_SUCCESS;
}