コード例 #1
0
static void
_list_pids_all_steps(const char *node_name, uint32_t jobid)
{
	List steps;
	ListIterator itr;
	step_loc_t *stepd;
	int count = 0;

	steps = stepd_available(NULL, node_name);
	if (!steps || list_count(steps) == 0) {
		fprintf(stderr, "Job %u does not exist on this node.\n", jobid);
		if (steps)
			list_destroy(steps);
		exit_code = 1;
		return;
	}

	itr = list_iterator_create(steps);
	while((stepd = list_next(itr))) {
		if (jobid == stepd->jobid) {
			_list_pids_one_step(stepd->nodename, stepd->jobid,
					    stepd->stepid);
			count++;
		}
	}
	list_iterator_destroy(itr);
	list_destroy(steps);

	if (count == 0) {
		fprintf(stderr, "Job %u does not exist on this node.\n",
			jobid);
		exit_code = 1;
	}
}
コード例 #2
0
ファイル: info_job.c プロジェクト: artpol84/slurm
static void
_list_pids_all_jobs(const char *node_name)
{
	List steps;
	ListIterator itr;
	step_loc_t *stepd;

	steps = stepd_available(NULL, node_name);
	if (!steps || list_count(steps) == 0) {
		fprintf(stderr, "No job steps exist on this node.\n");
		FREE_NULL_LIST(steps);
		exit_code = 1;
		return;
	}

	itr = list_iterator_create(steps);
	while((stepd = list_next(itr))) {
		_list_pids_one_step(stepd->nodename, stepd->jobid,
				    stepd->stepid);
	}
	list_iterator_destroy(itr);
	FREE_NULL_LIST(steps);
}
コード例 #3
0
ファイル: slurmd.c プロジェクト: mrhaoji/slurm
static void
_fill_registration_msg(slurm_node_registration_status_msg_t *msg)
{
	List steps;
	ListIterator i;
	step_loc_t *stepd;
	int  n;
	char *arch, *os;
	struct utsname buf;
	static bool first_msg = true;
	static time_t slurmd_start_time = 0;
	Buf gres_info;

	msg->node_name   = xstrdup (conf->node_name);
	msg->cpus	 = conf->cpus;
	msg->boards	 = conf->boards;
	msg->sockets	 = conf->sockets;
	msg->cores	 = conf->cores;
	msg->threads	 = conf->threads;
	msg->real_memory = conf->real_memory_size;
	msg->tmp_disk    = conf->tmp_disk_space;
	msg->hash_val    = slurm_get_hash_val();
	get_cpu_load(&msg->cpu_load);

	gres_info = init_buf(1024);
	if (gres_plugin_node_config_pack(gres_info) != SLURM_SUCCESS)
		error("error packing gres configuration");
	else
		msg->gres_info   = gres_info;

	get_up_time(&conf->up_time);
	msg->up_time     = conf->up_time;
	if (slurmd_start_time == 0)
		slurmd_start_time = time(NULL);
	msg->slurmd_start_time = slurmd_start_time;

	if (first_msg) {
		first_msg = false;
		info("CPUs=%u Boards=%u Sockets=%u Cores=%u Threads=%u "
		     "Memory=%u TmpDisk=%u Uptime=%u",
		     msg->cpus, msg->boards, msg->sockets, msg->cores,
		     msg->threads, msg->real_memory, msg->tmp_disk,
		     msg->up_time);
	} else {
		debug3("CPUs=%u Boards=%u Sockets=%u Cores=%u Threads=%u "
		       "Memory=%u TmpDisk=%u Uptime=%u",
		       msg->cpus, msg->boards, msg->sockets, msg->cores,
		       msg->threads, msg->real_memory, msg->tmp_disk,
		       msg->up_time);
	}
	uname(&buf);
	if ((arch = getenv("SLURM_ARCH")))
		msg->arch = xstrdup(arch);
	else
		msg->arch = xstrdup(buf.machine);
	if ((os = getenv("SLURM_OS")))
		msg->os   = xstrdup(os);
	else
		msg->os = xstrdup(buf.sysname);

	if (msg->startup) {
		if (switch_g_alloc_node_info(&msg->switch_nodeinfo))
			error("switch_g_alloc_node_info: %m");
		if (switch_g_build_node_info(msg->switch_nodeinfo))
			error("switch_g_build_node_info: %m");
	}

	steps = stepd_available(conf->spooldir, conf->node_name);
	msg->job_count = list_count(steps);
	msg->job_id    = xmalloc(msg->job_count * sizeof(*msg->job_id));
	/* Note: Running batch jobs will have step_id == NO_VAL */
	msg->step_id   = xmalloc(msg->job_count * sizeof(*msg->step_id));

	i = list_iterator_create(steps);
	n = 0;
	while ((stepd = list_next(i))) {
		int fd;
		fd = stepd_connect(stepd->directory, stepd->nodename,
				   stepd->jobid, stepd->stepid);
		if (fd == -1) {
			--(msg->job_count);
			continue;
		}
		if (stepd_state(fd) == SLURMSTEPD_NOT_RUNNING) {
			debug("stale domain socket for stepd %u.%u ",
			      stepd->jobid, stepd->stepid);
			--(msg->job_count);
			close(fd);
			continue;
		}

		close(fd);
		if (stepd->stepid == NO_VAL)
			debug("found apparently running job %u", stepd->jobid);
		else
			debug("found apparently running step %u.%u",
			      stepd->jobid, stepd->stepid);
		msg->job_id[n]  = stepd->jobid;
		msg->step_id[n] = stepd->stepid;
		n++;
	}
	list_iterator_destroy(i);
	list_destroy(steps);

	if (!msg->energy)
		msg->energy = acct_gather_energy_alloc();
	acct_gather_energy_g_get_data(ENERGY_DATA_STRUCT, msg->energy);

	msg->timestamp = time(NULL);

	return;
}
コード例 #4
0
ファイル: slurmd.c プロジェクト: mrhaoji/slurm
static void
_reconfigure(void)
{
	List steps;
	ListIterator i;
	slurm_ctl_conf_t *cf;
	step_loc_t *stepd;
	bool did_change;

	_reconfig = 0;
	slurm_conf_reinit(conf->conffile);
	_read_config();

	/*
	 * Rebuild topology information and refresh slurmd topo infos
	 */
	slurm_topo_build_config();
	_set_topo_info();

	/*
	 * In case the administrator changed the cpu frequency set capabilities
	 * on this node, rebuild the cpu frequency table information
	 */
	cpu_freq_init(conf);

	_print_conf();

	/*
	 * Make best effort at changing to new public key
	 */
	slurm_cred_ctx_key_update(conf->vctx, conf->pubkey);

	/*
	 * Reinitialize the groups cache
	 */
	cf = slurm_conf_lock();
	if (cf->group_info & GROUP_CACHE)
		init_gids_cache(1);
	else
		init_gids_cache(0);
	slurm_conf_unlock();

	/* send reconfig to each stepd so they can refresh their log
	 * file handle
	 */

	steps = stepd_available(conf->spooldir, conf->node_name);
	i = list_iterator_create(steps);
	while ((stepd = list_next(i))) {
		int fd;
		fd = stepd_connect(stepd->directory, stepd->nodename,
				   stepd->jobid, stepd->stepid);
		if (fd == -1)
			continue;
		if (stepd_reconfig(fd) != SLURM_SUCCESS)
			debug("Reconfig jobid=%u.%u failed: %m",
			      stepd->jobid, stepd->stepid);
		close(fd);
	}
	list_iterator_destroy(i);
	list_destroy(steps);

	gres_plugin_reconfig(&did_change);
	(void) switch_g_reconfig();
	container_g_reconfig();
	if (did_change) {
		uint32_t cpu_cnt = MAX(conf->conf_cpus, conf->block_map_size);
		(void) gres_plugin_node_config_load(cpu_cnt);
		send_registration_msg(SLURM_SUCCESS, false);
	}

	/* reconfigure energy */
	acct_gather_energy_g_set_data(ENERGY_DATA_RECONFIG, NULL);

	/*
	 * XXX: reopen slurmd port?
	 */
}
コード例 #5
0
ファイル: pam_slurm_adopt.c プロジェクト: A1ve5/slurm
/* Parse arguments, etc then get my socket address/port information. Attempt to
 * adopt this process into a job in the following order:
 * 	1) If the user has only one job on the node, pick that one
 * 	2) Send RPC to source IP of socket. If there is a slurmd at the IP
 * 		address, ask it which job I belong to. On success, pick that one
 *	3) Pick a job semi-randomly (default) or skip the adoption (if
 *		configured)
 */
PAM_EXTERN int pam_sm_acct_mgmt(pam_handle_t *pamh, int flags
				__attribute__((unused)), int argc, const char **argv)
{
	int retval = PAM_IGNORE, rc, slurmrc, bufsize, user_jobs;
	char *user_name;
	List steps = NULL;
	step_loc_t *stepd = NULL;
	struct passwd pwd, *pwd_result;
	char *buf = NULL;

	_init_opts();
	_parse_opts(pamh, argc, argv);
	_log_init(opts.log_level);

	switch (opts.action_generic_failure) {
	case CALLERID_ACTION_DENY:
		rc = PAM_PERM_DENIED;
		break;
	case CALLERID_ACTION_ALLOW:
		rc = PAM_SUCCESS;
		break;
	case CALLERID_ACTION_IGNORE:
		rc = PAM_IGNORE;
		break;
		/* Newer gcc versions warn if enum cases are missing */
	default:
		error("The code is broken!!!!");
	}

	retval = pam_get_item(pamh, PAM_USER, (void *) &user_name);
	if (user_name == NULL || retval != PAM_SUCCESS)  {
		pam_syslog(pamh, LOG_ERR, "No username in PAM_USER? Fail!");
		return PAM_SESSION_ERR;
	}

	/* Check for an unsafe config that might lock out root. This is a very
	 * basic check that shouldn't be 100% relied on */
	if (!opts.ignore_root &&
	    (opts.action_unknown == CALLERID_ACTION_DENY ||
	     opts.action_no_jobs != CALLERID_ACTION_ALLOW ||
	     opts.action_adopt_failure != CALLERID_ACTION_ALLOW ||
	     opts.action_generic_failure != CALLERID_ACTION_ALLOW
		    )) {
		/* Let's get verbose */
		info("===============================");
		info("Danger!!!");
		info("A crazy admin set ignore_root=0 and some unsafe actions");
		info("You might lock out root!");
		info("If this is desirable, modify the source code");
		info("Setting ignore_root=1 and continuing");
		opts.ignore_root = 1;
	}

	/* Ignoring root is probably best but the admin can allow it */
	if (!strcmp(user_name, "root")) {
		if (opts.ignore_root) {
			info("Ignoring root user");
			return PAM_IGNORE;
		} else {
			/* This administrator is crazy */
			info("Danger!!! This is a connection attempt by root and ignore_root=0 is set! Hope for the best!");
		}
	}

	/* Calculate buffer size for getpwnam_r */
	bufsize = sysconf(_SC_GETPW_R_SIZE_MAX);
	if (bufsize == -1)
		bufsize = 16384; /* take a large guess */

	buf = xmalloc(bufsize);
	retval = getpwnam_r(user_name, &pwd, buf, bufsize, &pwd_result);
	if (pwd_result == NULL) {
		if (retval == 0) {
			error("getpwnam_r could not locate %s", user_name);
		} else {
			errno = retval;
			error("getpwnam_r: %m");
		}

		xfree(buf);
		return PAM_SESSION_ERR;
	}

	if (_load_cgroup_config() != SLURM_SUCCESS)
		return rc;

	/* Check if there are any steps on the node from any user. A failure here
	 * likely means failures everywhere so exit on failure or if no local jobs
	 * exist. */
	steps = stepd_available(NULL, opts.node_name);
	if (!steps) {
		error("Error obtaining local step information.");
		goto cleanup;
	}

	/* Check to see if this user has only one job on the node. If so, choose
	 * that job and adopt this process into it (unless configured not to) */
	user_jobs = _user_job_count(steps, pwd.pw_uid, &stepd);
	if (user_jobs == 0) {
		if (opts.action_no_jobs == CALLERID_ACTION_DENY) {
			send_user_msg(pamh,
				      "Access denied by "
				      PAM_MODULE_NAME
				      ": you have no active jobs on this node");
			rc = PAM_PERM_DENIED;
		} else {
			debug("uid %u owns no jobs but action_no_jobs=ignore",
			      pwd.pw_uid);
			rc = PAM_IGNORE;
		}
		goto cleanup;
	} else if (user_jobs == 1) {
		if (opts.single_job_skip_rpc) {
			info("Connection by user %s: user has only one job %u",
			     user_name,
			     stepd->jobid);
			slurmrc = _adopt_process(getpid(), stepd);
			/* If adoption into the only job fails, it is time to
			 * exit. Return code is based on the
			 * action_adopt_failure setting */
			if (slurmrc == SLURM_SUCCESS ||
			    (opts.action_adopt_failure ==
			     CALLERID_ACTION_ALLOW))
				rc = PAM_SUCCESS;
			else
				rc = PAM_PERM_DENIED;
			goto cleanup;
		}
	} else {
		debug("uid %u has %d jobs", pwd.pw_uid, user_jobs);
	}

	/* Single job check turned up nothing (or we skipped it). Make RPC call
	 * to slurmd at source IP. If it can tell us the job, the function calls
	 * _adopt_process */
	rc = _try_rpc(&pwd);
	if (rc == PAM_SUCCESS)
		goto cleanup;

	/* The source of the connection either didn't reply or couldn't
	 * determine the job ID at the source. Proceed to action_unknown */
	rc = _action_unknown(pamh, &pwd, steps);

cleanup:
	FREE_NULL_LIST(steps);
	xfree(buf);
	xfree(slurm_cgroup_conf);
	xfree(opts.node_name);
	return rc;
}