Example #1
0
/* Returns negative number on failure. Failures are likely to occur if a step
 * exits; this is not a problem. */
static uid_t _get_job_uid(step_loc_t *stepd)
{
	uid_t uid = -1;
	int fd;
	uint16_t protocol_version;

	fd = stepd_connect(stepd->directory, stepd->nodename,
			   stepd->jobid, stepd->stepid, &protocol_version);
	if (fd < 0) {
		/* It's normal for a step to exit */
		debug3("unable to connect to step %u.%u on %s: %m",
		       stepd->jobid, stepd->stepid, stepd->nodename);
		return -1;
	}

	uid = stepd_get_uid(fd, stepd->protocol_version);
	close(fd);

	/* The step may have exited. Not a big concern. */
	if ((int32_t)uid == -1)
		debug3("unable to determine uid of step %u.%u on %s",
		       stepd->jobid, stepd->stepid, stepd->nodename);

	return uid;
}
Example #2
0
/* Adopts a process into the given step. Returns SLURM_SUCCESS if
 * opts.action_adopt_failure == CALLERID_ACTION_ALLOW or if the process was
 * successfully adopted.
 */
static int _adopt_process(pid_t pid, step_loc_t *stepd)
{
	int fd;
	uint16_t protocol_version;
	int rc;

	if (!stepd)
		return -1;
	debug("_adopt_process: trying to get %u.%u to adopt %d",
	      stepd->jobid, stepd->stepid, pid);
	fd = stepd_connect(stepd->directory, stepd->nodename,
			   stepd->jobid, stepd->stepid, &protocol_version);
	if (fd < 0) {
		/* It's normal for a step to exit */
		debug3("unable to connect to step %u.%u on %s: %m",
		       stepd->jobid, stepd->stepid, stepd->nodename);
		return -1;
	}

	rc = stepd_add_extern_pid(fd, stepd->protocol_version, pid);
	close(fd);

	if (rc == PAM_SUCCESS)
		info("Process %d adopted into job %u", pid, stepd->jobid);
	else
		info("Process %d adoption FAILED for job %u",
		     pid, stepd->jobid);

	return rc;
}
Example #3
0
static void
_list_pids_one_step(const char *node_name, uint32_t jobid, uint32_t stepid)
{
	int fd;
	slurmstepd_task_info_t *task_info;
	uint32_t *pids;
	uint32_t count = 0;
	uint32_t tcount = 0;
	int i;

	fd = stepd_connect(NULL, node_name, jobid, stepid);
	if (fd == -1) {
		exit_code = 1;
		if (errno == ENOENT) {
			fprintf(stderr,
				"Job step %u.%u does not exist on this node.\n",
				jobid, stepid);
			exit_code = 1;
		} else {
			perror("Unable to connect to slurmstepd");
		}
		return;
	}

	stepd_task_info(fd, &task_info, &tcount);
	for (i = 0; i < (int)tcount; i++) {
		if (!task_info[i].exited) {
			if (stepid == NO_VAL)
				printf("%-8d %-8u %-6s %-7d %-8d\n",
				       task_info[i].pid,
				       jobid,
				       "batch",
				       task_info[i].id,
				       task_info[i].gtid);
			else
				printf("%-8d %-8u %-6u %-7d %-8d\n",
				       task_info[i].pid,
				       jobid,
				       stepid,
				       task_info[i].id,
				       task_info[i].gtid);
		}
	}

	stepd_list_pids(fd, &pids, &count);
	for (i = 0; i < count; i++) {
		if (!_in_task_array((pid_t)pids[i], task_info, tcount)) {
			if (stepid == NO_VAL)
				printf("%-8d %-8u %-6s %-7s %-8s\n",
				       pids[i], jobid, "batch", "-", "-");
			else
				printf("%-8d %-8u %-6u %-7s %-8s\n",
				       pids[i], jobid, stepid, "-", "-");
		}
	}


	if (count > 0)
		xfree(pids);
	if (tcount > 0)
		xfree(task_info);
	close(fd);
}
Example #4
0
static void
_reconfigure(void)
{
	List steps;
	ListIterator i;
	slurm_ctl_conf_t *cf;
	step_loc_t *stepd;
	bool did_change;

	_reconfig = 0;
	slurm_conf_reinit(conf->conffile);
	_read_config();

	/*
	 * Rebuild topology information and refresh slurmd topo infos
	 */
	slurm_topo_build_config();
	_set_topo_info();

	/*
	 * In case the administrator changed the cpu frequency set capabilities
	 * on this node, rebuild the cpu frequency table information
	 */
	cpu_freq_init(conf);

	_print_conf();

	/*
	 * Make best effort at changing to new public key
	 */
	slurm_cred_ctx_key_update(conf->vctx, conf->pubkey);

	/*
	 * Reinitialize the groups cache
	 */
	cf = slurm_conf_lock();
	if (cf->group_info & GROUP_CACHE)
		init_gids_cache(1);
	else
		init_gids_cache(0);
	slurm_conf_unlock();

	/* send reconfig to each stepd so they can refresh their log
	 * file handle
	 */

	steps = stepd_available(conf->spooldir, conf->node_name);
	i = list_iterator_create(steps);
	while ((stepd = list_next(i))) {
		int fd;
		fd = stepd_connect(stepd->directory, stepd->nodename,
				   stepd->jobid, stepd->stepid);
		if (fd == -1)
			continue;
		if (stepd_reconfig(fd) != SLURM_SUCCESS)
			debug("Reconfig jobid=%u.%u failed: %m",
			      stepd->jobid, stepd->stepid);
		close(fd);
	}
	list_iterator_destroy(i);
	list_destroy(steps);

	gres_plugin_reconfig(&did_change);
	(void) switch_g_reconfig();
	container_g_reconfig();
	if (did_change) {
		uint32_t cpu_cnt = MAX(conf->conf_cpus, conf->block_map_size);
		(void) gres_plugin_node_config_load(cpu_cnt);
		send_registration_msg(SLURM_SUCCESS, false);
	}

	/* reconfigure energy */
	acct_gather_energy_g_set_data(ENERGY_DATA_RECONFIG, NULL);

	/*
	 * XXX: reopen slurmd port?
	 */
}
Example #5
0
static void
_fill_registration_msg(slurm_node_registration_status_msg_t *msg)
{
	List steps;
	ListIterator i;
	step_loc_t *stepd;
	int  n;
	char *arch, *os;
	struct utsname buf;
	static bool first_msg = true;
	static time_t slurmd_start_time = 0;
	Buf gres_info;

	msg->node_name   = xstrdup (conf->node_name);
	msg->cpus	 = conf->cpus;
	msg->boards	 = conf->boards;
	msg->sockets	 = conf->sockets;
	msg->cores	 = conf->cores;
	msg->threads	 = conf->threads;
	msg->real_memory = conf->real_memory_size;
	msg->tmp_disk    = conf->tmp_disk_space;
	msg->hash_val    = slurm_get_hash_val();
	get_cpu_load(&msg->cpu_load);

	gres_info = init_buf(1024);
	if (gres_plugin_node_config_pack(gres_info) != SLURM_SUCCESS)
		error("error packing gres configuration");
	else
		msg->gres_info   = gres_info;

	get_up_time(&conf->up_time);
	msg->up_time     = conf->up_time;
	if (slurmd_start_time == 0)
		slurmd_start_time = time(NULL);
	msg->slurmd_start_time = slurmd_start_time;

	if (first_msg) {
		first_msg = false;
		info("CPUs=%u Boards=%u Sockets=%u Cores=%u Threads=%u "
		     "Memory=%u TmpDisk=%u Uptime=%u",
		     msg->cpus, msg->boards, msg->sockets, msg->cores,
		     msg->threads, msg->real_memory, msg->tmp_disk,
		     msg->up_time);
	} else {
		debug3("CPUs=%u Boards=%u Sockets=%u Cores=%u Threads=%u "
		       "Memory=%u TmpDisk=%u Uptime=%u",
		       msg->cpus, msg->boards, msg->sockets, msg->cores,
		       msg->threads, msg->real_memory, msg->tmp_disk,
		       msg->up_time);
	}
	uname(&buf);
	if ((arch = getenv("SLURM_ARCH")))
		msg->arch = xstrdup(arch);
	else
		msg->arch = xstrdup(buf.machine);
	if ((os = getenv("SLURM_OS")))
		msg->os   = xstrdup(os);
	else
		msg->os = xstrdup(buf.sysname);

	if (msg->startup) {
		if (switch_g_alloc_node_info(&msg->switch_nodeinfo))
			error("switch_g_alloc_node_info: %m");
		if (switch_g_build_node_info(msg->switch_nodeinfo))
			error("switch_g_build_node_info: %m");
	}

	steps = stepd_available(conf->spooldir, conf->node_name);
	msg->job_count = list_count(steps);
	msg->job_id    = xmalloc(msg->job_count * sizeof(*msg->job_id));
	/* Note: Running batch jobs will have step_id == NO_VAL */
	msg->step_id   = xmalloc(msg->job_count * sizeof(*msg->step_id));

	i = list_iterator_create(steps);
	n = 0;
	while ((stepd = list_next(i))) {
		int fd;
		fd = stepd_connect(stepd->directory, stepd->nodename,
				   stepd->jobid, stepd->stepid);
		if (fd == -1) {
			--(msg->job_count);
			continue;
		}
		if (stepd_state(fd) == SLURMSTEPD_NOT_RUNNING) {
			debug("stale domain socket for stepd %u.%u ",
			      stepd->jobid, stepd->stepid);
			--(msg->job_count);
			close(fd);
			continue;
		}

		close(fd);
		if (stepd->stepid == NO_VAL)
			debug("found apparently running job %u", stepd->jobid);
		else
			debug("found apparently running step %u.%u",
			      stepd->jobid, stepd->stepid);
		msg->job_id[n]  = stepd->jobid;
		msg->step_id[n] = stepd->stepid;
		n++;
	}
	list_iterator_destroy(i);
	list_destroy(steps);

	if (!msg->energy)
		msg->energy = acct_gather_energy_alloc();
	acct_gather_energy_g_get_data(ENERGY_DATA_STRUCT, msg->energy);

	msg->timestamp = time(NULL);

	return;
}
Example #6
0
/*
 * Send the termination signal to all of the unix domain socket files
 * for a given directory and nodename, and then unlink the files.
 * Returns SLURM_ERROR if any sockets could not be unlinked.
 */
int
stepd_cleanup_sockets(const char *directory, const char *nodename)
{
	DIR *dp;
	struct dirent *ent;
	regex_t re;
	struct stat stat_buf;
	int rc = SLURM_SUCCESS;

	_sockname_regex_init(&re, nodename);

	/*
	 * Make sure that "directory" exists and is a directory.
	 */
	if (stat(directory, &stat_buf) < 0) {
		error("Domain socket directory %s: %m", directory);
		goto done;
	} else if (!S_ISDIR(stat_buf.st_mode)) {
		error("%s is not a directory", directory);
		goto done;
	}

	if ((dp = opendir(directory)) == NULL) {
		error("Unable to open directory: %m");
		goto done;
	}

	while ((ent = readdir(dp)) != NULL) {
		uint32_t jobid, stepid;
		if (_sockname_regex(&re, ent->d_name, &jobid, &stepid) == 0) {
			char *path;
			int fd;

			path = NULL;
			xstrfmtcat(path, "%s/%s", directory, ent->d_name);
			verbose("Cleaning up stray job step %u.%u",
				jobid, stepid);

			/* signal the slurmstepd to terminate its step */
			fd = stepd_connect((char *) directory, (char *) nodename,
					jobid, stepid);
			if (fd == -1) {
				debug("Unable to connect to socket %s", path);
			} else {
				stepd_signal_container(fd, SIGKILL);
				close(fd);
			}

			/* make sure that the socket has been removed */
			if (unlink(path) == -1 && errno != ENOENT) {
				error("Unable to clean up stray socket %s: %m",
				      path);
				rc = SLURM_ERROR;
			}
			xfree(path);
		}
	}

	closedir(dp);
done:
	regfree(&re);
	return rc;
}