Esempio n. 1
0
static List _get_precs(List task_list, bool pgid_plugin, uint64_t cont_id,
		       jag_callbacks_t *callbacks)
{
	jag_prec_t *prec = NULL;
	int pid = 0;

	if (!pgid_plugin) {
		pid_t *pids = NULL;
		int npids = 0;
		/* get only the processes in the proctrack container */
		proctrack_g_get_pids(cont_id, &pids, &npids);
		if (!npids) {
			debug4("no pids in this container %"PRIu64"", cont_id);
			goto finished;
		}
		for (i = 0; i < npids; i++) {
			pid = pids[i];
			if (!getprocs(&proc, sizeof(proc), 0, 0, &pid, 1))
				continue; /* Assume the process went away */
			prec = xmalloc(sizeof(prec_t));
			list_append(prec_list, prec);
			prec->pid = proc.pi_pid;
			prec->ppid = proc.pi_ppid;
			prec->usec = proc.pi_ru.ru_utime.tv_sec +
				proc.pi_ru.ru_utime.tv_usec * 1e-6;
			prec->ssec = proc.pi_ru.ru_stime.tv_sec +
				proc.pi_ru.ru_stime.tv_usec * 1e-6;
			prec->pages = proc.pi_majflt;
			prec->rss = (proc.pi_trss + proc.pi_drss) * pagesize;
			//prec->rss *= 1024;
			prec->vsize = (proc.pi_tsize / 1024);
			prec->vsize += (proc.pi_dvm * pagesize);
			//prec->vsize *= 1024;
			/*  debug("vsize = %f = (%d/1024)+(%d*%d)",   */
/*    		      prec->vsize, proc.pi_tsize, proc.pi_dvm, pagesize);  */
		}
	} else {
		while (getprocs(&proc, sizeof(proc), 0, 0, &pid, 1) == 1) {
			prec = xmalloc(sizeof(prec_t));
			list_append(prec_list, prec);
			prec->pid = proc.pi_pid;
			prec->ppid = proc.pi_ppid;
			prec->usec = proc.pi_ru.ru_utime.tv_sec +
				proc.pi_ru.ru_utime.tv_usec * 1e-6;
			prec->ssec = proc.pi_ru.ru_stime.tv_sec +
				proc.pi_ru.ru_stime.tv_usec * 1e-6;
			prec->pages = proc.pi_majflt;
			prec->rss = (proc.pi_trss + proc.pi_drss) * pagesize;
			//prec->rss *= 1024;
			prec->vsize = (proc.pi_tsize / 1024);
			prec->vsize += (proc.pi_dvm * pagesize);
			//prec->vsize *= 1024;
			/*  debug("vsize = %f = (%d/1024)+(%d*%d)",   */
/*    		      prec->vsize, proc.pi_tsize, proc.pi_dvm, pagesize);  */
		}
	}
}
Esempio n. 2
0
/*
 * Signal all processes within a container
 * cont_id IN - container ID as returned by proctrack_g_create()
 * signal IN  - signal to send, if zero then perform error checking
 *              but do not send signal
 *
 * Returns a SLURM errno.
 */
extern int proctrack_g_signal(uint64_t cont_id, int signal)
{


	if (slurm_proctrack_init() < 0)
		return SLURM_ERROR;

	if (signal == SIGKILL) {
		pid_t *pids = NULL;
		int i, j, npids = 0, hung_pids = 0;
		char *stat_fname = NULL;
		if (proctrack_g_get_pids(cont_id, &pids, &npids) ==
		    SLURM_SUCCESS) {
			/* NOTE: proctrack_g_get_pids() is not supported
			 * by the proctrack/pgid plugin */
			for (j = 0; j < 2; j++) {
				if (j)
					sleep(2);
				hung_pids = 0;
				for (i = 0; i < npids; i++) {
					if (!pids[i])
						continue;
					xstrfmtcat(stat_fname, "/proc/%d/stat",
						   (int) pids[i]);
					if (_test_core_dumping(stat_fname)) {
						debug("Process %d continuing "
						      "core dump",
						      (int) pids[i]);
						hung_pids++;
					} else {
						/* Don't test this PID again */
						pids[i] = 0;
					}
					xfree(stat_fname);
				}
				if (hung_pids == 0)
					break;
			}
			xfree(pids);
			if (hung_pids) {
				info("Defering sending signal, processes in "
				     "job are currently core dumping");
				_spawn_signal_thread(cont_id, signal);
				return SLURM_SUCCESS;
			}
		}
	}

	return (*(ops.signal)) (cont_id, signal);
}
Esempio n. 3
0
File: req.c Progetto: birc-aeh/slurm
/* We don't check the uid in this function, anyone may list the task info. */
static int
_handle_list_pids(int fd, stepd_step_rec_t *job)
{
	int i;
	pid_t *pids = NULL;
	int npids = 0;
	uint32_t pid;

	debug("_handle_list_pids for job %u.%u", job->jobid, job->stepid);
	proctrack_g_get_pids(job->cont_id, &pids, &npids);
	safe_write(fd, &npids, sizeof(uint32_t));
	for (i = 0; i < npids; i++) {
		pid = (uint32_t)pids[i];
		safe_write(fd, &pid, sizeof(uint32_t));
	}
	if (npids > 0)
		xfree(pids);

	return SLURM_SUCCESS;
rwfail:
	if (npids > 0)
		xfree(pids);
	return SLURM_FAILURE;
}
Esempio n. 4
0
static void *_sig_agent(void *args)
{
	agent_arg_t *agent_arg_ptr = args;

	while (1) {
		pid_t *pids = NULL;
		int i, npids = 0, hung_pids = 0;
		char *stat_fname = NULL;

		if (proctrack_g_get_pids(agent_arg_ptr->cont_id, &pids,
					     &npids) == SLURM_SUCCESS) {
			hung_pids = 0;
			for (i = 0; i < npids; i++) {
				xstrfmtcat(stat_fname, "/proc/%d/stat",
					   (int) pids[i]);
				if (_test_core_dumping(stat_fname)) {
					debug("Process %d continuing "
					      "core dump",
					      (int) pids[i]);
					hung_pids++;
				} else {
					/* Kill processes that we can now */
					kill(pids[i], agent_arg_ptr->signal);
				}
				xfree(stat_fname);
			}
		}
		if (hung_pids == 0)
			break;
		sleep(5);
	}

	(void) (*(ops.signal)) (agent_arg_ptr->cont_id, agent_arg_ptr->signal);
	xfree(args);
	return NULL;
}
Esempio n. 5
0
static List _get_precs(List task_list, bool pgid_plugin, uint64_t cont_id,
		       jag_callbacks_t *callbacks)
{
	List prec_list = list_create(destroy_jag_prec);
	char	proc_stat_file[256];	/* Allow ~20x extra length */
	char	proc_io_file[256];	/* Allow ~20x extra length */
	static	int	slash_proc_open = 0;
	int i;

	if (!pgid_plugin) {
		pid_t *pids = NULL;
		int npids = 0;
		/* get only the processes in the proctrack container */
		proctrack_g_get_pids(cont_id, &pids, &npids);
		if (!npids) {
			/* update consumed energy even if pids do not exist */
			ListIterator itr = list_iterator_create(task_list);
			struct jobacctinfo *jobacct = NULL;
			if ((jobacct = list_next(itr))) {
				acct_gather_energy_g_get_data(
					energy_profile,
					&jobacct->energy);
				debug2("getjoules_task energy = %u",
				       jobacct->energy.consumed_energy);
			}
			list_iterator_destroy(itr);

			debug4("no pids in this container %"PRIu64"", cont_id);
			goto finished;
		}
		for (i = 0; i < npids; i++) {
			snprintf(proc_stat_file, 256, "/proc/%d/stat", pids[i]);
			snprintf(proc_io_file, 256, "/proc/%d/io", pids[i]);
			_handle_stats(prec_list, proc_stat_file, proc_io_file,
				      callbacks);
		}
		xfree(pids);
	} else {
		struct dirent *slash_proc_entry;
		char  *iptr = NULL, *optr = NULL, *optr2 = NULL;

		if (slash_proc_open) {
			rewinddir(slash_proc);
		} else {
			slash_proc=opendir("/proc");
			if (slash_proc == NULL) {
				perror("opening /proc");
				goto finished;
			}
			slash_proc_open=1;
		}
		strcpy(proc_stat_file, "/proc/");
		strcpy(proc_io_file, "/proc/");

		while ((slash_proc_entry = readdir(slash_proc))) {

			/* Save a few cyles by simulating
			 * strcat(statFileName, slash_proc_entry->d_name);
			 * strcat(statFileName, "/stat");
			 * while checking for a numeric filename (which really
			 * should be a pid). Then do the same for the
			 * /proc/<pid>/io file name.
			 */
			optr = proc_stat_file + sizeof("/proc");
			iptr = slash_proc_entry->d_name;
			i = 0;
			do {
				if ((*iptr < '0') ||
				    ((*optr++ = *iptr++) > '9')) {
					i = -1;
					break;
				}
			} while (*iptr);

			if (i == -1)
				continue;
			iptr = (char*)"/stat";

			do {
				*optr++ = *iptr++;
			} while (*iptr);
			*optr = 0;

			optr2 = proc_io_file + sizeof("/proc");
			iptr = slash_proc_entry->d_name;
			i = 0;
			do {
				if ((*iptr < '0') ||
				    ((*optr2++ = *iptr++) > '9')) {
					i = -1;
					break;
				}
			} while (*iptr);
			if (i == -1)
				continue;
			iptr = (char*)"/io";

			do {
				*optr2++ = *iptr++;
			} while (*iptr);
			*optr2 = 0;

			_handle_stats(prec_list, proc_stat_file, proc_io_file,
				      callbacks);
		}
	}

finished:

	return prec_list;
}
Esempio n. 6
0
File: req.c Progetto: A1ve5/slurm
/* Wait for the pid given and when it ends get and children it might
 * of left behind and wait on them instead.
 */
static void *_wait_extern_pid(void *args)
{
	extern_pid_t *extern_pid = (extern_pid_t *)args;

	stepd_step_rec_t *job = extern_pid->job;
	pid_t pid = extern_pid->pid;

	jobacctinfo_t *jobacct = NULL;
	pid_t *pids = NULL;
	int npids = 0, i;
	char	proc_stat_file[256];	/* Allow ~20x extra length */
	FILE *stat_fp = NULL;
	int fd;
	char sbuf[256], *tmp, state[1];
	int num_read, ppid;

	xfree(extern_pid);

	//info("waiting on pid %d", pid);
	_block_on_pid(pid);
	//info("done with pid %d %d: %m", pid, rc);
	jobacct = jobacct_gather_remove_task(pid);
	if (jobacct) {
		job->jobacct->energy.consumed_energy = 0;
		jobacctinfo_aggregate(job->jobacct, jobacct);
		jobacctinfo_destroy(jobacct);
	}
	acct_gather_profile_g_task_end(pid);

	/* See if we have any children of init left and add them to track. */
	proctrack_g_get_pids(job->cont_id, &pids, &npids);
	for (i = 0; i < npids; i++) {
		snprintf(proc_stat_file, 256, "/proc/%d/stat", pids[i]);
		if (!(stat_fp = fopen(proc_stat_file, "r")))
			continue;  /* Assume the process went away */
		fd = fileno(stat_fp);
		fcntl(fd, F_SETFD, FD_CLOEXEC);

		num_read = read(fd, sbuf, (sizeof(sbuf) - 1));

		if (num_read <= 0)
			goto next_pid;

		sbuf[num_read] = '\0';

		/* get to the end of cmd name */
		tmp = strrchr(sbuf, ')');
		*tmp = '\0';	/* replace trailing ')' with NULL */
		/* skip space after ')' too */
		sscanf(tmp + 2,	"%c %d ", state, &ppid);

		if (ppid == 1) {
			debug2("adding tracking of orphaned process %d",
			       pids[i]);
			_handle_add_extern_pid_internal(job, pids[i]);
		}
	next_pid:
		fclose(stat_fp);
	}

	return NULL;
}
Esempio n. 7
0
static void *_sig_agent(void *args)
{
	bool hung_pids = false;
	agent_arg_t *agent_arg_ptr = args;

	while (1) {
		pid_t *pids = NULL;
		int i, npids = 0;
		char *stat_fname = NULL;

		if (hung_pids)
			sleep(5);

		hung_pids = false;

		if (proctrack_g_get_pids(agent_arg_ptr->cont_id, &pids,
					     &npids) == SLURM_SUCCESS) {
			/*
			 * Check if any processes are core dumping.
			 * If so, do not signal any of them, instead
			 * jump back to the sleep and wait for the core
			 * dump to finish.
			 *
			 * This works around an issue with OpenMP
			 * applications failing to write a full core
			 * file out - only one of the processes will
			 * be marked are core dumping, but killing any
			 * of them will terminate the application.
			 */
			for (i = 0; i < npids; i++) {
				xstrfmtcat(stat_fname, "/proc/%d/stat",
					   (int) pids[i]);
				if (_test_core_dumping(stat_fname)) {
					debug("Process %d continuing core dump",
					      (int) pids[i]);
					hung_pids = true;
					xfree(stat_fname);
					break;
				}
				xfree(stat_fname);
			}

			if (hung_pids) {
				xfree(pids);
				continue;
			}

			for (i = 0; i < npids; i++) {
				/* Kill processes */
				kill(pids[i], agent_arg_ptr->signal);
			}
			xfree(pids);
		}

		break;
	}

	(void) (*(ops.signal)) (agent_arg_ptr->cont_id, agent_arg_ptr->signal);
	xfree(args);
	return NULL;
}