Exemple #1
0
static void _handle_stats(List prec_list, char *proc_stat_file,
			  char *proc_io_file, jag_callbacks_t *callbacks)
{
	static int no_share_data = -1;
	FILE *stat_fp = NULL;
	FILE *io_fp = NULL;
	int fd, fd2;
	jag_prec_t *prec = NULL;

	if (no_share_data == -1) {
		char *acct_params = slurm_get_jobacct_gather_params();
		if (acct_params && strstr(acct_params, "NoShare"))
			no_share_data = 1;
		else
			no_share_data = 0;
		xfree(acct_params);
	}

	if (!(stat_fp = fopen(proc_stat_file, "r")))
		return;  /* Assume the process went away */
	/*
	 * Close the file on exec() of user tasks.
	 *
	 * NOTE: If we fork() slurmstepd after the
	 * fopen() above and before the fcntl() below,
	 * then the user task may have this extra file
	 * open, which can cause problems for
	 * checkpoint/restart, but this should be a very rare
	 * problem in practice.
	 */
	fd = fileno(stat_fp);
	fcntl(fd, F_SETFD, FD_CLOEXEC);

	prec = xmalloc(sizeof(jag_prec_t));
	if (_get_process_data_line(fd, prec)) {
		if (no_share_data)
			_remove_share_data(proc_stat_file, prec);
		list_append(prec_list, prec);
		if ((io_fp = fopen(proc_io_file, "r"))) {
			fd2 = fileno(io_fp);
			fcntl(fd2, F_SETFD, FD_CLOEXEC);
			_get_process_io_data_line(fd2, prec);
			fclose(io_fp);
		}
		if (callbacks->prec_extra)
			(*(callbacks->prec_extra))(prec, my_pagesize);
	} else
		xfree(prec);
	fclose(stat_fp);

}
Exemple #2
0
static void _handle_stats(List prec_list, char *proc_stat_file, char *proc_io_file,
			  char *proc_smaps_file, jag_callbacks_t *callbacks)
{
	static int no_share_data = -1;
	static int use_pss = -1;
	FILE *stat_fp = NULL;
	FILE *io_fp = NULL;
	int fd, fd2;
	jag_prec_t *prec = NULL;

	if (no_share_data == -1) {
		char *acct_params = slurm_get_jobacct_gather_params();
		if (acct_params && xstrcasestr(acct_params, "NoShare"))
			no_share_data = 1;
		else
			no_share_data = 0;

		if (acct_params && xstrcasestr(acct_params, "UsePss"))
			use_pss = 1;
		else
			use_pss = 0;
		xfree(acct_params);
	}

	if (!(stat_fp = fopen(proc_stat_file, "r")))
		return;  /* Assume the process went away */
	/*
	 * Close the file on exec() of user tasks.
	 *
	 * NOTE: If we fork() slurmstepd after the
	 * fopen() above and before the fcntl() below,
	 * then the user task may have this extra file
	 * open, which can cause problems for
	 * checkpoint/restart, but this should be a very rare
	 * problem in practice.
	 */
	fd = fileno(stat_fp);
	if (fcntl(fd, F_SETFD, FD_CLOEXEC) == -1)
		error("%s: fcntl(%s): %m", __func__, proc_stat_file);

	prec = try_xmalloc(sizeof(jag_prec_t));
	if (prec == NULL) {	/* Avoid killing slurmstepd on malloc failure */
		fclose(stat_fp);
		return;
	}
	if (!_get_process_data_line(fd, prec)) {
		xfree(prec);
		fclose(stat_fp);
		return;
	}
	fclose(stat_fp);

	/* Remove shared data from rss */
	if (no_share_data)
		_remove_share_data(proc_stat_file, prec);

	/* Use PSS instead if RSS */
	if (use_pss) {
		if (_get_pss(proc_smaps_file, prec) == -1) {
			xfree(prec);
			return;
		}
	}

	list_append(prec_list, prec);

	if ((io_fp = fopen(proc_io_file, "r"))) {
		fd2 = fileno(io_fp);
		if (fcntl(fd2, F_SETFD, FD_CLOEXEC) == -1)
			error("%s: fcntl: %m", __func__);
		_get_process_io_data_line(fd2, prec);
		fclose(io_fp);
	}
	if (callbacks->prec_extra)
		(*(callbacks->prec_extra))(prec);
}
Exemple #3
0
extern void jag_common_poll_data(
	List task_list, bool pgid_plugin, uint64_t cont_id,
	jag_callbacks_t *callbacks, bool profile)
{
	/* Update the data */
	List prec_list = NULL;
	uint64_t total_job_mem = 0, total_job_vsize = 0;
	ListIterator itr;
	jag_prec_t *prec = NULL;
	struct jobacctinfo *jobacct = NULL;
	static int processing = 0;
	char sbuf[72];
	int energy_counted = 0;
	time_t ct;
	static int no_over_memory_kill = -1;

	xassert(callbacks);

	if (!pgid_plugin && (cont_id == (uint64_t)NO_VAL)) {
		debug("cont_id hasn't been set yet not running poll");
		return;
	}

	if (processing) {
		debug("already running, returning");
		return;
	}
	processing = 1;

	if (no_over_memory_kill == -1) {
		char *acct_params = slurm_get_jobacct_gather_params();
		if (acct_params && strstr(acct_params, "NoOverMemoryKill"))
			no_over_memory_kill = 1;
		else
			no_over_memory_kill = 0;
		xfree(acct_params);
	}

	if (!callbacks->get_precs)
		callbacks->get_precs = _get_precs;

	ct = time(NULL);
	prec_list = (*(callbacks->get_precs))(task_list, pgid_plugin, cont_id,
					      callbacks);

	if (!list_count(prec_list) || !task_list || !list_count(task_list))
		goto finished;	/* We have no business being here! */

	itr = list_iterator_create(task_list);
	while ((jobacct = list_next(itr))) {
		uint32_t cpu_calc;
		uint32_t last_total_cputime;
		if (!(prec = list_find_first(prec_list, _find_prec, jobacct)))
			continue;

#if _DEBUG
		info("pid:%u ppid:%u rss:%d KB",
		     prec->pid, prec->ppid, prec->rss);
#endif
		/* find all my descendents */
		if (callbacks->get_offspring_data)
			(*(callbacks->get_offspring_data))
				(prec_list, prec, prec->pid);

		last_total_cputime = jobacct->tot_cpu;

		cpu_calc = (prec->ssec + prec->usec)/hertz;
		/* tally their usage */
		jobacct->max_rss =
			MAX(jobacct->max_rss, prec->rss);
		jobacct->tot_rss = prec->rss;
		total_job_mem += prec->rss;
		jobacct->max_vsize =
			MAX(jobacct->max_vsize, prec->vsize);
		jobacct->tot_vsize = prec->vsize;
		total_job_vsize += prec->vsize;
		jobacct->max_pages =
			MAX(jobacct->max_pages, prec->pages);
		jobacct->tot_pages = prec->pages;
		jobacct->max_disk_read = MAX(
			jobacct->max_disk_read,
			prec->disk_read);
		jobacct->tot_disk_read = prec->disk_read;
		jobacct->max_disk_write = MAX(
			jobacct->max_disk_write,
			prec->disk_write);

		jobacct->tot_disk_write = prec->disk_write;
		jobacct->min_cpu =
			MAX(jobacct->min_cpu, cpu_calc);

		/* Update the cpu times
		 */
		jobacct->tot_cpu = cpu_calc;
		jobacct->user_cpu_sec = prec->usec/hertz;
		jobacct->sys_cpu_sec = prec->ssec/hertz;
		debug2("%s: %d mem size %"PRIu64" %"PRIu64" "
		       "time %u(%u+%u)", __func__,
		       jobacct->pid, jobacct->max_rss,
		       jobacct->max_vsize, jobacct->tot_cpu,
		       jobacct->user_cpu_sec,
		       jobacct->sys_cpu_sec);
		/* compute frequency */
		jobacct->this_sampled_cputime =
			cpu_calc - last_total_cputime;
		_get_sys_interface_freq_line(
			prec->last_cpu,
			"cpuinfo_cur_freq", sbuf);
		jobacct->act_cpufreq =
			_update_weighted_freq(jobacct, sbuf);
		debug("%s: Task average frequency = %u "
		       "pid %d mem size %"PRIu64" %"PRIu64" "
		       "time %u(%u+%u)", __func__,
		       jobacct->act_cpufreq,
		       jobacct->pid, jobacct->max_rss,
		       jobacct->max_vsize, jobacct->tot_cpu,
		       jobacct->user_cpu_sec,
		       jobacct->sys_cpu_sec);
		/* get energy consumption
		 * only once is enough since we
		 * report per node energy consumption */
		debug2("energycounted = %d", energy_counted);
		if (energy_counted == 0) {
			acct_gather_energy_g_get_data(
				energy_profile,
				&jobacct->energy);
			debug2("getjoules_task energy = %"PRIu64,
			       jobacct->energy.consumed_energy);
			energy_counted = 1;
		}
		if (profile &&
		    acct_gather_profile_g_is_active(ACCT_GATHER_PROFILE_TASK)) {
			jobacct->cur_time = ct;

			_record_profile(jobacct);

			jobacct->last_tot_disk_read = jobacct->tot_disk_read;
			jobacct->last_tot_disk_write = jobacct->tot_disk_write;
			jobacct->last_total_cputime = jobacct->tot_cpu;
			jobacct->last_time = jobacct->cur_time;
		}
	}
	list_iterator_destroy(itr);

	if (!no_over_memory_kill)
		jobacct_gather_handle_mem_limit(total_job_mem, total_job_vsize);

finished:
	FREE_NULL_LIST(prec_list);
	processing = 0;
}
Exemple #4
0
static void _handle_stats(List prec_list, char *proc_stat_file,
			  char *proc_io_file, char *proc_smaps_file,
			  jag_callbacks_t *callbacks,
			  int tres_count)
{
	static int no_share_data = -1;
	static int use_pss = -1;
	FILE *stat_fp = NULL;
	FILE *io_fp = NULL;
	int fd, fd2, i;
	jag_prec_t *prec = NULL;

	if (no_share_data == -1) {
		char *acct_params = slurm_get_jobacct_gather_params();
		if (acct_params && xstrcasestr(acct_params, "NoShare"))
			no_share_data = 1;
		else
			no_share_data = 0;

		if (acct_params && xstrcasestr(acct_params, "UsePss"))
			use_pss = 1;
		else
			use_pss = 0;
		xfree(acct_params);
	}

	if (!(stat_fp = fopen(proc_stat_file, "r")))
		return;  /* Assume the process went away */
	/*
	 * Close the file on exec() of user tasks.
	 *
	 * NOTE: If we fork() slurmstepd after the
	 * fopen() above and before the fcntl() below,
	 * then the user task may have this extra file
	 * open, which can cause problems for
	 * checkpoint/restart, but this should be a very rare
	 * problem in practice.
	 */
	fd = fileno(stat_fp);
	if (fcntl(fd, F_SETFD, FD_CLOEXEC) == -1)
		error("%s: fcntl(%s): %m", __func__, proc_stat_file);

	prec = xmalloc(sizeof(jag_prec_t));

	if (!tres_count) {
		assoc_mgr_lock_t locks = {
			NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK,
			READ_LOCK, NO_LOCK, NO_LOCK };
		assoc_mgr_lock(&locks);
		tres_count = g_tres_count;
		assoc_mgr_unlock(&locks);
	}

	prec->tres_count = tres_count;
	prec->tres_data = xmalloc(prec->tres_count *
				  sizeof(acct_gather_data_t));

	/* Initialize read/writes */
	for (i = 0; i < prec->tres_count; i++) {
		prec->tres_data[i].num_reads = INFINITE64;
		prec->tres_data[i].num_writes = INFINITE64;
		prec->tres_data[i].size_read = INFINITE64;
		prec->tres_data[i].size_write = INFINITE64;
	}

	if (!_get_process_data_line(fd, prec)) {
		xfree(prec->tres_data);
		xfree(prec);
		fclose(stat_fp);
		return;
	}
	fclose(stat_fp);

	if (acct_gather_filesystem_g_get_data(prec->tres_data) < 0) {
		debug2("problem retrieving filesystem data");
	}

	if (acct_gather_interconnect_g_get_data(prec->tres_data) < 0) {
		debug2("problem retrieving interconnect data");
	}

	/* Remove shared data from rss */
	if (no_share_data)
		_remove_share_data(proc_stat_file, prec);

	/* Use PSS instead if RSS */
	if (use_pss) {
		if (_get_pss(proc_smaps_file, prec) == -1) {
			xfree(prec->tres_data);
			xfree(prec);
			return;
		}
	}

	list_append(prec_list, prec);

	if ((io_fp = fopen(proc_io_file, "r"))) {
		fd2 = fileno(io_fp);
		if (fcntl(fd2, F_SETFD, FD_CLOEXEC) == -1)
			error("%s: fcntl: %m", __func__);
		_get_process_io_data_line(fd2, prec);
		fclose(io_fp);
	}
}