Beispiel #1
0
static void _merge_node_step_data(hid_t fid_job, char* file_name, int nodeIndex,
				  char* node_name, hid_t jgid_nodes,
				  hid_t jgid_tasks)
{
	hid_t	fid_nodestep, jgid_node, nsgid_root, nsgid_node;
	char	*start_time;
	char	group_name[MAX_GROUP_NAME+1];

	jgid_node = H5Gcreate(jgid_nodes, node_name,
			      H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
	if (jgid_node < 0) {
		error("Failed to create group %s",node_name);
		return;
	}
	put_string_attribute(jgid_node, ATTR_NODENAME, node_name);
	// Process node step file
	// Open the file and the node group.
	fid_nodestep = H5Fopen(file_name, H5F_ACC_RDONLY, H5P_DEFAULT);
	if (fid_nodestep < 0) {
		H5Gclose(jgid_node);
		error("Failed to open %s",file_name);
		return;
	}
	nsgid_root = H5Gopen(fid_nodestep,"/", H5P_DEFAULT);
	sprintf(group_name, "/%s_%s", GRP_NODE, node_name);
	nsgid_node = H5Gopen(nsgid_root, group_name, H5P_DEFAULT);
	if (nsgid_node < 0) {
		H5Gclose(fid_nodestep);
		H5Gclose(jgid_node);
		error("Failed to open node group");
		return;;
	}
	start_time = get_string_attribute(nsgid_node,ATTR_STARTTIME);
	if (start_time == NULL) {
		info("No %s attribute", ATTR_STARTTIME);
	} else {
		put_string_attribute(jgid_node, ATTR_STARTTIME, start_time);
		xfree(start_time);
	}
	_merge_node_totals(jgid_node, nsgid_node);
	_merge_task_totals(jgid_tasks, nsgid_node, node_name);
	_merge_series_data(jgid_tasks, jgid_node, nsgid_node);
	H5Gclose(nsgid_node);
	H5Fclose(fid_nodestep);
	H5Gclose(jgid_node);

	if (!params.keepfiles)
		remove(file_name);

	return;
}
extern int acct_gather_profile_p_add_sample_data(uint32_t type, void *data)
{
	hid_t   g_sample_grp;
	char    group[MAX_GROUP_NAME+1];
	char 	group_sample[MAX_GROUP_NAME+1];
	static uint32_t sample_no = 0;
	uint32_t task_id = 0;
	void *send_profile = NULL;
	char *type_name = NULL;

	profile_task_t  profile_task;
	profile_network_t  profile_network;
	profile_energy_t  profile_energy;
	profile_io_t  profile_io;

	struct jobacctinfo *jobacct = (struct jobacctinfo *)data;
	acct_network_data_t *net = (acct_network_data_t *)data;
	acct_energy_data_t *ener = (acct_energy_data_t *)data;
	struct lustre_data *lus = (struct lustre_data *)data;

	xassert(_run_in_daemon());
	xassert(g_job);

	if (g_job->stepid == NO_VAL)
		return SLURM_SUCCESS;

	xassert(g_profile_running != ACCT_GATHER_PROFILE_NOT_SET);

	if (!_do_profile(type, g_profile_running))
		return SLURM_SUCCESS;

	switch (type) {
	case ACCT_GATHER_PROFILE_ENERGY:
		snprintf(group, sizeof(group), "%s", GRP_ENERGY);

		memset(&profile_energy, 0, sizeof(profile_energy_t));
		profile_energy.time = ener->time;
		profile_energy.cpu_freq = ener->cpu_freq;
		profile_energy.power = ener->power;

		send_profile = &profile_energy;
		break;
	case ACCT_GATHER_PROFILE_TASK:
		if (_get_taskid_from_pid(jobacct->pid, &task_id)
		    != SLURM_SUCCESS)
			return SLURM_ERROR;

		snprintf(group, sizeof(group), "%s_%u", GRP_TASK, task_id);

		memset(&profile_task, 0, sizeof(profile_task_t));
		profile_task.time = time(NULL);
		profile_task.cpu_freq = jobacct->act_cpufreq;
		profile_task.cpu_time = jobacct->tot_cpu;
		profile_task.cpu_utilization = jobacct->tot_cpu;
		profile_task.pages = jobacct->tot_pages;
		profile_task.read_size = jobacct->tot_disk_read;
		profile_task.rss = jobacct->tot_rss;
		profile_task.vm_size = jobacct->tot_vsize;
		profile_task.write_size = jobacct->tot_disk_write;

		send_profile = &profile_task;
		break;
	case ACCT_GATHER_PROFILE_LUSTRE:
		snprintf(group, sizeof(group), "%s", GRP_LUSTRE);

		memset(&profile_io, 0, sizeof(profile_io_t));
		profile_io.time = time(NULL);
		profile_io.reads = lus->reads;
		profile_io.read_size = lus->read_size;
		profile_io.writes = lus->writes;
		profile_io.write_size = lus->write_size;

		send_profile = &profile_io;

		break;
	case ACCT_GATHER_PROFILE_NETWORK:

		snprintf(group, sizeof(group), "%s", GRP_NETWORK);

		memset(&profile_network, 0, sizeof(profile_network_t));
		profile_network.time = time(NULL);
		profile_network.packets_in = net->packets_in;
		profile_network.size_in = net->size_in;
		profile_network.packets_out = net->packets_out;
		profile_network.size_out = net->size_out;

		send_profile = &profile_network;

		break;
	default:
		error("acct_gather_profile_p_add_sample_data: "
		      "Unknown type %d sent", type);
		return SLURM_ERROR;
	}

	type_name = acct_gather_profile_type_to_string(type);

	if (debug_flags & DEBUG_FLAG_PROFILE)
		info("PROFILE: add_sample_data Group-%s Type=%s",
		     group, type_name);

	if (file_id == -1) {
		if (debug_flags & DEBUG_FLAG_PROFILE) {
			// This can happen from samples from the gather threads
			// before the step actually starts.
			info("PROFILE: add_sample_data, HDF5 file not open");
		}
		return SLURM_FAILURE;
	}
	if (gid_samples < 0) {
		gid_samples = make_group(gid_node, GRP_SAMPLES);
		if (gid_samples < 1) {
			info("PROFILE: failed to create TimeSeries group");
			return SLURM_FAILURE;
		}
	}
	g_sample_grp = get_group(gid_samples, group);
	if (g_sample_grp < 0) {
		g_sample_grp = make_group(gid_samples, group);
		if (g_sample_grp < 0) {
			info("PROFILE: failed to open TimeSeries %s", group);
			return SLURM_FAILURE;
		}
		put_string_attribute(g_sample_grp, ATTR_DATATYPE, type_name);
	}
	sprintf(group_sample, "%s_%10.10d", group, ++sample_no);
	put_hdf5_data(g_sample_grp, type, SUBDATA_SAMPLE,
		      group_sample, send_profile, 1);
	H5Gclose(g_sample_grp);

	return SLURM_SUCCESS;
}
extern int acct_gather_profile_p_node_step_start(stepd_step_rec_t* job)
{
	int rc = SLURM_SUCCESS;

	time_t start_time;
	char    *profile_file_name;
	char *profile_str;

	xassert(_run_in_daemon());

	g_job = job;

	if (g_job->stepid == NO_VAL) {
		g_profile_running = ACCT_GATHER_PROFILE_NONE;
		return rc;
	}

	xassert(hdf5_conf.dir);

	if (debug_flags & DEBUG_FLAG_PROFILE) {
		profile_str = acct_gather_profile_to_string(g_job->profile);
		info("PROFILE: option --profile=%s", profile_str);
	}

	if (g_profile_running == ACCT_GATHER_PROFILE_NOT_SET)
		g_profile_running = _determine_profile();

	if (g_profile_running <= ACCT_GATHER_PROFILE_NONE)
		return rc;

	_create_directories();

	profile_file_name = xstrdup_printf(
		"%s/%s/%u_%u_%s.h5",
		hdf5_conf.dir, g_job->pwd->pw_name,
		g_job->jobid, g_job->stepid, g_job->node_name);

	if (debug_flags & DEBUG_FLAG_PROFILE) {
		profile_str = acct_gather_profile_to_string(g_profile_running);
		info("PROFILE: node_step_start, opt=%s file=%s",
		     profile_str, profile_file_name);
	}

	// Create a new file using the default properties.
	profile_init();
	file_id = H5Fcreate(profile_file_name, H5F_ACC_TRUNC, H5P_DEFAULT,
			    H5P_DEFAULT);

	if (chown(profile_file_name, (uid_t)g_job->pwd->pw_uid,
		  (gid_t)g_job->pwd->pw_gid) < 0)
		error("chown(%s): %m", profile_file_name);
	chmod(profile_file_name,  0600);
	xfree(profile_file_name);

	if (file_id < 1) {
		info("PROFILE: Failed to create Node group");
		return SLURM_FAILURE;
	}

	sprintf(group_node, "/%s_%s", GRP_NODE, g_job->node_name);
	gid_node = H5Gcreate(file_id, group_node, H5P_DEFAULT,
			     H5P_DEFAULT, H5P_DEFAULT);
	if (gid_node < 1) {
		H5Fclose(file_id);
		file_id = -1;
		info("PROFILE: Failed to create Node group");
		return SLURM_FAILURE;
	}
	put_string_attribute(gid_node, ATTR_NODENAME, g_job->node_name);
	put_int_attribute(gid_node, ATTR_NTASKS, g_job->node_tasks);
	start_time = time(NULL);
	put_string_attribute(gid_node, ATTR_STARTTIME, ctime(&start_time));

	return rc;
}
Beispiel #4
0
static void _merge_task_totals(hid_t jg_tasks, hid_t nsg_node, char* node_name)
{
	hid_t   jg_task, jg_totals, nsg_totals,
		g_total, nsg_tasks, nsg_task = -1;
	hsize_t nobj, ntasks = -1;
	int	i, len, taskx, taskid, taskcpus, size_data;
	void    *data;
	uint32_t type;
	char    buf[MAX_GROUP_NAME+1];
	char    group_name[MAX_GROUP_NAME+1];
	H5G_info_t group_info;

	if (jg_tasks < 0) {
		info("Job Tasks is not HDF5 object");
		return;
	}
	if (nsg_node < 0) {
		info("Node-Step is not HDF5 object");
		return;
	}

	nsg_tasks = get_group(nsg_node, GRP_TASKS);
	if (nsg_tasks < 0) {
		debug("No Tasks group in node-step file");
		return;
	}

	H5Gget_info(nsg_tasks, &group_info);
	ntasks = group_info.nlinks;
	for (taskx = 0; ((int)ntasks>0) && (taskx<((int)ntasks)); taskx++) {
		// Get the name of the group.
		len = H5Lget_name_by_idx(nsg_tasks, ".", H5_INDEX_NAME,
					 H5_ITER_INC, taskx, buf,
					 MAX_GROUP_NAME, H5P_DEFAULT);
		if (len<1 || len>MAX_GROUP_NAME) {
			info("Invalid group name %s", buf);
			continue;
		}
		nsg_task = H5Gopen(nsg_tasks, buf, H5P_DEFAULT);
		if (nsg_task < 0) {
			debug("Failed to open %s", buf);
			continue;
		}
		taskid = get_int_attribute(nsg_task, ATTR_TASKID);
		sprintf(group_name, "%s_%d", GRP_TASK, taskid);
		jg_task = H5Gcreate(jg_tasks, group_name,
				    H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
		if (jg_task < 0) {
			H5Gclose(nsg_task);
			info("Failed to create job task group");
			continue;
		}
		put_string_attribute(jg_task, ATTR_NODENAME, node_name);
		put_int_attribute(jg_task, ATTR_TASKID, taskid);
		taskcpus = get_int_attribute(nsg_task, ATTR_CPUPERTASK);
		put_int_attribute(jg_task, ATTR_CPUPERTASK, taskcpus);
		nsg_totals = get_group(nsg_task, GRP_TOTALS);
		if (nsg_totals < 0) {
			H5Gclose(jg_task);
			H5Gclose(nsg_task);
			continue;
		}
		jg_totals = H5Gcreate(jg_task, GRP_TOTALS,
				      H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
		if (jg_totals < 0) {
			H5Gclose(jg_task);
			H5Gclose(nsg_task);
			info("Failed to create job task totals");
			continue;
		}
		H5Gget_info(nsg_totals, &group_info);
		nobj = group_info.nlinks;
		for (i = 0; (nobj>0) && (i<nobj); i++) {
			// Get the name of the group.
			len = H5Lget_name_by_idx(nsg_totals, ".", H5_INDEX_NAME,
						 H5_ITER_INC, i, buf,
						 MAX_GROUP_NAME, H5P_DEFAULT);

			if (len<1 || len>MAX_GROUP_NAME) {
				info("Invalid group name %s", buf);
				continue;
			}
			g_total = H5Gopen(nsg_totals, buf, H5P_DEFAULT);
			if (g_total < 0) {
				info("Failed to open %s", buf);
				continue;
			}
			type = get_uint32_attribute(g_total, ATTR_DATATYPE);
			if (!type) {
				H5Gclose(g_total);
				info("No %s attribute", ATTR_DATATYPE);
				continue;
			}
			data = get_hdf5_data(g_total, type, buf, &size_data);
			if (data == NULL) {
				H5Gclose(g_total);
				info("Failed to get group %s type %s data", buf,
				     acct_gather_profile_type_to_string(type));
				continue;
			}
			put_hdf5_data(jg_totals, type, SUBDATA_DATA,
				      buf, data, 1);
			xfree(data);
			H5Gclose(g_total);
		}
		H5Gclose(nsg_totals);
		H5Gclose(nsg_task);
		H5Gclose(jg_totals);
		H5Gclose(jg_task);
	}
	H5Gclose(nsg_tasks);
}
Beispiel #5
0
extern int acct_gather_profile_p_node_step_start(stepd_step_rec_t* job)
{
	int rc = SLURM_SUCCESS;

	char *profile_file_name;
	char *profile_str;

	xassert(_run_in_daemon());

	g_job = job;

	xassert(hdf5_conf.dir);

	if (debug_flags & DEBUG_FLAG_PROFILE) {
		profile_str = acct_gather_profile_to_string(g_job->profile);
		info("PROFILE: option --profile=%s", profile_str);
	}

	if (g_profile_running == ACCT_GATHER_PROFILE_NOT_SET)
		g_profile_running = _determine_profile();

	if (g_profile_running <= ACCT_GATHER_PROFILE_NONE)
		return rc;

	_create_directories();

	/* Use a more user friendly string "batch" rather
	 * then 4294967294.
	 */
	if (g_job->stepid == NO_VAL) {
		profile_file_name = xstrdup_printf("%s/%s/%u_%s_%s.h5",
						   hdf5_conf.dir,
						   g_job->user_name,
						   g_job->jobid,
						   "batch",
						   g_job->node_name);
	} else {
		profile_file_name = xstrdup_printf(
			"%s/%s/%u_%u_%s.h5",
			hdf5_conf.dir, g_job->user_name,
			g_job->jobid, g_job->stepid, g_job->node_name);
	}

	if (debug_flags & DEBUG_FLAG_PROFILE) {
		profile_str = acct_gather_profile_to_string(g_profile_running);
		info("PROFILE: node_step_start, opt=%s file=%s",
		     profile_str, profile_file_name);
	}

	// Create a new file using the default properties.
	file_id = H5Fcreate(profile_file_name, H5F_ACC_TRUNC, H5P_DEFAULT,
			    H5P_DEFAULT);
	if (chown(profile_file_name, (uid_t)g_job->uid,
		  (gid_t)g_job->gid) < 0)
		error("chown(%s): %m", profile_file_name);
	chmod(profile_file_name,  0600);
	xfree(profile_file_name);

	if (file_id < 1) {
		info("PROFILE: Failed to create Node group");
		return SLURM_FAILURE;
	}
	/* fd_set_close_on_exec(file_id); Not supported for HDF5 */
	sprintf(group_node, "/%s", g_job->node_name);
	gid_node = make_group(file_id, group_node);
	if (gid_node < 0) {
		H5Fclose(file_id);
		file_id = -1;
		info("PROFILE: Failed to create Node group");
		return SLURM_FAILURE;
	}
	put_int_attribute(gid_node, ATTR_NODEINX, g_job->nodeid);
	put_string_attribute(gid_node, ATTR_NODENAME, g_job->node_name);
	put_int_attribute(gid_node, ATTR_NTASKS, g_job->node_tasks);
	put_int_attribute(gid_node, ATTR_CPUPERTASK, g_job->cpus_per_task);

	step_start_time = time(NULL);
	put_string_attribute(gid_node, ATTR_STARTTIME,
			     slurm_ctime2(&step_start_time));

	return rc;
}