Пример #1
0
extern int acct_gather_profile_p_task_end(pid_t taskpid)
{
	hid_t   gid_task;
	char 	group_task[MAX_GROUP_NAME+1];
	uint32_t task_id;
	int rc = SLURM_SUCCESS;

	xassert(_run_in_daemon());
	xassert(g_job);

	if (g_job->stepid == NO_VAL)
		return rc;

	xassert(g_profile_running != ACCT_GATHER_PROFILE_NOT_SET);

	if (!_do_profile(ACCT_GATHER_PROFILE_NOT_SET, g_profile_running))
		return rc;

	if (_get_taskid_from_pid(taskpid, &task_id) != SLURM_SUCCESS)
		return SLURM_FAILURE;
	if (file_id == -1) {
		info("PROFILE: add_task_data, HDF5 file is not open");
		return SLURM_FAILURE;
	}
	if (gid_tasks < 0) {
		gid_tasks = make_group(gid_node, GRP_TASKS);
		if (gid_tasks < 1) {
			info("PROFILE: Failed to create Tasks group");
			return SLURM_FAILURE;
		}
	}
	sprintf(group_task, "%s_%d", GRP_TASK, task_id);
	gid_task = get_group(gid_tasks, group_task);
	if (gid_task == -1) {
		gid_task = make_group(gid_tasks, group_task);
		if (gid_task < 0) {
			info("Failed to open tasks %s", group_task);
			return SLURM_FAILURE;
		}
		put_int_attribute(gid_task, ATTR_TASKID, task_id);
	}
	put_int_attribute(gid_task, ATTR_CPUPERTASK, g_job->cpus_per_task);

	if (debug_flags & DEBUG_FLAG_PROFILE)
		info("PROFILE: task_end");
	return rc;
}
Пример #2
0
extern int acct_gather_profile_p_node_step_start(stepd_step_rec_t* job)
{
	int rc = SLURM_SUCCESS;

	time_t start_time;
	char    *profile_file_name;
	char *profile_str;

	xassert(_run_in_daemon());

	g_job = job;

	if (g_job->stepid == NO_VAL) {
		g_profile_running = ACCT_GATHER_PROFILE_NONE;
		return rc;
	}

	xassert(hdf5_conf.dir);

	if (debug_flags & DEBUG_FLAG_PROFILE) {
		profile_str = acct_gather_profile_to_string(g_job->profile);
		info("PROFILE: option --profile=%s", profile_str);
	}

	if (g_profile_running == ACCT_GATHER_PROFILE_NOT_SET)
		g_profile_running = _determine_profile();

	if (g_profile_running <= ACCT_GATHER_PROFILE_NONE)
		return rc;

	_create_directories();

	profile_file_name = xstrdup_printf(
		"%s/%s/%u_%u_%s.h5",
		hdf5_conf.dir, g_job->pwd->pw_name,
		g_job->jobid, g_job->stepid, g_job->node_name);

	if (debug_flags & DEBUG_FLAG_PROFILE) {
		profile_str = acct_gather_profile_to_string(g_profile_running);
		info("PROFILE: node_step_start, opt=%s file=%s",
		     profile_str, profile_file_name);
	}

	// Create a new file using the default properties.
	profile_init();
	file_id = H5Fcreate(profile_file_name, H5F_ACC_TRUNC, H5P_DEFAULT,
			    H5P_DEFAULT);

	if (chown(profile_file_name, (uid_t)g_job->pwd->pw_uid,
		  (gid_t)g_job->pwd->pw_gid) < 0)
		error("chown(%s): %m", profile_file_name);
	chmod(profile_file_name,  0600);
	xfree(profile_file_name);

	if (file_id < 1) {
		info("PROFILE: Failed to create Node group");
		return SLURM_FAILURE;
	}

	sprintf(group_node, "/%s_%s", GRP_NODE, g_job->node_name);
	gid_node = H5Gcreate(file_id, group_node, H5P_DEFAULT,
			     H5P_DEFAULT, H5P_DEFAULT);
	if (gid_node < 1) {
		H5Fclose(file_id);
		file_id = -1;
		info("PROFILE: Failed to create Node group");
		return SLURM_FAILURE;
	}
	put_string_attribute(gid_node, ATTR_NODENAME, g_job->node_name);
	put_int_attribute(gid_node, ATTR_NTASKS, g_job->node_tasks);
	start_time = time(NULL);
	put_string_attribute(gid_node, ATTR_STARTTIME, ctime(&start_time));

	return rc;
}
Пример #3
0
static void _merge_step_files(void)
{
	hid_t 	fid_job = -1, jgid_step = -1, jgid_nodes = -1, jgid_tasks = -1;
	DIR    *dir;
	struct  dirent *de;
	char	file_name[MAX_PROFILE_PATH+1];
	char	step_dir[MAX_PROFILE_PATH+1];
	char	step_path[MAX_PROFILE_PATH+1];
	char	jgrp_step_name[MAX_GROUP_NAME+1];
	char	jgrp_nodes_name[MAX_GROUP_NAME+1];
	char	jgrp_tasks_name[MAX_GROUP_NAME+1];
	char 	*step_node, *pos_char, *stepno;
	int	stepx = 0, num_steps = 0, nodex = -1, max_step = -1;
	int	jobid, stepid;
	bool	found_files = false;

	sprintf(step_dir, "%s/%s", params.dir, params.user);
	while (max_step == -1 || stepx <= max_step) {
		if (!(dir = opendir(step_dir))) {
			error("opendir for job profile directory: %m");
			exit(1);
		}
		nodex = 0;
		while ((de = readdir(dir))) {
			strcpy(file_name, de->d_name);
			if (file_name[0] == '.')
				continue; // Not HDF5 file
			pos_char = strstr(file_name,".h5");
			if (!pos_char) {
				error("error processing this file, %s, "
				      "(not .h5)", de->d_name);
				continue; // Not HDF5 file
			}
			*pos_char = 0; // truncate .hf
			pos_char = strchr(file_name,'_');
			if (!pos_char)
				continue; // not right format
			*pos_char = 0; // make jobid string
			jobid = strtol(file_name, NULL, 10);
			if (jobid != params.job_id)
				continue; // not desired job
			stepno = pos_char + 1;
			pos_char = strchr(stepno,'_');
			if (!pos_char) {
				continue; // not right format
			}
			*pos_char = 0; // make stepid string
			stepid = strtol(stepno, NULL, 10);
			if (stepid > max_step)
				max_step = stepid;
			if (stepid != stepx)
				continue; // Not step we are merging
			step_node = pos_char + 1;
			// Found a node step file for this job
			if (!found_files) {
				// Need to create the job file
				fid_job = H5Fcreate(params.output,
						    H5F_ACC_TRUNC,
						    H5P_DEFAULT,
						    H5P_DEFAULT);
				if (fid_job < 0) {
					fatal("Failed to %s %s",
					      "create HDF5 file:",
					      params.output);
				}
				found_files = true;
			}
			if (nodex == 0) {
				num_steps++;
				sprintf(jgrp_step_name, "/%s_%d", GRP_STEP,
					stepx);
				jgid_step = make_group(fid_job, jgrp_step_name);
				if (jgid_step < 0) {
					error("Failed to create %s",
					      jgrp_step_name);
					continue;
				}
				sprintf(jgrp_nodes_name,"%s/%s",
					jgrp_step_name,
					GRP_NODES);
				jgid_nodes = make_group(jgid_step,
							jgrp_nodes_name);
				if (jgid_nodes < 0) {
					error("Failed to create %s",
					      jgrp_nodes_name);
					continue;
				}
				sprintf(jgrp_tasks_name,"%s/%s",
					jgrp_step_name,
					GRP_TASKS);
				jgid_tasks = make_group(jgid_step,
							jgrp_tasks_name);
				if (jgid_tasks < 0) {
					error("Failed to create %s",
					      jgrp_tasks_name);
					continue;
				}
			}
			sprintf(step_path, "%s/%s", step_dir, de->d_name);
			debug("Adding %s to the job file", step_path);
			_merge_node_step_data(fid_job, step_path,
					      nodex, step_node,
					      jgid_nodes, jgid_tasks);
			nodex++;
		}
		closedir(dir);
		if (nodex > 0) {
			put_int_attribute(jgid_step, ATTR_NNODES, nodex);
			H5Gclose(jgid_tasks);
			H5Gclose(jgid_nodes);
			H5Gclose(jgid_step);
		}
		stepx++;
	}
	if (!found_files)
		info("No node-step files found for jobid=%d", params.job_id);
	else
		put_int_attribute(fid_job, ATTR_NSTEPS, num_steps);
	if (fid_job != -1)
		H5Fclose(fid_job);
}
Пример #4
0
static void _merge_task_totals(hid_t jg_tasks, hid_t nsg_node, char* node_name)
{
	hid_t   jg_task, jg_totals, nsg_totals,
		g_total, nsg_tasks, nsg_task = -1;
	hsize_t nobj, ntasks = -1;
	int	i, len, taskx, taskid, taskcpus, size_data;
	void    *data;
	uint32_t type;
	char    buf[MAX_GROUP_NAME+1];
	char    group_name[MAX_GROUP_NAME+1];
	H5G_info_t group_info;

	if (jg_tasks < 0) {
		info("Job Tasks is not HDF5 object");
		return;
	}
	if (nsg_node < 0) {
		info("Node-Step is not HDF5 object");
		return;
	}

	nsg_tasks = get_group(nsg_node, GRP_TASKS);
	if (nsg_tasks < 0) {
		debug("No Tasks group in node-step file");
		return;
	}

	H5Gget_info(nsg_tasks, &group_info);
	ntasks = group_info.nlinks;
	for (taskx = 0; ((int)ntasks>0) && (taskx<((int)ntasks)); taskx++) {
		// Get the name of the group.
		len = H5Lget_name_by_idx(nsg_tasks, ".", H5_INDEX_NAME,
					 H5_ITER_INC, taskx, buf,
					 MAX_GROUP_NAME, H5P_DEFAULT);
		if (len<1 || len>MAX_GROUP_NAME) {
			info("Invalid group name %s", buf);
			continue;
		}
		nsg_task = H5Gopen(nsg_tasks, buf, H5P_DEFAULT);
		if (nsg_task < 0) {
			debug("Failed to open %s", buf);
			continue;
		}
		taskid = get_int_attribute(nsg_task, ATTR_TASKID);
		sprintf(group_name, "%s_%d", GRP_TASK, taskid);
		jg_task = H5Gcreate(jg_tasks, group_name,
				    H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
		if (jg_task < 0) {
			H5Gclose(nsg_task);
			info("Failed to create job task group");
			continue;
		}
		put_string_attribute(jg_task, ATTR_NODENAME, node_name);
		put_int_attribute(jg_task, ATTR_TASKID, taskid);
		taskcpus = get_int_attribute(nsg_task, ATTR_CPUPERTASK);
		put_int_attribute(jg_task, ATTR_CPUPERTASK, taskcpus);
		nsg_totals = get_group(nsg_task, GRP_TOTALS);
		if (nsg_totals < 0) {
			H5Gclose(jg_task);
			H5Gclose(nsg_task);
			continue;
		}
		jg_totals = H5Gcreate(jg_task, GRP_TOTALS,
				      H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT);
		if (jg_totals < 0) {
			H5Gclose(jg_task);
			H5Gclose(nsg_task);
			info("Failed to create job task totals");
			continue;
		}
		H5Gget_info(nsg_totals, &group_info);
		nobj = group_info.nlinks;
		for (i = 0; (nobj>0) && (i<nobj); i++) {
			// Get the name of the group.
			len = H5Lget_name_by_idx(nsg_totals, ".", H5_INDEX_NAME,
						 H5_ITER_INC, i, buf,
						 MAX_GROUP_NAME, H5P_DEFAULT);

			if (len<1 || len>MAX_GROUP_NAME) {
				info("Invalid group name %s", buf);
				continue;
			}
			g_total = H5Gopen(nsg_totals, buf, H5P_DEFAULT);
			if (g_total < 0) {
				info("Failed to open %s", buf);
				continue;
			}
			type = get_uint32_attribute(g_total, ATTR_DATATYPE);
			if (!type) {
				H5Gclose(g_total);
				info("No %s attribute", ATTR_DATATYPE);
				continue;
			}
			data = get_hdf5_data(g_total, type, buf, &size_data);
			if (data == NULL) {
				H5Gclose(g_total);
				info("Failed to get group %s type %s data", buf,
				     acct_gather_profile_type_to_string(type));
				continue;
			}
			put_hdf5_data(jg_totals, type, SUBDATA_DATA,
				      buf, data, 1);
			xfree(data);
			H5Gclose(g_total);
		}
		H5Gclose(nsg_totals);
		H5Gclose(nsg_task);
		H5Gclose(jg_totals);
		H5Gclose(jg_task);
	}
	H5Gclose(nsg_tasks);
}
Пример #5
0
extern int acct_gather_profile_p_node_step_start(stepd_step_rec_t* job)
{
	int rc = SLURM_SUCCESS;

	char *profile_file_name;
	char *profile_str;

	xassert(_run_in_daemon());

	g_job = job;

	xassert(hdf5_conf.dir);

	if (debug_flags & DEBUG_FLAG_PROFILE) {
		profile_str = acct_gather_profile_to_string(g_job->profile);
		info("PROFILE: option --profile=%s", profile_str);
	}

	if (g_profile_running == ACCT_GATHER_PROFILE_NOT_SET)
		g_profile_running = _determine_profile();

	if (g_profile_running <= ACCT_GATHER_PROFILE_NONE)
		return rc;

	_create_directories();

	/* Use a more user friendly string "batch" rather
	 * then 4294967294.
	 */
	if (g_job->stepid == NO_VAL) {
		profile_file_name = xstrdup_printf("%s/%s/%u_%s_%s.h5",
						   hdf5_conf.dir,
						   g_job->user_name,
						   g_job->jobid,
						   "batch",
						   g_job->node_name);
	} else {
		profile_file_name = xstrdup_printf(
			"%s/%s/%u_%u_%s.h5",
			hdf5_conf.dir, g_job->user_name,
			g_job->jobid, g_job->stepid, g_job->node_name);
	}

	if (debug_flags & DEBUG_FLAG_PROFILE) {
		profile_str = acct_gather_profile_to_string(g_profile_running);
		info("PROFILE: node_step_start, opt=%s file=%s",
		     profile_str, profile_file_name);
	}

	// Create a new file using the default properties.
	file_id = H5Fcreate(profile_file_name, H5F_ACC_TRUNC, H5P_DEFAULT,
			    H5P_DEFAULT);
	if (chown(profile_file_name, (uid_t)g_job->uid,
		  (gid_t)g_job->gid) < 0)
		error("chown(%s): %m", profile_file_name);
	chmod(profile_file_name,  0600);
	xfree(profile_file_name);

	if (file_id < 1) {
		info("PROFILE: Failed to create Node group");
		return SLURM_FAILURE;
	}
	/* fd_set_close_on_exec(file_id); Not supported for HDF5 */
	sprintf(group_node, "/%s", g_job->node_name);
	gid_node = make_group(file_id, group_node);
	if (gid_node < 0) {
		H5Fclose(file_id);
		file_id = -1;
		info("PROFILE: Failed to create Node group");
		return SLURM_FAILURE;
	}
	put_int_attribute(gid_node, ATTR_NODEINX, g_job->nodeid);
	put_string_attribute(gid_node, ATTR_NODENAME, g_job->node_name);
	put_int_attribute(gid_node, ATTR_NTASKS, g_job->node_tasks);
	put_int_attribute(gid_node, ATTR_CPUPERTASK, g_job->cpus_per_task);

	step_start_time = time(NULL);
	put_string_attribute(gid_node, ATTR_STARTTIME,
			     slurm_ctime2(&step_start_time));

	return rc;
}
Пример #6
0
static int _merge_step_files(void)
{
	hid_t fid_job = -1;
	hid_t jgid_step = -1;
	hid_t jgid_nodes = -1;
	hid_t jgid_tasks = -1;
	DIR *dir;
	struct  dirent *de;
	char file_name[MAX_PROFILE_PATH+1];
	char step_dir[MAX_PROFILE_PATH+1];
	char step_path[MAX_PROFILE_PATH+1];
	char jgrp_step_name[MAX_GROUP_NAME+1];
	char jgrp_nodes_name[MAX_GROUP_NAME+1];
	char jgrp_tasks_name[MAX_GROUP_NAME+1];
	char *step_node;
	char *pos_char;
	char *stepno;
	int	stepx = 0;
	int num_steps = 0;
	int nodex = -1;
	int max_step = -1;
	int	jobid, stepid;
	bool found_files = false;

	sprintf(step_dir, "%s/%s", params.dir, params.user);

	while (max_step == -1 || stepx <= max_step) {

		if (!(dir = opendir(step_dir))) {
			error("Cannot open %s job profile directory: %m", step_dir);
			return -1;
		}

		nodex = 0;
		while ((de = readdir(dir))) {

			strcpy(file_name, de->d_name);
			if (file_name[0] == '.')
				continue;

			pos_char = strstr(file_name,".h5");
			if (!pos_char)
				continue;
			*pos_char = 0;

			pos_char = strchr(file_name,'_');
			if (!pos_char)
				continue;
			*pos_char = 0;

			jobid = strtol(file_name, NULL, 10);
			if (jobid != params.job_id)
				continue;

			stepno = pos_char + 1;
			pos_char = strchr(stepno,'_');
			if (!pos_char) {
				continue;
			}
			*pos_char = 0;

			stepid = strtol(stepno, NULL, 10);
			if (stepid > max_step)
				max_step = stepid;
			if (stepid != stepx)
				continue;

			step_node = pos_char + 1;

			if (!found_files) {
				fid_job = H5Fcreate(params.output,
				                    H5F_ACC_TRUNC,
				                    H5P_DEFAULT,
				                    H5P_DEFAULT);
				if (fid_job < 0) {
					error("Failed create HDF5 file %s", params.output);
					return -1;
				}
				found_files = true;
			}

			if (nodex == 0) {

				num_steps++;
				sprintf(jgrp_step_name, "/%s_%d", GRP_STEP,
				        stepx);

				jgid_step = make_group(fid_job, jgrp_step_name);
				if (jgid_step < 0) {
					error("Failed to create %s", jgrp_step_name);
					continue;
				}

				sprintf(jgrp_nodes_name,"%s/%s",
				        jgrp_step_name,
				        GRP_NODES);
				jgid_nodes = make_group(jgid_step,
				                        jgrp_nodes_name);
				if (jgid_nodes < 0) {
					error("Failed to create %s", jgrp_nodes_name);
					continue;
				}

				sprintf(jgrp_tasks_name,"%s/%s",
				        jgrp_step_name,
				        GRP_TASKS);
				jgid_tasks = make_group(jgid_step,
				                        jgrp_tasks_name);
				if (jgid_tasks < 0) {
					error("Failed to create %s", jgrp_tasks_name);
					continue;
				}
			}

			sprintf(step_path, "%s/%s", step_dir, de->d_name);
			debug("Adding %s to the job file", step_path);
			_merge_node_step_data(fid_job, step_path,
			                      nodex, step_node,
			                      jgid_nodes, jgid_tasks);
			nodex++;
		}

		closedir(dir);

		if (nodex > 0) {
			put_int_attribute(jgid_step, ATTR_NNODES, nodex);
			H5Gclose(jgid_tasks);
			H5Gclose(jgid_nodes);
			H5Gclose(jgid_step);
		}

		/* If we did not find the step 0
		 * bail out.
		 */
		if (stepx == 0
			&& !found_files)
			break;

		stepx++;
	}

	if (!found_files)
		info("No node-step files found for jobid %d", params.job_id);
	else
		put_int_attribute(fid_job, ATTR_NSTEPS, num_steps);

	if (fid_job != -1)
		H5Fclose(fid_job);

	return 0;
}