extern int acct_gather_profile_p_task_end(pid_t taskpid) { hid_t gid_task; char group_task[MAX_GROUP_NAME+1]; uint32_t task_id; int rc = SLURM_SUCCESS; xassert(_run_in_daemon()); xassert(g_job); if (g_job->stepid == NO_VAL) return rc; xassert(g_profile_running != ACCT_GATHER_PROFILE_NOT_SET); if (!_do_profile(ACCT_GATHER_PROFILE_NOT_SET, g_profile_running)) return rc; if (_get_taskid_from_pid(taskpid, &task_id) != SLURM_SUCCESS) return SLURM_FAILURE; if (file_id == -1) { info("PROFILE: add_task_data, HDF5 file is not open"); return SLURM_FAILURE; } if (gid_tasks < 0) { gid_tasks = make_group(gid_node, GRP_TASKS); if (gid_tasks < 1) { info("PROFILE: Failed to create Tasks group"); return SLURM_FAILURE; } } sprintf(group_task, "%s_%d", GRP_TASK, task_id); gid_task = get_group(gid_tasks, group_task); if (gid_task == -1) { gid_task = make_group(gid_tasks, group_task); if (gid_task < 0) { info("Failed to open tasks %s", group_task); return SLURM_FAILURE; } put_int_attribute(gid_task, ATTR_TASKID, task_id); } put_int_attribute(gid_task, ATTR_CPUPERTASK, g_job->cpus_per_task); if (debug_flags & DEBUG_FLAG_PROFILE) info("PROFILE: task_end"); return rc; }
extern int acct_gather_profile_p_node_step_start(stepd_step_rec_t* job) { int rc = SLURM_SUCCESS; time_t start_time; char *profile_file_name; char *profile_str; xassert(_run_in_daemon()); g_job = job; if (g_job->stepid == NO_VAL) { g_profile_running = ACCT_GATHER_PROFILE_NONE; return rc; } xassert(hdf5_conf.dir); if (debug_flags & DEBUG_FLAG_PROFILE) { profile_str = acct_gather_profile_to_string(g_job->profile); info("PROFILE: option --profile=%s", profile_str); } if (g_profile_running == ACCT_GATHER_PROFILE_NOT_SET) g_profile_running = _determine_profile(); if (g_profile_running <= ACCT_GATHER_PROFILE_NONE) return rc; _create_directories(); profile_file_name = xstrdup_printf( "%s/%s/%u_%u_%s.h5", hdf5_conf.dir, g_job->pwd->pw_name, g_job->jobid, g_job->stepid, g_job->node_name); if (debug_flags & DEBUG_FLAG_PROFILE) { profile_str = acct_gather_profile_to_string(g_profile_running); info("PROFILE: node_step_start, opt=%s file=%s", profile_str, profile_file_name); } // Create a new file using the default properties. profile_init(); file_id = H5Fcreate(profile_file_name, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT); if (chown(profile_file_name, (uid_t)g_job->pwd->pw_uid, (gid_t)g_job->pwd->pw_gid) < 0) error("chown(%s): %m", profile_file_name); chmod(profile_file_name, 0600); xfree(profile_file_name); if (file_id < 1) { info("PROFILE: Failed to create Node group"); return SLURM_FAILURE; } sprintf(group_node, "/%s_%s", GRP_NODE, g_job->node_name); gid_node = H5Gcreate(file_id, group_node, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); if (gid_node < 1) { H5Fclose(file_id); file_id = -1; info("PROFILE: Failed to create Node group"); return SLURM_FAILURE; } put_string_attribute(gid_node, ATTR_NODENAME, g_job->node_name); put_int_attribute(gid_node, ATTR_NTASKS, g_job->node_tasks); start_time = time(NULL); put_string_attribute(gid_node, ATTR_STARTTIME, ctime(&start_time)); return rc; }
static void _merge_step_files(void) { hid_t fid_job = -1, jgid_step = -1, jgid_nodes = -1, jgid_tasks = -1; DIR *dir; struct dirent *de; char file_name[MAX_PROFILE_PATH+1]; char step_dir[MAX_PROFILE_PATH+1]; char step_path[MAX_PROFILE_PATH+1]; char jgrp_step_name[MAX_GROUP_NAME+1]; char jgrp_nodes_name[MAX_GROUP_NAME+1]; char jgrp_tasks_name[MAX_GROUP_NAME+1]; char *step_node, *pos_char, *stepno; int stepx = 0, num_steps = 0, nodex = -1, max_step = -1; int jobid, stepid; bool found_files = false; sprintf(step_dir, "%s/%s", params.dir, params.user); while (max_step == -1 || stepx <= max_step) { if (!(dir = opendir(step_dir))) { error("opendir for job profile directory: %m"); exit(1); } nodex = 0; while ((de = readdir(dir))) { strcpy(file_name, de->d_name); if (file_name[0] == '.') continue; // Not HDF5 file pos_char = strstr(file_name,".h5"); if (!pos_char) { error("error processing this file, %s, " "(not .h5)", de->d_name); continue; // Not HDF5 file } *pos_char = 0; // truncate .hf pos_char = strchr(file_name,'_'); if (!pos_char) continue; // not right format *pos_char = 0; // make jobid string jobid = strtol(file_name, NULL, 10); if (jobid != params.job_id) continue; // not desired job stepno = pos_char + 1; pos_char = strchr(stepno,'_'); if (!pos_char) { continue; // not right format } *pos_char = 0; // make stepid string stepid = strtol(stepno, NULL, 10); if (stepid > max_step) max_step = stepid; if (stepid != stepx) continue; // Not step we are merging step_node = pos_char + 1; // Found a node step file for this job if (!found_files) { // Need to create the job file fid_job = H5Fcreate(params.output, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT); if (fid_job < 0) { fatal("Failed to %s %s", "create HDF5 file:", params.output); } found_files = true; } if (nodex == 0) { num_steps++; sprintf(jgrp_step_name, "/%s_%d", GRP_STEP, stepx); jgid_step = make_group(fid_job, jgrp_step_name); if (jgid_step < 0) { error("Failed to create %s", jgrp_step_name); continue; } sprintf(jgrp_nodes_name,"%s/%s", jgrp_step_name, GRP_NODES); jgid_nodes = make_group(jgid_step, jgrp_nodes_name); if (jgid_nodes < 0) { error("Failed to create %s", jgrp_nodes_name); continue; } sprintf(jgrp_tasks_name,"%s/%s", jgrp_step_name, GRP_TASKS); jgid_tasks = make_group(jgid_step, jgrp_tasks_name); if (jgid_tasks < 0) { error("Failed to create %s", jgrp_tasks_name); continue; } } sprintf(step_path, "%s/%s", step_dir, de->d_name); debug("Adding %s to the job file", step_path); _merge_node_step_data(fid_job, step_path, nodex, step_node, jgid_nodes, jgid_tasks); nodex++; } closedir(dir); if (nodex > 0) { put_int_attribute(jgid_step, ATTR_NNODES, nodex); H5Gclose(jgid_tasks); H5Gclose(jgid_nodes); H5Gclose(jgid_step); } stepx++; } if (!found_files) info("No node-step files found for jobid=%d", params.job_id); else put_int_attribute(fid_job, ATTR_NSTEPS, num_steps); if (fid_job != -1) H5Fclose(fid_job); }
static void _merge_task_totals(hid_t jg_tasks, hid_t nsg_node, char* node_name) { hid_t jg_task, jg_totals, nsg_totals, g_total, nsg_tasks, nsg_task = -1; hsize_t nobj, ntasks = -1; int i, len, taskx, taskid, taskcpus, size_data; void *data; uint32_t type; char buf[MAX_GROUP_NAME+1]; char group_name[MAX_GROUP_NAME+1]; H5G_info_t group_info; if (jg_tasks < 0) { info("Job Tasks is not HDF5 object"); return; } if (nsg_node < 0) { info("Node-Step is not HDF5 object"); return; } nsg_tasks = get_group(nsg_node, GRP_TASKS); if (nsg_tasks < 0) { debug("No Tasks group in node-step file"); return; } H5Gget_info(nsg_tasks, &group_info); ntasks = group_info.nlinks; for (taskx = 0; ((int)ntasks>0) && (taskx<((int)ntasks)); taskx++) { // Get the name of the group. len = H5Lget_name_by_idx(nsg_tasks, ".", H5_INDEX_NAME, H5_ITER_INC, taskx, buf, MAX_GROUP_NAME, H5P_DEFAULT); if (len<1 || len>MAX_GROUP_NAME) { info("Invalid group name %s", buf); continue; } nsg_task = H5Gopen(nsg_tasks, buf, H5P_DEFAULT); if (nsg_task < 0) { debug("Failed to open %s", buf); continue; } taskid = get_int_attribute(nsg_task, ATTR_TASKID); sprintf(group_name, "%s_%d", GRP_TASK, taskid); jg_task = H5Gcreate(jg_tasks, group_name, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); if (jg_task < 0) { H5Gclose(nsg_task); info("Failed to create job task group"); continue; } put_string_attribute(jg_task, ATTR_NODENAME, node_name); put_int_attribute(jg_task, ATTR_TASKID, taskid); taskcpus = get_int_attribute(nsg_task, ATTR_CPUPERTASK); put_int_attribute(jg_task, ATTR_CPUPERTASK, taskcpus); nsg_totals = get_group(nsg_task, GRP_TOTALS); if (nsg_totals < 0) { H5Gclose(jg_task); H5Gclose(nsg_task); continue; } jg_totals = H5Gcreate(jg_task, GRP_TOTALS, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); if (jg_totals < 0) { H5Gclose(jg_task); H5Gclose(nsg_task); info("Failed to create job task totals"); continue; } H5Gget_info(nsg_totals, &group_info); nobj = group_info.nlinks; for (i = 0; (nobj>0) && (i<nobj); i++) { // Get the name of the group. len = H5Lget_name_by_idx(nsg_totals, ".", H5_INDEX_NAME, H5_ITER_INC, i, buf, MAX_GROUP_NAME, H5P_DEFAULT); if (len<1 || len>MAX_GROUP_NAME) { info("Invalid group name %s", buf); continue; } g_total = H5Gopen(nsg_totals, buf, H5P_DEFAULT); if (g_total < 0) { info("Failed to open %s", buf); continue; } type = get_uint32_attribute(g_total, ATTR_DATATYPE); if (!type) { H5Gclose(g_total); info("No %s attribute", ATTR_DATATYPE); continue; } data = get_hdf5_data(g_total, type, buf, &size_data); if (data == NULL) { H5Gclose(g_total); info("Failed to get group %s type %s data", buf, acct_gather_profile_type_to_string(type)); continue; } put_hdf5_data(jg_totals, type, SUBDATA_DATA, buf, data, 1); xfree(data); H5Gclose(g_total); } H5Gclose(nsg_totals); H5Gclose(nsg_task); H5Gclose(jg_totals); H5Gclose(jg_task); } H5Gclose(nsg_tasks); }
extern int acct_gather_profile_p_node_step_start(stepd_step_rec_t* job) { int rc = SLURM_SUCCESS; char *profile_file_name; char *profile_str; xassert(_run_in_daemon()); g_job = job; xassert(hdf5_conf.dir); if (debug_flags & DEBUG_FLAG_PROFILE) { profile_str = acct_gather_profile_to_string(g_job->profile); info("PROFILE: option --profile=%s", profile_str); } if (g_profile_running == ACCT_GATHER_PROFILE_NOT_SET) g_profile_running = _determine_profile(); if (g_profile_running <= ACCT_GATHER_PROFILE_NONE) return rc; _create_directories(); /* Use a more user friendly string "batch" rather * then 4294967294. */ if (g_job->stepid == NO_VAL) { profile_file_name = xstrdup_printf("%s/%s/%u_%s_%s.h5", hdf5_conf.dir, g_job->user_name, g_job->jobid, "batch", g_job->node_name); } else { profile_file_name = xstrdup_printf( "%s/%s/%u_%u_%s.h5", hdf5_conf.dir, g_job->user_name, g_job->jobid, g_job->stepid, g_job->node_name); } if (debug_flags & DEBUG_FLAG_PROFILE) { profile_str = acct_gather_profile_to_string(g_profile_running); info("PROFILE: node_step_start, opt=%s file=%s", profile_str, profile_file_name); } // Create a new file using the default properties. file_id = H5Fcreate(profile_file_name, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT); if (chown(profile_file_name, (uid_t)g_job->uid, (gid_t)g_job->gid) < 0) error("chown(%s): %m", profile_file_name); chmod(profile_file_name, 0600); xfree(profile_file_name); if (file_id < 1) { info("PROFILE: Failed to create Node group"); return SLURM_FAILURE; } /* fd_set_close_on_exec(file_id); Not supported for HDF5 */ sprintf(group_node, "/%s", g_job->node_name); gid_node = make_group(file_id, group_node); if (gid_node < 0) { H5Fclose(file_id); file_id = -1; info("PROFILE: Failed to create Node group"); return SLURM_FAILURE; } put_int_attribute(gid_node, ATTR_NODEINX, g_job->nodeid); put_string_attribute(gid_node, ATTR_NODENAME, g_job->node_name); put_int_attribute(gid_node, ATTR_NTASKS, g_job->node_tasks); put_int_attribute(gid_node, ATTR_CPUPERTASK, g_job->cpus_per_task); step_start_time = time(NULL); put_string_attribute(gid_node, ATTR_STARTTIME, slurm_ctime2(&step_start_time)); return rc; }
static int _merge_step_files(void) { hid_t fid_job = -1; hid_t jgid_step = -1; hid_t jgid_nodes = -1; hid_t jgid_tasks = -1; DIR *dir; struct dirent *de; char file_name[MAX_PROFILE_PATH+1]; char step_dir[MAX_PROFILE_PATH+1]; char step_path[MAX_PROFILE_PATH+1]; char jgrp_step_name[MAX_GROUP_NAME+1]; char jgrp_nodes_name[MAX_GROUP_NAME+1]; char jgrp_tasks_name[MAX_GROUP_NAME+1]; char *step_node; char *pos_char; char *stepno; int stepx = 0; int num_steps = 0; int nodex = -1; int max_step = -1; int jobid, stepid; bool found_files = false; sprintf(step_dir, "%s/%s", params.dir, params.user); while (max_step == -1 || stepx <= max_step) { if (!(dir = opendir(step_dir))) { error("Cannot open %s job profile directory: %m", step_dir); return -1; } nodex = 0; while ((de = readdir(dir))) { strcpy(file_name, de->d_name); if (file_name[0] == '.') continue; pos_char = strstr(file_name,".h5"); if (!pos_char) continue; *pos_char = 0; pos_char = strchr(file_name,'_'); if (!pos_char) continue; *pos_char = 0; jobid = strtol(file_name, NULL, 10); if (jobid != params.job_id) continue; stepno = pos_char + 1; pos_char = strchr(stepno,'_'); if (!pos_char) { continue; } *pos_char = 0; stepid = strtol(stepno, NULL, 10); if (stepid > max_step) max_step = stepid; if (stepid != stepx) continue; step_node = pos_char + 1; if (!found_files) { fid_job = H5Fcreate(params.output, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT); if (fid_job < 0) { error("Failed create HDF5 file %s", params.output); return -1; } found_files = true; } if (nodex == 0) { num_steps++; sprintf(jgrp_step_name, "/%s_%d", GRP_STEP, stepx); jgid_step = make_group(fid_job, jgrp_step_name); if (jgid_step < 0) { error("Failed to create %s", jgrp_step_name); continue; } sprintf(jgrp_nodes_name,"%s/%s", jgrp_step_name, GRP_NODES); jgid_nodes = make_group(jgid_step, jgrp_nodes_name); if (jgid_nodes < 0) { error("Failed to create %s", jgrp_nodes_name); continue; } sprintf(jgrp_tasks_name,"%s/%s", jgrp_step_name, GRP_TASKS); jgid_tasks = make_group(jgid_step, jgrp_tasks_name); if (jgid_tasks < 0) { error("Failed to create %s", jgrp_tasks_name); continue; } } sprintf(step_path, "%s/%s", step_dir, de->d_name); debug("Adding %s to the job file", step_path); _merge_node_step_data(fid_job, step_path, nodex, step_node, jgid_nodes, jgid_tasks); nodex++; } closedir(dir); if (nodex > 0) { put_int_attribute(jgid_step, ATTR_NNODES, nodex); H5Gclose(jgid_tasks); H5Gclose(jgid_nodes); H5Gclose(jgid_step); } /* If we did not find the step 0 * bail out. */ if (stepx == 0 && !found_files) break; stepx++; } if (!found_files) info("No node-step files found for jobid %d", params.job_id); else put_int_attribute(fid_job, ATTR_NSTEPS, num_steps); if (fid_job != -1) H5Fclose(fid_job); return 0; }