static void _merge_node_step_data(hid_t fid_job, char* file_name, int nodeIndex, char* node_name, hid_t jgid_nodes, hid_t jgid_tasks) { hid_t fid_nodestep, jgid_node, nsgid_root, nsgid_node; char *start_time; char group_name[MAX_GROUP_NAME+1]; jgid_node = H5Gcreate(jgid_nodes, node_name, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); if (jgid_node < 0) { error("Failed to create group %s",node_name); return; } put_string_attribute(jgid_node, ATTR_NODENAME, node_name); // Process node step file // Open the file and the node group. fid_nodestep = H5Fopen(file_name, H5F_ACC_RDONLY, H5P_DEFAULT); if (fid_nodestep < 0) { H5Gclose(jgid_node); error("Failed to open %s",file_name); return; } nsgid_root = H5Gopen(fid_nodestep,"/", H5P_DEFAULT); sprintf(group_name, "/%s_%s", GRP_NODE, node_name); nsgid_node = H5Gopen(nsgid_root, group_name, H5P_DEFAULT); if (nsgid_node < 0) { H5Gclose(fid_nodestep); H5Gclose(jgid_node); error("Failed to open node group"); return;; } start_time = get_string_attribute(nsgid_node,ATTR_STARTTIME); if (start_time == NULL) { info("No %s attribute", ATTR_STARTTIME); } else { put_string_attribute(jgid_node, ATTR_STARTTIME, start_time); xfree(start_time); } _merge_node_totals(jgid_node, nsgid_node); _merge_task_totals(jgid_tasks, nsgid_node, node_name); _merge_series_data(jgid_tasks, jgid_node, nsgid_node); H5Gclose(nsgid_node); H5Fclose(fid_nodestep); H5Gclose(jgid_node); if (!params.keepfiles) remove(file_name); return; }
extern int acct_gather_profile_p_add_sample_data(uint32_t type, void *data) { hid_t g_sample_grp; char group[MAX_GROUP_NAME+1]; char group_sample[MAX_GROUP_NAME+1]; static uint32_t sample_no = 0; uint32_t task_id = 0; void *send_profile = NULL; char *type_name = NULL; profile_task_t profile_task; profile_network_t profile_network; profile_energy_t profile_energy; profile_io_t profile_io; struct jobacctinfo *jobacct = (struct jobacctinfo *)data; acct_network_data_t *net = (acct_network_data_t *)data; acct_energy_data_t *ener = (acct_energy_data_t *)data; struct lustre_data *lus = (struct lustre_data *)data; xassert(_run_in_daemon()); xassert(g_job); if (g_job->stepid == NO_VAL) return SLURM_SUCCESS; xassert(g_profile_running != ACCT_GATHER_PROFILE_NOT_SET); if (!_do_profile(type, g_profile_running)) return SLURM_SUCCESS; switch (type) { case ACCT_GATHER_PROFILE_ENERGY: snprintf(group, sizeof(group), "%s", GRP_ENERGY); memset(&profile_energy, 0, sizeof(profile_energy_t)); profile_energy.time = ener->time; profile_energy.cpu_freq = ener->cpu_freq; profile_energy.power = ener->power; send_profile = &profile_energy; break; case ACCT_GATHER_PROFILE_TASK: if (_get_taskid_from_pid(jobacct->pid, &task_id) != SLURM_SUCCESS) return SLURM_ERROR; snprintf(group, sizeof(group), "%s_%u", GRP_TASK, task_id); memset(&profile_task, 0, sizeof(profile_task_t)); profile_task.time = time(NULL); profile_task.cpu_freq = jobacct->act_cpufreq; profile_task.cpu_time = jobacct->tot_cpu; profile_task.cpu_utilization = jobacct->tot_cpu; profile_task.pages = jobacct->tot_pages; profile_task.read_size = jobacct->tot_disk_read; profile_task.rss = jobacct->tot_rss; profile_task.vm_size = jobacct->tot_vsize; profile_task.write_size = jobacct->tot_disk_write; send_profile = &profile_task; break; case ACCT_GATHER_PROFILE_LUSTRE: snprintf(group, sizeof(group), "%s", GRP_LUSTRE); memset(&profile_io, 0, sizeof(profile_io_t)); profile_io.time = time(NULL); profile_io.reads = lus->reads; profile_io.read_size = lus->read_size; profile_io.writes = lus->writes; profile_io.write_size = lus->write_size; send_profile = &profile_io; break; case ACCT_GATHER_PROFILE_NETWORK: snprintf(group, sizeof(group), "%s", GRP_NETWORK); memset(&profile_network, 0, sizeof(profile_network_t)); profile_network.time = time(NULL); profile_network.packets_in = net->packets_in; profile_network.size_in = net->size_in; profile_network.packets_out = net->packets_out; profile_network.size_out = net->size_out; send_profile = &profile_network; break; default: error("acct_gather_profile_p_add_sample_data: " "Unknown type %d sent", type); return SLURM_ERROR; } type_name = acct_gather_profile_type_to_string(type); if (debug_flags & DEBUG_FLAG_PROFILE) info("PROFILE: add_sample_data Group-%s Type=%s", group, type_name); if (file_id == -1) { if (debug_flags & DEBUG_FLAG_PROFILE) { // This can happen from samples from the gather threads // before the step actually starts. info("PROFILE: add_sample_data, HDF5 file not open"); } return SLURM_FAILURE; } if (gid_samples < 0) { gid_samples = make_group(gid_node, GRP_SAMPLES); if (gid_samples < 1) { info("PROFILE: failed to create TimeSeries group"); return SLURM_FAILURE; } } g_sample_grp = get_group(gid_samples, group); if (g_sample_grp < 0) { g_sample_grp = make_group(gid_samples, group); if (g_sample_grp < 0) { info("PROFILE: failed to open TimeSeries %s", group); return SLURM_FAILURE; } put_string_attribute(g_sample_grp, ATTR_DATATYPE, type_name); } sprintf(group_sample, "%s_%10.10d", group, ++sample_no); put_hdf5_data(g_sample_grp, type, SUBDATA_SAMPLE, group_sample, send_profile, 1); H5Gclose(g_sample_grp); return SLURM_SUCCESS; }
extern int acct_gather_profile_p_node_step_start(stepd_step_rec_t* job) { int rc = SLURM_SUCCESS; time_t start_time; char *profile_file_name; char *profile_str; xassert(_run_in_daemon()); g_job = job; if (g_job->stepid == NO_VAL) { g_profile_running = ACCT_GATHER_PROFILE_NONE; return rc; } xassert(hdf5_conf.dir); if (debug_flags & DEBUG_FLAG_PROFILE) { profile_str = acct_gather_profile_to_string(g_job->profile); info("PROFILE: option --profile=%s", profile_str); } if (g_profile_running == ACCT_GATHER_PROFILE_NOT_SET) g_profile_running = _determine_profile(); if (g_profile_running <= ACCT_GATHER_PROFILE_NONE) return rc; _create_directories(); profile_file_name = xstrdup_printf( "%s/%s/%u_%u_%s.h5", hdf5_conf.dir, g_job->pwd->pw_name, g_job->jobid, g_job->stepid, g_job->node_name); if (debug_flags & DEBUG_FLAG_PROFILE) { profile_str = acct_gather_profile_to_string(g_profile_running); info("PROFILE: node_step_start, opt=%s file=%s", profile_str, profile_file_name); } // Create a new file using the default properties. profile_init(); file_id = H5Fcreate(profile_file_name, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT); if (chown(profile_file_name, (uid_t)g_job->pwd->pw_uid, (gid_t)g_job->pwd->pw_gid) < 0) error("chown(%s): %m", profile_file_name); chmod(profile_file_name, 0600); xfree(profile_file_name); if (file_id < 1) { info("PROFILE: Failed to create Node group"); return SLURM_FAILURE; } sprintf(group_node, "/%s_%s", GRP_NODE, g_job->node_name); gid_node = H5Gcreate(file_id, group_node, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); if (gid_node < 1) { H5Fclose(file_id); file_id = -1; info("PROFILE: Failed to create Node group"); return SLURM_FAILURE; } put_string_attribute(gid_node, ATTR_NODENAME, g_job->node_name); put_int_attribute(gid_node, ATTR_NTASKS, g_job->node_tasks); start_time = time(NULL); put_string_attribute(gid_node, ATTR_STARTTIME, ctime(&start_time)); return rc; }
static void _merge_task_totals(hid_t jg_tasks, hid_t nsg_node, char* node_name) { hid_t jg_task, jg_totals, nsg_totals, g_total, nsg_tasks, nsg_task = -1; hsize_t nobj, ntasks = -1; int i, len, taskx, taskid, taskcpus, size_data; void *data; uint32_t type; char buf[MAX_GROUP_NAME+1]; char group_name[MAX_GROUP_NAME+1]; H5G_info_t group_info; if (jg_tasks < 0) { info("Job Tasks is not HDF5 object"); return; } if (nsg_node < 0) { info("Node-Step is not HDF5 object"); return; } nsg_tasks = get_group(nsg_node, GRP_TASKS); if (nsg_tasks < 0) { debug("No Tasks group in node-step file"); return; } H5Gget_info(nsg_tasks, &group_info); ntasks = group_info.nlinks; for (taskx = 0; ((int)ntasks>0) && (taskx<((int)ntasks)); taskx++) { // Get the name of the group. len = H5Lget_name_by_idx(nsg_tasks, ".", H5_INDEX_NAME, H5_ITER_INC, taskx, buf, MAX_GROUP_NAME, H5P_DEFAULT); if (len<1 || len>MAX_GROUP_NAME) { info("Invalid group name %s", buf); continue; } nsg_task = H5Gopen(nsg_tasks, buf, H5P_DEFAULT); if (nsg_task < 0) { debug("Failed to open %s", buf); continue; } taskid = get_int_attribute(nsg_task, ATTR_TASKID); sprintf(group_name, "%s_%d", GRP_TASK, taskid); jg_task = H5Gcreate(jg_tasks, group_name, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); if (jg_task < 0) { H5Gclose(nsg_task); info("Failed to create job task group"); continue; } put_string_attribute(jg_task, ATTR_NODENAME, node_name); put_int_attribute(jg_task, ATTR_TASKID, taskid); taskcpus = get_int_attribute(nsg_task, ATTR_CPUPERTASK); put_int_attribute(jg_task, ATTR_CPUPERTASK, taskcpus); nsg_totals = get_group(nsg_task, GRP_TOTALS); if (nsg_totals < 0) { H5Gclose(jg_task); H5Gclose(nsg_task); continue; } jg_totals = H5Gcreate(jg_task, GRP_TOTALS, H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); if (jg_totals < 0) { H5Gclose(jg_task); H5Gclose(nsg_task); info("Failed to create job task totals"); continue; } H5Gget_info(nsg_totals, &group_info); nobj = group_info.nlinks; for (i = 0; (nobj>0) && (i<nobj); i++) { // Get the name of the group. len = H5Lget_name_by_idx(nsg_totals, ".", H5_INDEX_NAME, H5_ITER_INC, i, buf, MAX_GROUP_NAME, H5P_DEFAULT); if (len<1 || len>MAX_GROUP_NAME) { info("Invalid group name %s", buf); continue; } g_total = H5Gopen(nsg_totals, buf, H5P_DEFAULT); if (g_total < 0) { info("Failed to open %s", buf); continue; } type = get_uint32_attribute(g_total, ATTR_DATATYPE); if (!type) { H5Gclose(g_total); info("No %s attribute", ATTR_DATATYPE); continue; } data = get_hdf5_data(g_total, type, buf, &size_data); if (data == NULL) { H5Gclose(g_total); info("Failed to get group %s type %s data", buf, acct_gather_profile_type_to_string(type)); continue; } put_hdf5_data(jg_totals, type, SUBDATA_DATA, buf, data, 1); xfree(data); H5Gclose(g_total); } H5Gclose(nsg_totals); H5Gclose(nsg_task); H5Gclose(jg_totals); H5Gclose(jg_task); } H5Gclose(nsg_tasks); }
extern int acct_gather_profile_p_node_step_start(stepd_step_rec_t* job) { int rc = SLURM_SUCCESS; char *profile_file_name; char *profile_str; xassert(_run_in_daemon()); g_job = job; xassert(hdf5_conf.dir); if (debug_flags & DEBUG_FLAG_PROFILE) { profile_str = acct_gather_profile_to_string(g_job->profile); info("PROFILE: option --profile=%s", profile_str); } if (g_profile_running == ACCT_GATHER_PROFILE_NOT_SET) g_profile_running = _determine_profile(); if (g_profile_running <= ACCT_GATHER_PROFILE_NONE) return rc; _create_directories(); /* Use a more user friendly string "batch" rather * then 4294967294. */ if (g_job->stepid == NO_VAL) { profile_file_name = xstrdup_printf("%s/%s/%u_%s_%s.h5", hdf5_conf.dir, g_job->user_name, g_job->jobid, "batch", g_job->node_name); } else { profile_file_name = xstrdup_printf( "%s/%s/%u_%u_%s.h5", hdf5_conf.dir, g_job->user_name, g_job->jobid, g_job->stepid, g_job->node_name); } if (debug_flags & DEBUG_FLAG_PROFILE) { profile_str = acct_gather_profile_to_string(g_profile_running); info("PROFILE: node_step_start, opt=%s file=%s", profile_str, profile_file_name); } // Create a new file using the default properties. file_id = H5Fcreate(profile_file_name, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT); if (chown(profile_file_name, (uid_t)g_job->uid, (gid_t)g_job->gid) < 0) error("chown(%s): %m", profile_file_name); chmod(profile_file_name, 0600); xfree(profile_file_name); if (file_id < 1) { info("PROFILE: Failed to create Node group"); return SLURM_FAILURE; } /* fd_set_close_on_exec(file_id); Not supported for HDF5 */ sprintf(group_node, "/%s", g_job->node_name); gid_node = make_group(file_id, group_node); if (gid_node < 0) { H5Fclose(file_id); file_id = -1; info("PROFILE: Failed to create Node group"); return SLURM_FAILURE; } put_int_attribute(gid_node, ATTR_NODEINX, g_job->nodeid); put_string_attribute(gid_node, ATTR_NODENAME, g_job->node_name); put_int_attribute(gid_node, ATTR_NTASKS, g_job->node_tasks); put_int_attribute(gid_node, ATTR_CPUPERTASK, g_job->cpus_per_task); step_start_time = time(NULL); put_string_attribute(gid_node, ATTR_STARTTIME, slurm_ctime2(&step_start_time)); return rc; }