extern void jag_common_poll_data( List task_list, bool pgid_plugin, uint64_t cont_id, jag_callbacks_t *callbacks) { /* Update the data */ List prec_list = NULL; uint32_t total_job_mem = 0, total_job_vsize = 0; ListIterator itr; ListIterator itr2; jag_prec_t *prec = NULL; struct jobacctinfo *jobacct = NULL; static int processing = 0; char sbuf[72]; int energy_counted = 0; static int first = 1; xassert(callbacks); if (!pgid_plugin && (cont_id == (uint64_t)NO_VAL)) { debug("cont_id hasn't been set yet not running poll"); return; } if (processing) { debug("already running, returning"); return; } processing = 1; if (!callbacks->get_precs) callbacks->get_precs = _get_precs; prec_list = (*(callbacks->get_precs))(task_list, pgid_plugin, cont_id, callbacks); if (!list_count(prec_list) || !task_list || !list_count(task_list)) goto finished; /* We have no business being here! */ itr = list_iterator_create(task_list); while ((jobacct = list_next(itr))) { itr2 = list_iterator_create(prec_list); while ((prec = list_next(itr2))) { if (prec->pid == jobacct->pid) { uint32_t cpu_calc = (prec->ssec + prec->usec)/hertz; #if _DEBUG info("pid:%u ppid:%u rss:%d KB", prec->pid, prec->ppid, prec->rss); #endif /* find all my descendents */ if (callbacks->get_offspring_data) (*(callbacks->get_offspring_data)) (prec_list, prec, prec->pid); /* tally their usage */ jobacct->max_rss = MAX(jobacct->max_rss, prec->rss); jobacct->tot_rss = prec->rss; total_job_mem += prec->rss; jobacct->max_vsize = MAX(jobacct->max_vsize, prec->vsize); jobacct->tot_vsize = prec->vsize; total_job_vsize += prec->vsize; jobacct->max_pages = MAX(jobacct->max_pages, prec->pages); jobacct->tot_pages = prec->pages; jobacct->max_disk_read = MAX( jobacct->max_disk_read, prec->disk_read); jobacct->tot_disk_read = prec->disk_read; jobacct->max_disk_write = MAX( jobacct->max_disk_write, prec->disk_write); jobacct->tot_disk_write = prec->disk_write; jobacct->min_cpu = MAX(jobacct->min_cpu, cpu_calc); jobacct->last_total_cputime = jobacct->tot_cpu; jobacct->tot_cpu = cpu_calc; debug2("%d mem size %u %u time %u(%u+%u)", jobacct->pid, jobacct->max_rss, jobacct->max_vsize, jobacct->tot_cpu, prec->usec, prec->ssec); /* compute frequency */ jobacct->this_sampled_cputime = cpu_calc - jobacct->last_total_cputime; _get_sys_interface_freq_line( prec->last_cpu, "cpuinfo_cur_freq", sbuf); jobacct->act_cpufreq = _update_weighted_freq(jobacct, sbuf); debug2("Task average frequency = %u " "pid %d mem size %u %u time %u(%u+%u)", jobacct->act_cpufreq, jobacct->pid, jobacct->max_rss, jobacct->max_vsize, jobacct->tot_cpu, prec->usec, prec->ssec); /* get energy consumption * only once is enough since we * report per node energy consumption */ debug2("energycounted = %d", energy_counted); if (energy_counted == 0) { acct_gather_energy_g_get_data( energy_profile, &jobacct->energy); debug2("getjoules_task energy = %u", jobacct->energy.consumed_energy); energy_counted = 1; } /* We only profile on after the first poll. */ if (!first) acct_gather_profile_g_add_sample_data( ACCT_GATHER_PROFILE_TASK, jobacct); break; } } list_iterator_destroy(itr2); } list_iterator_destroy(itr); jobacct_gather_handle_mem_limit(total_job_mem, total_job_vsize); finished: list_destroy(prec_list); processing = 0; first = 0; }
static List _get_precs(List task_list, bool pgid_plugin, uint64_t cont_id, jag_callbacks_t *callbacks) { List prec_list = list_create(destroy_jag_prec); char proc_stat_file[256]; /* Allow ~20x extra length */ char proc_io_file[256]; /* Allow ~20x extra length */ static int slash_proc_open = 0; int i; if (!pgid_plugin) { pid_t *pids = NULL; int npids = 0; /* get only the processes in the proctrack container */ proctrack_g_get_pids(cont_id, &pids, &npids); if (!npids) { /* update consumed energy even if pids do not exist */ ListIterator itr = list_iterator_create(task_list); struct jobacctinfo *jobacct = NULL; if ((jobacct = list_next(itr))) { acct_gather_energy_g_get_data( energy_profile, &jobacct->energy); debug2("getjoules_task energy = %u", jobacct->energy.consumed_energy); } list_iterator_destroy(itr); debug4("no pids in this container %"PRIu64"", cont_id); goto finished; } for (i = 0; i < npids; i++) { snprintf(proc_stat_file, 256, "/proc/%d/stat", pids[i]); snprintf(proc_io_file, 256, "/proc/%d/io", pids[i]); _handle_stats(prec_list, proc_stat_file, proc_io_file, callbacks); } xfree(pids); } else { struct dirent *slash_proc_entry; char *iptr = NULL, *optr = NULL, *optr2 = NULL; if (slash_proc_open) { rewinddir(slash_proc); } else { slash_proc=opendir("/proc"); if (slash_proc == NULL) { perror("opening /proc"); goto finished; } slash_proc_open=1; } strcpy(proc_stat_file, "/proc/"); strcpy(proc_io_file, "/proc/"); while ((slash_proc_entry = readdir(slash_proc))) { /* Save a few cyles by simulating * strcat(statFileName, slash_proc_entry->d_name); * strcat(statFileName, "/stat"); * while checking for a numeric filename (which really * should be a pid). Then do the same for the * /proc/<pid>/io file name. */ optr = proc_stat_file + sizeof("/proc"); iptr = slash_proc_entry->d_name; i = 0; do { if ((*iptr < '0') || ((*optr++ = *iptr++) > '9')) { i = -1; break; } } while (*iptr); if (i == -1) continue; iptr = (char*)"/stat"; do { *optr++ = *iptr++; } while (*iptr); *optr = 0; optr2 = proc_io_file + sizeof("/proc"); iptr = slash_proc_entry->d_name; i = 0; do { if ((*iptr < '0') || ((*optr2++ = *iptr++) > '9')) { i = -1; break; } } while (*iptr); if (i == -1) continue; iptr = (char*)"/io"; do { *optr2++ = *iptr++; } while (*iptr); *optr2 = 0; _handle_stats(prec_list, proc_stat_file, proc_io_file, callbacks); } } finished: return prec_list; }
extern void jag_common_poll_data( List task_list, bool pgid_plugin, uint64_t cont_id, jag_callbacks_t *callbacks, bool profile) { /* Update the data */ List prec_list = NULL; uint64_t total_job_mem = 0, total_job_vsize = 0; ListIterator itr; jag_prec_t *prec = NULL; struct jobacctinfo *jobacct = NULL; static int processing = 0; char sbuf[72]; int energy_counted = 0; time_t ct; static int no_over_memory_kill = -1; xassert(callbacks); if (!pgid_plugin && (cont_id == (uint64_t)NO_VAL)) { debug("cont_id hasn't been set yet not running poll"); return; } if (processing) { debug("already running, returning"); return; } processing = 1; if (no_over_memory_kill == -1) { char *acct_params = slurm_get_jobacct_gather_params(); if (acct_params && strstr(acct_params, "NoOverMemoryKill")) no_over_memory_kill = 1; else no_over_memory_kill = 0; xfree(acct_params); } if (!callbacks->get_precs) callbacks->get_precs = _get_precs; ct = time(NULL); prec_list = (*(callbacks->get_precs))(task_list, pgid_plugin, cont_id, callbacks); if (!list_count(prec_list) || !task_list || !list_count(task_list)) goto finished; /* We have no business being here! */ itr = list_iterator_create(task_list); while ((jobacct = list_next(itr))) { uint32_t cpu_calc; uint32_t last_total_cputime; if (!(prec = list_find_first(prec_list, _find_prec, jobacct))) continue; #if _DEBUG info("pid:%u ppid:%u rss:%d KB", prec->pid, prec->ppid, prec->rss); #endif /* find all my descendents */ if (callbacks->get_offspring_data) (*(callbacks->get_offspring_data)) (prec_list, prec, prec->pid); last_total_cputime = jobacct->tot_cpu; cpu_calc = (prec->ssec + prec->usec)/hertz; /* tally their usage */ jobacct->max_rss = MAX(jobacct->max_rss, prec->rss); jobacct->tot_rss = prec->rss; total_job_mem += prec->rss; jobacct->max_vsize = MAX(jobacct->max_vsize, prec->vsize); jobacct->tot_vsize = prec->vsize; total_job_vsize += prec->vsize; jobacct->max_pages = MAX(jobacct->max_pages, prec->pages); jobacct->tot_pages = prec->pages; jobacct->max_disk_read = MAX( jobacct->max_disk_read, prec->disk_read); jobacct->tot_disk_read = prec->disk_read; jobacct->max_disk_write = MAX( jobacct->max_disk_write, prec->disk_write); jobacct->tot_disk_write = prec->disk_write; jobacct->min_cpu = MAX(jobacct->min_cpu, cpu_calc); /* Update the cpu times */ jobacct->tot_cpu = cpu_calc; jobacct->user_cpu_sec = prec->usec/hertz; jobacct->sys_cpu_sec = prec->ssec/hertz; debug2("%s: %d mem size %"PRIu64" %"PRIu64" " "time %u(%u+%u)", __func__, jobacct->pid, jobacct->max_rss, jobacct->max_vsize, jobacct->tot_cpu, jobacct->user_cpu_sec, jobacct->sys_cpu_sec); /* compute frequency */ jobacct->this_sampled_cputime = cpu_calc - last_total_cputime; _get_sys_interface_freq_line( prec->last_cpu, "cpuinfo_cur_freq", sbuf); jobacct->act_cpufreq = _update_weighted_freq(jobacct, sbuf); debug("%s: Task average frequency = %u " "pid %d mem size %"PRIu64" %"PRIu64" " "time %u(%u+%u)", __func__, jobacct->act_cpufreq, jobacct->pid, jobacct->max_rss, jobacct->max_vsize, jobacct->tot_cpu, jobacct->user_cpu_sec, jobacct->sys_cpu_sec); /* get energy consumption * only once is enough since we * report per node energy consumption */ debug2("energycounted = %d", energy_counted); if (energy_counted == 0) { acct_gather_energy_g_get_data( energy_profile, &jobacct->energy); debug2("getjoules_task energy = %"PRIu64, jobacct->energy.consumed_energy); energy_counted = 1; } if (profile && acct_gather_profile_g_is_active(ACCT_GATHER_PROFILE_TASK)) { jobacct->cur_time = ct; _record_profile(jobacct); jobacct->last_tot_disk_read = jobacct->tot_disk_read; jobacct->last_tot_disk_write = jobacct->tot_disk_write; jobacct->last_total_cputime = jobacct->tot_cpu; jobacct->last_time = jobacct->cur_time; } } list_iterator_destroy(itr); if (!no_over_memory_kill) jobacct_gather_handle_mem_limit(total_job_mem, total_job_vsize); finished: FREE_NULL_LIST(prec_list); processing = 0; }
static void _fill_registration_msg(slurm_node_registration_status_msg_t *msg) { List steps; ListIterator i; step_loc_t *stepd; int n; char *arch, *os; struct utsname buf; static bool first_msg = true; static time_t slurmd_start_time = 0; Buf gres_info; msg->node_name = xstrdup (conf->node_name); msg->cpus = conf->cpus; msg->boards = conf->boards; msg->sockets = conf->sockets; msg->cores = conf->cores; msg->threads = conf->threads; msg->real_memory = conf->real_memory_size; msg->tmp_disk = conf->tmp_disk_space; msg->hash_val = slurm_get_hash_val(); get_cpu_load(&msg->cpu_load); gres_info = init_buf(1024); if (gres_plugin_node_config_pack(gres_info) != SLURM_SUCCESS) error("error packing gres configuration"); else msg->gres_info = gres_info; get_up_time(&conf->up_time); msg->up_time = conf->up_time; if (slurmd_start_time == 0) slurmd_start_time = time(NULL); msg->slurmd_start_time = slurmd_start_time; if (first_msg) { first_msg = false; info("CPUs=%u Boards=%u Sockets=%u Cores=%u Threads=%u " "Memory=%u TmpDisk=%u Uptime=%u", msg->cpus, msg->boards, msg->sockets, msg->cores, msg->threads, msg->real_memory, msg->tmp_disk, msg->up_time); } else { debug3("CPUs=%u Boards=%u Sockets=%u Cores=%u Threads=%u " "Memory=%u TmpDisk=%u Uptime=%u", msg->cpus, msg->boards, msg->sockets, msg->cores, msg->threads, msg->real_memory, msg->tmp_disk, msg->up_time); } uname(&buf); if ((arch = getenv("SLURM_ARCH"))) msg->arch = xstrdup(arch); else msg->arch = xstrdup(buf.machine); if ((os = getenv("SLURM_OS"))) msg->os = xstrdup(os); else msg->os = xstrdup(buf.sysname); if (msg->startup) { if (switch_g_alloc_node_info(&msg->switch_nodeinfo)) error("switch_g_alloc_node_info: %m"); if (switch_g_build_node_info(msg->switch_nodeinfo)) error("switch_g_build_node_info: %m"); } steps = stepd_available(conf->spooldir, conf->node_name); msg->job_count = list_count(steps); msg->job_id = xmalloc(msg->job_count * sizeof(*msg->job_id)); /* Note: Running batch jobs will have step_id == NO_VAL */ msg->step_id = xmalloc(msg->job_count * sizeof(*msg->step_id)); i = list_iterator_create(steps); n = 0; while ((stepd = list_next(i))) { int fd; fd = stepd_connect(stepd->directory, stepd->nodename, stepd->jobid, stepd->stepid); if (fd == -1) { --(msg->job_count); continue; } if (stepd_state(fd) == SLURMSTEPD_NOT_RUNNING) { debug("stale domain socket for stepd %u.%u ", stepd->jobid, stepd->stepid); --(msg->job_count); close(fd); continue; } close(fd); if (stepd->stepid == NO_VAL) debug("found apparently running job %u", stepd->jobid); else debug("found apparently running step %u.%u", stepd->jobid, stepd->stepid); msg->job_id[n] = stepd->jobid; msg->step_id[n] = stepd->stepid; n++; } list_iterator_destroy(i); list_destroy(steps); if (!msg->energy) msg->energy = acct_gather_energy_alloc(); acct_gather_energy_g_get_data(ENERGY_DATA_STRUCT, msg->energy); msg->timestamp = time(NULL); return; }
extern void jag_common_poll_data( List task_list, bool pgid_plugin, uint64_t cont_id, jag_callbacks_t *callbacks, bool profile) { /* Update the data */ List prec_list = NULL; uint64_t total_job_mem = 0, total_job_vsize = 0; ListIterator itr; jag_prec_t *prec = NULL; struct jobacctinfo *jobacct = NULL; static int processing = 0; char sbuf[72]; int energy_counted = 0; time_t ct; static int over_memory_kill = -1; int i = 0; xassert(callbacks); if (!pgid_plugin && (cont_id == NO_VAL64)) { debug("cont_id hasn't been set yet not running poll"); return; } if (processing) { debug("already running, returning"); return; } processing = 1; if (!callbacks->get_precs) callbacks->get_precs = _get_precs; ct = time(NULL); prec_list = (*(callbacks->get_precs))(task_list, pgid_plugin, cont_id, callbacks); if (!list_count(prec_list) || !task_list || !list_count(task_list)) goto finished; /* We have no business being here! */ itr = list_iterator_create(task_list); while ((jobacct = list_next(itr))) { double cpu_calc; double last_total_cputime; if (!(prec = list_find_first(prec_list, _find_prec, jobacct))) continue; /* * Only jobacct_gather/cgroup uses prec_extra, and we want to * make sure we call it once per task, so call it here as we * iterate through the tasks instead of in get_precs. */ if (callbacks->prec_extra) (*(callbacks->prec_extra))(prec, jobacct->id.taskid); #if _DEBUG info("pid:%u ppid:%u rss:%"PRIu64" B", prec->pid, prec->ppid, prec->tres_data[TRES_ARRAY_MEM].size_read); #endif /* find all my descendents */ if (callbacks->get_offspring_data) (*(callbacks->get_offspring_data)) (prec_list, prec, prec->pid); last_total_cputime = (double)jobacct->tres_usage_in_tot[TRES_ARRAY_CPU]; cpu_calc = (prec->ssec + prec->usec) / (double)hertz; /* * Since we are not storing things as a double anymore make it * bigger so we don't loose precision. */ cpu_calc *= CPU_TIME_ADJ; prec->tres_data[TRES_ARRAY_CPU].size_read = (uint64_t)cpu_calc; /* get energy consumption * only once is enough since we * report per node energy consumption. * Energy is stored in read fields, while power is stored * in write fields.*/ debug2("energycounted = %d", energy_counted); if (energy_counted == 0) { acct_gather_energy_g_get_data( energy_profile, &jobacct->energy); prec->tres_data[TRES_ARRAY_ENERGY].size_read = jobacct->energy.consumed_energy; prec->tres_data[TRES_ARRAY_ENERGY].size_write = jobacct->energy.current_watts; debug2("%s: energy = %"PRIu64" watts = %"PRIu64" ave_watts = %u", __func__, prec->tres_data[TRES_ARRAY_ENERGY].size_read, prec->tres_data[TRES_ARRAY_ENERGY].size_write, jobacct->energy.ave_watts); energy_counted = 1; } /* tally their usage */ for (i = 0; i < jobacct->tres_count; i++) { if (prec->tres_data[i].size_read == INFINITE64) continue; if (jobacct->tres_usage_in_max[i] == INFINITE64) jobacct->tres_usage_in_max[i] = prec->tres_data[i].size_read; else jobacct->tres_usage_in_max[i] = MAX(jobacct->tres_usage_in_max[i], prec->tres_data[i].size_read); /* * Even with min we want to get the max as we are * looking at a specific task aso we are always looking * at the max that task had, not the min (or lots of * things will be zero). The min is from compairing * ranks later when combining. So here it will be the * same as the max value set above. * (same thing goes for the out) */ jobacct->tres_usage_in_min[i] = jobacct->tres_usage_in_max[i]; jobacct->tres_usage_in_tot[i] = prec->tres_data[i].size_read; if (jobacct->tres_usage_out_max[i] == INFINITE64) jobacct->tres_usage_out_max[i] = prec->tres_data[i].size_write; else jobacct->tres_usage_out_max[i] = MAX(jobacct->tres_usage_out_max[i], prec->tres_data[i].size_write); jobacct->tres_usage_out_min[i] = jobacct->tres_usage_out_max[i]; jobacct->tres_usage_out_tot[i] = prec->tres_data[i].size_write; } total_job_mem += jobacct->tres_usage_in_tot[TRES_ARRAY_MEM]; total_job_vsize += jobacct->tres_usage_in_tot[TRES_ARRAY_VMEM]; /* Update the cpu times */ jobacct->user_cpu_sec = (uint32_t)(prec->usec / (double)hertz); jobacct->sys_cpu_sec = (uint32_t)(prec->ssec / (double)hertz); /* compute frequency */ jobacct->this_sampled_cputime = cpu_calc - last_total_cputime; _get_sys_interface_freq_line( prec->last_cpu, "cpuinfo_cur_freq", sbuf); jobacct->act_cpufreq = _update_weighted_freq(jobacct, sbuf); debug("%s: Task %u pid %d ave_freq = %u mem size/max %"PRIu64"/%"PRIu64" vmem size/max %"PRIu64"/%"PRIu64", disk read size/max (%"PRIu64"/%"PRIu64"), disk write size/max (%"PRIu64"/%"PRIu64"), time %f(%u+%u) Energy tot/max %"PRIu64"/%"PRIu64" TotPower %"PRIu64" MaxPower %"PRIu64" MinPower %"PRIu64, __func__, jobacct->id.taskid, jobacct->pid, jobacct->act_cpufreq, jobacct->tres_usage_in_tot[TRES_ARRAY_MEM], jobacct->tres_usage_in_max[TRES_ARRAY_MEM], jobacct->tres_usage_in_tot[TRES_ARRAY_VMEM], jobacct->tres_usage_in_max[TRES_ARRAY_VMEM], jobacct->tres_usage_in_tot[TRES_ARRAY_FS_DISK], jobacct->tres_usage_in_max[TRES_ARRAY_FS_DISK], jobacct->tres_usage_out_tot[TRES_ARRAY_FS_DISK], jobacct->tres_usage_out_max[TRES_ARRAY_FS_DISK], (double)(jobacct->tres_usage_in_tot[TRES_ARRAY_CPU] / CPU_TIME_ADJ), jobacct->user_cpu_sec, jobacct->sys_cpu_sec, jobacct->tres_usage_in_tot[TRES_ARRAY_ENERGY], jobacct->tres_usage_in_max[TRES_ARRAY_ENERGY], jobacct->tres_usage_out_tot[TRES_ARRAY_ENERGY], jobacct->tres_usage_out_max[TRES_ARRAY_ENERGY], jobacct->tres_usage_out_min[TRES_ARRAY_ENERGY]); if (profile && acct_gather_profile_g_is_active(ACCT_GATHER_PROFILE_TASK)) { jobacct->cur_time = ct; _record_profile(jobacct); jobacct->last_tres_usage_in_tot = jobacct->tres_usage_in_tot[TRES_ARRAY_FS_DISK]; jobacct->last_tres_usage_out_tot = jobacct->tres_usage_out_tot[TRES_ARRAY_FS_DISK]; jobacct->last_total_cputime = jobacct->tres_usage_in_tot[TRES_ARRAY_CPU]; jobacct->last_time = jobacct->cur_time; } } list_iterator_destroy(itr); if (over_memory_kill == -1) over_memory_kill = slurm_get_job_acct_oom_kill(); if (over_memory_kill) jobacct_gather_handle_mem_limit(total_job_mem, total_job_vsize); finished: FREE_NULL_LIST(prec_list); processing = 0; }