/* * slurm_job_step_get_pids - get the complete list of pids for a given * job step * * IN job_id * IN step_id * IN node_list, optional, if NULL then all nodes in step are returned. * OUT resp * RET SLURM_SUCCESS on success SLURM_ERROR else */ extern int slurm_job_step_get_pids(uint32_t job_id, uint32_t step_id, char *node_list, job_step_pids_response_msg_t **resp) { int rc = SLURM_SUCCESS; slurm_msg_t req_msg; job_step_id_msg_t req; ListIterator itr; List ret_list = NULL; ret_data_info_t *ret_data_info = NULL; slurm_step_layout_t *step_layout = NULL; job_step_pids_response_msg_t *resp_out; bool created = 0; xassert(resp); if(!node_list) { if(!(step_layout = slurm_job_step_layout_get(job_id, step_id))) { rc = errno; error("slurm_job_step_get_pids: " "problem getting step_layout for %u.%u: %s", job_id, step_id, slurm_strerror(rc)); return rc; } node_list = step_layout->node_list; } if(!*resp) { resp_out = xmalloc(sizeof(job_step_pids_response_msg_t)); *resp = resp_out; created = 1; } else resp_out = *resp; debug("slurm_job_step_get_pids: " "getting pid information of job %u.%u on nodes %s", job_id, step_id, node_list); slurm_msg_t_init(&req_msg); memset(&req, 0, sizeof(job_step_id_msg_t)); resp_out->job_id = req.job_id = job_id; resp_out->step_id = req.step_id = step_id; req_msg.msg_type = REQUEST_JOB_STEP_PIDS; req_msg.data = &req; if(!(ret_list = slurm_send_recv_msgs(node_list, &req_msg, 0, false))) { error("slurm_job_step_get_pids: got an error no list returned"); rc = SLURM_ERROR; if(created) { slurm_job_step_pids_response_msg_free(resp_out); *resp = NULL; } goto cleanup; } itr = list_iterator_create(ret_list); while((ret_data_info = list_next(itr))) { switch (ret_data_info->type) { case RESPONSE_JOB_STEP_PIDS: if(!resp_out->pid_list) resp_out->pid_list = list_create( slurm_free_job_step_pids); list_push(resp_out->pid_list, ret_data_info->data); ret_data_info->data = NULL; break; case RESPONSE_SLURM_RC: rc = slurm_get_return_code(ret_data_info->type, ret_data_info->data); error("slurm_job_step_get_pids: " "there was an error with the " "list pid request rc = %s", slurm_strerror(rc)); break; default: rc = slurm_get_return_code(ret_data_info->type, ret_data_info->data); error("slurm_job_step_get_pids: " "unknown return given %d rc = %s", ret_data_info->type, slurm_strerror(rc)); break; } } list_iterator_destroy(itr); list_destroy(ret_list); if(resp_out->pid_list) list_sort(resp_out->pid_list, (ListCmpF)_sort_pids_by_name); cleanup: slurm_step_layout_destroy(step_layout); return rc; }
int _do_stat(uint32_t jobid, uint32_t stepid, char *nodelist, uint32_t req_cpufreq) { job_step_stat_response_msg_t *step_stat_response = NULL; int rc = SLURM_SUCCESS; ListIterator itr; slurmdb_stats_t temp_stats; job_step_stat_t *step_stat = NULL; int ntasks = 0; int tot_tasks = 0; hostlist_t hl = NULL; debug("requesting info for job %u.%u", jobid, stepid); if ((rc = slurm_job_step_stat(jobid, stepid, nodelist, &step_stat_response)) != SLURM_SUCCESS) { if (rc == ESLURM_INVALID_JOB_ID) { debug("job step %u.%u has already completed", jobid, stepid); } else { error("problem getting step_layout for %u.%u: %s", jobid, stepid, slurm_strerror(rc)); } return rc; } memset(&job, 0, sizeof(slurmdb_job_rec_t)); job.jobid = jobid; memset(&step, 0, sizeof(slurmdb_step_rec_t)); memset(&temp_stats, 0, sizeof(slurmdb_stats_t)); temp_stats.cpu_min = NO_VAL; memset(&step.stats, 0, sizeof(slurmdb_stats_t)); step.stats.cpu_min = NO_VAL; step.job_ptr = &job; step.stepid = stepid; step.nodes = xmalloc(BUF_SIZE); step.req_cpufreq = req_cpufreq; step.stepname = NULL; step.state = JOB_RUNNING; hl = hostlist_create(NULL); itr = list_iterator_create(step_stat_response->stats_list); while ((step_stat = list_next(itr))) { if (!step_stat->step_pids || !step_stat->step_pids->node_name) continue; if (step_stat->step_pids->pid_cnt > 0 ) { int i; for(i=0; i<step_stat->step_pids->pid_cnt; i++) { if (step.pid_str) xstrcat(step.pid_str, ","); xstrfmtcat(step.pid_str, "%u", step_stat->step_pids->pid[i]); } } if (params.pid_format) { step.nodes = step_stat->step_pids->node_name; print_fields(&step); xfree(step.pid_str); } else { hostlist_push(hl, step_stat->step_pids->node_name); jobacctinfo_2_stats(&temp_stats, step_stat->jobacct); ntasks += step_stat->num_tasks; aggregate_stats(&step.stats, &temp_stats); } } list_iterator_destroy(itr); slurm_job_step_pids_response_msg_free(step_stat_response); /* we printed it out already */ if (params.pid_format) return rc; hostlist_sort(hl); hostlist_ranged_string(hl, BUF_SIZE, step.nodes); hostlist_destroy(hl); tot_tasks += ntasks; if (tot_tasks) { step.stats.cpu_ave /= (double)tot_tasks; step.stats.rss_ave /= (double)tot_tasks; step.stats.vsize_ave /= (double)tot_tasks; step.stats.pages_ave /= (double)tot_tasks; step.stats.disk_read_ave /= (double)tot_tasks; step.stats.disk_write_ave /= (double)tot_tasks; step.stats.act_cpufreq /= (double)tot_tasks; step.ntasks = tot_tasks; } print_fields(&step); return rc; }