Ejemplo n.º 1
0
/*
 * slurm_job_step_get_pids - get the complete list of pids for a given
 *      job step
 *
 * IN job_id
 * IN step_id
 * IN node_list, optional, if NULL then all nodes in step are returned.
 * OUT resp
 * RET SLURM_SUCCESS on success SLURM_ERROR else
 */
extern int slurm_job_step_get_pids(uint32_t job_id, uint32_t step_id,
				   char *node_list,
				   job_step_pids_response_msg_t **resp)
{
        int rc = SLURM_SUCCESS;
        slurm_msg_t req_msg;
        job_step_id_msg_t req;
        ListIterator itr;
        List ret_list = NULL;
        ret_data_info_t *ret_data_info = NULL;
	slurm_step_layout_t *step_layout = NULL;
	job_step_pids_response_msg_t *resp_out;
	bool created = 0;

	xassert(resp);

	if(!node_list) {
		if(!(step_layout =
		     slurm_job_step_layout_get(job_id, step_id))) {
			rc = errno;
			error("slurm_job_step_get_pids: "
			      "problem getting step_layout for %u.%u: %s",
			      job_id, step_id, slurm_strerror(rc));
			return rc;
		}
		node_list = step_layout->node_list;
	}

	if(!*resp) {
		resp_out = xmalloc(sizeof(job_step_pids_response_msg_t));
		*resp = resp_out;
		created = 1;
	} else
		resp_out = *resp;

        debug("slurm_job_step_get_pids: "
	      "getting pid information of job %u.%u on nodes %s",
              job_id, step_id, node_list);

	slurm_msg_t_init(&req_msg);

	memset(&req, 0, sizeof(job_step_id_msg_t));
        resp_out->job_id = req.job_id = job_id;
	resp_out->step_id = req.step_id = step_id;

	req_msg.msg_type = REQUEST_JOB_STEP_PIDS;
        req_msg.data = &req;

        if(!(ret_list = slurm_send_recv_msgs(node_list,
					     &req_msg, 0, false))) {
                error("slurm_job_step_get_pids: got an error no list returned");
                rc = SLURM_ERROR;
		if(created) {
			slurm_job_step_pids_response_msg_free(resp_out);
			*resp = NULL;
		}
		goto cleanup;
        }

        itr = list_iterator_create(ret_list);
        while((ret_data_info = list_next(itr))) {
                switch (ret_data_info->type) {
			case RESPONSE_JOB_STEP_PIDS:
				if(!resp_out->pid_list)
					resp_out->pid_list = list_create(
						slurm_free_job_step_pids);
				list_push(resp_out->pid_list,
					  ret_data_info->data);
				ret_data_info->data = NULL;
                              break;
                      case RESPONSE_SLURM_RC:
                              rc = slurm_get_return_code(ret_data_info->type,
                                                         ret_data_info->data);
                              error("slurm_job_step_get_pids: "
				    "there was an error with the "
				    "list pid request rc = %s",
                                    slurm_strerror(rc));
                              break;
                      default:
                              rc = slurm_get_return_code(ret_data_info->type,
                                                         ret_data_info->data);
                              error("slurm_job_step_get_pids: "
				    "unknown return given %d rc = %s",
                                    ret_data_info->type, slurm_strerror(rc));
                              break;
                }
        }
        list_iterator_destroy(itr);
        list_destroy(ret_list);

 	if(resp_out->pid_list)
		list_sort(resp_out->pid_list, (ListCmpF)_sort_pids_by_name);
cleanup:
	slurm_step_layout_destroy(step_layout);

        return rc;
}
Ejemplo n.º 2
0
Archivo: sstat.c Proyecto: Cray/slurm
int _do_stat(uint32_t jobid, uint32_t stepid, char *nodelist,
	     uint32_t req_cpufreq)
{
	job_step_stat_response_msg_t *step_stat_response = NULL;
	int rc = SLURM_SUCCESS;
	ListIterator itr;
	slurmdb_stats_t temp_stats;
	job_step_stat_t *step_stat = NULL;
	int ntasks = 0;
	int tot_tasks = 0;
	hostlist_t hl = NULL;

	debug("requesting info for job %u.%u", jobid, stepid);
	if ((rc = slurm_job_step_stat(jobid, stepid, nodelist,
				      &step_stat_response)) != SLURM_SUCCESS) {
		if (rc == ESLURM_INVALID_JOB_ID) {
			debug("job step %u.%u has already completed",
			      jobid, stepid);
		} else {
			error("problem getting step_layout for %u.%u: %s",
			      jobid, stepid, slurm_strerror(rc));
		}
		return rc;
	}

	memset(&job, 0, sizeof(slurmdb_job_rec_t));
	job.jobid = jobid;

	memset(&step, 0, sizeof(slurmdb_step_rec_t));

	memset(&temp_stats, 0, sizeof(slurmdb_stats_t));
	temp_stats.cpu_min = NO_VAL;
	memset(&step.stats, 0, sizeof(slurmdb_stats_t));
	step.stats.cpu_min = NO_VAL;

	step.job_ptr = &job;
	step.stepid = stepid;
	step.nodes = xmalloc(BUF_SIZE);
	step.req_cpufreq = req_cpufreq;
	step.stepname = NULL;
	step.state = JOB_RUNNING;

	hl = hostlist_create(NULL);
	itr = list_iterator_create(step_stat_response->stats_list);
	while ((step_stat = list_next(itr))) {
		if (!step_stat->step_pids || !step_stat->step_pids->node_name)
			continue;
		if (step_stat->step_pids->pid_cnt > 0 ) {
			int i;
			for(i=0; i<step_stat->step_pids->pid_cnt; i++) {
				if (step.pid_str)
					xstrcat(step.pid_str, ",");
				xstrfmtcat(step.pid_str, "%u",
					   step_stat->step_pids->pid[i]);
			}
		}

		if (params.pid_format) {
			step.nodes = step_stat->step_pids->node_name;
			print_fields(&step);
			xfree(step.pid_str);
		} else {
			hostlist_push(hl, step_stat->step_pids->node_name);
			jobacctinfo_2_stats(&temp_stats, step_stat->jobacct);
			ntasks += step_stat->num_tasks;
			aggregate_stats(&step.stats, &temp_stats);
		}
	}
	list_iterator_destroy(itr);
	slurm_job_step_pids_response_msg_free(step_stat_response);
	/* we printed it out already */
	if (params.pid_format)
		return rc;

	hostlist_sort(hl);
	hostlist_ranged_string(hl, BUF_SIZE, step.nodes);
	hostlist_destroy(hl);
	tot_tasks += ntasks;

	if (tot_tasks) {
		step.stats.cpu_ave /= (double)tot_tasks;
		step.stats.rss_ave /= (double)tot_tasks;
		step.stats.vsize_ave /= (double)tot_tasks;
		step.stats.pages_ave /= (double)tot_tasks;
		step.stats.disk_read_ave /= (double)tot_tasks;
		step.stats.disk_write_ave /= (double)tot_tasks;
		step.stats.act_cpufreq /= (double)tot_tasks;
		step.ntasks = tot_tasks;
	}

	print_fields(&step);

	return rc;
}