예제 #1
0
/*
 * slurm_job_step_stat - status a current step
 *
 * IN job_id
 * IN step_id
 * IN node_list, optional, if NULL then all nodes in step are returned.
 * OUT resp
 * RET SLURM_SUCCESS on success SLURM_ERROR else
 */
extern int slurm_job_step_stat(uint32_t job_id, uint32_t step_id,
			       char *node_list,
			       job_step_stat_response_msg_t **resp)
{
	slurm_msg_t req_msg;
	ListIterator itr;
	job_step_id_msg_t req;
	List ret_list = NULL;
	ret_data_info_t *ret_data_info = NULL;
	int rc = SLURM_SUCCESS;
	slurm_step_layout_t *step_layout = NULL;
	job_step_stat_response_msg_t *resp_out;
	bool created = 0;

	xassert(resp);

	if (!node_list) {
		if (!(step_layout =
		      slurm_job_step_layout_get(job_id, step_id))) {
			rc = errno;
			error("slurm_job_step_stat: "
			      "problem getting step_layout for %u.%u: %s",
			      job_id, step_id, slurm_strerror(rc));
			return rc;
		}
		node_list = step_layout->node_list;
	}

 	if (!*resp) {
		resp_out = xmalloc(sizeof(job_step_stat_response_msg_t));
		*resp = resp_out;
		created = 1;
	} else
		resp_out = *resp;

        debug("slurm_job_step_stat: "
	      "getting pid information of job %u.%u on nodes %s",
              job_id, step_id, node_list);

	slurm_msg_t_init(&req_msg);

	memset(&req, 0, sizeof(job_step_id_msg_t));
	resp_out->job_id = req.job_id = job_id;
	resp_out->step_id = req.step_id = step_id;

	req_msg.msg_type = REQUEST_JOB_STEP_STAT;
        req_msg.data = &req;

        if (!(ret_list = slurm_send_recv_msgs(node_list, &req_msg, 0, false))) {
                error("slurm_job_step_stat: got an error no list returned");
		rc = SLURM_ERROR;
		if (created) {
			slurm_job_step_stat_response_msg_free(resp_out);
			*resp = NULL;
		}
		goto cleanup;
        }

	itr = list_iterator_create(ret_list);
	while ((ret_data_info = list_next(itr))) {
		switch (ret_data_info->type) {
		case RESPONSE_JOB_STEP_STAT:
			if (!resp_out->stats_list)
				resp_out->stats_list = list_create(
					slurm_free_job_step_stat);
			list_push(resp_out->stats_list,
				  ret_data_info->data);
			ret_data_info->data = NULL;
 			break;
		case RESPONSE_SLURM_RC:
			rc = slurm_get_return_code(ret_data_info->type,
						   ret_data_info->data);
			error("slurm_job_step_stat: "
			      "there was an error with the request to "
			      "%s rc = %s",
			      ret_data_info->node_name, slurm_strerror(rc));
			break;
		default:
			rc = slurm_get_return_code(ret_data_info->type,
						   ret_data_info->data);
			error("slurm_job_step_stat: "
			      "unknown return given from %s: %d rc = %s",
			      ret_data_info->node_name, ret_data_info->type,
			      slurm_strerror(rc));
			break;
		}
	}
	list_iterator_destroy(itr);
	list_destroy(ret_list);

	if (resp_out->stats_list)
		list_sort(resp_out->stats_list, (ListCmpF)_sort_stats_by_name);
cleanup:
	slurm_step_layout_destroy(step_layout);

	return rc;
}
예제 #2
0
파일: env.c 프로젝트: donaghy1/slurm
/*
 * Set in "dest" the environment variables relevant to a SLURM job
 * allocation, overwriting any environment variables of the same name.
 * If the address pointed to by "dest" is NULL, memory will automatically be
 * xmalloc'ed.  The array is terminated by a NULL pointer, and thus is
 * suitable for use by execle() and other env_array_* functions.
 *
 * Sets the variables:
 *	SLURM_JOB_ID
 *	SLURM_JOB_NUM_NODES
 *	SLURM_JOB_NODELIST
 *	SLURM_JOB_CPUS_PER_NODE
 *	LOADLBATCH (AIX only)
 *	SLURM_BG_NUM_NODES, MPIRUN_PARTITION, MPIRUN_NOFREE, and
 *	MPIRUN_NOALLOCATE (BG only)
 *
 * Sets OBSOLETE variables (needed for MPI, do not remove):
 *	SLURM_JOBID
 *	SLURM_NNODES
 *	SLURM_NODELIST
 *	SLURM_TASKS_PER_NODE
 */
int
env_array_for_job(char ***dest, const resource_allocation_response_msg_t *alloc,
		  const job_desc_msg_t *desc)
{
	char *tmp = NULL;
	char *dist = NULL, *lllp_dist = NULL;
	slurm_step_layout_t *step_layout = NULL;
	uint32_t num_tasks = desc->num_tasks;
	int rc = SLURM_SUCCESS;
	uint32_t node_cnt = alloc->node_cnt;
	uint32_t cluster_flags = slurmdb_setup_cluster_flags();

	_setup_particulars(cluster_flags, dest, alloc->select_jobinfo);

	if (cluster_flags & CLUSTER_FLAG_BG) {
		select_g_select_jobinfo_get(alloc->select_jobinfo,
					    SELECT_JOBDATA_NODE_CNT,
					    &node_cnt);
		if (!node_cnt)
			node_cnt = alloc->node_cnt;

		env_array_overwrite_fmt(dest, "SLURM_BG_NUM_NODES",
					"%u", node_cnt);
	}

	env_array_overwrite_fmt(dest, "SLURM_JOB_ID", "%u", alloc->job_id);
	env_array_overwrite_fmt(dest, "SLURM_JOB_NUM_NODES", "%u", node_cnt);
	env_array_overwrite_fmt(dest, "SLURM_JOB_NODELIST", "%s",
				alloc->node_list);

	_set_distribution(desc->task_dist, &dist, &lllp_dist);
	if(dist)
		env_array_overwrite_fmt(dest, "SLURM_DISTRIBUTION", "%s",
					dist);

	if(desc->task_dist == SLURM_DIST_PLANE)
		env_array_overwrite_fmt(dest, "SLURM_DIST_PLANESIZE",
					"%u", desc->plane_size);

	if(lllp_dist)
		env_array_overwrite_fmt(dest, "SLURM_DIST_LLLP", "%s",
					lllp_dist);

	tmp = uint32_compressed_to_str(alloc->num_cpu_groups,
					alloc->cpus_per_node,
					alloc->cpu_count_reps);
	env_array_overwrite_fmt(dest, "SLURM_JOB_CPUS_PER_NODE", "%s", tmp);
	xfree(tmp);

	/* OBSOLETE, but needed by MPI, do not remove */
	env_array_overwrite_fmt(dest, "SLURM_JOBID", "%u", alloc->job_id);
	env_array_overwrite_fmt(dest, "SLURM_NNODES", "%u", node_cnt);
	env_array_overwrite_fmt(dest, "SLURM_NODELIST", "%s", alloc->node_list);

	if(num_tasks == NO_VAL) {
		/* If we know how many tasks we are going to do then
		   we set SLURM_TASKS_PER_NODE */
		int i=0;
		/* If no tasks were given we can figure it out here
		 * by totalling up the cpus and then dividing by the
		 * number of cpus per task */

		num_tasks = 0;
		for (i = 0; i < alloc->num_cpu_groups; i++) {
			num_tasks += alloc->cpu_count_reps[i]
				* alloc->cpus_per_node[i];
		}
		if((int)desc->cpus_per_task > 1
		   && desc->cpus_per_task != (uint16_t)NO_VAL)
			num_tasks /= desc->cpus_per_task;
		//num_tasks = desc->min_cpus;
	}

	if(desc->task_dist == SLURM_DIST_ARBITRARY) {
		tmp = desc->req_nodes;
		env_array_overwrite_fmt(dest, "SLURM_ARBITRARY_NODELIST",
					"%s", tmp);
	} else
		tmp = alloc->node_list;

	if(!(step_layout = slurm_step_layout_create(tmp,
						    alloc->cpus_per_node,
						    alloc->cpu_count_reps,
						    node_cnt,
						    num_tasks,
						    desc->cpus_per_task,
						    desc->task_dist,
						    desc->plane_size)))
		return SLURM_ERROR;

	tmp = _uint16_array_to_str(step_layout->node_cnt,
				   step_layout->tasks);
	slurm_step_layout_destroy(step_layout);
	env_array_overwrite_fmt(dest, "SLURM_TASKS_PER_NODE", "%s", tmp);
	xfree(tmp);
	return rc;
}
예제 #3
0
extern void slurm_job_step_layout_free(slurm_step_layout_t *layout)
{
	slurm_step_layout_destroy(layout);
}
예제 #4
0
파일: env.c 프로젝트: donaghy1/slurm
/*
 * Set in "dest" the environment variables strings relevant to a SLURM batch
 * job allocation, overwriting any environment variables of the same name.
 * If the address pointed to by "dest" is NULL, memory will automatically be
 * xmalloc'ed.  The array is terminated by a NULL pointer, and thus is
 * suitable for use by execle() and other env_array_* functions.
 *
 * Sets the variables:
 *	SLURM_JOB_ID
 *	SLURM_JOB_NUM_NODES
 *	SLURM_JOB_NODELIST
 *	SLURM_JOB_CPUS_PER_NODE
 *	ENVIRONMENT=BATCH
 *	HOSTNAME
 *	LOADLBATCH (AIX only)
 *
 * Sets OBSOLETE variables (needed for MPI, do not remove):
 *	SLURM_JOBID
 *	SLURM_NNODES
 *	SLURM_NODELIST
 *	SLURM_NTASKS
 *	SLURM_TASKS_PER_NODE
 */
extern int
env_array_for_batch_job(char ***dest, const batch_job_launch_msg_t *batch,
			const char *node_name)
{
	char *tmp = NULL;
	uint32_t num_nodes = 0;
	uint32_t num_cpus = 0;
	int i;
	slurm_step_layout_t *step_layout = NULL;
	uint32_t num_tasks = batch->ntasks;
	uint16_t cpus_per_task;
	uint16_t task_dist;
	uint32_t cluster_flags = slurmdb_setup_cluster_flags();

	_setup_particulars(cluster_flags, dest, batch->select_jobinfo);

	/* There is no explicit node count in the batch structure,
	 * so we need to calculate the node count. */
	for (i = 0; i < batch->num_cpu_groups; i++) {
		num_nodes += batch->cpu_count_reps[i];
		num_cpus += batch->cpu_count_reps[i] * batch->cpus_per_node[i];
	}

	env_array_overwrite_fmt(dest, "SLURM_JOB_ID", "%u", batch->job_id);
	env_array_overwrite_fmt(dest, "SLURM_JOB_NUM_NODES", "%u", num_nodes);
	if(cluster_flags & CLUSTER_FLAG_BG)
		env_array_overwrite_fmt(dest, "SLURM_BG_NUM_NODES",
					"%u", num_nodes);

	env_array_overwrite_fmt(dest, "SLURM_JOB_NODELIST", "%s", batch->nodes);

	tmp = uint32_compressed_to_str(batch->num_cpu_groups,
				       batch->cpus_per_node,
				       batch->cpu_count_reps);
	env_array_overwrite_fmt(dest, "SLURM_JOB_CPUS_PER_NODE", "%s", tmp);
	xfree(tmp);

	env_array_overwrite_fmt(dest, "ENVIRONMENT", "BATCH");
	if (node_name)
		env_array_overwrite_fmt(dest, "HOSTNAME", "%s", node_name);

	/* OBSOLETE, but needed by MPI, do not remove */
	env_array_overwrite_fmt(dest, "SLURM_JOBID", "%u", batch->job_id);
	env_array_overwrite_fmt(dest, "SLURM_NNODES", "%u", num_nodes);
	env_array_overwrite_fmt(dest, "SLURM_NODELIST", "%s", batch->nodes);

	if((batch->cpus_per_task != 0) &&
	   (batch->cpus_per_task != (uint16_t) NO_VAL))
		cpus_per_task = batch->cpus_per_task;
	else
		cpus_per_task = 1;	/* default value */
	if (cpus_per_task > 1) {
		env_array_overwrite_fmt(dest, "SLURM_CPUS_PER_TASK", "%u",
					cpus_per_task);
	}

	if(num_tasks) {
		env_array_overwrite_fmt(dest, "SLURM_NTASKS", "%u",
					num_tasks);
		/* keep around for old scripts */
		env_array_overwrite_fmt(dest, "SLURM_NPROCS", "%u",
					num_tasks);
	} else {
		num_tasks = num_cpus / cpus_per_task;
	}

	if((tmp = getenvp(*dest, "SLURM_ARBITRARY_NODELIST"))) {
		task_dist = SLURM_DIST_ARBITRARY;
	} else {
		tmp = batch->nodes;
		task_dist = SLURM_DIST_BLOCK;
	}

	if(!(step_layout = slurm_step_layout_create(tmp,
						    batch->cpus_per_node,
						    batch->cpu_count_reps,
						    num_nodes,
						    num_tasks,
						    cpus_per_task,
						    task_dist,
						    (uint16_t)NO_VAL)))
		return SLURM_ERROR;

	tmp = _uint16_array_to_str(step_layout->node_cnt,
				   step_layout->tasks);
	slurm_step_layout_destroy(step_layout);
	env_array_overwrite_fmt(dest, "SLURM_TASKS_PER_NODE", "%s", tmp);
	xfree(tmp);
	return SLURM_SUCCESS;

}
예제 #5
0
extern int unpack_slurm_step_layout(slurm_step_layout_t **layout, Buf buffer,
				    uint16_t protocol_version)
{
	uint16_t uint16_tmp;
	uint32_t num_tids, uint32_tmp;
	slurm_step_layout_t *step_layout = NULL;
	int i;

	if (protocol_version >= SLURM_2_3_PROTOCOL_VERSION) {
		safe_unpack16(&uint16_tmp, buffer);
		if (!uint16_tmp)
			return SLURM_SUCCESS;

		step_layout = xmalloc(sizeof(slurm_step_layout_t));
		*layout = step_layout;

		safe_unpackstr_xmalloc(&step_layout->front_end,
				       &uint32_tmp, buffer);
		safe_unpackstr_xmalloc(&step_layout->node_list,
				       &uint32_tmp, buffer);
		safe_unpack32(&step_layout->node_cnt, buffer);
		safe_unpack32(&step_layout->task_cnt, buffer);
		safe_unpack16(&step_layout->task_dist, buffer);

		step_layout->tasks =
			xmalloc(sizeof(uint32_t) * step_layout->node_cnt);
		step_layout->tids = xmalloc(sizeof(uint32_t *)
					    * step_layout->node_cnt);
		for (i = 0; i < step_layout->node_cnt; i++) {
			safe_unpack32_array(&(step_layout->tids[i]),
					    &num_tids,
					    buffer);
			step_layout->tasks[i] = num_tids;
		}
	} else if (protocol_version >= SLURM_2_1_PROTOCOL_VERSION) {
		safe_unpack16(&uint16_tmp, buffer);
		if (!uint16_tmp)
			return SLURM_SUCCESS;

		step_layout = xmalloc(sizeof(slurm_step_layout_t));
		*layout = step_layout;

		safe_unpackstr_xmalloc(&step_layout->node_list,
				       &uint32_tmp, buffer);
		safe_unpack32(&step_layout->node_cnt, buffer);
		safe_unpack32(&step_layout->task_cnt, buffer);
		safe_unpack16(&step_layout->task_dist, buffer);

		step_layout->tasks =
			xmalloc(sizeof(uint32_t) * step_layout->node_cnt);
		step_layout->tids = xmalloc(sizeof(uint32_t *)
					    * step_layout->node_cnt);
		for (i = 0; i < step_layout->node_cnt; i++) {
			safe_unpack32_array(&(step_layout->tids[i]),
					    &num_tids,
					    buffer);
			step_layout->tasks[i] = num_tids;
		}
	}
	return SLURM_SUCCESS;

unpack_error:
	slurm_step_layout_destroy(step_layout);
	*layout = NULL;
	return SLURM_ERROR;
}