コード例 #1
0
ファイル: allocate.c プロジェクト: VURM/slurm
/*
 * slurm_allocate_resources - allocate resources for a job request
 * IN job_desc_msg - description of resource allocation request
 * OUT slurm_alloc_msg - response to request
 * RET 0 on success, otherwise return -1 and set errno to indicate the error
 * NOTE: free the response using slurm_free_resource_allocation_response_msg()
 */
int
slurm_allocate_resources (job_desc_msg_t *req,
			  resource_allocation_response_msg_t **resp)
{
	int rc;
	slurm_msg_t req_msg;
	slurm_msg_t resp_msg;
	bool host_set = false;
	char host[64];

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);
	/*
	 * set Node and session id for this request
	 */
	if (req->alloc_sid == NO_VAL)
		req->alloc_sid = getsid(0);

	if ( (req->alloc_node == NULL)
	    && (gethostname_short(host, sizeof(host)) == 0) ) {
		req->alloc_node = host;
		host_set  = true;
	}

	req_msg.msg_type = REQUEST_RESOURCE_ALLOCATION;
	req_msg.data     = req;

	rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg);

	/*
	 *  Clear this hostname if set internally to this function
	 *    (memory is on the stack)
	 */
	if (host_set)
		req->alloc_node = NULL;

	if (rc == SLURM_SOCKET_ERROR)
		return SLURM_SOCKET_ERROR;

	switch (resp_msg.msg_type) {
	case RESPONSE_SLURM_RC:
		if (_handle_rc_msg(&resp_msg) < 0)
			return SLURM_PROTOCOL_ERROR;
		*resp = NULL;
		break;
	case RESPONSE_RESOURCE_ALLOCATION:
		*resp = (resource_allocation_response_msg_t *) resp_msg.data;
		break;
	default:
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
	}

	return SLURM_PROTOCOL_SUCCESS;
}
コード例 #2
0
ファイル: update_config.c プロジェクト: SchedMD/slurm
/*
 * slurm_update_job2 - issue RPC to a job's configuration per request,
 *	only usable by user root or (for some parameters) the job's owner
 * IN job_msg - description of job updates
 * OUT resp - per task response to the request,
 *	      free using slurm_free_job_array_resp()
 * RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set
 */
extern int
slurm_update_job2 (job_desc_msg_t * job_msg, job_array_resp_msg_t **resp)
{
	int rc = SLURM_SUCCESS;
	slurm_msg_t req_msg, resp_msg;
	slurmdb_cluster_rec_t *save_working_cluster_rec = working_cluster_rec;

	slurm_msg_t_init(&req_msg);
	req_msg.msg_type	= REQUEST_UPDATE_JOB;
	req_msg.data		= job_msg;

tryagain:
	slurm_msg_t_init(&resp_msg);

	rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg,
					    working_cluster_rec);
	switch (resp_msg.msg_type) {
	case RESPONSE_SLURM_REROUTE_MSG:
	{
		reroute_msg_t *rr_msg = (reroute_msg_t *)resp_msg.data;

		/* Don't expect mutliple hops but in the case it does
		 * happen, free the previous rr cluster_rec. */
		if (working_cluster_rec &&
		    working_cluster_rec != save_working_cluster_rec)
			slurmdb_destroy_cluster_rec(
						working_cluster_rec);

		working_cluster_rec = rr_msg->working_cluster_rec;
		slurmdb_setup_cluster_rec(working_cluster_rec);
		rr_msg->working_cluster_rec = NULL;
		goto tryagain;
	}
	case RESPONSE_JOB_ARRAY_ERRORS:
		*resp = (job_array_resp_msg_t *) resp_msg.data;
		break;
	case RESPONSE_SLURM_RC:
		rc = ((return_code_msg_t *) resp_msg.data)->return_code;
		if (rc)
			slurm_seterrno(rc);
		break;
	default:
		slurm_seterrno(SLURM_UNEXPECTED_MSG_ERROR);
	}

	if (working_cluster_rec != save_working_cluster_rec) {
		slurmdb_destroy_cluster_rec(working_cluster_rec);
		working_cluster_rec = save_working_cluster_rec;
	}

	return rc;
}
コード例 #3
0
ファイル: checkpoint.c プロジェクト: HPCNow/slurm
/*
 * slurm_checkpoint_error - gather error information for the last checkpoint
 *	operation for some job step
 * IN job_id  - job on which to perform operation
 * IN step_id - job step on which to perform operation
 * OUT error_code - error number associated with the last checkpoint operation,
 *	this value is dependent upon the checkpoint plugin used and may be
 *	completely unrelated to slurm error codes, the highest value for all
 *	complete calls is preserved
 * OUT error_msg - error message, preserved for highest error_code, value
 *	must be freed by the caller to prevent memory leak
 * RET 0 or a slurm error code
 */
extern int slurm_checkpoint_error (uint32_t job_id, uint32_t step_id,
				   uint32_t *error_code, char **error_msg)
{
	int rc;
	slurm_msg_t msg;
	checkpoint_msg_t req;
	slurm_msg_t resp_msg;
	checkpoint_resp_msg_t *ckpt_resp;

	if ((error_code == NULL) || (error_msg == NULL))
		return EINVAL;

	/*
	 * Request message:
	 */
	req.op        = CHECK_ERROR;
	req.job_id    = job_id;
	req.step_id   = step_id;
	req.image_dir = NULL;
	slurm_msg_t_init(&msg);
	slurm_msg_t_init(&resp_msg);
	msg.msg_type  = REQUEST_CHECKPOINT;
	msg.data      = &req;

	rc = slurm_send_recv_controller_msg(&msg, &resp_msg,
					    working_cluster_rec);

	if (rc == SLURM_SOCKET_ERROR)
		return rc;

	switch (resp_msg.msg_type) {
	case RESPONSE_SLURM_RC:
		*error_code = 0;
		*error_msg = strdup("");
		rc = _handle_rc_msg(&resp_msg);
		break;
	case RESPONSE_CHECKPOINT:
		ckpt_resp = (checkpoint_resp_msg_t *) resp_msg.data;
		*error_code = ckpt_resp->error_code;
		if (ckpt_resp->error_msg)
			*error_msg = strdup(ckpt_resp->error_msg);
		else
			*error_msg = strdup("");
		slurm_free_checkpoint_resp_msg(ckpt_resp);
		rc = SLURM_SUCCESS;
		break;
	default:
		rc = SLURM_UNEXPECTED_MSG_ERROR;
	}

	return rc;
}
コード例 #4
0
ファイル: submit.c プロジェクト: chrisdukey/slurm
/*
 * slurm_submit_batch_pack_job - issue RPC to submit a heterogeneous job for
 *				 later execution
 * NOTE: free the response using slurm_free_submit_response_response_msg
 * IN job_req_list - List of resource allocation requests, type job_desc_msg_t
 * OUT resp - response to request
 * RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set
 */
extern int slurm_submit_batch_pack_job(List job_req_list,
				       submit_response_msg_t **resp)
{
	int rc;
	job_desc_msg_t *req;
	slurm_msg_t req_msg;
	slurm_msg_t resp_msg;
	char *local_hostname = NULL;
	ListIterator iter;

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);

	/*
	 * set Node and session id for this request
	 */
	local_hostname = xshort_hostname();
	iter = list_iterator_create(job_req_list);
	while ((req = (job_desc_msg_t *) list_next(iter))) {
		if (req->alloc_sid == NO_VAL)
			req->alloc_sid = getsid(0);
		if (!req->alloc_node)
			req->alloc_node = local_hostname;
	}
	list_iterator_destroy(iter);

	req_msg.msg_type = REQUEST_SUBMIT_BATCH_JOB_PACK;
	req_msg.data     = job_req_list;

	rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg,
					    working_cluster_rec);
	xfree(local_hostname);
	if (rc == SLURM_ERROR)
		return SLURM_ERROR;
	switch (resp_msg.msg_type) {
	case RESPONSE_SLURM_RC:
		rc = ((return_code_msg_t *) resp_msg.data)->return_code;
		if (rc)
			slurm_seterrno_ret(rc);
		*resp = NULL;
		break;
	case RESPONSE_SUBMIT_BATCH_JOB:
		*resp = (submit_response_msg_t *) resp_msg.data;
		break;
	default:
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
	}

	return SLURM_SUCCESS;
}
コード例 #5
0
ファイル: submit.c プロジェクト: chrisdukey/slurm
/*
 * slurm_submit_batch_job - issue RPC to submit a job for later execution
 * NOTE: free the response using slurm_free_submit_response_response_msg
 * IN job_desc_msg - description of batch job request
 * OUT resp - response to request
 * RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set
 */
extern int slurm_submit_batch_job(job_desc_msg_t *req,
				  submit_response_msg_t **resp)
{
	int rc;
	slurm_msg_t req_msg;
	slurm_msg_t resp_msg;
	char *local_hostname = NULL;

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);

	/*
	 * set Node and session id for this request
	 */
	if (req->alloc_sid == NO_VAL)
		req->alloc_sid = getsid(0);

	if (req->alloc_node == NULL) {
		local_hostname = xshort_hostname();
		req->alloc_node = local_hostname;
	}

	req_msg.msg_type = REQUEST_SUBMIT_BATCH_JOB;
	req_msg.data     = req;

	rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg,
					    working_cluster_rec);
	xfree(local_hostname);
	if (rc == SLURM_ERROR)
		return SLURM_ERROR;

	switch (resp_msg.msg_type) {
	case RESPONSE_SLURM_RC:
		rc = ((return_code_msg_t *) resp_msg.data)->return_code;
		if (rc)
			slurm_seterrno_ret(rc);
		*resp = NULL;
		break;
	case RESPONSE_SUBMIT_BATCH_JOB:
		*resp = (submit_response_msg_t *) resp_msg.data;
		break;
	default:
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
	}

	return SLURM_SUCCESS;
}
コード例 #6
0
ファイル: license_info.c プロジェクト: HPCNow/slurm
/* slurm_load_licenses()
 *
 * Load requested licenses from the controller.
 *
 */
extern int
slurm_load_licenses(time_t t,
                    license_info_msg_t **lic_info,
                    uint16_t show_flags)
{
	int cc;
	slurm_msg_t msg_request;
	slurm_msg_t msg_reply;
	struct license_info_request_msg req;

	memset(&req, 0, sizeof(struct license_info_request_msg));
	slurm_msg_t_init(&msg_request);
	slurm_msg_t_init(&msg_reply);

	msg_request.msg_type = REQUEST_LICENSE_INFO;
	req.last_update = t;
	req.show_flags = show_flags;
	msg_request.data = &req;

	cc = slurm_send_recv_controller_msg(&msg_request, &msg_reply,
					    working_cluster_rec);
	if (cc < 0)
		return SLURM_ERROR;

	switch (msg_reply.msg_type) {
		case RESPONSE_LICENSE_INFO:
			*lic_info = msg_reply.data;
			break;
		case RESPONSE_SLURM_RC:
			cc = ((return_code_msg_t *)msg_reply.data)->return_code;
			slurm_free_return_code_msg(msg_reply.data);
			if (cc) /* slurm_seterrno_ret() is a macro ... sigh */
				slurm_seterrno(cc);
			*lic_info = NULL;
			return -1;
		default:
			slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
		break;
	}

	return SLURM_PROTOCOL_SUCCESS;
}
コード例 #7
0
ファイル: allocate.c プロジェクト: HPCNow/slurm
/*
 * slurm_job_step_create - create a job step for a given job id
 * IN slurm_step_alloc_req_msg - description of job step request
 * OUT slurm_step_alloc_resp_msg - response to request
 * RET 0 on success, otherwise return -1 and set errno to indicate the error
 * NOTE: free the response using slurm_free_job_step_create_response_msg
 */
int
slurm_job_step_create (job_step_create_request_msg_t *req,
                       job_step_create_response_msg_t **resp)
{
	slurm_msg_t req_msg, resp_msg;
	int delay, rc, retry = 0;

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);
	req_msg.msg_type = REQUEST_JOB_STEP_CREATE;
	req_msg.data     = req;

re_send:
	if (slurm_send_recv_controller_msg(&req_msg, &resp_msg,
					   working_cluster_rec) < 0)
		return SLURM_ERROR;

	switch (resp_msg.msg_type) {
	case RESPONSE_SLURM_RC:
		rc = _handle_rc_msg(&resp_msg);
		if ((rc < 0) && (errno == EAGAIN)) {
			if (retry++ == 0) {
				verbose("Slurm is busy, step creation delayed");
				delay = (getpid() % 10) + 10;	/* 10-19 secs */
			}
			sleep(delay);
			goto re_send;
		}
		if (rc < 0)
			return SLURM_PROTOCOL_ERROR;
		*resp = NULL;
		break;
	case RESPONSE_JOB_STEP_CREATE:
		*resp = (job_step_create_response_msg_t *) resp_msg.data;
		break;
	default:
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
		break;
	}

	return SLURM_PROTOCOL_SUCCESS ;
}
コード例 #8
0
ファイル: block_info.c プロジェクト: HPCNow/slurm
/*
 * slurm_load_block_info - issue RPC to get slurm all node select plugin
 *	information if changed since update_time
 * IN update_time - time of current configuration data
 * IN block_info_msg_pptr - place to store a node select configuration
 *	pointer
 * IN show_flags - controls output form or filtering, see SHOW_FLAGS in slurm.h
 * RET 0 or a slurm error code
 * NOTE: free the response using slurm_free_block_info_msg
 */
extern int slurm_load_block_info (time_t update_time,
				  block_info_msg_t **block_info_msg_pptr,
				  uint16_t show_flags)
{
        int rc;
        slurm_msg_t req_msg;
        slurm_msg_t resp_msg;
	block_info_request_msg_t req;

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);

        req.last_update  = update_time;
	req.show_flags   = show_flags;
        req_msg.msg_type = REQUEST_BLOCK_INFO;
        req_msg.data     = &req;

	if (slurm_send_recv_controller_msg(&req_msg, &resp_msg,
					   working_cluster_rec) < 0)
		return SLURM_ERROR;

	switch (resp_msg.msg_type) {
	case RESPONSE_BLOCK_INFO:
		*block_info_msg_pptr = (block_info_msg_t *)
			resp_msg.data;
		break;
	case RESPONSE_SLURM_RC:
		rc = ((return_code_msg_t *) resp_msg.data)->return_code;
		slurm_free_return_code_msg(resp_msg.data);
		if (rc)
			slurm_seterrno_ret(rc);
		*block_info_msg_pptr = NULL;
		break;
	default:
		*block_info_msg_pptr = NULL;
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
		break;
	}

        return SLURM_SUCCESS;
}
コード例 #9
0
ファイル: allocate.c プロジェクト: iakovos-panourgias/slurm
/*
 * slurm_allocate_resources - allocate resources for a job request
 * IN job_desc_msg - description of resource allocation request
 * OUT slurm_alloc_msg - response to request
 * RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set
 * NOTE: free the response using slurm_free_resource_allocation_response_msg()
 */
int
slurm_allocate_resources (job_desc_msg_t *req,
			  resource_allocation_response_msg_t **resp)
{
	int rc;
	slurm_msg_t req_msg;
	slurm_msg_t resp_msg;

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);
	/*
	 * set Node and session id for this request
	 */
	if (req->alloc_sid == NO_VAL)
		req->alloc_sid = getsid(0);

	req_msg.msg_type = REQUEST_RESOURCE_ALLOCATION;
	req_msg.data     = req;

	rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg,
					    working_cluster_rec);

	if (rc == SLURM_ERROR)
		return SLURM_ERROR;

	switch (resp_msg.msg_type) {
	case RESPONSE_SLURM_RC:
		if (_handle_rc_msg(&resp_msg) < 0)
			return SLURM_ERROR;
		*resp = NULL;
		break;
	case RESPONSE_RESOURCE_ALLOCATION:
		*resp = (resource_allocation_response_msg_t *) resp_msg.data;
		break;
	default:
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
	}

	return SLURM_SUCCESS;
}
コード例 #10
0
/*
 * slurm_get_job_steps - issue RPC to get specific slurm job step
 *	configuration information if changed since update_time.
 *	a job_id value of NO_VAL implies all jobs, a step_id value of
 *	NO_VAL implies all steps
 * IN update_time - time of current configuration data
 * IN job_id - get information for specific job id, NO_VAL for all jobs
 * IN step_id - get information for specific job step id, NO_VAL for all
 *	job steps
 * IN job_info_msg_pptr - place to store a job configuration pointer
 * IN show_flags - job step filtering options
 * RET 0 on success, otherwise return -1 and set errno to indicate the error
 * NOTE: free the response using slurm_free_job_step_info_response_msg
 */
int
slurm_get_job_steps (time_t update_time, uint32_t job_id, uint32_t step_id,
		     job_step_info_response_msg_t **resp, uint16_t show_flags)
{
	int rc;
	slurm_msg_t req_msg;
	slurm_msg_t resp_msg;
	job_step_info_request_msg_t req;

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);

	req.last_update  = update_time;
	req.job_id	= job_id;
	req.step_id	= step_id;
	req.show_flags	= show_flags;
	req_msg.msg_type = REQUEST_JOB_STEP_INFO;
	req_msg.data	= &req;

	if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
		return SLURM_ERROR;

	switch (resp_msg.msg_type) {
	case RESPONSE_JOB_STEP_INFO:
		*resp = (job_step_info_response_msg_t *) resp_msg.data;
		break;
	case RESPONSE_SLURM_RC:
		rc = ((return_code_msg_t *) resp_msg.data)->return_code;
		slurm_free_return_code_msg(resp_msg.data);
		if (rc)
			slurm_seterrno_ret(rc);
		*resp = NULL;
		break;
	default:
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
		break;
	}

	return SLURM_PROTOCOL_SUCCESS;
}
コード例 #11
0
/*
 * slurm_load_partitions - issue RPC to get slurm all partition configuration
 *	information if changed since update_time
 * IN update_time - time of current configuration data
 * IN partition_info_msg_pptr - place to store a partition configuration
 *	pointer
 * IN show_flags - partition filtering options
 * RET 0 or a slurm error code
 * NOTE: free the response using slurm_free_partition_info_msg
 */
extern int slurm_load_partitions (time_t update_time,
				  partition_info_msg_t **resp,
				  uint16_t show_flags)
{
        int rc;
        slurm_msg_t req_msg;
        slurm_msg_t resp_msg;
        part_info_request_msg_t req;

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);

	req.last_update  = update_time;
	req.show_flags   = show_flags;
	req_msg.msg_type = REQUEST_PARTITION_INFO;
	req_msg.data     = &req;

	if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
		return SLURM_ERROR;

	switch (resp_msg.msg_type) {
	case RESPONSE_PARTITION_INFO:
		*resp = (partition_info_msg_t *) resp_msg.data;
		break;
	case RESPONSE_SLURM_RC:
		rc = ((return_code_msg_t *) resp_msg.data)->return_code;
		slurm_free_return_code_msg(resp_msg.data);
		if (rc)
			slurm_seterrno_ret(rc);
		*resp = NULL;
		break;
	default:
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
		break;
	}

	return SLURM_PROTOCOL_SUCCESS;
}
コード例 #12
0
/*
 * slurm_load_node_single - issue RPC to get slurm configuration information
 *	for a specific node
 * OUT resp - place to store a node configuration pointer
 * IN node_name - name of the node for which information is requested
 * IN show_flags - node filtering options
 * RET 0 or a slurm error code
 * NOTE: free the response using slurm_free_node_info_msg
 */
extern int slurm_load_node_single (node_info_msg_t **resp,
				   char *node_name, uint16_t show_flags)
{
	int rc;
	slurm_msg_t req_msg;
	slurm_msg_t resp_msg;
	node_info_single_msg_t req;

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);
	req.node_name    = node_name;
	req.show_flags   = show_flags;
	req_msg.msg_type = REQUEST_NODE_INFO_SINGLE;
	req_msg.data     = &req;

	if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
		return SLURM_ERROR;

	switch (resp_msg.msg_type) {
	case RESPONSE_NODE_INFO:
		*resp = (node_info_msg_t *) resp_msg.data;
		if (show_flags & SHOW_MIXED)
			_set_node_mixed(*resp);
		break;
	case RESPONSE_SLURM_RC:
		rc = ((return_code_msg_t *) resp_msg.data)->return_code;
		slurm_free_return_code_msg(resp_msg.data);
		if (rc)
			slurm_seterrno_ret(rc);
		*resp = NULL;
		break;
	default:
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
		break;
	}

	return SLURM_PROTOCOL_SUCCESS;
}
コード例 #13
0
ファイル: allocate.c プロジェクト: HPCNow/slurm
/*
 * slurm_allocation_lookup - retrieve info for an existing resource allocation
 * 			     without the addrs and such
 * IN jobid - job allocation identifier
 * OUT info - job allocation information
 * RET 0 on success, otherwise return -1 and set errno to indicate the error
 * NOTE: free the response using slurm_free_resource_allocation_response_msg()
 */
extern int slurm_allocation_lookup(uint32_t jobid,
				   resource_allocation_response_msg_t **info)
{
	job_alloc_info_msg_t req = {0};
	slurm_msg_t req_msg;
	slurm_msg_t resp_msg;

	req.job_id = jobid;
	req.req_cluster  = slurmctld_conf.cluster_name;
	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);
	req_msg.msg_type = REQUEST_JOB_ALLOCATION_INFO;
	req_msg.data     = &req;

	if (slurm_send_recv_controller_msg(&req_msg, &resp_msg,
					   working_cluster_rec) < 0)
		return SLURM_ERROR;

	req.req_cluster = NULL;

	switch (resp_msg.msg_type) {
	case RESPONSE_SLURM_RC:
		if (_handle_rc_msg(&resp_msg) < 0)
			return SLURM_ERROR;
		*info = NULL;
		break;
	case RESPONSE_JOB_ALLOCATION_INFO:
		*info = (resource_allocation_response_msg_t *) resp_msg.data;
		return SLURM_PROTOCOL_SUCCESS;
		break;
	default:
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
		break;
	}

	return SLURM_PROTOCOL_SUCCESS;
}
コード例 #14
0
ファイル: checkpoint.c プロジェクト: HPCNow/slurm
/*
 * slurm_checkpoint_able - determine if the specified job step can presently
 *	be checkpointed
 * IN job_id  - job on which to perform operation
 * IN step_id - job step on which to perform operation
 * OUT start_time - time at which checkpoint request was issued
 * RET 0 (can be checkpoined) or a slurm error code
 */
extern int slurm_checkpoint_able (uint32_t job_id, uint32_t step_id,
		time_t *start_time)
{
	int rc;
	slurm_msg_t req_msg, resp_msg;
	checkpoint_msg_t ckp_req;
	checkpoint_resp_msg_t *resp;

	ckp_req.op        = CHECK_ABLE;
	ckp_req.job_id    = job_id;
	ckp_req.step_id   = step_id;
	ckp_req.image_dir = NULL;
	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);
	req_msg.msg_type  = REQUEST_CHECKPOINT;
	req_msg.data      = &ckp_req;

	if (slurm_send_recv_controller_msg(&req_msg, &resp_msg,
					   working_cluster_rec) < 0)
		return SLURM_ERROR;

	switch (resp_msg.msg_type) {
	case RESPONSE_CHECKPOINT:
		resp = (checkpoint_resp_msg_t *) resp_msg.data;
		*start_time = resp->event_time;
		slurm_free_checkpoint_resp_msg(resp_msg.data);
		rc = SLURM_SUCCESS;
		break;
	case RESPONSE_SLURM_RC:
		rc = _handle_rc_msg(&resp_msg);
		break;
	default:
		*start_time = (time_t) NULL;
		rc = SLURM_ERROR;
	}
	return rc;
}
コード例 #15
0
ファイル: job_info.c プロジェクト: fafik23/slurm
/*
 * slurm_job_node_ready - report if nodes are ready for job to execute now
 * IN job_id - slurm job id
 * RET: READY_* values as defined in slurm.h
 */
extern int slurm_job_node_ready(uint32_t job_id)
{
	slurm_msg_t req, resp;
	job_id_msg_t msg;
	int rc;

	slurm_msg_t_init(&req);
	slurm_msg_t_init(&resp);

	bzero(&msg, sizeof(job_id_msg_t));
	msg.job_id   = job_id;
	req.msg_type = REQUEST_JOB_READY;
	req.data     = &msg;

	if (slurm_send_recv_controller_msg(&req, &resp) < 0)
		return READY_JOB_ERROR;

	if (resp.msg_type == RESPONSE_JOB_READY) {
		rc = ((return_code_msg_t *) resp.data)->return_code;
		slurm_free_return_code_msg(resp.data);
	} else if (resp.msg_type == RESPONSE_SLURM_RC) {
		int job_rc = ((return_code_msg_t *) resp.data) ->
				return_code;
		if ((job_rc == ESLURM_INVALID_PARTITION_NAME) ||
		    (job_rc == ESLURM_INVALID_JOB_ID))
			rc = READY_JOB_FATAL;
		else	/* EAGAIN */
			rc = READY_JOB_ERROR;
		slurm_free_return_code_msg(resp.data);
	} else if (resp.msg_type == RESPONSE_PROLOG_EXECUTING) {
		rc = READY_JOB_ERROR;
	} else {
		rc = READY_JOB_ERROR;
	}

	return rc;
}
コード例 #16
0
ファイル: job_info.c プロジェクト: IFCA/slurm
/*
 * slurm_load_job_user - issue RPC to get slurm information about all jobs
 *	to be run as the specified user
 * IN/OUT job_info_msg_pptr - place to store a job configuration pointer
 * IN user_id - ID of user we want information for
 * IN show_flags - job filtering options
 * RET 0 or -1 on error
 * NOTE: free the response using slurm_free_job_info_msg
 */
extern int slurm_load_job_user (job_info_msg_t **job_info_msg_pptr,
				uint32_t user_id,
				uint16_t show_flags)
{
	int rc;
	slurm_msg_t resp_msg;
	slurm_msg_t req_msg;
	job_user_id_msg_t req;

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);

	req.show_flags   = show_flags;
	req.user_id      = user_id;
	req_msg.msg_type = REQUEST_JOB_USER_INFO;
	req_msg.data     = &req;

	if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
		return SLURM_ERROR;

	switch (resp_msg.msg_type) {
	case RESPONSE_JOB_INFO:
		*job_info_msg_pptr = (job_info_msg_t *)resp_msg.data;
		break;
	case RESPONSE_SLURM_RC:
		rc = ((return_code_msg_t *) resp_msg.data)->return_code;
		slurm_free_return_code_msg(resp_msg.data);
		if (rc)
			slurm_seterrno_ret(rc);
		break;
	default:
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
		break;
	}

	return SLURM_PROTOCOL_SUCCESS;
}
コード例 #17
0
ファイル: allocate.c プロジェクト: iakovos-panourgias/slurm
/*
 * slurm_sbcast_lookup - retrieve info for an existing resource allocation
 *	including a credential needed for sbcast
 * IN job_id - job allocation identifier (or pack job ID)
 * IN pack_job_offset - pack job  index (or NO_VAL if not pack job)
 * IN step_id - step allocation identifier (or NO_VAL for entire job)
 * OUT info - job allocation information including a credential for sbcast
 * RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set
 * NOTE: free the "resp" using slurm_free_sbcast_cred_msg
 */
extern int slurm_sbcast_lookup(uint32_t job_id, uint32_t pack_job_offset,
			       uint32_t step_id, job_sbcast_cred_msg_t **info)
{
	step_alloc_info_msg_t req;
	slurm_msg_t req_msg;
	slurm_msg_t resp_msg;

	req.job_id = job_id;
	req.pack_job_offset = pack_job_offset;
	req.step_id = step_id;
	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);
	req_msg.msg_type = REQUEST_JOB_SBCAST_CRED;
	req_msg.data     = &req;

	if (slurm_send_recv_controller_msg(&req_msg,
					   &resp_msg,working_cluster_rec) < 0)
		return SLURM_ERROR;

	switch (resp_msg.msg_type) {
	case RESPONSE_SLURM_RC:
		if (_handle_rc_msg(&resp_msg) < 0)
			return SLURM_ERROR;
		*info = NULL;
		break;
	case RESPONSE_JOB_SBCAST_CRED:
		*info = (job_sbcast_cred_msg_t *)resp_msg.data;
		return SLURM_SUCCESS;
		break;
	default:
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
		break;
	}

	return SLURM_SUCCESS;
}
コード例 #18
0
ファイル: allocate.c プロジェクト: HPCNow/slurm
/*
 * slurm_allocate_pack_job_blocking
 *	allocate resources for a list of job requests.  This call will block
 *	until the entire allocation is granted, or the specified timeout limit
 *	is reached.
 * IN req - List of resource allocation requests
 * IN timeout - amount of time, in seconds, to wait for a response before
 * 	giving up.
 *	A timeout of zero will wait indefinitely.
 * IN pending_callback - If the allocation cannot be granted immediately,
 *      the controller will put the job in the PENDING state.  If
 *      pending callback is not NULL, it will be called with the job_id
 *      of the pending job as the sole parameter.
 *
 * RET List of allocation structures on success, NULL on error set errno to
 *	indicate the error (errno will be ETIMEDOUT if the timeout is reached
 *      with no allocation granted)
 * NOTE: free the response using list_destroy()
 */
List slurm_allocate_pack_job_blocking(List job_req_list, time_t timeout,
				      void(*pending_callback)(uint32_t job_id))
{
	int rc;
	slurm_msg_t req_msg;
	slurm_msg_t resp_msg;
	List resp = NULL;
	char *local_hostname = NULL;
	job_desc_msg_t *req;
	listen_t *listen = NULL;
	int errnum = SLURM_SUCCESS;
	ListIterator iter;
	bool immediate_flag = false;
	bool immediate_logged = false;
	uint32_t node_cnt = 0, job_id = 0;

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);

	/*
	 * set node name and session ID for this request
	 */

	if (!immediate_flag) {
		listen = _create_allocation_response_socket(local_hostname);
		if (listen == NULL)
			return NULL;
	}

	local_hostname = xshort_hostname();
	iter = list_iterator_create(job_req_list);
	while ((req = (job_desc_msg_t *) list_next(iter))) {
		if (req->alloc_sid == NO_VAL)
			req->alloc_sid = getsid(0);
		if (listen)
			req->alloc_resp_port = listen->port;

		if (!req->alloc_node) {
			if (local_hostname) {
				req->alloc_node = local_hostname;
			} else if (immediate_logged) {
				req->immediate = 1;
			} else {
				req->immediate = 1;
				error("Could not get local hostname, forcing "
				      "immediate allocation mode");
				immediate_logged = true;
			}
		}
		if (req->immediate)
			immediate_flag = true;
	}
	list_iterator_destroy(iter);

	req_msg.msg_type = REQUEST_JOB_PACK_ALLOCATION;
	req_msg.data     = job_req_list;

	rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg,
					    working_cluster_rec);

	if (rc == SLURM_SOCKET_ERROR) {
		int errnum = errno;
		destroy_forward(&req_msg.forward);
		destroy_forward(&resp_msg.forward);
		if (listen)
			_destroy_allocation_response_socket(listen);
		iter = list_iterator_create(job_req_list);
		while ((req = (job_desc_msg_t *)list_next(iter))) {
			if (req->alloc_node == local_hostname)
				req->alloc_node = NULL;
		}
		list_iterator_destroy(iter);
		xfree(local_hostname);
		errno = errnum;
		return NULL;
	}

	switch (resp_msg.msg_type) {
	case RESPONSE_SLURM_RC:
		if (_handle_rc_msg(&resp_msg) < 0) {
			/* will reach this when the allocation fails */
			errnum = errno;
		} else {
			/* shouldn't get here */
			errnum = SLURM_ERROR;
		}
		break;
	case RESPONSE_JOB_PACK_ALLOCATION:
		/* Yay, the controller has acknowledged our request!
		 * Test if we have an allocation yet? */
		resp = (List) resp_msg.data;
		_pack_alloc_test(resp, &node_cnt, &job_id);
		if (node_cnt > 0) {
			/* yes, allocation has been granted */
			errno = SLURM_PROTOCOL_SUCCESS;
		} else if (immediate_flag) {
			debug("Immediate allocation not granted");
		} else {
			/* no, we need to wait for a response */
			FREE_NULL_LIST(resp);
			if (pending_callback != NULL)
				pending_callback(job_id);
			_wait_for_allocation_response(job_id, listen,
						RESPONSE_JOB_PACK_ALLOCATION,
						timeout, (void **) &resp);
			/* If NULL, we didn't get the allocation in
			 * the time desired, so just free the job id */
			if ((resp == NULL) && (errno != ESLURM_ALREADY_DONE)) {
				errnum = errno;
				slurm_complete_job(job_id, -1);
			}
		}
		break;
	default:
		errnum = SLURM_UNEXPECTED_MSG_ERROR;
	}

	destroy_forward(&req_msg.forward);
	destroy_forward(&resp_msg.forward);
	if (listen)
		_destroy_allocation_response_socket(listen);
	iter = list_iterator_create(job_req_list);
	while ((req = (job_desc_msg_t *)list_next(iter))) {
		if (req->alloc_node == local_hostname)
			req->alloc_node = NULL;
	}
	list_iterator_destroy(iter);
	xfree(local_hostname);
	errno = errnum;

	return resp;
}
コード例 #19
0
ファイル: mult_cluster.c プロジェクト: IFCA/slurm
/*
 * We don't use the api here because it does things we aren't needing
 * like printing out information and not returning times.
 */
local_cluster_rec_t *_job_will_run (job_desc_msg_t *req)
{
	slurm_msg_t req_msg, resp_msg;
	will_run_response_msg_t *will_run_resp;
	int rc;
	char buf[64];
	char *type = "processors";
	local_cluster_rec_t *local_cluster = NULL;
	/* req.immediate = true;    implicit */

	slurm_msg_t_init(&req_msg);
	req_msg.msg_type = REQUEST_JOB_WILL_RUN;
	req_msg.data     = req;

	rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg);

	if (rc < 0) {
		slurm_seterrno(SLURM_SOCKET_ERROR);
		return NULL;
	}
	switch (resp_msg.msg_type) {
	case RESPONSE_SLURM_RC:
		rc = ((return_code_msg_t *) resp_msg.data)->return_code;
		slurm_free_return_code_msg(resp_msg.data);
		if (rc)
			slurm_seterrno(rc);
		break;
	case RESPONSE_JOB_WILL_RUN:
		if (working_cluster_rec->flags & CLUSTER_FLAG_BG)
			type = "cnodes";
		will_run_resp = (will_run_response_msg_t *) resp_msg.data;
		slurm_make_time_str(&will_run_resp->start_time,
				    buf, sizeof(buf));
		debug("Job %u to start at %s on cluster %s using %u %s on %s",
		      will_run_resp->job_id, buf, working_cluster_rec->name,
		      will_run_resp->proc_cnt, type,
		      will_run_resp->node_list);
		local_cluster = xmalloc(sizeof(local_cluster_rec_t));
		local_cluster->cluster_rec = working_cluster_rec;
		local_cluster->start_time = will_run_resp->start_time;
		if (will_run_resp->preemptee_job_id) {
			local_cluster->preempt_cnt =
				list_count(will_run_resp->preemptee_job_id);
			if (opt.verbose >= LOG_LEVEL_DEBUG) {
				ListIterator itr;
				uint32_t *job_id_ptr;
				char *job_list = NULL, *sep = "";
				itr = list_iterator_create(will_run_resp->
							   preemptee_job_id);
				while ((job_id_ptr = list_next(itr))) {
					if (job_list)
						sep = ",";
					xstrfmtcat(job_list, "%s%u",
						   sep, *job_id_ptr);
				}
				debug("  Preempts: %s", job_list);
				xfree(job_list);
			}
		}

		slurm_free_will_run_response_msg(will_run_resp);
		break;
	default:
		slurm_seterrno(SLURM_UNEXPECTED_MSG_ERROR);
		return NULL;
		break;
	}
	return local_cluster;
}
コード例 #20
0
ファイル: allocate.c プロジェクト: alepharchives/slurm
/*
 * slurm_allocate_resources_blocking
 *	allocate resources for a job request.  This call will block until
 *	the allocation is granted, or the specified timeout limit is reached.
 * IN req - description of resource allocation request
 * IN timeout - amount of time, in seconds, to wait for a response before
 * 	giving up.
 *	A timeout of zero will wait indefinitely.
 * IN pending_callback - If the allocation cannot be granted immediately,
 *      the controller will put the job in the PENDING state.  If
 *      pending callback is not NULL, it will be called with the job_id
 *      of the pending job as the sole parameter.
 *
 * RET allocation structure on success, NULL on error set errno to
 *	indicate the error (errno will be ETIMEDOUT if the timeout is reached
 *      with no allocation granted)
 * NOTE: free the response using slurm_free_resource_allocation_response_msg()
 */
resource_allocation_response_msg_t *
slurm_allocate_resources_blocking (const job_desc_msg_t *user_req,
				   time_t timeout,
				   void(*pending_callback)(uint32_t job_id))
{
	int rc;
	slurm_msg_t req_msg;
	slurm_msg_t resp_msg;
	resource_allocation_response_msg_t *resp = NULL;
	char *hostname = NULL;
	uint32_t job_id;
	job_desc_msg_t *req;
	listen_t *listen = NULL;
	int errnum = SLURM_SUCCESS;

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);

	/* make a copy of the user's job description struct so that we
	 * can make changes before contacting the controller */
	req = (job_desc_msg_t *)xmalloc(sizeof(job_desc_msg_t));
	if (req == NULL)
		return NULL;
	memcpy(req, user_req, sizeof(job_desc_msg_t));

	/*
	 * set Node and session id for this request
	 */
	if (req->alloc_sid == NO_VAL)
		req->alloc_sid = getsid(0);

	if (user_req->alloc_node != NULL) {
		req->alloc_node = xstrdup(user_req->alloc_node);
	} else if ((hostname = xshort_hostname()) != NULL) {
		req->alloc_node = hostname;
	} else {
		error("Could not get local hostname,"
		      " forcing immediate allocation mode.");
		req->immediate = 1;
	}

	if (!req->immediate) {
		listen = _create_allocation_response_socket(hostname);
		if (listen == NULL) {
			xfree(req);
			return NULL;
		}
		req->alloc_resp_port = listen->port;
	}

	req_msg.msg_type = REQUEST_RESOURCE_ALLOCATION;
	req_msg.data     = req;

	rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg);

	if (rc == SLURM_SOCKET_ERROR) {
		int errnum = errno;
		destroy_forward(&req_msg.forward);
		destroy_forward(&resp_msg.forward);
		if (!req->immediate)
			_destroy_allocation_response_socket(listen);
		xfree(req);
		errno = errnum;
		return NULL;
	}

	switch (resp_msg.msg_type) {
	case RESPONSE_SLURM_RC:
		if (_handle_rc_msg(&resp_msg) < 0) {
			/* will reach this when the allocation fails */
			errnum = errno;
		} else {
			/* shouldn't get here */
			errnum = -1;
		}
		break;
	case RESPONSE_RESOURCE_ALLOCATION:
		/* Yay, the controller has acknowledged our request!  But did
		   we really get an allocation yet? */
		resp = (resource_allocation_response_msg_t *) resp_msg.data;
		if (resp->node_cnt > 0) {
			/* yes, allocation has been granted */
			errno = SLURM_PROTOCOL_SUCCESS;
		} else if (!req->immediate) {
			if (resp->error_code != SLURM_SUCCESS)
				info("%s", slurm_strerror(resp->error_code));
			/* no, we need to wait for a response */
			job_id = resp->job_id;
			slurm_free_resource_allocation_response_msg(resp);
			if (pending_callback != NULL)
				pending_callback(job_id);
 			resp = _wait_for_allocation_response(job_id, listen,
							     timeout);
			/* If NULL, we didn't get the allocation in
			   the time desired, so just free the job id */
			if ((resp == NULL) && (errno != ESLURM_ALREADY_DONE)) {
				errnum = errno;
				slurm_complete_job(job_id, -1);
			}
		}
		break;
	default:
		errnum = SLURM_UNEXPECTED_MSG_ERROR;
		resp = NULL;
	}

	destroy_forward(&req_msg.forward);
	destroy_forward(&resp_msg.forward);
	if (!req->immediate)
		_destroy_allocation_response_socket(listen);
	xfree(req);
	errno = errnum;
	return resp;
}
コード例 #21
0
ファイル: job_info.c プロジェクト: IFCA/slurm
/*
 * slurm_get_end_time - get the expected end time for a given slurm job
 * IN jobid     - slurm job id
 * end_time_ptr - location in which to store scheduled end time for job
 * RET 0 or -1 on error
 */
extern int
slurm_get_end_time(uint32_t jobid, time_t *end_time_ptr)
{
	int rc;
	slurm_msg_t resp_msg;
	slurm_msg_t req_msg;
	job_alloc_info_msg_t job_msg;
	srun_timeout_msg_t *timeout_msg;
	time_t now = time(NULL);
	static uint32_t jobid_cache = 0;
	static uint32_t jobid_env = 0;
	static time_t endtime_cache = 0;
	static time_t last_test_time = 0;

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);

	if (!end_time_ptr)
		slurm_seterrno_ret(EINVAL);

	if (jobid == 0) {
		if (jobid_env) {
			jobid = jobid_env;
		} else {
			char *env = getenv("SLURM_JOB_ID");
			if (env) {
				jobid = (uint32_t) atol(env);
				jobid_env = jobid;
			}
		}
		if (jobid == 0) {
			slurm_seterrno(ESLURM_INVALID_JOB_ID);
			return SLURM_ERROR;
		}
	}

	/* Just use cached data if data less than 60 seconds old */
	if ((jobid == jobid_cache)
	&&  (difftime(now, last_test_time) < 60)) {
		*end_time_ptr  = endtime_cache;
		return SLURM_SUCCESS;
	}

	job_msg.job_id     = jobid;
	req_msg.msg_type   = REQUEST_JOB_END_TIME;
	req_msg.data       = &job_msg;

	if (slurm_send_recv_controller_msg(
		    &req_msg, &resp_msg) < 0)
		return SLURM_ERROR;

	switch (resp_msg.msg_type) {
	case SRUN_TIMEOUT:
		timeout_msg = (srun_timeout_msg_t *) resp_msg.data;
		last_test_time = time(NULL);
		jobid_cache    = jobid;
		endtime_cache  = timeout_msg->timeout;
		*end_time_ptr  = endtime_cache;
		slurm_free_srun_timeout_msg(resp_msg.data);
		break;
	case RESPONSE_SLURM_RC:
		rc = ((return_code_msg_t *) resp_msg.data)->return_code;
		slurm_free_return_code_msg(resp_msg.data);
		if (endtime_cache)
			*end_time_ptr  = endtime_cache;
		else if (rc)
			slurm_seterrno_ret(rc);
		break;
	default:
		if (endtime_cache)
			*end_time_ptr  = endtime_cache;
		else
			slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
		break;
	}

	return SLURM_SUCCESS;
}
コード例 #22
0
ファイル: allocate.c プロジェクト: alepharchives/slurm
/*
 * slurm_job_will_run - determine if a job would execute immediately if
 *	submitted now
 * IN job_desc_msg - description of resource allocation request
 * RET 0 on success, otherwise return -1 and set errno to indicate the error
 */
int slurm_job_will_run (job_desc_msg_t *req)
{
	slurm_msg_t req_msg, resp_msg;
	will_run_response_msg_t *will_run_resp;
	char buf[64];
	bool host_set = false;
	int rc;
	uint32_t cluster_flags = slurmdb_setup_cluster_flags();
	char *type = "processors";
	/* req.immediate = true;    implicit */
	if ((req->alloc_node == NULL) &&
	    (gethostname_short(buf, sizeof(buf)) == 0)) {
		req->alloc_node = buf;
		host_set = true;
	}
	slurm_msg_t_init(&req_msg);
	req_msg.msg_type = REQUEST_JOB_WILL_RUN;
	req_msg.data     = req;

	rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg);

	if (host_set)
		req->alloc_node = NULL;

	if (rc < 0)
		return SLURM_SOCKET_ERROR;

	switch (resp_msg.msg_type) {
	case RESPONSE_SLURM_RC:
		if (_handle_rc_msg(&resp_msg) < 0)
			return SLURM_PROTOCOL_ERROR;
		break;
	case RESPONSE_JOB_WILL_RUN:
		if(cluster_flags & CLUSTER_FLAG_BG)
			type = "cnodes";
		will_run_resp = (will_run_response_msg_t *) resp_msg.data;
		slurm_make_time_str(&will_run_resp->start_time,
				    buf, sizeof(buf));
		info("Job %u to start at %s using %u %s"
		     " on %s",
		     will_run_resp->job_id, buf,
		     will_run_resp->proc_cnt, type,
		     will_run_resp->node_list);
		if (will_run_resp->preemptee_job_id) {
			ListIterator itr;
			uint32_t *job_id_ptr;
			char *job_list = NULL, *sep = "";
			itr = list_iterator_create(will_run_resp->
						   preemptee_job_id);
			while ((job_id_ptr = list_next(itr))) {
				if (job_list)
					sep = ",";
				xstrfmtcat(job_list, "%s%u", sep, *job_id_ptr);
			}
			info("  Preempts: %s", job_list);
			xfree(job_list);
		}

		slurm_free_will_run_response_msg(will_run_resp);
		break;
	default:
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
		break;
	}

	return SLURM_PROTOCOL_SUCCESS;
}
コード例 #23
0
ファイル: allocate.c プロジェクト: iakovos-panourgias/slurm
/*
 * slurm_allocate_resources_blocking
 *	allocate resources for a job request.  This call will block until
 *	the allocation is granted, or the specified timeout limit is reached.
 * IN req - description of resource allocation request
 * IN timeout - amount of time, in seconds, to wait for a response before
 * 	giving up.
 *	A timeout of zero will wait indefinitely.
 * IN pending_callback - If the allocation cannot be granted immediately,
 *      the controller will put the job in the PENDING state.  If
 *      pending callback is not NULL, it will be called with the job_id
 *      of the pending job as the sole parameter.
 *
 * RET allocation structure on success, NULL on error set errno to
 *	indicate the error (errno will be ETIMEDOUT if the timeout is reached
 *      with no allocation granted)
 * NOTE: free the response using slurm_free_resource_allocation_response_msg()
 */
resource_allocation_response_msg_t *
slurm_allocate_resources_blocking (const job_desc_msg_t *user_req,
				   time_t timeout,
				   void(*pending_callback)(uint32_t job_id))
{
	int rc;
	slurm_msg_t req_msg;
	slurm_msg_t resp_msg;
	resource_allocation_response_msg_t *resp = NULL;
	uint32_t job_id;
	job_desc_msg_t *req;
	listen_t *listen = NULL;
	int errnum = SLURM_SUCCESS;
	bool already_done = false;

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);

	/* make a copy of the user's job description struct so that we
	 * can make changes before contacting the controller */
	req = (job_desc_msg_t *)xmalloc(sizeof(job_desc_msg_t));
	if (req == NULL)
		return NULL;
	memcpy(req, user_req, sizeof(job_desc_msg_t));

	/*
	 * set Node and session id for this request
	 */
	if (req->alloc_sid == NO_VAL)
		req->alloc_sid = getsid(0);

	if (!req->immediate) {
		listen = _create_allocation_response_socket();
		if (listen == NULL) {
			xfree(req);
			return NULL;
		}
		req->alloc_resp_port = listen->port;
	}

	req_msg.msg_type = REQUEST_RESOURCE_ALLOCATION;
	req_msg.data     = req;

	rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg,
					    working_cluster_rec);

	if (rc == SLURM_ERROR) {
		int errnum = errno;
		destroy_forward(&req_msg.forward);
		destroy_forward(&resp_msg.forward);
		if (!req->immediate)
			_destroy_allocation_response_socket(listen);
		xfree(req);
		errno = errnum;
		return NULL;
	}

	switch (resp_msg.msg_type) {
	case RESPONSE_SLURM_RC:
		if (_handle_rc_msg(&resp_msg) < 0) {
			/* will reach this when the allocation fails */
			errnum = errno;
		} else {
			/* shouldn't get here */
			errnum = SLURM_ERROR;
		}
		break;
	case RESPONSE_RESOURCE_ALLOCATION:
		/* Yay, the controller has acknowledged our request!
		 * Test if we have an allocation yet? */
		resp = (resource_allocation_response_msg_t *) resp_msg.data;
		if (resp->node_cnt > 0) {
			/* yes, allocation has been granted */
			errno = SLURM_SUCCESS;
		} else if (!req->immediate) {
			if (resp->error_code != SLURM_SUCCESS)
				info("%s", slurm_strerror(resp->error_code));
			/* no, we need to wait for a response */

			/* print out any user messages before we wait. */
			print_multi_line_string(resp->job_submit_user_msg,
						-1, LOG_LEVEL_INFO);

			job_id = resp->job_id;
			slurm_free_resource_allocation_response_msg(resp);
			if (pending_callback != NULL)
				pending_callback(job_id);
			_wait_for_allocation_response(job_id, listen,
						RESPONSE_RESOURCE_ALLOCATION,
						timeout, (void **) &resp);
			/* If NULL, we didn't get the allocation in
			   the time desired, so just free the job id */
			if ((resp == NULL) && (errno != ESLURM_ALREADY_DONE)) {
				errnum = errno;
				slurm_complete_job(job_id, -1);
			}
			if ((resp == NULL) && (errno == ESLURM_ALREADY_DONE))
				already_done = true;
		}
		break;
	default:
		errnum = SLURM_UNEXPECTED_MSG_ERROR;
		resp = NULL;
	}

	destroy_forward(&req_msg.forward);
	destroy_forward(&resp_msg.forward);
	if (!req->immediate)
		_destroy_allocation_response_socket(listen);
	xfree(req);
	if (!resp && already_done && (errnum == SLURM_SUCCESS))
		errnum = ESLURM_ALREADY_DONE;
	errno = errnum;
	return resp;
}