Exemple #1
0
/*
 * slurm_load_jobs - issue RPC to get all job configuration
 *	information if changed since update_time
 * IN update_time - time of current configuration data
 * IN/OUT job_info_msg_pptr - place to store a job configuration pointer
 * IN show_flags -  job filtering option: 0, SHOW_ALL or SHOW_DETAIL
 * RET 0 or -1 on error
 * NOTE: free the response using slurm_free_job_info_msg
 */
extern int
slurm_load_jobs (time_t update_time, job_info_msg_t **job_info_msg_pptr,
		 uint16_t show_flags)
{
	int rc;
	slurm_msg_t resp_msg;
	slurm_msg_t req_msg;
	job_info_request_msg_t req;

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);

	req.last_update  = update_time;
	req.show_flags   = show_flags;
	req_msg.msg_type = REQUEST_JOB_INFO;
	req_msg.data     = &req;

	if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
		return SLURM_ERROR;

	switch (resp_msg.msg_type) {
	case RESPONSE_JOB_INFO:
		*job_info_msg_pptr = (job_info_msg_t *)resp_msg.data;
		break;
	case RESPONSE_SLURM_RC:
		rc = ((return_code_msg_t *) resp_msg.data)->return_code;
		slurm_free_return_code_msg(resp_msg.data);
		if (rc)
			slurm_seterrno_ret(rc);
		break;
	default:
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
		break;
	}

	return SLURM_PROTOCOL_SUCCESS;
}
Exemple #2
0
/*
 * slurm_load_job - issue RPC to get job information for one job ID
 * IN job_info_msg_pptr - place to store a job configuration pointer
 * IN job_id -  ID of job we want information about
 * IN show_flags -  job filtering option: 0, SHOW_ALL or SHOW_DETAIL
 * RET 0 or -1 on error
 * NOTE: free the response using slurm_free_job_info_msg
 */
extern int
slurm_load_job (job_info_msg_t **resp, uint32_t job_id, uint16_t show_flags)
{
	int rc;
	slurm_msg_t resp_msg;
	slurm_msg_t req_msg;
	job_id_msg_t req;

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);

	bzero(&req, sizeof(job_id_msg_t));
	req.job_id = job_id;
	req.show_flags = show_flags;
	req_msg.msg_type = REQUEST_JOB_INFO_SINGLE;
	req_msg.data     = &req;

	if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
		return SLURM_ERROR;

	switch (resp_msg.msg_type) {
	case RESPONSE_JOB_INFO:
		*resp = (job_info_msg_t *)resp_msg.data;
		break;
	case RESPONSE_SLURM_RC:
		rc = ((return_code_msg_t *) resp_msg.data)->return_code;
		slurm_free_return_code_msg(resp_msg.data);
		if (rc)
			slurm_seterrno_ret(rc);
		break;
	default:
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
		break;
	}

	return SLURM_PROTOCOL_SUCCESS ;
}
Exemple #3
0
/*
 * slurm_load_node_single - issue RPC to get slurm configuration information
 *	for a specific node
 * OUT resp - place to store a node configuration pointer
 * IN node_name - name of the node for which information is requested
 * IN show_flags - node filtering options
 * RET 0 or a slurm error code
 * NOTE: free the response using slurm_free_node_info_msg
 */
extern int slurm_load_node_single (node_info_msg_t **resp,
				   char *node_name, uint16_t show_flags)
{
	int rc;
	slurm_msg_t req_msg;
	slurm_msg_t resp_msg;
	node_info_single_msg_t req;

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);
	req.node_name    = node_name;
	req.show_flags   = show_flags;
	req_msg.msg_type = REQUEST_NODE_INFO_SINGLE;
	req_msg.data     = &req;

	if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
		return SLURM_ERROR;

	switch (resp_msg.msg_type) {
	case RESPONSE_NODE_INFO:
		*resp = (node_info_msg_t *) resp_msg.data;
		break;
	case RESPONSE_SLURM_RC:
		rc = ((return_code_msg_t *) resp_msg.data)->return_code;
		slurm_free_return_code_msg(resp_msg.data);
		if (rc)
			slurm_seterrno_ret(rc);
		*resp = NULL;
		break;
	default:
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
		break;
	}

	return SLURM_PROTOCOL_SUCCESS;
}
Exemple #4
0
/*
 * slurm_load_reservations - issue RPC to get all slurm reservation
 *	configuration information if changed since update_time
 * IN update_time - time of current configuration data
 * IN reserve_info_msg_pptr - place to store a reservation configuration
 *	pointer
 * RET 0 or a slurm error code
 * NOTE: free the response using slurm_free_reservation_info_msg
 */
extern int slurm_load_reservations (time_t update_time,
				    reserve_info_msg_t **resp)
{
        int rc;
        slurm_msg_t req_msg;
        slurm_msg_t resp_msg;
        resv_info_request_msg_t req;

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);

        req.last_update  = update_time;
        req_msg.msg_type = REQUEST_RESERVATION_INFO;
        req_msg.data     = &req;

	if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
		return SLURM_ERROR;

	switch (resp_msg.msg_type) {
	case RESPONSE_RESERVATION_INFO:
		*resp = (reserve_info_msg_t *) resp_msg.data;
		break;
	case RESPONSE_SLURM_RC:
		rc = ((return_code_msg_t *) resp_msg.data)->return_code;
		slurm_free_return_code_msg(resp_msg.data);
		if (rc)
			slurm_seterrno_ret(rc);
		*resp = NULL;
		break;
	default:
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
		break;
	}

	return SLURM_PROTOCOL_SUCCESS;
}
Exemple #5
0
int
_send_message_controller (enum controller_id dest, slurm_msg_t *req)
{
	int rc = SLURM_PROTOCOL_SUCCESS;
	slurm_fd_t fd = -1;
	slurm_msg_t *resp_msg = NULL;

	/* always going to one node (primary or backup per value of "dest") */
	if ((fd = slurm_open_controller_conn_spec(dest)) < 0)
		slurm_seterrno_ret(SLURMCTLD_COMMUNICATIONS_CONNECTION_ERROR);

	if (slurm_send_node_msg(fd, req) < 0) {
		slurm_shutdown_msg_conn(fd);
		slurm_seterrno_ret(SLURMCTLD_COMMUNICATIONS_SEND_ERROR);
	}
	resp_msg = xmalloc(sizeof(slurm_msg_t));
	slurm_msg_t_init(resp_msg);

	if((rc = slurm_receive_msg(fd, resp_msg, 0)) != 0) {
		slurm_shutdown_msg_conn(fd);
		return SLURMCTLD_COMMUNICATIONS_RECEIVE_ERROR;
	}

	if (slurm_shutdown_msg_conn(fd) != SLURM_SUCCESS)
		rc = SLURMCTLD_COMMUNICATIONS_SHUTDOWN_ERROR;
	else if (resp_msg->msg_type != RESPONSE_SLURM_RC)
		rc = SLURM_UNEXPECTED_MSG_ERROR;
	else
		rc = slurm_get_return_code(resp_msg->msg_type,
					   resp_msg->data);
	slurm_free_msg(resp_msg);

	if (rc)
		slurm_seterrno_ret(rc);
        return rc;
}
Exemple #6
0
int slurm_job_cpus_allocated_str_on_node_id(char *cpus,
					    size_t cpus_len,
					    job_resources_t *job_resrcs_ptr,
					    int node_id)
{
	int start_node = -1; /* start with -1 less so the array reps
			      * lines up correctly */
	uint32_t threads = 1;
	int inx = 0;
	bitstr_t *cpu_bitmap;
	int j, k, bit_inx, bit_reps;

	if (!job_resrcs_ptr || node_id < 0)
		slurm_seterrno_ret(EINVAL);

	/* find index in sock_core_rep_count[] for this node id
	 */
	do {
		start_node += job_resrcs_ptr->sock_core_rep_count[inx];
		inx++;
	} while (start_node < node_id);
	/* back to previous index since inx is always one step further
	 * after previous loop
	 */
	inx--;

	bit_reps = job_resrcs_ptr->sockets_per_node[inx] *
		job_resrcs_ptr->cores_per_socket[inx];

	/* get the number of threads per core on this node
	 */
	if (job_node_ptr)
		threads = job_node_ptr->node_array[node_id].threads;
	bit_inx = 0;
	cpu_bitmap = bit_alloc(bit_reps * threads);
	for (j = 0; j < bit_reps; j++) {
		if (bit_test(job_resrcs_ptr->core_bitmap, bit_inx)){
			for (k = 0; k < threads; k++)
				bit_set(cpu_bitmap,
					(j * threads) + k);
		}
		bit_inx++;
	}
	bit_fmt(cpus, cpus_len, cpu_bitmap);
	FREE_NULL_BITMAP(cpu_bitmap);

	return SLURM_SUCCESS;
}
Exemple #7
0
extern int slurm_job_cpus_allocated_on_node(job_resources_t *job_resrcs_ptr,
					    const char *node)
{
	hostlist_t node_hl;
	int node_id;

	if (!job_resrcs_ptr || !node || !job_resrcs_ptr->nodes)
		slurm_seterrno_ret(EINVAL);

	node_hl = hostlist_create(job_resrcs_ptr->nodes);
	node_id = hostlist_find(node_hl, node);
	hostlist_destroy(node_hl);
	if (node_id == -1)
		return (0); /* No cpus allocated on this node */

	return slurm_job_cpus_allocated_on_node_id(job_resrcs_ptr, node_id);
}
Exemple #8
0
/* _slurm_update - issue RPC for all update requests */
static int
_slurm_update (void *data, slurm_msg_type_t msg_type)
{
	int rc;
	slurm_msg_t req_msg;

	slurm_msg_t_init(&req_msg);
	req_msg.msg_type = msg_type;
	req_msg.data     = data;

	if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) < 0)
		return SLURM_ERROR;

	if (rc != SLURM_SUCCESS)
		slurm_seterrno_ret(rc);

        return SLURM_PROTOCOL_SUCCESS;
}
Exemple #9
0
/*
 * slurm_reconfigure - issue RPC to have Slurm controller (slurmctld)
 *	reload its configuration file
 * RET 0 or a slurm error code
 */
int
slurm_reconfigure (void)
{
	int rc;
	slurm_msg_t req;

	slurm_msg_t_init(&req);

	req.msg_type = REQUEST_RECONFIGURE;

	if (slurm_send_recv_controller_rc_msg(&req, &rc) < 0)
		return SLURM_ERROR;

	if (rc)
		slurm_seterrno_ret(rc);

	return SLURM_PROTOCOL_SUCCESS;
}
Exemple #10
0
/*
 * slurm_job_step_create - create a job step for a given job id
 * IN slurm_step_alloc_req_msg - description of job step request
 * OUT slurm_step_alloc_resp_msg - response to request
 * RET 0 on success, otherwise return -1 and set errno to indicate the error
 * NOTE: free the response using slurm_free_job_step_create_response_msg
 */
int
slurm_job_step_create (job_step_create_request_msg_t *req,
                       job_step_create_response_msg_t **resp)
{
	slurm_msg_t req_msg, resp_msg;
	int delay, rc, retry = 0;

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);
	req_msg.msg_type = REQUEST_JOB_STEP_CREATE;
	req_msg.data     = req;

re_send:
	if (slurm_send_recv_controller_msg(&req_msg, &resp_msg,
					   working_cluster_rec) < 0)
		return SLURM_ERROR;

	switch (resp_msg.msg_type) {
	case RESPONSE_SLURM_RC:
		rc = _handle_rc_msg(&resp_msg);
		if ((rc < 0) && (errno == EAGAIN)) {
			if (retry++ == 0) {
				verbose("Slurm is busy, step creation delayed");
				delay = (getpid() % 10) + 10;	/* 10-19 secs */
			}
			sleep(delay);
			goto re_send;
		}
		if (rc < 0)
			return SLURM_PROTOCOL_ERROR;
		*resp = NULL;
		break;
	case RESPONSE_JOB_STEP_CREATE:
		*resp = (job_step_create_response_msg_t *) resp_msg.data;
		break;
	default:
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
		break;
	}

	return SLURM_PROTOCOL_SUCCESS ;
}
Exemple #11
0
/* slurm_load_licenses()
 *
 * Load requested licenses from the controller.
 *
 */
extern int
slurm_load_licenses(time_t t,
                    license_info_msg_t **lic_info,
                    uint16_t show_flags)
{
	int cc;
	slurm_msg_t msg_request;
	slurm_msg_t msg_reply;
	struct license_info_request_msg req;

	memset(&req, 0, sizeof(struct license_info_request_msg));
	slurm_msg_t_init(&msg_request);
	slurm_msg_t_init(&msg_reply);

	msg_request.msg_type = REQUEST_LICENSE_INFO;
	req.last_update = t;
	req.show_flags = show_flags;
	msg_request.data = &req;

	cc = slurm_send_recv_controller_msg(&msg_request, &msg_reply,
					    working_cluster_rec);
	if (cc < 0)
		return SLURM_ERROR;

	switch (msg_reply.msg_type) {
		case RESPONSE_LICENSE_INFO:
			*lic_info = msg_reply.data;
			break;
		case RESPONSE_SLURM_RC:
			cc = ((return_code_msg_t *)msg_reply.data)->return_code;
			slurm_free_return_code_msg(msg_reply.data);
			if (cc) /* slurm_seterrno_ret() is a macro ... sigh */
				slurm_seterrno(cc);
			*lic_info = NULL;
			return -1;
		default:
			slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
		break;
	}

	return SLURM_PROTOCOL_SUCCESS;
}
Exemple #12
0
extern int slurm_job_cpus_allocated_on_node_id(
	job_resources_t *job_resrcs_ptr, int node_id)
{
	int i;
	int start_node=-1; /* start with -1 less so the array reps
			    * lines up correctly */

	if (!job_resrcs_ptr || node_id < 0)
		slurm_seterrno_ret(EINVAL);

	for (i = 0; i < job_resrcs_ptr->cpu_array_cnt; i++) {
		start_node += job_resrcs_ptr->cpu_array_reps[i];
		if (start_node >= node_id)
			break;
	}

	if (i >= job_resrcs_ptr->cpu_array_cnt)
		return (0); /* nodeid not in this job */

	return job_resrcs_ptr->cpu_array_value[i];
}
Exemple #13
0
/*
 * slurm_allocate_resources - allocate resources for a job request
 * IN job_desc_msg - description of resource allocation request
 * OUT slurm_alloc_msg - response to request
 * RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set
 * NOTE: free the response using slurm_free_resource_allocation_response_msg()
 */
int
slurm_allocate_resources (job_desc_msg_t *req,
			  resource_allocation_response_msg_t **resp)
{
	int rc;
	slurm_msg_t req_msg;
	slurm_msg_t resp_msg;

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);
	/*
	 * set Node and session id for this request
	 */
	if (req->alloc_sid == NO_VAL)
		req->alloc_sid = getsid(0);

	req_msg.msg_type = REQUEST_RESOURCE_ALLOCATION;
	req_msg.data     = req;

	rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg,
					    working_cluster_rec);

	if (rc == SLURM_ERROR)
		return SLURM_ERROR;

	switch (resp_msg.msg_type) {
	case RESPONSE_SLURM_RC:
		if (_handle_rc_msg(&resp_msg) < 0)
			return SLURM_ERROR;
		*resp = NULL;
		break;
	case RESPONSE_RESOURCE_ALLOCATION:
		*resp = (resource_allocation_response_msg_t *) resp_msg.data;
		break;
	default:
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
	}

	return SLURM_SUCCESS;
}
Exemple #14
0
int slurm_job_cpus_allocated_str_on_node(char *cpus,
					 size_t cpus_len,
					 job_resources_t *job_resrcs_ptr,
					 const char *node)
{
	hostlist_t node_hl;
	int node_id;

	if (!job_resrcs_ptr || !node || !job_resrcs_ptr->nodes)
		slurm_seterrno_ret(EINVAL);

	node_hl = hostlist_create(job_resrcs_ptr->nodes);
	node_id = hostlist_find(node_hl, node);
	hostlist_destroy(node_hl);
	if (node_id == -1)
		return SLURM_ERROR;

	return slurm_job_cpus_allocated_str_on_node_id(cpus,
						       cpus_len,
						       job_resrcs_ptr,
						       node_id);
}
Exemple #15
0
int optz_add(struct option **optz, const struct option *opt)
{
	int len = 0;
	struct option *op = *optz;
	struct option *t = *optz;

	for (; op->name != NULL; op++) {
		if (strcmp(op->name, opt->name) == 0)
			slurm_seterrno_ret(EEXIST);
		len++;
	}

	++len;			/* Add one for incoming option */

	t = xrealloc(t, (len + 1) * sizeof(struct option));

	t[len - 1] = *opt;
	t[len] = opt_table_end;

	*optz = t;

	return (0);
}
Exemple #16
0
/*
 * slurm_clear_trigger - Clear (remove) an existing event trigger
 * RET 0 or a slurm error code
 */
extern int slurm_clear_trigger (trigger_info_t *trigger_clear)
{
	int rc;
	slurm_msg_t msg;
	trigger_info_msg_t req;

	slurm_msg_t_init(&msg);
	/*
	 * Request message:
	 */
	req.record_count  = 1;
	req.trigger_array = trigger_clear;
	msg.msg_type      = REQUEST_TRIGGER_CLEAR;
        msg.data          = &req;

	if (slurm_send_recv_controller_rc_msg(&msg, &rc) < 0)
		return SLURM_FAILURE;

	if (rc)
		slurm_seterrno_ret(rc);

	return SLURM_SUCCESS;
}
Exemple #17
0
/*
 * slurm_pull_trigger - Pull (fire) an event trigger
 * RET 0 or a slurm error code
 */
extern int slurm_pull_trigger (trigger_info_t *trigger_pull)
{
	int rc;
	slurm_msg_t msg;
	trigger_info_msg_t req;

	/*
	 * Request message:
	 */
	slurm_msg_t_init(&msg);
	memset(&req, 0, sizeof(trigger_info_msg_t));
	req.record_count  = 1;
	req.trigger_array = trigger_pull;
	msg.msg_type      = REQUEST_TRIGGER_PULL;
	msg.data	  = &req;

	if (slurm_send_recv_controller_rc_msg(&msg, &rc) < 0)
		return SLURM_FAILURE;
	if (rc)
		slurm_seterrno_ret(rc);

	return SLURM_SUCCESS;
}
Exemple #18
0
/*
 * slurm_checkpoint_complete - note the completion of a job step's checkpoint
 *	operation.
 * IN job_id  - job on which to perform operation
 * IN step_id - job step on which to perform operation
 * IN begin_time - time at which checkpoint began
 * IN error_code - error code, highest value for all complete calls is preserved
 * IN error_msg - error message, preserved for highest error_code
 * RET 0 or a slurm error code
 */
extern int slurm_checkpoint_complete (uint32_t job_id, uint32_t step_id,
		time_t begin_time, uint32_t error_code, char *error_msg)
{
	int rc;
	slurm_msg_t msg;
	checkpoint_comp_msg_t req;

	slurm_msg_t_init(&msg);
	req.job_id       = job_id;
	req.step_id      = step_id;
	req.begin_time   = begin_time;
	req.error_code   = error_code;
	req.error_msg    = error_msg;
	msg.msg_type     = REQUEST_CHECKPOINT_COMP;
	msg.data         = &req;

	if (slurm_send_recv_controller_rc_msg(&msg, &rc,
					      working_cluster_rec) < 0)
		return SLURM_ERROR;
	if (rc)
		slurm_seterrno_ret(rc);
	return SLURM_SUCCESS;
}
Exemple #19
0
/*
 * slurm_allocation_lookup - retrieve info for an existing resource allocation
 * 			     without the addrs and such
 * IN jobid - job allocation identifier
 * OUT info - job allocation information
 * RET 0 on success, otherwise return -1 and set errno to indicate the error
 * NOTE: free the response using slurm_free_resource_allocation_response_msg()
 */
extern int slurm_allocation_lookup(uint32_t jobid,
				   resource_allocation_response_msg_t **info)
{
	job_alloc_info_msg_t req = {0};
	slurm_msg_t req_msg;
	slurm_msg_t resp_msg;

	req.job_id = jobid;
	req.req_cluster  = slurmctld_conf.cluster_name;
	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);
	req_msg.msg_type = REQUEST_JOB_ALLOCATION_INFO;
	req_msg.data     = &req;

	if (slurm_send_recv_controller_msg(&req_msg, &resp_msg,
					   working_cluster_rec) < 0)
		return SLURM_ERROR;

	req.req_cluster = NULL;

	switch (resp_msg.msg_type) {
	case RESPONSE_SLURM_RC:
		if (_handle_rc_msg(&resp_msg) < 0)
			return SLURM_ERROR;
		*info = NULL;
		break;
	case RESPONSE_JOB_ALLOCATION_INFO:
		*info = (resource_allocation_response_msg_t *) resp_msg.data;
		return SLURM_PROTOCOL_SUCCESS;
		break;
	default:
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
		break;
	}

	return SLURM_PROTOCOL_SUCCESS;
}
Exemple #20
0
/*
 * slurm_sbcast_lookup - retrieve info for an existing resource allocation
 *	including a credential needed for sbcast
 * IN job_id - job allocation identifier (or pack job ID)
 * IN pack_job_offset - pack job  index (or NO_VAL if not pack job)
 * IN step_id - step allocation identifier (or NO_VAL for entire job)
 * OUT info - job allocation information including a credential for sbcast
 * RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set
 * NOTE: free the "resp" using slurm_free_sbcast_cred_msg
 */
extern int slurm_sbcast_lookup(uint32_t job_id, uint32_t pack_job_offset,
			       uint32_t step_id, job_sbcast_cred_msg_t **info)
{
	step_alloc_info_msg_t req;
	slurm_msg_t req_msg;
	slurm_msg_t resp_msg;

	req.job_id = job_id;
	req.pack_job_offset = pack_job_offset;
	req.step_id = step_id;
	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);
	req_msg.msg_type = REQUEST_JOB_SBCAST_CRED;
	req_msg.data     = &req;

	if (slurm_send_recv_controller_msg(&req_msg,
					   &resp_msg,working_cluster_rec) < 0)
		return SLURM_ERROR;

	switch (resp_msg.msg_type) {
	case RESPONSE_SLURM_RC:
		if (_handle_rc_msg(&resp_msg) < 0)
			return SLURM_ERROR;
		*info = NULL;
		break;
	case RESPONSE_JOB_SBCAST_CRED:
		*info = (job_sbcast_cred_msg_t *)resp_msg.data;
		return SLURM_SUCCESS;
		break;
	default:
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
		break;
	}

	return SLURM_SUCCESS;
}
Exemple #21
0
/*
 * slurm_job_will_run - determine if a job would execute immediately if
 *	submitted now
 * IN job_desc_msg - description of resource allocation request
 * RET 0 on success, otherwise return -1 and set errno to indicate the error
 */
int slurm_job_will_run (job_desc_msg_t *req)
{
	slurm_msg_t req_msg, resp_msg;
	will_run_response_msg_t *will_run_resp;
	char buf[64];
	bool host_set = false;
	int rc;
	uint32_t cluster_flags = slurmdb_setup_cluster_flags();
	char *type = "processors";
	/* req.immediate = true;    implicit */
	if ((req->alloc_node == NULL) &&
	    (gethostname_short(buf, sizeof(buf)) == 0)) {
		req->alloc_node = buf;
		host_set = true;
	}
	slurm_msg_t_init(&req_msg);
	req_msg.msg_type = REQUEST_JOB_WILL_RUN;
	req_msg.data     = req;

	rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg);

	if (host_set)
		req->alloc_node = NULL;

	if (rc < 0)
		return SLURM_SOCKET_ERROR;

	switch (resp_msg.msg_type) {
	case RESPONSE_SLURM_RC:
		if (_handle_rc_msg(&resp_msg) < 0)
			return SLURM_PROTOCOL_ERROR;
		break;
	case RESPONSE_JOB_WILL_RUN:
		if(cluster_flags & CLUSTER_FLAG_BG)
			type = "cnodes";
		will_run_resp = (will_run_response_msg_t *) resp_msg.data;
		slurm_make_time_str(&will_run_resp->start_time,
				    buf, sizeof(buf));
		info("Job %u to start at %s using %u %s"
		     " on %s",
		     will_run_resp->job_id, buf,
		     will_run_resp->proc_cnt, type,
		     will_run_resp->node_list);
		if (will_run_resp->preemptee_job_id) {
			ListIterator itr;
			uint32_t *job_id_ptr;
			char *job_list = NULL, *sep = "";
			itr = list_iterator_create(will_run_resp->
						   preemptee_job_id);
			while ((job_id_ptr = list_next(itr))) {
				if (job_list)
					sep = ",";
				xstrfmtcat(job_list, "%s%u", sep, *job_id_ptr);
			}
			info("  Preempts: %s", job_list);
			xfree(job_list);
		}

		slurm_free_will_run_response_msg(will_run_resp);
		break;
	default:
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
		break;
	}

	return SLURM_PROTOCOL_SUCCESS;
}
Exemple #22
0
/*
 * slurm_load_slurmd_status - issue RPC to get the status of slurmd
 *	daemon on this machine
 * IN slurmd_info_ptr - place to store slurmd status information
 * RET 0 or -1 on error
 * NOTE: free the response using slurm_free_slurmd_status()
 */
extern int
slurm_load_slurmd_status(slurmd_status_t **slurmd_status_ptr)
{
	int rc;
	slurm_msg_t req_msg;
	slurm_msg_t resp_msg;
	uint32_t cluster_flags = slurmdb_setup_cluster_flags();
	char *this_addr;

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);

	if (cluster_flags & CLUSTER_FLAG_MULTSD) {
		if ((this_addr = getenv("SLURMD_NODENAME"))) {
			slurm_conf_get_addr(this_addr, &req_msg.address);
		} else {
			this_addr = "localhost";
			slurm_set_addr(&req_msg.address,
				       (uint16_t)slurm_get_slurmd_port(),
				       this_addr);
		}
	} else {
		char this_host[256];

		/*
		 *  Set request message address to slurmd on localhost
		 */
		gethostname_short(this_host, sizeof(this_host));
		this_addr = slurm_conf_get_nodeaddr(this_host);
		if (this_addr == NULL)
			this_addr = xstrdup("localhost");
		slurm_set_addr(&req_msg.address,
			       (uint16_t)slurm_get_slurmd_port(),
			       this_addr);
		xfree(this_addr);
	}
	req_msg.msg_type = REQUEST_DAEMON_STATUS;
	req_msg.data     = NULL;

	rc = slurm_send_recv_node_msg(&req_msg, &resp_msg, 0);

	if ((rc != 0) || !resp_msg.auth_cred) {
		error("slurm_slurmd_info: %m");
		if (resp_msg.auth_cred)
			g_slurm_auth_destroy(resp_msg.auth_cred);
		return SLURM_ERROR;
	}
	if (resp_msg.auth_cred)
		g_slurm_auth_destroy(resp_msg.auth_cred);

	switch (resp_msg.msg_type) {
	case RESPONSE_SLURMD_STATUS:
		*slurmd_status_ptr = (slurmd_status_t *) resp_msg.data;
		break;
	case RESPONSE_SLURM_RC:
		rc = ((return_code_msg_t *) resp_msg.data)->return_code;
		slurm_free_return_code_msg(resp_msg.data);
		if (rc)
			slurm_seterrno_ret(rc);
		break;
	default:
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
		break;
	}

	return SLURM_PROTOCOL_SUCCESS;
}
Exemple #23
0
/*
 * check_header_version checks to see that the specified header was sent
 * from a node running the same version of the protocol as the current node
 * IN header - the message header received
 * RET - SLURM error code
 */
int check_header_version(header_t * header)
{
	uint16_t check_version = SLURM_PROTOCOL_VERSION;

	if (working_cluster_rec)
		check_version = _get_slurm_version(
			working_cluster_rec->rpc_version);

	if (slurmdbd_conf) {
		if ((header->version != SLURM_PROTOCOL_VERSION)     &&
		    (header->version != SLURM_2_2_PROTOCOL_VERSION) &&
		    (header->version != SLURM_2_1_PROTOCOL_VERSION))
			slurm_seterrno_ret(SLURM_PROTOCOL_VERSION_ERROR);
	} else if (header->version != check_version) {
		/* Starting with 2.2 we will handle previous versions
		 * of SLURM for some calls */
		switch(header->msg_type) {
		case REQUEST_BLOCK_INFO:
		case REQUEST_BUILD_INFO:
		case REQUEST_CANCEL_JOB_STEP:
		case REQUEST_CHECKPOINT:
		case REQUEST_CHECKPOINT_COMP:
		case REQUEST_CHECKPOINT_TASK_COMP:
		case REQUEST_COMPLETE_BATCH_SCRIPT:	/* From slurmstepd */
		case REQUEST_COMPLETE_JOB_ALLOCATION:
		case REQUEST_CREATE_PARTITION:
		case REQUEST_CREATE_RESERVATION:
		case REQUEST_JOB_END_TIME:
		case REQUEST_JOB_INFO:
		case REQUEST_JOB_INFO_SINGLE:
		case REQUEST_JOB_READY:
		case REQUEST_JOB_REQUEUE:
		case REQUEST_JOB_STEP_INFO:
		case REQUEST_JOB_WILL_RUN:
		case REQUEST_NODE_INFO:
		case REQUEST_PARTITION_INFO:
		case REQUEST_PRIORITY_FACTORS:
		case REQUEST_RECONFIGURE:
		case REQUEST_RESERVATION_INFO:
		case REQUEST_SET_DEBUG_FLAGS:
		case REQUEST_SET_DEBUG_LEVEL:
		case REQUEST_SHARE_INFO:
		case REQUEST_SHUTDOWN:
		case REQUEST_SHUTDOWN_IMMEDIATE:
		case REQUEST_STEP_COMPLETE:		/* From slurmstepd */
		case REQUEST_STEP_LAYOUT:
		case REQUEST_SUBMIT_BATCH_JOB:
		case REQUEST_SUSPEND:
		case REQUEST_TOPO_INFO:
		case REQUEST_UPDATE_BLOCK:
		case REQUEST_UPDATE_JOB:
		case REQUEST_UPDATE_PARTITION:
			if ((header->version == SLURM_2_2_PROTOCOL_VERSION)
			    || (header->version == SLURM_2_1_PROTOCOL_VERSION))
				break;
		default:
			slurm_seterrno_ret(SLURM_PROTOCOL_VERSION_ERROR);
			break;
		}
	}
	return SLURM_PROTOCOL_SUCCESS;
}
Exemple #24
0
static int _load_fed_parts(slurm_msg_t *req_msg,
			   partition_info_msg_t **part_info_msg_pptr,
			   uint16_t show_flags, char *cluster_name,
			   slurmdb_federation_rec_t *fed)
{
	int cluster_inx = 0, i;
	load_part_resp_struct_t *part_resp;
	partition_info_msg_t *orig_msg = NULL, *new_msg = NULL;
	uint32_t new_rec_cnt;
	slurmdb_cluster_rec_t *cluster;
	ListIterator iter;
	pthread_attr_t load_attr;
	int pthread_count = 0;
	pthread_t *load_thread = 0;
	load_part_req_struct_t *load_args;
	List resp_msg_list;

	*part_info_msg_pptr = NULL;

	/* Spawn one pthread per cluster to collect partition information */
	resp_msg_list = list_create(NULL);
	load_thread = xmalloc(sizeof(pthread_attr_t) *
			      list_count(fed->cluster_list));
	iter = list_iterator_create(fed->cluster_list);
	while ((cluster = (slurmdb_cluster_rec_t *) list_next(iter))) {
		int retries = 0;
		if ((cluster->control_host == NULL) ||
		    (cluster->control_host[0] == '\0'))
			continue;	/* Cluster down */

		load_args = xmalloc(sizeof(load_part_req_struct_t));
		load_args->cluster = cluster;
		load_args->cluster_inx = cluster_inx++;
		load_args->req_msg = req_msg;
		load_args->resp_msg_list = resp_msg_list;
		load_args->show_flags = show_flags;
		slurm_attr_init(&load_attr);
		if (pthread_attr_setdetachstate(&load_attr,
						PTHREAD_CREATE_JOINABLE))
			error("pthread_attr_setdetachstate error %m");
		while (pthread_create(&load_thread[pthread_count], &load_attr,
				      _load_part_thread, (void *) load_args)) {
			error("pthread_create error %m");
			if (++retries > MAX_RETRIES)
				fatal("Can't create pthread");
			usleep(10000);	/* sleep and retry */
		}
		pthread_count++;
		slurm_attr_destroy(&load_attr);

	}
	list_iterator_destroy(iter);

	/* Wait for all pthreads to complete */
	for (i = 0; i < pthread_count; i++)
		pthread_join(load_thread[i], NULL);
	xfree(load_thread);

	/* Maintain a consistent cluster/node ordering */
	list_sort(resp_msg_list, _sort_by_cluster_inx);

	/* Merge the responses into a single response message */
	iter = list_iterator_create(resp_msg_list);
	while ((part_resp = (load_part_resp_struct_t *) list_next(iter))) {
		new_msg = part_resp->new_msg;
		if (!orig_msg) {
			orig_msg = new_msg;
			*part_info_msg_pptr = orig_msg;
		} else {
			/* Merge the node records */
			orig_msg->last_update = MIN(orig_msg->last_update,
						    new_msg->last_update);
			new_rec_cnt = orig_msg->record_count +
				      new_msg->record_count;
			if (new_msg->record_count) {
				orig_msg->partition_array =
					xrealloc(orig_msg->partition_array,
						 sizeof(partition_info_t) *
						 new_rec_cnt);
				(void) memcpy(orig_msg->partition_array +
					      orig_msg->record_count,
					      new_msg->partition_array,
					      sizeof(partition_info_t) *
					      new_msg->record_count);
				orig_msg->record_count = new_rec_cnt;
			}
			xfree(new_msg->partition_array);
			xfree(new_msg);
		}
		xfree(part_resp);
	}
	list_iterator_destroy(iter);
	FREE_NULL_LIST(resp_msg_list);

	if (!orig_msg)
		slurm_seterrno_ret(SLURM_ERROR);

	return SLURM_PROTOCOL_SUCCESS;
}
/*
 * slurm_get_node_energy_n - issue RPC to get the energy data of all
 * configured sensors on the target machine
 * IN  host  - name of node to query, NULL if localhost
 * IN  delta - Use cache if data is newer than this in seconds
 * OUT sensors_cnt - number of sensors
 * OUT energy - array of acct_gather_energy_t structures on success or
 *                NULL other wise
 * RET 0 on success or a slurm error code
 * NOTE: free the response using xfree
 */
extern int slurm_get_node_energy(char *host, uint16_t delta,
				 uint16_t *sensor_cnt,
				 acct_gather_energy_t **energy)
{
	int rc;
	slurm_msg_t req_msg;
	slurm_msg_t resp_msg;
	acct_gather_energy_req_msg_t req;
	uint32_t cluster_flags = slurmdb_setup_cluster_flags();
	char *this_addr;

	xassert(sensor_cnt);
	xassert(energy);

	*sensor_cnt = 0;
	*energy = NULL;

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);

	if (host)
		slurm_conf_get_addr(host, &req_msg.address);
	else if (cluster_flags & CLUSTER_FLAG_MULTSD) {
		if ((this_addr = getenv("SLURMD_NODENAME"))) {
			slurm_conf_get_addr(this_addr, &req_msg.address);
		} else {
			this_addr = "localhost";
			slurm_set_addr(&req_msg.address,
				       (uint16_t)slurm_get_slurmd_port(),
				       this_addr);
		}
	} else {
		char this_host[256];
		/*
		 *  Set request message address to slurmd on localhost
		 */
		gethostname_short(this_host, sizeof(this_host));
		this_addr = slurm_conf_get_nodeaddr(this_host);
		if (this_addr == NULL)
			this_addr = xstrdup("localhost");
		slurm_set_addr(&req_msg.address,
			       (uint16_t)slurm_get_slurmd_port(),
			       this_addr);
		xfree(this_addr);
	}

	req.delta        = delta;
	req_msg.msg_type = REQUEST_ACCT_GATHER_ENERGY;
	req_msg.data     = &req;

	rc = slurm_send_recv_node_msg(&req_msg, &resp_msg, 0);

	if (rc != 0 || !resp_msg.auth_cred) {
		error("slurm_get_node_energy: %m");
		if (resp_msg.auth_cred)
			g_slurm_auth_destroy(resp_msg.auth_cred);
		return SLURM_ERROR;
	}
	if (resp_msg.auth_cred)
		g_slurm_auth_destroy(resp_msg.auth_cred);
	switch (resp_msg.msg_type) {
	case RESPONSE_ACCT_GATHER_ENERGY:
		*sensor_cnt = ((acct_gather_node_resp_msg_t *)
			       resp_msg.data)->sensor_cnt;
		*energy = ((acct_gather_node_resp_msg_t *)
			   resp_msg.data)->energy;
		((acct_gather_node_resp_msg_t *) resp_msg.data)->energy = NULL;
		slurm_free_acct_gather_node_resp_msg(resp_msg.data);
		break;
	case RESPONSE_SLURM_RC:
	        rc = ((return_code_msg_t *) resp_msg.data)->return_code;
		slurm_free_return_code_msg(resp_msg.data);
		if (rc)
			slurm_seterrno_ret(rc);
		break;
	default:
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
		break;
	}

	return SLURM_PROTOCOL_SUCCESS;
}
Exemple #26
0
static int _load_fed_nodes(slurm_msg_t *req_msg,
			   node_info_msg_t **node_info_msg_pptr,
			   uint16_t show_flags, char *cluster_name,
			   slurmdb_federation_rec_t *fed)
{
	int cluster_inx = 0, i;
	load_node_resp_struct_t *node_resp;
	node_info_msg_t *orig_msg = NULL, *new_msg = NULL;
	uint32_t new_rec_cnt;
	slurmdb_cluster_rec_t *cluster;
	ListIterator iter;
	int pthread_count = 0;
	pthread_t *load_thread = 0;
	load_node_req_struct_t *load_args;
	List resp_msg_list;

	*node_info_msg_pptr = NULL;

	/* Spawn one pthread per cluster to collect node information */
	resp_msg_list = list_create(NULL);
	load_thread = xmalloc(sizeof(pthread_t) *
			      list_count(fed->cluster_list));
	iter = list_iterator_create(fed->cluster_list);
	while ((cluster = (slurmdb_cluster_rec_t *) list_next(iter))) {
		if ((cluster->control_host == NULL) ||
		    (cluster->control_host[0] == '\0'))
			continue;	/* Cluster down */

		load_args = xmalloc(sizeof(load_node_req_struct_t));
		load_args->cluster = cluster;
		load_args->cluster_inx = cluster_inx++;
		load_args->req_msg = req_msg;
		load_args->resp_msg_list = resp_msg_list;
		load_args->show_flags = show_flags;
		slurm_thread_create(&load_thread[pthread_count],
				    _load_node_thread, load_args);
		pthread_count++;
	}
	list_iterator_destroy(iter);

	/* Wait for all pthreads to complete */
	for (i = 0; i < pthread_count; i++)
		pthread_join(load_thread[i], NULL);
	xfree(load_thread);

	/* Maintain a consistent cluster/node ordering */
	list_sort(resp_msg_list, _sort_by_cluster_inx);

	/* Merge the responses into a single response message */
	iter = list_iterator_create(resp_msg_list);
	while ((node_resp = (load_node_resp_struct_t *) list_next(iter))) {
		new_msg = node_resp->new_msg;
		if (!orig_msg) {
			orig_msg = new_msg;
			*node_info_msg_pptr = orig_msg;
		} else {
			/* Merge the node records */
			orig_msg->last_update = MIN(orig_msg->last_update,
						    new_msg->last_update);
			new_rec_cnt = orig_msg->record_count +
				      new_msg->record_count;
			if (new_msg->record_count) {
				orig_msg->node_array =
					xrealloc(orig_msg->node_array,
						 sizeof(node_info_t) *
						 new_rec_cnt);
				(void) memcpy(orig_msg->node_array +
					      orig_msg->record_count,
					      new_msg->node_array,
					      sizeof(node_info_t) *
					      new_msg->record_count);
				orig_msg->record_count = new_rec_cnt;
			}
			xfree(new_msg->node_array);
			xfree(new_msg);
		}
		xfree(node_resp);
	}
	list_iterator_destroy(iter);
	FREE_NULL_LIST(resp_msg_list);

	if (!orig_msg)
		slurm_seterrno_ret(SLURM_ERROR);

	return SLURM_SUCCESS;
}
Exemple #27
0
/*
 * slurm_get_end_time - get the expected end time for a given slurm job
 * IN jobid     - slurm job id
 * end_time_ptr - location in which to store scheduled end time for job
 * RET 0 or -1 on error
 */
extern int
slurm_get_end_time(uint32_t jobid, time_t *end_time_ptr)
{
	int rc;
	slurm_msg_t resp_msg;
	slurm_msg_t req_msg;
	job_alloc_info_msg_t job_msg;
	srun_timeout_msg_t *timeout_msg;
	time_t now = time(NULL);
	static uint32_t jobid_cache = 0;
	static uint32_t jobid_env = 0;
	static time_t endtime_cache = 0;
	static time_t last_test_time = 0;

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);

	if (!end_time_ptr)
		slurm_seterrno_ret(EINVAL);

	if (jobid == 0) {
		if (jobid_env) {
			jobid = jobid_env;
		} else {
			char *env = getenv("SLURM_JOB_ID");
			if (env) {
				jobid = (uint32_t) atol(env);
				jobid_env = jobid;
			}
		}
		if (jobid == 0) {
			slurm_seterrno(ESLURM_INVALID_JOB_ID);
			return SLURM_ERROR;
		}
	}

	/* Just use cached data if data less than 60 seconds old */
	if ((jobid == jobid_cache)
	&&  (difftime(now, last_test_time) < 60)) {
		*end_time_ptr  = endtime_cache;
		return SLURM_SUCCESS;
	}

	job_msg.job_id     = jobid;
	req_msg.msg_type   = REQUEST_JOB_END_TIME;
	req_msg.data       = &job_msg;

	if (slurm_send_recv_controller_msg(
		    &req_msg, &resp_msg) < 0)
		return SLURM_ERROR;

	switch (resp_msg.msg_type) {
	case SRUN_TIMEOUT:
		timeout_msg = (srun_timeout_msg_t *) resp_msg.data;
		last_test_time = time(NULL);
		jobid_cache    = jobid;
		endtime_cache  = timeout_msg->timeout;
		*end_time_ptr  = endtime_cache;
		slurm_free_srun_timeout_msg(resp_msg.data);
		break;
	case RESPONSE_SLURM_RC:
		rc = ((return_code_msg_t *) resp_msg.data)->return_code;
		slurm_free_return_code_msg(resp_msg.data);
		if (endtime_cache)
			*end_time_ptr  = endtime_cache;
		else if (rc)
			slurm_seterrno_ret(rc);
		break;
	default:
		if (endtime_cache)
			*end_time_ptr  = endtime_cache;
		else
			slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
		break;
	}

	return SLURM_SUCCESS;
}
Exemple #28
0
/*
 * slurm_network_callerid - issue RPC to get the job id of a job from a remote
 * slurmd based upon network socket information.
 *
 * IN req - Information about network connection in question
 * OUT job_id -  ID of the job or NO_VAL
 * OUT node_name - name of the remote slurmd
 * IN node_name_size - size of the node_name buffer
 * RET SLURM_PROTOCOL_SUCCESS or SLURM_FAILURE on error
 */
extern int
slurm_network_callerid (network_callerid_msg_t req, uint32_t *job_id,
	char *node_name, int node_name_size)
{
	int rc;
	slurm_msg_t resp_msg;
	slurm_msg_t req_msg;
	network_callerid_resp_t *resp;
	struct sockaddr_in addr;
	uint32_t target_slurmd; /* change for IPv6 support */

	debug("slurm_network_callerid RPC: start");

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);

	/* ip_src is the IP we want to talk to. Hopefully there's a slurmd
	 * listening there */
	memset(&addr, 0, sizeof(addr));
	addr.sin_family = req.af;

	/* TODO: until IPv6 support is added to Slurm, we must hope that the
	 * other end is IPv4 */
	if (req.af == AF_INET6) {
		error("IPv6 is not yet supported in Slurm");
		/* For testing IPv6 callerid prior to Slurm IPv6 RPC support,
		 * set a sane target, uncomment the following and comment out
		 * the return code:
		addr.sin_family = AF_INET;
		target_slurmd = inet_addr("127.0.0.1"); //choose a test target
		*/
		return SLURM_FAILURE;
	} else
		memcpy(&target_slurmd, req.ip_src, 4);

	addr.sin_addr.s_addr = target_slurmd;
	addr.sin_port = htons(slurm_get_slurmd_port());
	req_msg.address = addr;

	req_msg.msg_type = REQUEST_NETWORK_CALLERID;
	req_msg.data     = &req;

	if (slurm_send_recv_node_msg(&req_msg, &resp_msg, 0) < 0)
		return SLURM_ERROR;

	switch (resp_msg.msg_type) {
		case RESPONSE_NETWORK_CALLERID:
			resp = (network_callerid_resp_t*)resp_msg.data;
			*job_id = resp->job_id;
			strncpy(node_name, resp->node_name, node_name_size);
			break;
		case RESPONSE_SLURM_RC:
			rc = ((return_code_msg_t *) resp_msg.data)->return_code;
			if (rc)
				slurm_seterrno_ret(rc);
			break;
		default:
			slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
			break;
	}

	slurm_free_network_callerid_msg(resp_msg.data);
	return SLURM_PROTOCOL_SUCCESS;
}