/*
 * slurm_load_node - issue RPC to get slurm all node configuration information
 *	if changed since update_time
 * IN update_time - time of current configuration data
 * OUT resp - place to store a node configuration pointer
 * IN show_flags - node filtering options
 * RET 0 or a slurm error code
 * NOTE: free the response using slurm_free_node_info_msg
 */
extern int slurm_load_node (time_t update_time,
			    node_info_msg_t **resp, uint16_t show_flags)
{
	int rc;
	slurm_msg_t req_msg;
	slurm_msg_t resp_msg;
	node_info_request_msg_t req;

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);
	req.last_update  = update_time;
	req.show_flags   = show_flags;
	req_msg.msg_type = REQUEST_NODE_INFO;
	req_msg.data     = &req;

	if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
		return SLURM_ERROR;

	switch (resp_msg.msg_type) {
	case RESPONSE_NODE_INFO:
		*resp = (node_info_msg_t *) resp_msg.data;
		if (show_flags & SHOW_MIXED)
			_set_node_mixed(*resp);
		break;
	case RESPONSE_SLURM_RC:
		rc = ((return_code_msg_t *) resp_msg.data)->return_code;
		slurm_free_return_code_msg(resp_msg.data);
		if (rc)
			slurm_seterrno_ret(rc);
		*resp = NULL;
		break;
	default:
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
		break;
	}

	return SLURM_PROTOCOL_SUCCESS;
}
Beispiel #2
0
/*
 * slurm_checkpoint_able - determine if the specified job step can presently
 *	be checkpointed
 * IN job_id  - job on which to perform operation
 * IN step_id - job step on which to perform operation
 * OUT start_time - time at which checkpoint request was issued
 * RET 0 (can be checkpoined) or a slurm error code
 */
extern int slurm_checkpoint_able (uint32_t job_id, uint32_t step_id,
		time_t *start_time)
{
	int rc;
	slurm_msg_t req_msg, resp_msg;
	checkpoint_msg_t ckp_req;
	checkpoint_resp_msg_t *resp;

	ckp_req.op        = CHECK_ABLE;
	ckp_req.job_id    = job_id;
	ckp_req.step_id   = step_id;
	ckp_req.image_dir = NULL;
	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);
	req_msg.msg_type  = REQUEST_CHECKPOINT;
	req_msg.data      = &ckp_req;

	if (slurm_send_recv_controller_msg(&req_msg, &resp_msg,
					   working_cluster_rec) < 0)
		return SLURM_ERROR;

	switch (resp_msg.msg_type) {
	case RESPONSE_CHECKPOINT:
		resp = (checkpoint_resp_msg_t *) resp_msg.data;
		*start_time = resp->event_time;
		slurm_free_checkpoint_resp_msg(resp_msg.data);
		rc = SLURM_SUCCESS;
		break;
	case RESPONSE_SLURM_RC:
		rc = _handle_rc_msg(&resp_msg);
		break;
	default:
		*start_time = (time_t) NULL;
		rc = SLURM_ERROR;
	}
	return rc;
}
Beispiel #3
0
/*
 * slurm_job_node_ready - report if nodes are ready for job to execute now
 * IN job_id - slurm job id
 * RET: READY_* values as defined in slurm.h
 */
extern int slurm_job_node_ready(uint32_t job_id)
{
	slurm_msg_t req, resp;
	job_id_msg_t msg;
	int rc;

	slurm_msg_t_init(&req);
	slurm_msg_t_init(&resp);

	bzero(&msg, sizeof(job_id_msg_t));
	msg.job_id   = job_id;
	req.msg_type = REQUEST_JOB_READY;
	req.data     = &msg;

	if (slurm_send_recv_controller_msg(&req, &resp) < 0)
		return READY_JOB_ERROR;

	if (resp.msg_type == RESPONSE_JOB_READY) {
		rc = ((return_code_msg_t *) resp.data)->return_code;
		slurm_free_return_code_msg(resp.data);
	} else if (resp.msg_type == RESPONSE_SLURM_RC) {
		int job_rc = ((return_code_msg_t *) resp.data) ->
				return_code;
		if ((job_rc == ESLURM_INVALID_PARTITION_NAME) ||
		    (job_rc == ESLURM_INVALID_JOB_ID))
			rc = READY_JOB_FATAL;
		else	/* EAGAIN */
			rc = READY_JOB_ERROR;
		slurm_free_return_code_msg(resp.data);
	} else if (resp.msg_type == RESPONSE_PROLOG_EXECUTING) {
		rc = READY_JOB_ERROR;
	} else {
		rc = READY_JOB_ERROR;
	}

	return rc;
}
Beispiel #4
0
/*
 * slurm_allocation_lookup - retrieve info for an existing resource allocation
 * 			     without the addrs and such
 * IN jobid - job allocation identifier
 * OUT info - job allocation information
 * RET 0 on success, otherwise return -1 and set errno to indicate the error
 * NOTE: free the response using slurm_free_resource_allocation_response_msg()
 */
extern int slurm_allocation_lookup(uint32_t jobid,
				   resource_allocation_response_msg_t **info)
{
	job_alloc_info_msg_t req = {0};
	slurm_msg_t req_msg;
	slurm_msg_t resp_msg;

	req.job_id = jobid;
	req.req_cluster  = slurmctld_conf.cluster_name;
	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);
	req_msg.msg_type = REQUEST_JOB_ALLOCATION_INFO;
	req_msg.data     = &req;

	if (slurm_send_recv_controller_msg(&req_msg, &resp_msg,
					   working_cluster_rec) < 0)
		return SLURM_ERROR;

	req.req_cluster = NULL;

	switch (resp_msg.msg_type) {
	case RESPONSE_SLURM_RC:
		if (_handle_rc_msg(&resp_msg) < 0)
			return SLURM_ERROR;
		*info = NULL;
		break;
	case RESPONSE_JOB_ALLOCATION_INFO:
		*info = (resource_allocation_response_msg_t *) resp_msg.data;
		return SLURM_PROTOCOL_SUCCESS;
		break;
	default:
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
		break;
	}

	return SLURM_PROTOCOL_SUCCESS;
}
Beispiel #5
0
/*
 * slurm_load_job_user - issue RPC to get slurm information about all jobs
 *	to be run as the specified user
 * IN/OUT job_info_msg_pptr - place to store a job configuration pointer
 * IN user_id - ID of user we want information for
 * IN show_flags - job filtering options
 * RET 0 or -1 on error
 * NOTE: free the response using slurm_free_job_info_msg
 */
extern int slurm_load_job_user (job_info_msg_t **job_info_msg_pptr,
				uint32_t user_id,
				uint16_t show_flags)
{
	int rc;
	slurm_msg_t resp_msg;
	slurm_msg_t req_msg;
	job_user_id_msg_t req;

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);

	req.show_flags   = show_flags;
	req.user_id      = user_id;
	req_msg.msg_type = REQUEST_JOB_USER_INFO;
	req_msg.data     = &req;

	if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
		return SLURM_ERROR;

	switch (resp_msg.msg_type) {
	case RESPONSE_JOB_INFO:
		*job_info_msg_pptr = (job_info_msg_t *)resp_msg.data;
		break;
	case RESPONSE_SLURM_RC:
		rc = ((return_code_msg_t *) resp_msg.data)->return_code;
		slurm_free_return_code_msg(resp_msg.data);
		if (rc)
			slurm_seterrno_ret(rc);
		break;
	default:
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
		break;
	}

	return SLURM_PROTOCOL_SUCCESS;
}
Beispiel #6
0
/*
 * slurm_load_node_single - issue RPC to get slurm configuration information
 *	for a specific node
 * OUT resp - place to store a node configuration pointer
 * IN node_name - name of the node for which information is requested
 * IN show_flags - node filtering options
 * RET 0 or a slurm error code
 * NOTE: free the response using slurm_free_node_info_msg
 */
extern int slurm_load_node_single (node_info_msg_t **resp,
				   char *node_name, uint16_t show_flags)
{
	int rc;
	slurm_msg_t req_msg;
	slurm_msg_t resp_msg;
	node_info_single_msg_t req;

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);
	req.node_name    = node_name;
	req.show_flags   = show_flags;
	req_msg.msg_type = REQUEST_NODE_INFO_SINGLE;
	req_msg.data     = &req;

	if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
		return SLURM_ERROR;

	switch (resp_msg.msg_type) {
	case RESPONSE_NODE_INFO:
		*resp = (node_info_msg_t *) resp_msg.data;
		break;
	case RESPONSE_SLURM_RC:
		rc = ((return_code_msg_t *) resp_msg.data)->return_code;
		slurm_free_return_code_msg(resp_msg.data);
		if (rc)
			slurm_seterrno_ret(rc);
		*resp = NULL;
		break;
	default:
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
		break;
	}

	return SLURM_PROTOCOL_SUCCESS;
}
Beispiel #7
0
/*
 * slurm_sbcast_lookup - retrieve info for an existing resource allocation
 *	including a credential needed for sbcast
 * IN job_id - job allocation identifier (or pack job ID)
 * IN pack_job_offset - pack job  index (or NO_VAL if not pack job)
 * IN step_id - step allocation identifier (or NO_VAL for entire job)
 * OUT info - job allocation information including a credential for sbcast
 * RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set
 * NOTE: free the "resp" using slurm_free_sbcast_cred_msg
 */
extern int slurm_sbcast_lookup(uint32_t job_id, uint32_t pack_job_offset,
			       uint32_t step_id, job_sbcast_cred_msg_t **info)
{
	step_alloc_info_msg_t req;
	slurm_msg_t req_msg;
	slurm_msg_t resp_msg;

	req.job_id = job_id;
	req.pack_job_offset = pack_job_offset;
	req.step_id = step_id;
	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);
	req_msg.msg_type = REQUEST_JOB_SBCAST_CRED;
	req_msg.data     = &req;

	if (slurm_send_recv_controller_msg(&req_msg,
					   &resp_msg,working_cluster_rec) < 0)
		return SLURM_ERROR;

	switch (resp_msg.msg_type) {
	case RESPONSE_SLURM_RC:
		if (_handle_rc_msg(&resp_msg) < 0)
			return SLURM_ERROR;
		*info = NULL;
		break;
	case RESPONSE_JOB_SBCAST_CRED:
		*info = (job_sbcast_cred_msg_t *)resp_msg.data;
		return SLURM_SUCCESS;
		break;
	default:
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
		break;
	}

	return SLURM_SUCCESS;
}
Beispiel #8
0
/*
 * slurm_load_reservations - issue RPC to get all slurm reservation
 *	configuration information if changed since update_time
 * IN update_time - time of current configuration data
 * IN reserve_info_msg_pptr - place to store a reservation configuration
 *	pointer
 * RET 0 or a slurm error code
 * NOTE: free the response using slurm_free_reservation_info_msg
 */
extern int slurm_load_reservations (time_t update_time,
				    reserve_info_msg_t **resp)
{
        int rc;
        slurm_msg_t req_msg;
        slurm_msg_t resp_msg;
        resv_info_request_msg_t req;

	slurm_msg_t_init(&req_msg);
	slurm_msg_t_init(&resp_msg);

        req.last_update  = update_time;
        req_msg.msg_type = REQUEST_RESERVATION_INFO;
        req_msg.data     = &req;

	if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0)
		return SLURM_ERROR;

	switch (resp_msg.msg_type) {
	case RESPONSE_RESERVATION_INFO:
		*resp = (reserve_info_msg_t *) resp_msg.data;
		break;
	case RESPONSE_SLURM_RC:
		rc = ((return_code_msg_t *) resp_msg.data)->return_code;
		slurm_free_return_code_msg(resp_msg.data);
		if (rc)
			slurm_seterrno_ret(rc);
		*resp = NULL;
		break;
	default:
		slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
		break;
	}

	return SLURM_PROTOCOL_SUCCESS;
}
Beispiel #9
0
/*
 * slurm_load_node2 - equivalent to slurm_load_node() with addition
 *	of cluster record for communications in a federation
 */
extern int slurm_load_node2(time_t update_time, node_info_msg_t **resp,
			    uint16_t show_flags, slurmdb_cluster_rec_t *cluster)
{
	slurm_msg_t req_msg;
	node_info_request_msg_t req;

	slurm_msg_t_init(&req_msg);
	memset(&req, 0, sizeof(req));
	req.last_update  = update_time;
	req.show_flags   = show_flags;
	req_msg.msg_type = REQUEST_NODE_INFO;
	req_msg.data     = &req;

	return _load_cluster_nodes(&req_msg, resp, cluster, show_flags);
}
Beispiel #10
0
/*
 * slurm_load_node - issue RPC to get slurm all node configuration information
 *	if changed since update_time
 * IN update_time - time of current configuration data
 * OUT resp - place to store a node configuration pointer
 * IN show_flags - node filtering options
 * RET 0 or a slurm error code
 * NOTE: free the response using slurm_free_node_info_msg
 */
extern int slurm_load_node(time_t update_time, node_info_msg_t **resp,
			   uint16_t show_flags)
{
	slurm_msg_t req_msg;
	node_info_request_msg_t req;
	char *cluster_name = NULL;
	void *ptr = NULL;
	slurmdb_federation_rec_t *fed;
	int rc;

	if (working_cluster_rec)
		cluster_name = xstrdup(working_cluster_rec->name);
	else
		cluster_name = slurm_get_cluster_name();
	if ((show_flags & SHOW_FEDERATION) && !(show_flags & SHOW_LOCAL) &&
	    (slurm_load_federation(&ptr) == SLURM_SUCCESS) &&
	    cluster_in_federation(ptr, cluster_name)) {
		/* In federation. Need full info from all clusters */
		update_time = (time_t) 0;
		show_flags &= (~SHOW_LOCAL);
	} else {
		/* Report local cluster info only */
		show_flags |= SHOW_LOCAL;
		show_flags &= (~SHOW_FEDERATION);
	}

	slurm_msg_t_init(&req_msg);
	memset(&req, 0, sizeof(req));
	req.last_update  = update_time;
	req.show_flags   = show_flags;
	req_msg.msg_type = REQUEST_NODE_INFO;
	req_msg.data     = &req;

	if ((show_flags & SHOW_FEDERATION) && ptr) { /* "ptr" check for CLANG */
		fed = (slurmdb_federation_rec_t *) ptr;
		rc = _load_fed_nodes(&req_msg, resp, show_flags, cluster_name,
				     fed);
	} else {
		rc = _load_cluster_nodes(&req_msg, resp, working_cluster_rec,
					 show_flags);
	}

	if (ptr)
		slurm_destroy_federation_rec(ptr);
	xfree(cluster_name);

	return rc;
}
Beispiel #11
0
/*
 * slurm_load_node_single2 - equivalent to slurm_load_node_single() with
 *	addition of cluster record for communications in a federation
 */
extern int slurm_load_node_single2(node_info_msg_t **resp, char *node_name,
				   uint16_t show_flags,
				   slurmdb_cluster_rec_t *cluster)
{
	slurm_msg_t req_msg;
	node_info_single_msg_t req;

	slurm_msg_t_init(&req_msg);
	memset(&req, 0, sizeof(req));
	req.node_name    = node_name;
	req.show_flags   = show_flags;
	req_msg.msg_type = REQUEST_NODE_INFO_SINGLE;
	req_msg.data     = &req;

	return _load_cluster_nodes(&req_msg, resp, cluster, show_flags);
}
Beispiel #12
0
/* Transmit PMI Keyval space data */
int slurm_send_kvs_comm_set(struct kvs_comm_set *kvs_set_ptr,
		int pmi_rank, int pmi_size)
{
	slurm_msg_t msg_send;
	int rc, retries = 0, timeout = 0;

	if (kvs_set_ptr == NULL)
		return EINVAL;

	if ((rc = _get_addr()) != SLURM_SUCCESS)
		return rc;
	_set_pmi_time();

	slurm_msg_t_init(&msg_send);
	msg_send.address = srun_addr;
	msg_send.msg_type = PMI_KVS_PUT_REQ;
	msg_send.data = (void *) kvs_set_ptr;

	/* Send the RPC to the local srun communcation manager.
	 * Since the srun can be sent thousands of messages at
	 * the same time and refuse some connections, retry as
	 * needed. Spread out messages by task's rank. Also
	 * increase the timeout if many tasks since the srun
	 * command is very overloaded.
	 * We also increase the timeout (default timeout is
	 * 10 secs). */
	_delay_rpc(pmi_rank, pmi_size);
	if      (pmi_size > 4000)	/* 240 secs */
		timeout = slurm_get_msg_timeout() * 24000;
	else if (pmi_size > 1000)	/* 120 secs */
		timeout = slurm_get_msg_timeout() * 12000;
	else if (pmi_size > 100)	/* 50 secs */
		timeout = slurm_get_msg_timeout() * 5000;
	else if (pmi_size > 10)		/* 20 secs */
		timeout = slurm_get_msg_timeout() * 2000;

	while (slurm_send_recv_rc_msg_only_one(&msg_send, &rc, timeout) < 0) {
		if (retries++ > MAX_RETRIES) {
			error("slurm_send_kvs_comm_set: %m");
			return SLURM_ERROR;
		} else
			debug("send_kvs retry %d", retries);
		_delay_rpc(pmi_rank, pmi_size);
	}

	return rc;
}
Beispiel #13
0
/*
 * slurm_requeue - re-queue a batch job, if already running
 *	then terminate it first
 * IN job_id  - job on which to perform operation
 * RET 0 or a slurm error code
 */
extern int slurm_requeue (uint32_t job_id)
{
    int rc;
    job_id_msg_t requeue_req;
    slurm_msg_t req_msg;

    slurm_msg_t_init(&req_msg);
    requeue_req.job_id	= job_id;
    req_msg.msg_type	= REQUEST_JOB_REQUEUE;
    req_msg.data		= &requeue_req;

    if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) < 0)
        return SLURM_ERROR;

    slurm_seterrno(rc);
    return rc;
}
Beispiel #14
0
/*
 * slurm_load_partitions2 - equivalent to slurm_load_partitions() with addition
 *	of cluster record for communications in a federation
 */
extern int slurm_load_partitions2(time_t update_time,
				  partition_info_msg_t **resp,
				  uint16_t show_flags,
				  slurmdb_cluster_rec_t *cluster)
{
	slurm_msg_t req_msg;
	part_info_request_msg_t req;

	slurm_msg_t_init(&req_msg);

	req.last_update  = update_time;
	req.show_flags   = show_flags;
	req_msg.msg_type = REQUEST_PARTITION_INFO;
	req_msg.data     = &req;

	return _load_cluster_parts(&req_msg, resp, cluster);
}
/* _acct_kill_step() issue RPC to kill a slurm job step */
static void _acct_kill_step(void)
{
	slurm_msg_t msg;
	job_step_kill_msg_t req;

	slurm_msg_t_init(&msg);
	/*
	 * Request message:
	 */
	req.job_id      = jobacct_job_id;
	req.job_step_id = jobacct_step_id;
	req.signal      = SIGKILL;
	req.batch_flag  = 0;
	msg.msg_type    = REQUEST_CANCEL_JOB_STEP;
	msg.data        = &req;

	slurm_send_only_controller_msg(&msg);
}
Beispiel #16
0
/*
 * Move the specified job ID to the top of the queue for a given user ID,
 *	partition, account, and QOS.
 * IN job_id_str - a job id
 * RET 0 or -1 on error */
extern int
slurm_top_job(char *job_id_str)
{
	int rc = SLURM_SUCCESS;
	top_job_msg_t top_job_req;
	slurm_msg_t req_msg;

	slurm_msg_t_init(&req_msg);
	top_job_req.job_id_str = job_id_str;
	req_msg.msg_type       = REQUEST_TOP_JOB;
	req_msg.data           = &top_job_req;

	if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) < 0)
		return SLURM_ERROR;

	slurm_seterrno(rc);
	return rc;
}
Beispiel #17
0
/* _slurm_update - issue RPC for all update requests */
static int
_slurm_update (void *data, slurm_msg_type_t msg_type)
{
	int rc;
	slurm_msg_t req_msg;

	slurm_msg_t_init(&req_msg);
	req_msg.msg_type = msg_type;
	req_msg.data     = data;

	if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) < 0)
		return SLURM_ERROR;

	if (rc != SLURM_SUCCESS)
		slurm_seterrno_ret(rc);

        return SLURM_PROTOCOL_SUCCESS;
}
Beispiel #18
0
/*
 * _suspend_op - perform a suspend/resume operation for some job.
 * IN op      - operation to perform
 * IN job_id  - job on which to perform operation
 * IN step_id - job step on which to perform operation
 * RET 0 or a slurm error code
 */
static int _suspend_op (uint16_t op, uint32_t job_id)
{
    int rc;
    suspend_msg_t sus_req;
    slurm_msg_t req_msg;

    slurm_msg_t_init(&req_msg);
    sus_req.op       = op;
    sus_req.job_id   = job_id;
    req_msg.msg_type = REQUEST_SUSPEND;
    req_msg.data     = &sus_req;

    if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) < 0)
        return SLURM_ERROR;

    slurm_seterrno(rc);
    return rc;
}
Beispiel #19
0
/*
 * slurm_shutdown - issue RPC to have Slurm controller (slurmctld)
 *	cease operations, both the primary and backup controller
 *	are shutdown.
 * IN options - 0: all slurm daemons are shutdown
 *              1: slurmctld generates a core file
 *              2: only the slurmctld is shutdown (no core file)
 * RET 0 or a slurm error code
 */
int
slurm_shutdown (uint16_t options)
{
	slurm_msg_t req_msg;
	shutdown_msg_t shutdown_msg;

	slurm_msg_t_init(&req_msg);
	shutdown_msg.options = options;
	req_msg.msg_type     = REQUEST_SHUTDOWN;
	req_msg.data         = &shutdown_msg;

	/*
	 * Explicity send the message to both primary
	 *   and backup controllers
	 */
	(void) _send_message_controller(SECONDARY_CONTROLLER, &req_msg);
	return _send_message_controller(PRIMARY_CONTROLLER,   &req_msg);
}
Beispiel #20
0
/*
 * slurm_reconfigure - issue RPC to have Slurm controller (slurmctld)
 *	reload its configuration file
 * RET 0 or a slurm error code
 */
int
slurm_reconfigure (void)
{
	int rc;
	slurm_msg_t req;

	slurm_msg_t_init(&req);

	req.msg_type = REQUEST_RECONFIGURE;

	if (slurm_send_recv_controller_rc_msg(&req, &rc) < 0)
		return SLURM_ERROR;

	if (rc)
		slurm_seterrno_ret(rc);

	return SLURM_PROTOCOL_SUCCESS;
}
Beispiel #21
0
extern int slurm_persist_msg_unpack(slurm_persist_conn_t *persist_conn,
				    persist_msg_t *resp_msg, Buf buffer)
{
	int rc;

	xassert(persist_conn);
	xassert(resp_msg);

	if (persist_conn->flags & PERSIST_FLAG_DBD) {
		rc = unpack_slurmdbd_msg((slurmdbd_msg_t *)resp_msg,
					 persist_conn->version,
					 buffer);
	} else {
		slurm_msg_t msg;

		slurm_msg_t_init(&msg);

		msg.protocol_version = persist_conn->version;

		safe_unpack16(&msg.msg_type, buffer);

		rc = unpack_msg(&msg, buffer);

		resp_msg->msg_type = msg.msg_type;
		resp_msg->data = msg.data;
	}

	/* Here we transfer the auth_cred to the persist_conn just in case in the
	 * future we need to use it in some way to verify things for messages
	 * that don't have on that will follow on the connection.
	 */
	if (resp_msg->msg_type == REQUEST_PERSIST_INIT) {
		slurm_msg_t *msg = resp_msg->data;
		if (persist_conn->auth_cred)
			g_slurm_auth_destroy(persist_conn->auth_cred);

		persist_conn->auth_cred = msg->auth_cred;
		msg->auth_cred = NULL;
	}

	return rc;
unpack_error:
	return SLURM_ERROR;
}
Beispiel #22
0
/* Accept RPC from slurmctld and process it.
 * IN slurmctld_fd: file descriptor for slurmctld communications
 * OUT resp: resource allocation response message
 * RET 1 if resp is filled in, 0 otherwise */
static int
_accept_msg_connection(int listen_fd,
		       resource_allocation_response_msg_t **resp)
{
	int	     conn_fd;
	slurm_msg_t  *msg = NULL;
	slurm_addr_t   cli_addr;
	char         host[256];
	uint16_t     port;
	int          rc = 0;

	conn_fd = slurm_accept_msg_conn(listen_fd, &cli_addr);
	if (conn_fd < 0) {
		error("Unable to accept connection: %m");
		return rc;
	}

	slurm_get_addr(&cli_addr, &port, host, sizeof(host));
	debug2("got message connection from %s:%hu", host, port);

	msg = xmalloc(sizeof(slurm_msg_t));
	slurm_msg_t_init(msg);

	if((rc = slurm_receive_msg(conn_fd, msg, 0)) != 0) {
		slurm_free_msg(msg);

		if (errno == EINTR) {
			slurm_close_accepted_conn(conn_fd);
			*resp = NULL;
			return 0;
		}

		error("_accept_msg_connection[%s]: %m", host);
		slurm_close_accepted_conn(conn_fd);
		return SLURM_ERROR;
	}

	rc = _handle_msg(msg, resp); /* handle_msg frees msg */
	slurm_free_msg(msg);

	slurm_close_accepted_conn(conn_fd);
	return rc;
}
Beispiel #23
0
/*
 * Tell the primary_controller to relinquish control, primary control_machine
 *	has to suspend operation
 * Based on _shutdown_backup_controller from controller.c
 * wait_time - How long to wait for primary controller to write state, seconds.
 * RET 0 or an error code
 * NOTE: READ lock_slurmctld config before entry (or be single-threaded)
 */
static int _shutdown_primary_controller(int wait_time)
{
	int rc;
	slurm_msg_t req;

	slurm_msg_t_init(&req);
	if ((slurmctld_conf.control_addr == NULL) ||
	    (slurmctld_conf.control_addr[0] == '\0')) {
		error("_shutdown_primary_controller: "
		      "no primary controller to shutdown");
		return SLURM_ERROR;
	}

	slurm_set_addr(&req.address, slurmctld_conf.slurmctld_port,
		       slurmctld_conf.control_addr);

	/* send request message */
	req.msg_type = REQUEST_CONTROL;

	if (slurm_send_recv_rc_msg_only_one(&req, &rc,
				(CONTROL_TIMEOUT * 1000)) < 0) {
		error("_shutdown_primary_controller:send/recv: %m");
		return SLURM_ERROR;
	}
	if (rc == ESLURM_DISABLED)
		debug("primary controller responding");
	else if (rc == 0) {
		debug("primary controller has relinquished control");
	} else {
		error("_shutdown_primary_controller: %s", slurm_strerror(rc));
		return SLURM_ERROR;
	}

	/* FIXME: Ideally the REQUEST_CONTROL RPC does not return until all
	 * other activity has ceased and the state has been saved. That is
	 * not presently the case (it returns when no other work is pending,
	 * so the state save should occur right away). We sleep for a while
	 * here and give the primary controller time to shutdown */
	if (wait_time)
		sleep(wait_time);

	return SLURM_SUCCESS;
}
Beispiel #24
0
/*
 * slurm_requeue - re-queue a batch job, if already running
 *	then terminate it first
 * IN job_id     - job on which to perform operation
 * IN state      - state in which to place the job
 * RET 0 or a slurm error code
 */
extern int slurm_requeue(uint32_t job_id, uint32_t state)
{
	int rc = SLURM_SUCCESS;
	requeue_msg_t requeue_req;
	slurm_msg_t req_msg;

	slurm_msg_t_init(&req_msg);

	requeue_req.job_id	= job_id;
	requeue_req.job_id_str	= NULL;
	requeue_req.state	= state;
	req_msg.msg_type	= REQUEST_JOB_REQUEUE;
	req_msg.data		= &requeue_req;

	if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) < 0)
		return SLURM_ERROR;

	slurm_seterrno(rc);
	return rc;
}
Beispiel #25
0
/*
 * slurm_ping - issue RPC to have Slurm controller (slurmctld)
 * IN controller - 1==primary controller, 2==secondary controller
 * RET 0 or a slurm error code
 */
int
slurm_ping (int primary)
{
	int rc ;
	slurm_msg_t request_msg ;

	slurm_msg_t_init(&request_msg);
	request_msg.msg_type = REQUEST_PING ;

	if (primary == 1)
		rc = _send_message_controller ( PRIMARY_CONTROLLER,
						&request_msg );
	else if (primary == 2)
		rc = _send_message_controller ( SECONDARY_CONTROLLER,
						&request_msg );
	else
		rc = SLURM_ERROR;

	return rc;
}
Beispiel #26
0
static int _drain_node(char *reason)
{
	slurm_msg_t req_msg;
	update_node_msg_t update_node_msg;

	memset(&update_node_msg, 0, sizeof(update_node_msg_t));
	update_node_msg.node_names = conf->node_name;
	update_node_msg.node_state = NODE_STATE_DRAIN;
	update_node_msg.reason = reason;
	update_node_msg.reason_uid = getuid();
	update_node_msg.weight = NO_VAL;
	slurm_msg_t_init(&req_msg);
	req_msg.msg_type = REQUEST_UPDATE_NODE;
	req_msg.data = &update_node_msg;

	if (slurm_send_only_controller_msg(&req_msg) < 0)
		return SLURM_ERROR;

	return SLURM_SUCCESS;
}
Beispiel #27
0
static int _send_to_stepds(hostlist_t hl, const char *addr, uint32_t len,
		char *data)
{
	List ret_list = NULL;
	int temp_rc = 0, rc = 0;
	ret_data_info_t *ret_data_info = NULL;
	slurm_msg_t *msg = xmalloc(sizeof(slurm_msg_t));
	forward_data_msg_t req;
	char *nodelist = NULL;

	slurm_msg_t_init(msg);
	req.address = xstrdup(addr);
	req.len = len;
	req.data = data;

	msg->msg_type = REQUEST_FORWARD_DATA;
	msg->data = &req;

	nodelist = hostlist_ranged_string_xmalloc(hl);

	if ((ret_list = slurm_send_recv_msgs(nodelist, msg, 0, false))) {
		while ((ret_data_info = list_pop(ret_list))) {
			temp_rc = slurm_get_return_code(ret_data_info->type,
					ret_data_info->data);
			if (temp_rc) {
				rc = temp_rc;
			} else {
				hostlist_delete_host(hl,
						ret_data_info->node_name);
			}
		}
	} else {
		error("tree_msg_to_stepds: no list was returned");
		rc = SLURM_ERROR;
	}

	slurm_free_msg(msg);
	xfree(nodelist);
	xfree(req.address);
	return rc;
}
Beispiel #28
0
extern void msg_aggr_add_comp(Buf buffer, void *auth_cred, header_t *header)
{
	slurm_msg_t *msg;

	if (!msg_collection.running)
		return;

	msg = xmalloc_nz(sizeof(slurm_msg_t));
	slurm_msg_t_init(msg);

	msg->protocol_version = header->version;
	msg->msg_type = header->msg_type;
	msg->flags = header->flags;

	msg->auth_cred = auth_cred;

	msg->data = buffer;
	msg->data_size = remaining_buf(buffer);

	msg_aggr_add_msg(msg, 0, NULL);
}
Beispiel #29
0
/* Issue the RPC to transfer the file's data */
static int _file_bcast(struct bcast_parameters *params,
		       file_bcast_msg_t *bcast_msg,
		       job_sbcast_cred_msg_t *sbcast_cred)
{
	List ret_list = NULL;
	ListIterator itr;
	ret_data_info_t *ret_data_info = NULL;
	int rc = 0, msg_rc;
	slurm_msg_t msg;

	slurm_msg_t_init(&msg);
	msg.data = bcast_msg;
	msg.msg_type = REQUEST_FILE_BCAST;

	ret_list = slurm_send_recv_msgs(
		sbcast_cred->node_list, &msg, params->timeout, true);
	if (ret_list == NULL) {
		error("slurm_send_recv_msgs: %m");
		exit(1);
	}

	itr = list_iterator_create(ret_list);
	while ((ret_data_info = list_next(itr))) {
		msg_rc = slurm_get_return_code(ret_data_info->type,
					       ret_data_info->data);
		if (msg_rc == SLURM_SUCCESS)
			continue;

		error("REQUEST_FILE_BCAST(%s): %s",
		      ret_data_info->node_name,
		      slurm_strerror(msg_rc));
		rc = MAX(rc, msg_rc);
	}
	list_iterator_destroy(itr);
	FREE_NULL_LIST(ret_list);

	return rc;
}
Beispiel #30
0
/*
 * slurm_clear_trigger - Clear (remove) an existing event trigger
 * RET 0 or a slurm error code
 */
extern int slurm_clear_trigger (trigger_info_t *trigger_clear)
{
	int rc;
	slurm_msg_t msg;
	trigger_info_msg_t req;

	slurm_msg_t_init(&msg);
	/*
	 * Request message:
	 */
	req.record_count  = 1;
	req.trigger_array = trigger_clear;
	msg.msg_type      = REQUEST_TRIGGER_CLEAR;
        msg.data          = &req;

	if (slurm_send_recv_controller_rc_msg(&msg, &rc) < 0)
		return SLURM_FAILURE;

	if (rc)
		slurm_seterrno_ret(rc);

	return SLURM_SUCCESS;
}