Exemplo n.º 1
0
static void _send_step_complete_rpc(srun_job_t *srun_job, int step_rc)
{
	slurm_msg_t req;
	step_complete_msg_t msg;
	int rc;

	memset(&msg, 0, sizeof(step_complete_msg_t));
	msg.job_id = srun_job->jobid;
	msg.job_step_id = srun_job->stepid;
	msg.range_first = 0;
	msg.range_last = 0;
	msg.step_rc = step_rc;
	msg.jobacct = jobacctinfo_create(NULL);

	slurm_msg_t_init(&req);
	req.msg_type = REQUEST_STEP_COMPLETE;
	req.data = &msg;
/*	req.address = step_complete.parent_addr; */

	debug3("Sending step complete RPC to slurmctld");
	if (slurm_send_recv_controller_rc_msg(&req, &rc, working_cluster_rec)
	    < 0)
		error("Error sending step complete RPC to slurmctld");
	jobacctinfo_destroy(msg.jobacct);
}
Exemplo n.º 2
0
/*
 * slurm_checkpoint_task_complete - note the completion of a task's checkpoint
 *	operation.
 * IN job_id  - job on which to perform operation
 * IN step_id - job step on which to perform operation
 * IN task_id - task which completed the operation
 * IN begin_time - time at which checkpoint began
 * IN error_code - error code, highest value for all complete calls is preserved
 * IN error_msg - error message, preserved for highest error_code
 * RET 0 or a slurm error code
 */
extern int slurm_checkpoint_task_complete (uint32_t job_id, uint32_t step_id,
					   uint32_t task_id, time_t begin_time,
					   uint32_t error_code,
					   char *error_msg)
{
	int rc;
	slurm_msg_t msg;
	checkpoint_task_comp_msg_t req;

	slurm_msg_t_init(&msg);
	req.job_id       = job_id;
	req.step_id      = step_id;
	req.task_id      = task_id;
	req.begin_time   = begin_time;
	req.error_code   = error_code;
	req.error_msg    = error_msg;
	msg.msg_type     = REQUEST_CHECKPOINT_TASK_COMP;
	msg.data         = &req;

	if (slurm_send_recv_controller_rc_msg(&msg, &rc,
					      working_cluster_rec) < 0)
		return SLURM_ERROR;
	if (rc)
		slurm_seterrno_ret(rc);
	return SLURM_SUCCESS;
}
Exemplo n.º 3
0
extern int
send_registration_msg(uint32_t status, bool startup)
{
	int rc, ret_val = SLURM_SUCCESS;
	slurm_msg_t req;
	slurm_node_registration_status_msg_t *msg =
		xmalloc (sizeof (slurm_node_registration_status_msg_t));

	slurm_msg_t_init(&req);

	msg->startup = (uint16_t) startup;
	_fill_registration_msg(msg);
	msg->status  = status;

	req.msg_type = MESSAGE_NODE_REGISTRATION_STATUS;
	req.data     = msg;

	if (slurm_send_recv_controller_rc_msg(&req, &rc) < 0) {
		error("Unable to register: %m");
		ret_val = SLURM_FAILURE;
	} else {
		sent_reg_time = time(NULL);
	}
	slurm_free_node_registration_status_msg (msg);

	return ret_val;
}
Exemplo n.º 4
0
/*
 * slurm_notify_job - send message to the job's stdout,
 *	usable only by user root
 * IN job_id - slurm job_id or 0 for all jobs
 * IN message - arbitrary message
 * RET 0 or -1 on error
 */
extern int slurm_notify_job (uint32_t job_id, char *message)
{
	int rc;
	slurm_msg_t msg;
	job_notify_msg_t req;

	slurm_msg_t_init(&msg);
	/*
	 * Request message:
	 */
	req.job_id      = job_id;
	req.job_step_id = NO_VAL;	/* currently not used */
	req.message     = message;
	msg.msg_type    = REQUEST_JOB_NOTIFY;
	msg.data        = &req;

	if (slurm_send_recv_controller_rc_msg(&msg, &rc) < 0)
		return SLURM_FAILURE;

	if (rc) {
		slurm_seterrno_ret(rc);
		return SLURM_FAILURE;
	}

	return SLURM_SUCCESS;
}
Exemplo n.º 5
0
Arquivo: cancel.c Projeto: IFCA/slurm
/*
 * Kill a job step with job id "job_id" and step id "step_id", optionally
 *	sending the processes in the job step a signal "signal"
 * IN job_id     - the job's id
 * IN step_id    - the job step's id
 * IN signal     - signal number
 * RET 0 on success, otherwise return -1 and set errno to indicate the error
 */
extern int
slurm_kill_job_step (uint32_t job_id, uint32_t step_id, uint16_t signal)
{
	int rc;
	slurm_msg_t msg;
	job_step_kill_msg_t req;

	slurm_msg_t_init(&msg);
	/*
	 * Request message:
	 */
	req.job_id      = job_id;
	req.job_step_id = step_id;
	req.signal      = signal;
	req.flags	= 0;
	msg.msg_type    = REQUEST_CANCEL_JOB_STEP;
        msg.data        = &req;

	if (slurm_send_recv_controller_rc_msg(&msg, &rc) < 0)
		return SLURM_FAILURE;

	if (rc)
		slurm_seterrno_ret(rc);

	return SLURM_SUCCESS;
}
Exemplo n.º 6
0
/*
 * slurm_requeue - re-queue a batch job, if already running
 *	then terminate it first
 * IN job_id  - job on which to perform operation
 * RET 0 or a slurm error code
 */
extern int slurm_requeue (uint32_t job_id)
{
    int rc;
    job_id_msg_t requeue_req;
    slurm_msg_t req_msg;

    slurm_msg_t_init(&req_msg);
    requeue_req.job_id	= job_id;
    req_msg.msg_type	= REQUEST_JOB_REQUEUE;
    req_msg.data		= &requeue_req;

    if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) < 0)
        return SLURM_ERROR;

    slurm_seterrno(rc);
    return rc;
}
Exemplo n.º 7
0
/*
 * _suspend_op - perform a suspend/resume operation for some job.
 * IN op      - operation to perform
 * IN job_id  - job on which to perform operation
 * IN step_id - job step on which to perform operation
 * RET 0 or a slurm error code
 */
static int _suspend_op (uint16_t op, uint32_t job_id)
{
    int rc;
    suspend_msg_t sus_req;
    slurm_msg_t req_msg;

    slurm_msg_t_init(&req_msg);
    sus_req.op       = op;
    sus_req.job_id   = job_id;
    req_msg.msg_type = REQUEST_SUSPEND;
    req_msg.data     = &sus_req;

    if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) < 0)
        return SLURM_ERROR;

    slurm_seterrno(rc);
    return rc;
}
Exemplo n.º 8
0
/*
 * slurm_reconfigure - issue RPC to have Slurm controller (slurmctld)
 *	reload its configuration file
 * RET 0 or a slurm error code
 */
int
slurm_reconfigure (void)
{
	int rc;
	slurm_msg_t req;

	slurm_msg_t_init(&req);

	req.msg_type = REQUEST_RECONFIGURE;

	if (slurm_send_recv_controller_rc_msg(&req, &rc) < 0)
		return SLURM_ERROR;

	if (rc)
		slurm_seterrno_ret(rc);

	return SLURM_PROTOCOL_SUCCESS;
}
Exemplo n.º 9
0
/* _slurm_update - issue RPC for all update requests */
static int
_slurm_update (void *data, slurm_msg_type_t msg_type)
{
	int rc;
	slurm_msg_t req_msg;

	slurm_msg_t_init(&req_msg);
	req_msg.msg_type = msg_type;
	req_msg.data     = data;

	if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) < 0)
		return SLURM_ERROR;

	if (rc != SLURM_SUCCESS)
		slurm_seterrno_ret(rc);

        return SLURM_PROTOCOL_SUCCESS;
}
Exemplo n.º 10
0
/*
 * Move the specified job ID to the top of the queue for a given user ID,
 *	partition, account, and QOS.
 * IN job_id_str - a job id
 * RET 0 or -1 on error */
extern int
slurm_top_job(char *job_id_str)
{
	int rc = SLURM_SUCCESS;
	top_job_msg_t top_job_req;
	slurm_msg_t req_msg;

	slurm_msg_t_init(&req_msg);
	top_job_req.job_id_str = job_id_str;
	req_msg.msg_type       = REQUEST_TOP_JOB;
	req_msg.data           = &top_job_req;

	if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) < 0)
		return SLURM_ERROR;

	slurm_seterrno(rc);
	return rc;
}
Exemplo n.º 11
0
/*
 * slurm_requeue - re-queue a batch job, if already running
 *	then terminate it first
 * IN job_id     - job on which to perform operation
 * IN state      - state in which to place the job
 * RET 0 or a slurm error code
 */
extern int slurm_requeue(uint32_t job_id, uint32_t state)
{
	int rc = SLURM_SUCCESS;
	requeue_msg_t requeue_req;
	slurm_msg_t req_msg;

	slurm_msg_t_init(&req_msg);

	requeue_req.job_id	= job_id;
	requeue_req.job_id_str	= NULL;
	requeue_req.state	= state;
	req_msg.msg_type	= REQUEST_JOB_REQUEUE;
	req_msg.data		= &requeue_req;

	if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) < 0)
		return SLURM_ERROR;

	slurm_seterrno(rc);
	return rc;
}
Exemplo n.º 12
0
/*
 * slurm_clear_trigger - Clear (remove) an existing event trigger
 * RET 0 or a slurm error code
 */
extern int slurm_clear_trigger (trigger_info_t *trigger_clear)
{
	int rc;
	slurm_msg_t msg;
	trigger_info_msg_t req;

	slurm_msg_t_init(&msg);
	/*
	 * Request message:
	 */
	req.record_count  = 1;
	req.trigger_array = trigger_clear;
	msg.msg_type      = REQUEST_TRIGGER_CLEAR;
        msg.data          = &req;

	if (slurm_send_recv_controller_rc_msg(&msg, &rc) < 0)
		return SLURM_FAILURE;

	if (rc)
		slurm_seterrno_ret(rc);

	return SLURM_SUCCESS;
}
Exemplo n.º 13
0
/*
 * slurm_pull_trigger - Pull (fire) an event trigger
 * RET 0 or a slurm error code
 */
extern int slurm_pull_trigger (trigger_info_t *trigger_pull)
{
	int rc;
	slurm_msg_t msg;
	trigger_info_msg_t req;

	/*
	 * Request message:
	 */
	slurm_msg_t_init(&msg);
	memset(&req, 0, sizeof(trigger_info_msg_t));
	req.record_count  = 1;
	req.trigger_array = trigger_pull;
	msg.msg_type      = REQUEST_TRIGGER_PULL;
	msg.data	  = &req;

	if (slurm_send_recv_controller_rc_msg(&msg, &rc) < 0)
		return SLURM_FAILURE;
	if (rc)
		slurm_seterrno_ret(rc);

	return SLURM_SUCCESS;
}
Exemplo n.º 14
0
/*
 * _checkpoint_op - perform many checkpoint operation for some job step.
 * IN op        - operation to perform
 * IN data      - operation-specific data
 * IN job_id    - job on which to perform operation
 * IN step_id   - job step on which to perform operation
 * IN image_dir - directory used to get/put checkpoint images
 * RET 0 or a slurm error code
 */
static int _checkpoint_op (uint16_t op, uint16_t data,
			   uint32_t job_id, uint32_t step_id,
			   char *image_dir)
{
	int rc;
	checkpoint_msg_t ckp_req;
	slurm_msg_t req_msg;

	slurm_msg_t_init(&req_msg);
	ckp_req.op        = op;
	ckp_req.data      = data;
	ckp_req.job_id    = job_id;
	ckp_req.step_id   = step_id;
	ckp_req.image_dir = image_dir;
	req_msg.msg_type  = REQUEST_CHECKPOINT;
	req_msg.data      = &ckp_req;

	if (slurm_send_recv_controller_rc_msg(&req_msg, &rc,
					      working_cluster_rec) < 0)
		return SLURM_ERROR;

	slurm_seterrno(rc);
	return rc;
}