static void _send_step_complete_rpc(srun_job_t *srun_job, int step_rc) { slurm_msg_t req; step_complete_msg_t msg; int rc; memset(&msg, 0, sizeof(step_complete_msg_t)); msg.job_id = srun_job->jobid; msg.job_step_id = srun_job->stepid; msg.range_first = 0; msg.range_last = 0; msg.step_rc = step_rc; msg.jobacct = jobacctinfo_create(NULL); slurm_msg_t_init(&req); req.msg_type = REQUEST_STEP_COMPLETE; req.data = &msg; /* req.address = step_complete.parent_addr; */ debug3("Sending step complete RPC to slurmctld"); if (slurm_send_recv_controller_rc_msg(&req, &rc, working_cluster_rec) < 0) error("Error sending step complete RPC to slurmctld"); jobacctinfo_destroy(msg.jobacct); }
/* * slurm_checkpoint_task_complete - note the completion of a task's checkpoint * operation. * IN job_id - job on which to perform operation * IN step_id - job step on which to perform operation * IN task_id - task which completed the operation * IN begin_time - time at which checkpoint began * IN error_code - error code, highest value for all complete calls is preserved * IN error_msg - error message, preserved for highest error_code * RET 0 or a slurm error code */ extern int slurm_checkpoint_task_complete (uint32_t job_id, uint32_t step_id, uint32_t task_id, time_t begin_time, uint32_t error_code, char *error_msg) { int rc; slurm_msg_t msg; checkpoint_task_comp_msg_t req; slurm_msg_t_init(&msg); req.job_id = job_id; req.step_id = step_id; req.task_id = task_id; req.begin_time = begin_time; req.error_code = error_code; req.error_msg = error_msg; msg.msg_type = REQUEST_CHECKPOINT_TASK_COMP; msg.data = &req; if (slurm_send_recv_controller_rc_msg(&msg, &rc, working_cluster_rec) < 0) return SLURM_ERROR; if (rc) slurm_seterrno_ret(rc); return SLURM_SUCCESS; }
extern int send_registration_msg(uint32_t status, bool startup) { int rc, ret_val = SLURM_SUCCESS; slurm_msg_t req; slurm_node_registration_status_msg_t *msg = xmalloc (sizeof (slurm_node_registration_status_msg_t)); slurm_msg_t_init(&req); msg->startup = (uint16_t) startup; _fill_registration_msg(msg); msg->status = status; req.msg_type = MESSAGE_NODE_REGISTRATION_STATUS; req.data = msg; if (slurm_send_recv_controller_rc_msg(&req, &rc) < 0) { error("Unable to register: %m"); ret_val = SLURM_FAILURE; } else { sent_reg_time = time(NULL); } slurm_free_node_registration_status_msg (msg); return ret_val; }
/* * slurm_notify_job - send message to the job's stdout, * usable only by user root * IN job_id - slurm job_id or 0 for all jobs * IN message - arbitrary message * RET 0 or -1 on error */ extern int slurm_notify_job (uint32_t job_id, char *message) { int rc; slurm_msg_t msg; job_notify_msg_t req; slurm_msg_t_init(&msg); /* * Request message: */ req.job_id = job_id; req.job_step_id = NO_VAL; /* currently not used */ req.message = message; msg.msg_type = REQUEST_JOB_NOTIFY; msg.data = &req; if (slurm_send_recv_controller_rc_msg(&msg, &rc) < 0) return SLURM_FAILURE; if (rc) { slurm_seterrno_ret(rc); return SLURM_FAILURE; } return SLURM_SUCCESS; }
/* * Kill a job step with job id "job_id" and step id "step_id", optionally * sending the processes in the job step a signal "signal" * IN job_id - the job's id * IN step_id - the job step's id * IN signal - signal number * RET 0 on success, otherwise return -1 and set errno to indicate the error */ extern int slurm_kill_job_step (uint32_t job_id, uint32_t step_id, uint16_t signal) { int rc; slurm_msg_t msg; job_step_kill_msg_t req; slurm_msg_t_init(&msg); /* * Request message: */ req.job_id = job_id; req.job_step_id = step_id; req.signal = signal; req.flags = 0; msg.msg_type = REQUEST_CANCEL_JOB_STEP; msg.data = &req; if (slurm_send_recv_controller_rc_msg(&msg, &rc) < 0) return SLURM_FAILURE; if (rc) slurm_seterrno_ret(rc); return SLURM_SUCCESS; }
/* * slurm_requeue - re-queue a batch job, if already running * then terminate it first * IN job_id - job on which to perform operation * RET 0 or a slurm error code */ extern int slurm_requeue (uint32_t job_id) { int rc; job_id_msg_t requeue_req; slurm_msg_t req_msg; slurm_msg_t_init(&req_msg); requeue_req.job_id = job_id; req_msg.msg_type = REQUEST_JOB_REQUEUE; req_msg.data = &requeue_req; if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) < 0) return SLURM_ERROR; slurm_seterrno(rc); return rc; }
/* * _suspend_op - perform a suspend/resume operation for some job. * IN op - operation to perform * IN job_id - job on which to perform operation * IN step_id - job step on which to perform operation * RET 0 or a slurm error code */ static int _suspend_op (uint16_t op, uint32_t job_id) { int rc; suspend_msg_t sus_req; slurm_msg_t req_msg; slurm_msg_t_init(&req_msg); sus_req.op = op; sus_req.job_id = job_id; req_msg.msg_type = REQUEST_SUSPEND; req_msg.data = &sus_req; if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) < 0) return SLURM_ERROR; slurm_seterrno(rc); return rc; }
/* * slurm_reconfigure - issue RPC to have Slurm controller (slurmctld) * reload its configuration file * RET 0 or a slurm error code */ int slurm_reconfigure (void) { int rc; slurm_msg_t req; slurm_msg_t_init(&req); req.msg_type = REQUEST_RECONFIGURE; if (slurm_send_recv_controller_rc_msg(&req, &rc) < 0) return SLURM_ERROR; if (rc) slurm_seterrno_ret(rc); return SLURM_PROTOCOL_SUCCESS; }
/* _slurm_update - issue RPC for all update requests */ static int _slurm_update (void *data, slurm_msg_type_t msg_type) { int rc; slurm_msg_t req_msg; slurm_msg_t_init(&req_msg); req_msg.msg_type = msg_type; req_msg.data = data; if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) < 0) return SLURM_ERROR; if (rc != SLURM_SUCCESS) slurm_seterrno_ret(rc); return SLURM_PROTOCOL_SUCCESS; }
/* * Move the specified job ID to the top of the queue for a given user ID, * partition, account, and QOS. * IN job_id_str - a job id * RET 0 or -1 on error */ extern int slurm_top_job(char *job_id_str) { int rc = SLURM_SUCCESS; top_job_msg_t top_job_req; slurm_msg_t req_msg; slurm_msg_t_init(&req_msg); top_job_req.job_id_str = job_id_str; req_msg.msg_type = REQUEST_TOP_JOB; req_msg.data = &top_job_req; if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) < 0) return SLURM_ERROR; slurm_seterrno(rc); return rc; }
/* * slurm_requeue - re-queue a batch job, if already running * then terminate it first * IN job_id - job on which to perform operation * IN state - state in which to place the job * RET 0 or a slurm error code */ extern int slurm_requeue(uint32_t job_id, uint32_t state) { int rc = SLURM_SUCCESS; requeue_msg_t requeue_req; slurm_msg_t req_msg; slurm_msg_t_init(&req_msg); requeue_req.job_id = job_id; requeue_req.job_id_str = NULL; requeue_req.state = state; req_msg.msg_type = REQUEST_JOB_REQUEUE; req_msg.data = &requeue_req; if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) < 0) return SLURM_ERROR; slurm_seterrno(rc); return rc; }
/* * slurm_clear_trigger - Clear (remove) an existing event trigger * RET 0 or a slurm error code */ extern int slurm_clear_trigger (trigger_info_t *trigger_clear) { int rc; slurm_msg_t msg; trigger_info_msg_t req; slurm_msg_t_init(&msg); /* * Request message: */ req.record_count = 1; req.trigger_array = trigger_clear; msg.msg_type = REQUEST_TRIGGER_CLEAR; msg.data = &req; if (slurm_send_recv_controller_rc_msg(&msg, &rc) < 0) return SLURM_FAILURE; if (rc) slurm_seterrno_ret(rc); return SLURM_SUCCESS; }
/* * slurm_pull_trigger - Pull (fire) an event trigger * RET 0 or a slurm error code */ extern int slurm_pull_trigger (trigger_info_t *trigger_pull) { int rc; slurm_msg_t msg; trigger_info_msg_t req; /* * Request message: */ slurm_msg_t_init(&msg); memset(&req, 0, sizeof(trigger_info_msg_t)); req.record_count = 1; req.trigger_array = trigger_pull; msg.msg_type = REQUEST_TRIGGER_PULL; msg.data = &req; if (slurm_send_recv_controller_rc_msg(&msg, &rc) < 0) return SLURM_FAILURE; if (rc) slurm_seterrno_ret(rc); return SLURM_SUCCESS; }
/* * _checkpoint_op - perform many checkpoint operation for some job step. * IN op - operation to perform * IN data - operation-specific data * IN job_id - job on which to perform operation * IN step_id - job step on which to perform operation * IN image_dir - directory used to get/put checkpoint images * RET 0 or a slurm error code */ static int _checkpoint_op (uint16_t op, uint16_t data, uint32_t job_id, uint32_t step_id, char *image_dir) { int rc; checkpoint_msg_t ckp_req; slurm_msg_t req_msg; slurm_msg_t_init(&req_msg); ckp_req.op = op; ckp_req.data = data; ckp_req.job_id = job_id; ckp_req.step_id = step_id; ckp_req.image_dir = image_dir; req_msg.msg_type = REQUEST_CHECKPOINT; req_msg.data = &ckp_req; if (slurm_send_recv_controller_rc_msg(&req_msg, &rc, working_cluster_rec) < 0) return SLURM_ERROR; slurm_seterrno(rc); return rc; }