/* * slurm_load_node - issue RPC to get slurm all node configuration information * if changed since update_time * IN update_time - time of current configuration data * OUT resp - place to store a node configuration pointer * IN show_flags - node filtering options * RET 0 or a slurm error code * NOTE: free the response using slurm_free_node_info_msg */ extern int slurm_load_node (time_t update_time, node_info_msg_t **resp, uint16_t show_flags) { int rc; slurm_msg_t req_msg; slurm_msg_t resp_msg; node_info_request_msg_t req; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); req.last_update = update_time; req.show_flags = show_flags; req_msg.msg_type = REQUEST_NODE_INFO; req_msg.data = &req; if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0) return SLURM_ERROR; switch (resp_msg.msg_type) { case RESPONSE_NODE_INFO: *resp = (node_info_msg_t *) resp_msg.data; if (show_flags & SHOW_MIXED) _set_node_mixed(*resp); break; case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; slurm_free_return_code_msg(resp_msg.data); if (rc) slurm_seterrno_ret(rc); *resp = NULL; break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS; }
/* * slurm_checkpoint_able - determine if the specified job step can presently * be checkpointed * IN job_id - job on which to perform operation * IN step_id - job step on which to perform operation * OUT start_time - time at which checkpoint request was issued * RET 0 (can be checkpoined) or a slurm error code */ extern int slurm_checkpoint_able (uint32_t job_id, uint32_t step_id, time_t *start_time) { int rc; slurm_msg_t req_msg, resp_msg; checkpoint_msg_t ckp_req; checkpoint_resp_msg_t *resp; ckp_req.op = CHECK_ABLE; ckp_req.job_id = job_id; ckp_req.step_id = step_id; ckp_req.image_dir = NULL; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); req_msg.msg_type = REQUEST_CHECKPOINT; req_msg.data = &ckp_req; if (slurm_send_recv_controller_msg(&req_msg, &resp_msg, working_cluster_rec) < 0) return SLURM_ERROR; switch (resp_msg.msg_type) { case RESPONSE_CHECKPOINT: resp = (checkpoint_resp_msg_t *) resp_msg.data; *start_time = resp->event_time; slurm_free_checkpoint_resp_msg(resp_msg.data); rc = SLURM_SUCCESS; break; case RESPONSE_SLURM_RC: rc = _handle_rc_msg(&resp_msg); break; default: *start_time = (time_t) NULL; rc = SLURM_ERROR; } return rc; }
/* * slurm_job_node_ready - report if nodes are ready for job to execute now * IN job_id - slurm job id * RET: READY_* values as defined in slurm.h */ extern int slurm_job_node_ready(uint32_t job_id) { slurm_msg_t req, resp; job_id_msg_t msg; int rc; slurm_msg_t_init(&req); slurm_msg_t_init(&resp); bzero(&msg, sizeof(job_id_msg_t)); msg.job_id = job_id; req.msg_type = REQUEST_JOB_READY; req.data = &msg; if (slurm_send_recv_controller_msg(&req, &resp) < 0) return READY_JOB_ERROR; if (resp.msg_type == RESPONSE_JOB_READY) { rc = ((return_code_msg_t *) resp.data)->return_code; slurm_free_return_code_msg(resp.data); } else if (resp.msg_type == RESPONSE_SLURM_RC) { int job_rc = ((return_code_msg_t *) resp.data) -> return_code; if ((job_rc == ESLURM_INVALID_PARTITION_NAME) || (job_rc == ESLURM_INVALID_JOB_ID)) rc = READY_JOB_FATAL; else /* EAGAIN */ rc = READY_JOB_ERROR; slurm_free_return_code_msg(resp.data); } else if (resp.msg_type == RESPONSE_PROLOG_EXECUTING) { rc = READY_JOB_ERROR; } else { rc = READY_JOB_ERROR; } return rc; }
/* * slurm_allocation_lookup - retrieve info for an existing resource allocation * without the addrs and such * IN jobid - job allocation identifier * OUT info - job allocation information * RET 0 on success, otherwise return -1 and set errno to indicate the error * NOTE: free the response using slurm_free_resource_allocation_response_msg() */ extern int slurm_allocation_lookup(uint32_t jobid, resource_allocation_response_msg_t **info) { job_alloc_info_msg_t req = {0}; slurm_msg_t req_msg; slurm_msg_t resp_msg; req.job_id = jobid; req.req_cluster = slurmctld_conf.cluster_name; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); req_msg.msg_type = REQUEST_JOB_ALLOCATION_INFO; req_msg.data = &req; if (slurm_send_recv_controller_msg(&req_msg, &resp_msg, working_cluster_rec) < 0) return SLURM_ERROR; req.req_cluster = NULL; switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: if (_handle_rc_msg(&resp_msg) < 0) return SLURM_ERROR; *info = NULL; break; case RESPONSE_JOB_ALLOCATION_INFO: *info = (resource_allocation_response_msg_t *) resp_msg.data; return SLURM_PROTOCOL_SUCCESS; break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS; }
/* * slurm_load_job_user - issue RPC to get slurm information about all jobs * to be run as the specified user * IN/OUT job_info_msg_pptr - place to store a job configuration pointer * IN user_id - ID of user we want information for * IN show_flags - job filtering options * RET 0 or -1 on error * NOTE: free the response using slurm_free_job_info_msg */ extern int slurm_load_job_user (job_info_msg_t **job_info_msg_pptr, uint32_t user_id, uint16_t show_flags) { int rc; slurm_msg_t resp_msg; slurm_msg_t req_msg; job_user_id_msg_t req; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); req.show_flags = show_flags; req.user_id = user_id; req_msg.msg_type = REQUEST_JOB_USER_INFO; req_msg.data = &req; if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0) return SLURM_ERROR; switch (resp_msg.msg_type) { case RESPONSE_JOB_INFO: *job_info_msg_pptr = (job_info_msg_t *)resp_msg.data; break; case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; slurm_free_return_code_msg(resp_msg.data); if (rc) slurm_seterrno_ret(rc); break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS; }
/* * slurm_load_node_single - issue RPC to get slurm configuration information * for a specific node * OUT resp - place to store a node configuration pointer * IN node_name - name of the node for which information is requested * IN show_flags - node filtering options * RET 0 or a slurm error code * NOTE: free the response using slurm_free_node_info_msg */ extern int slurm_load_node_single (node_info_msg_t **resp, char *node_name, uint16_t show_flags) { int rc; slurm_msg_t req_msg; slurm_msg_t resp_msg; node_info_single_msg_t req; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); req.node_name = node_name; req.show_flags = show_flags; req_msg.msg_type = REQUEST_NODE_INFO_SINGLE; req_msg.data = &req; if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0) return SLURM_ERROR; switch (resp_msg.msg_type) { case RESPONSE_NODE_INFO: *resp = (node_info_msg_t *) resp_msg.data; break; case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; slurm_free_return_code_msg(resp_msg.data); if (rc) slurm_seterrno_ret(rc); *resp = NULL; break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS; }
/* * slurm_sbcast_lookup - retrieve info for an existing resource allocation * including a credential needed for sbcast * IN job_id - job allocation identifier (or pack job ID) * IN pack_job_offset - pack job index (or NO_VAL if not pack job) * IN step_id - step allocation identifier (or NO_VAL for entire job) * OUT info - job allocation information including a credential for sbcast * RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set * NOTE: free the "resp" using slurm_free_sbcast_cred_msg */ extern int slurm_sbcast_lookup(uint32_t job_id, uint32_t pack_job_offset, uint32_t step_id, job_sbcast_cred_msg_t **info) { step_alloc_info_msg_t req; slurm_msg_t req_msg; slurm_msg_t resp_msg; req.job_id = job_id; req.pack_job_offset = pack_job_offset; req.step_id = step_id; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); req_msg.msg_type = REQUEST_JOB_SBCAST_CRED; req_msg.data = &req; if (slurm_send_recv_controller_msg(&req_msg, &resp_msg,working_cluster_rec) < 0) return SLURM_ERROR; switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: if (_handle_rc_msg(&resp_msg) < 0) return SLURM_ERROR; *info = NULL; break; case RESPONSE_JOB_SBCAST_CRED: *info = (job_sbcast_cred_msg_t *)resp_msg.data; return SLURM_SUCCESS; break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_SUCCESS; }
/* * slurm_load_reservations - issue RPC to get all slurm reservation * configuration information if changed since update_time * IN update_time - time of current configuration data * IN reserve_info_msg_pptr - place to store a reservation configuration * pointer * RET 0 or a slurm error code * NOTE: free the response using slurm_free_reservation_info_msg */ extern int slurm_load_reservations (time_t update_time, reserve_info_msg_t **resp) { int rc; slurm_msg_t req_msg; slurm_msg_t resp_msg; resv_info_request_msg_t req; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); req.last_update = update_time; req_msg.msg_type = REQUEST_RESERVATION_INFO; req_msg.data = &req; if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0) return SLURM_ERROR; switch (resp_msg.msg_type) { case RESPONSE_RESERVATION_INFO: *resp = (reserve_info_msg_t *) resp_msg.data; break; case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; slurm_free_return_code_msg(resp_msg.data); if (rc) slurm_seterrno_ret(rc); *resp = NULL; break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS; }
/* * slurm_load_node2 - equivalent to slurm_load_node() with addition * of cluster record for communications in a federation */ extern int slurm_load_node2(time_t update_time, node_info_msg_t **resp, uint16_t show_flags, slurmdb_cluster_rec_t *cluster) { slurm_msg_t req_msg; node_info_request_msg_t req; slurm_msg_t_init(&req_msg); memset(&req, 0, sizeof(req)); req.last_update = update_time; req.show_flags = show_flags; req_msg.msg_type = REQUEST_NODE_INFO; req_msg.data = &req; return _load_cluster_nodes(&req_msg, resp, cluster, show_flags); }
/* * slurm_load_node - issue RPC to get slurm all node configuration information * if changed since update_time * IN update_time - time of current configuration data * OUT resp - place to store a node configuration pointer * IN show_flags - node filtering options * RET 0 or a slurm error code * NOTE: free the response using slurm_free_node_info_msg */ extern int slurm_load_node(time_t update_time, node_info_msg_t **resp, uint16_t show_flags) { slurm_msg_t req_msg; node_info_request_msg_t req; char *cluster_name = NULL; void *ptr = NULL; slurmdb_federation_rec_t *fed; int rc; if (working_cluster_rec) cluster_name = xstrdup(working_cluster_rec->name); else cluster_name = slurm_get_cluster_name(); if ((show_flags & SHOW_FEDERATION) && !(show_flags & SHOW_LOCAL) && (slurm_load_federation(&ptr) == SLURM_SUCCESS) && cluster_in_federation(ptr, cluster_name)) { /* In federation. Need full info from all clusters */ update_time = (time_t) 0; show_flags &= (~SHOW_LOCAL); } else { /* Report local cluster info only */ show_flags |= SHOW_LOCAL; show_flags &= (~SHOW_FEDERATION); } slurm_msg_t_init(&req_msg); memset(&req, 0, sizeof(req)); req.last_update = update_time; req.show_flags = show_flags; req_msg.msg_type = REQUEST_NODE_INFO; req_msg.data = &req; if ((show_flags & SHOW_FEDERATION) && ptr) { /* "ptr" check for CLANG */ fed = (slurmdb_federation_rec_t *) ptr; rc = _load_fed_nodes(&req_msg, resp, show_flags, cluster_name, fed); } else { rc = _load_cluster_nodes(&req_msg, resp, working_cluster_rec, show_flags); } if (ptr) slurm_destroy_federation_rec(ptr); xfree(cluster_name); return rc; }
/* * slurm_load_node_single2 - equivalent to slurm_load_node_single() with * addition of cluster record for communications in a federation */ extern int slurm_load_node_single2(node_info_msg_t **resp, char *node_name, uint16_t show_flags, slurmdb_cluster_rec_t *cluster) { slurm_msg_t req_msg; node_info_single_msg_t req; slurm_msg_t_init(&req_msg); memset(&req, 0, sizeof(req)); req.node_name = node_name; req.show_flags = show_flags; req_msg.msg_type = REQUEST_NODE_INFO_SINGLE; req_msg.data = &req; return _load_cluster_nodes(&req_msg, resp, cluster, show_flags); }
/* Transmit PMI Keyval space data */ int slurm_send_kvs_comm_set(struct kvs_comm_set *kvs_set_ptr, int pmi_rank, int pmi_size) { slurm_msg_t msg_send; int rc, retries = 0, timeout = 0; if (kvs_set_ptr == NULL) return EINVAL; if ((rc = _get_addr()) != SLURM_SUCCESS) return rc; _set_pmi_time(); slurm_msg_t_init(&msg_send); msg_send.address = srun_addr; msg_send.msg_type = PMI_KVS_PUT_REQ; msg_send.data = (void *) kvs_set_ptr; /* Send the RPC to the local srun communcation manager. * Since the srun can be sent thousands of messages at * the same time and refuse some connections, retry as * needed. Spread out messages by task's rank. Also * increase the timeout if many tasks since the srun * command is very overloaded. * We also increase the timeout (default timeout is * 10 secs). */ _delay_rpc(pmi_rank, pmi_size); if (pmi_size > 4000) /* 240 secs */ timeout = slurm_get_msg_timeout() * 24000; else if (pmi_size > 1000) /* 120 secs */ timeout = slurm_get_msg_timeout() * 12000; else if (pmi_size > 100) /* 50 secs */ timeout = slurm_get_msg_timeout() * 5000; else if (pmi_size > 10) /* 20 secs */ timeout = slurm_get_msg_timeout() * 2000; while (slurm_send_recv_rc_msg_only_one(&msg_send, &rc, timeout) < 0) { if (retries++ > MAX_RETRIES) { error("slurm_send_kvs_comm_set: %m"); return SLURM_ERROR; } else debug("send_kvs retry %d", retries); _delay_rpc(pmi_rank, pmi_size); } return rc; }
/* * slurm_requeue - re-queue a batch job, if already running * then terminate it first * IN job_id - job on which to perform operation * RET 0 or a slurm error code */ extern int slurm_requeue (uint32_t job_id) { int rc; job_id_msg_t requeue_req; slurm_msg_t req_msg; slurm_msg_t_init(&req_msg); requeue_req.job_id = job_id; req_msg.msg_type = REQUEST_JOB_REQUEUE; req_msg.data = &requeue_req; if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) < 0) return SLURM_ERROR; slurm_seterrno(rc); return rc; }
/* * slurm_load_partitions2 - equivalent to slurm_load_partitions() with addition * of cluster record for communications in a federation */ extern int slurm_load_partitions2(time_t update_time, partition_info_msg_t **resp, uint16_t show_flags, slurmdb_cluster_rec_t *cluster) { slurm_msg_t req_msg; part_info_request_msg_t req; slurm_msg_t_init(&req_msg); req.last_update = update_time; req.show_flags = show_flags; req_msg.msg_type = REQUEST_PARTITION_INFO; req_msg.data = &req; return _load_cluster_parts(&req_msg, resp, cluster); }
/* _acct_kill_step() issue RPC to kill a slurm job step */ static void _acct_kill_step(void) { slurm_msg_t msg; job_step_kill_msg_t req; slurm_msg_t_init(&msg); /* * Request message: */ req.job_id = jobacct_job_id; req.job_step_id = jobacct_step_id; req.signal = SIGKILL; req.batch_flag = 0; msg.msg_type = REQUEST_CANCEL_JOB_STEP; msg.data = &req; slurm_send_only_controller_msg(&msg); }
/* * Move the specified job ID to the top of the queue for a given user ID, * partition, account, and QOS. * IN job_id_str - a job id * RET 0 or -1 on error */ extern int slurm_top_job(char *job_id_str) { int rc = SLURM_SUCCESS; top_job_msg_t top_job_req; slurm_msg_t req_msg; slurm_msg_t_init(&req_msg); top_job_req.job_id_str = job_id_str; req_msg.msg_type = REQUEST_TOP_JOB; req_msg.data = &top_job_req; if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) < 0) return SLURM_ERROR; slurm_seterrno(rc); return rc; }
/* _slurm_update - issue RPC for all update requests */ static int _slurm_update (void *data, slurm_msg_type_t msg_type) { int rc; slurm_msg_t req_msg; slurm_msg_t_init(&req_msg); req_msg.msg_type = msg_type; req_msg.data = data; if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) < 0) return SLURM_ERROR; if (rc != SLURM_SUCCESS) slurm_seterrno_ret(rc); return SLURM_PROTOCOL_SUCCESS; }
/* * _suspend_op - perform a suspend/resume operation for some job. * IN op - operation to perform * IN job_id - job on which to perform operation * IN step_id - job step on which to perform operation * RET 0 or a slurm error code */ static int _suspend_op (uint16_t op, uint32_t job_id) { int rc; suspend_msg_t sus_req; slurm_msg_t req_msg; slurm_msg_t_init(&req_msg); sus_req.op = op; sus_req.job_id = job_id; req_msg.msg_type = REQUEST_SUSPEND; req_msg.data = &sus_req; if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) < 0) return SLURM_ERROR; slurm_seterrno(rc); return rc; }
/* * slurm_shutdown - issue RPC to have Slurm controller (slurmctld) * cease operations, both the primary and backup controller * are shutdown. * IN options - 0: all slurm daemons are shutdown * 1: slurmctld generates a core file * 2: only the slurmctld is shutdown (no core file) * RET 0 or a slurm error code */ int slurm_shutdown (uint16_t options) { slurm_msg_t req_msg; shutdown_msg_t shutdown_msg; slurm_msg_t_init(&req_msg); shutdown_msg.options = options; req_msg.msg_type = REQUEST_SHUTDOWN; req_msg.data = &shutdown_msg; /* * Explicity send the message to both primary * and backup controllers */ (void) _send_message_controller(SECONDARY_CONTROLLER, &req_msg); return _send_message_controller(PRIMARY_CONTROLLER, &req_msg); }
/* * slurm_reconfigure - issue RPC to have Slurm controller (slurmctld) * reload its configuration file * RET 0 or a slurm error code */ int slurm_reconfigure (void) { int rc; slurm_msg_t req; slurm_msg_t_init(&req); req.msg_type = REQUEST_RECONFIGURE; if (slurm_send_recv_controller_rc_msg(&req, &rc) < 0) return SLURM_ERROR; if (rc) slurm_seterrno_ret(rc); return SLURM_PROTOCOL_SUCCESS; }
extern int slurm_persist_msg_unpack(slurm_persist_conn_t *persist_conn, persist_msg_t *resp_msg, Buf buffer) { int rc; xassert(persist_conn); xassert(resp_msg); if (persist_conn->flags & PERSIST_FLAG_DBD) { rc = unpack_slurmdbd_msg((slurmdbd_msg_t *)resp_msg, persist_conn->version, buffer); } else { slurm_msg_t msg; slurm_msg_t_init(&msg); msg.protocol_version = persist_conn->version; safe_unpack16(&msg.msg_type, buffer); rc = unpack_msg(&msg, buffer); resp_msg->msg_type = msg.msg_type; resp_msg->data = msg.data; } /* Here we transfer the auth_cred to the persist_conn just in case in the * future we need to use it in some way to verify things for messages * that don't have on that will follow on the connection. */ if (resp_msg->msg_type == REQUEST_PERSIST_INIT) { slurm_msg_t *msg = resp_msg->data; if (persist_conn->auth_cred) g_slurm_auth_destroy(persist_conn->auth_cred); persist_conn->auth_cred = msg->auth_cred; msg->auth_cred = NULL; } return rc; unpack_error: return SLURM_ERROR; }
/* Accept RPC from slurmctld and process it. * IN slurmctld_fd: file descriptor for slurmctld communications * OUT resp: resource allocation response message * RET 1 if resp is filled in, 0 otherwise */ static int _accept_msg_connection(int listen_fd, resource_allocation_response_msg_t **resp) { int conn_fd; slurm_msg_t *msg = NULL; slurm_addr_t cli_addr; char host[256]; uint16_t port; int rc = 0; conn_fd = slurm_accept_msg_conn(listen_fd, &cli_addr); if (conn_fd < 0) { error("Unable to accept connection: %m"); return rc; } slurm_get_addr(&cli_addr, &port, host, sizeof(host)); debug2("got message connection from %s:%hu", host, port); msg = xmalloc(sizeof(slurm_msg_t)); slurm_msg_t_init(msg); if((rc = slurm_receive_msg(conn_fd, msg, 0)) != 0) { slurm_free_msg(msg); if (errno == EINTR) { slurm_close_accepted_conn(conn_fd); *resp = NULL; return 0; } error("_accept_msg_connection[%s]: %m", host); slurm_close_accepted_conn(conn_fd); return SLURM_ERROR; } rc = _handle_msg(msg, resp); /* handle_msg frees msg */ slurm_free_msg(msg); slurm_close_accepted_conn(conn_fd); return rc; }
/* * Tell the primary_controller to relinquish control, primary control_machine * has to suspend operation * Based on _shutdown_backup_controller from controller.c * wait_time - How long to wait for primary controller to write state, seconds. * RET 0 or an error code * NOTE: READ lock_slurmctld config before entry (or be single-threaded) */ static int _shutdown_primary_controller(int wait_time) { int rc; slurm_msg_t req; slurm_msg_t_init(&req); if ((slurmctld_conf.control_addr == NULL) || (slurmctld_conf.control_addr[0] == '\0')) { error("_shutdown_primary_controller: " "no primary controller to shutdown"); return SLURM_ERROR; } slurm_set_addr(&req.address, slurmctld_conf.slurmctld_port, slurmctld_conf.control_addr); /* send request message */ req.msg_type = REQUEST_CONTROL; if (slurm_send_recv_rc_msg_only_one(&req, &rc, (CONTROL_TIMEOUT * 1000)) < 0) { error("_shutdown_primary_controller:send/recv: %m"); return SLURM_ERROR; } if (rc == ESLURM_DISABLED) debug("primary controller responding"); else if (rc == 0) { debug("primary controller has relinquished control"); } else { error("_shutdown_primary_controller: %s", slurm_strerror(rc)); return SLURM_ERROR; } /* FIXME: Ideally the REQUEST_CONTROL RPC does not return until all * other activity has ceased and the state has been saved. That is * not presently the case (it returns when no other work is pending, * so the state save should occur right away). We sleep for a while * here and give the primary controller time to shutdown */ if (wait_time) sleep(wait_time); return SLURM_SUCCESS; }
/* * slurm_requeue - re-queue a batch job, if already running * then terminate it first * IN job_id - job on which to perform operation * IN state - state in which to place the job * RET 0 or a slurm error code */ extern int slurm_requeue(uint32_t job_id, uint32_t state) { int rc = SLURM_SUCCESS; requeue_msg_t requeue_req; slurm_msg_t req_msg; slurm_msg_t_init(&req_msg); requeue_req.job_id = job_id; requeue_req.job_id_str = NULL; requeue_req.state = state; req_msg.msg_type = REQUEST_JOB_REQUEUE; req_msg.data = &requeue_req; if (slurm_send_recv_controller_rc_msg(&req_msg, &rc) < 0) return SLURM_ERROR; slurm_seterrno(rc); return rc; }
/* * slurm_ping - issue RPC to have Slurm controller (slurmctld) * IN controller - 1==primary controller, 2==secondary controller * RET 0 or a slurm error code */ int slurm_ping (int primary) { int rc ; slurm_msg_t request_msg ; slurm_msg_t_init(&request_msg); request_msg.msg_type = REQUEST_PING ; if (primary == 1) rc = _send_message_controller ( PRIMARY_CONTROLLER, &request_msg ); else if (primary == 2) rc = _send_message_controller ( SECONDARY_CONTROLLER, &request_msg ); else rc = SLURM_ERROR; return rc; }
static int _drain_node(char *reason) { slurm_msg_t req_msg; update_node_msg_t update_node_msg; memset(&update_node_msg, 0, sizeof(update_node_msg_t)); update_node_msg.node_names = conf->node_name; update_node_msg.node_state = NODE_STATE_DRAIN; update_node_msg.reason = reason; update_node_msg.reason_uid = getuid(); update_node_msg.weight = NO_VAL; slurm_msg_t_init(&req_msg); req_msg.msg_type = REQUEST_UPDATE_NODE; req_msg.data = &update_node_msg; if (slurm_send_only_controller_msg(&req_msg) < 0) return SLURM_ERROR; return SLURM_SUCCESS; }
static int _send_to_stepds(hostlist_t hl, const char *addr, uint32_t len, char *data) { List ret_list = NULL; int temp_rc = 0, rc = 0; ret_data_info_t *ret_data_info = NULL; slurm_msg_t *msg = xmalloc(sizeof(slurm_msg_t)); forward_data_msg_t req; char *nodelist = NULL; slurm_msg_t_init(msg); req.address = xstrdup(addr); req.len = len; req.data = data; msg->msg_type = REQUEST_FORWARD_DATA; msg->data = &req; nodelist = hostlist_ranged_string_xmalloc(hl); if ((ret_list = slurm_send_recv_msgs(nodelist, msg, 0, false))) { while ((ret_data_info = list_pop(ret_list))) { temp_rc = slurm_get_return_code(ret_data_info->type, ret_data_info->data); if (temp_rc) { rc = temp_rc; } else { hostlist_delete_host(hl, ret_data_info->node_name); } } } else { error("tree_msg_to_stepds: no list was returned"); rc = SLURM_ERROR; } slurm_free_msg(msg); xfree(nodelist); xfree(req.address); return rc; }
extern void msg_aggr_add_comp(Buf buffer, void *auth_cred, header_t *header) { slurm_msg_t *msg; if (!msg_collection.running) return; msg = xmalloc_nz(sizeof(slurm_msg_t)); slurm_msg_t_init(msg); msg->protocol_version = header->version; msg->msg_type = header->msg_type; msg->flags = header->flags; msg->auth_cred = auth_cred; msg->data = buffer; msg->data_size = remaining_buf(buffer); msg_aggr_add_msg(msg, 0, NULL); }
/* Issue the RPC to transfer the file's data */ static int _file_bcast(struct bcast_parameters *params, file_bcast_msg_t *bcast_msg, job_sbcast_cred_msg_t *sbcast_cred) { List ret_list = NULL; ListIterator itr; ret_data_info_t *ret_data_info = NULL; int rc = 0, msg_rc; slurm_msg_t msg; slurm_msg_t_init(&msg); msg.data = bcast_msg; msg.msg_type = REQUEST_FILE_BCAST; ret_list = slurm_send_recv_msgs( sbcast_cred->node_list, &msg, params->timeout, true); if (ret_list == NULL) { error("slurm_send_recv_msgs: %m"); exit(1); } itr = list_iterator_create(ret_list); while ((ret_data_info = list_next(itr))) { msg_rc = slurm_get_return_code(ret_data_info->type, ret_data_info->data); if (msg_rc == SLURM_SUCCESS) continue; error("REQUEST_FILE_BCAST(%s): %s", ret_data_info->node_name, slurm_strerror(msg_rc)); rc = MAX(rc, msg_rc); } list_iterator_destroy(itr); FREE_NULL_LIST(ret_list); return rc; }
/* * slurm_clear_trigger - Clear (remove) an existing event trigger * RET 0 or a slurm error code */ extern int slurm_clear_trigger (trigger_info_t *trigger_clear) { int rc; slurm_msg_t msg; trigger_info_msg_t req; slurm_msg_t_init(&msg); /* * Request message: */ req.record_count = 1; req.trigger_array = trigger_clear; msg.msg_type = REQUEST_TRIGGER_CLEAR; msg.data = &req; if (slurm_send_recv_controller_rc_msg(&msg, &rc) < 0) return SLURM_FAILURE; if (rc) slurm_seterrno_ret(rc); return SLURM_SUCCESS; }