/* * slurm_allocate_resources - allocate resources for a job request * IN job_desc_msg - description of resource allocation request * OUT slurm_alloc_msg - response to request * RET 0 on success, otherwise return -1 and set errno to indicate the error * NOTE: free the response using slurm_free_resource_allocation_response_msg() */ int slurm_allocate_resources (job_desc_msg_t *req, resource_allocation_response_msg_t **resp) { int rc; slurm_msg_t req_msg; slurm_msg_t resp_msg; bool host_set = false; char host[64]; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); /* * set Node and session id for this request */ if (req->alloc_sid == NO_VAL) req->alloc_sid = getsid(0); if ( (req->alloc_node == NULL) && (gethostname_short(host, sizeof(host)) == 0) ) { req->alloc_node = host; host_set = true; } req_msg.msg_type = REQUEST_RESOURCE_ALLOCATION; req_msg.data = req; rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg); /* * Clear this hostname if set internally to this function * (memory is on the stack) */ if (host_set) req->alloc_node = NULL; if (rc == SLURM_SOCKET_ERROR) return SLURM_SOCKET_ERROR; switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: if (_handle_rc_msg(&resp_msg) < 0) return SLURM_PROTOCOL_ERROR; *resp = NULL; break; case RESPONSE_RESOURCE_ALLOCATION: *resp = (resource_allocation_response_msg_t *) resp_msg.data; break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); } return SLURM_PROTOCOL_SUCCESS; }
/* * slurm_update_job2 - issue RPC to a job's configuration per request, * only usable by user root or (for some parameters) the job's owner * IN job_msg - description of job updates * OUT resp - per task response to the request, * free using slurm_free_job_array_resp() * RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set */ extern int slurm_update_job2 (job_desc_msg_t * job_msg, job_array_resp_msg_t **resp) { int rc = SLURM_SUCCESS; slurm_msg_t req_msg, resp_msg; slurmdb_cluster_rec_t *save_working_cluster_rec = working_cluster_rec; slurm_msg_t_init(&req_msg); req_msg.msg_type = REQUEST_UPDATE_JOB; req_msg.data = job_msg; tryagain: slurm_msg_t_init(&resp_msg); rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg, working_cluster_rec); switch (resp_msg.msg_type) { case RESPONSE_SLURM_REROUTE_MSG: { reroute_msg_t *rr_msg = (reroute_msg_t *)resp_msg.data; /* Don't expect mutliple hops but in the case it does * happen, free the previous rr cluster_rec. */ if (working_cluster_rec && working_cluster_rec != save_working_cluster_rec) slurmdb_destroy_cluster_rec( working_cluster_rec); working_cluster_rec = rr_msg->working_cluster_rec; slurmdb_setup_cluster_rec(working_cluster_rec); rr_msg->working_cluster_rec = NULL; goto tryagain; } case RESPONSE_JOB_ARRAY_ERRORS: *resp = (job_array_resp_msg_t *) resp_msg.data; break; case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; if (rc) slurm_seterrno(rc); break; default: slurm_seterrno(SLURM_UNEXPECTED_MSG_ERROR); } if (working_cluster_rec != save_working_cluster_rec) { slurmdb_destroy_cluster_rec(working_cluster_rec); working_cluster_rec = save_working_cluster_rec; } return rc; }
/* * slurm_checkpoint_error - gather error information for the last checkpoint * operation for some job step * IN job_id - job on which to perform operation * IN step_id - job step on which to perform operation * OUT error_code - error number associated with the last checkpoint operation, * this value is dependent upon the checkpoint plugin used and may be * completely unrelated to slurm error codes, the highest value for all * complete calls is preserved * OUT error_msg - error message, preserved for highest error_code, value * must be freed by the caller to prevent memory leak * RET 0 or a slurm error code */ extern int slurm_checkpoint_error (uint32_t job_id, uint32_t step_id, uint32_t *error_code, char **error_msg) { int rc; slurm_msg_t msg; checkpoint_msg_t req; slurm_msg_t resp_msg; checkpoint_resp_msg_t *ckpt_resp; if ((error_code == NULL) || (error_msg == NULL)) return EINVAL; /* * Request message: */ req.op = CHECK_ERROR; req.job_id = job_id; req.step_id = step_id; req.image_dir = NULL; slurm_msg_t_init(&msg); slurm_msg_t_init(&resp_msg); msg.msg_type = REQUEST_CHECKPOINT; msg.data = &req; rc = slurm_send_recv_controller_msg(&msg, &resp_msg, working_cluster_rec); if (rc == SLURM_SOCKET_ERROR) return rc; switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: *error_code = 0; *error_msg = strdup(""); rc = _handle_rc_msg(&resp_msg); break; case RESPONSE_CHECKPOINT: ckpt_resp = (checkpoint_resp_msg_t *) resp_msg.data; *error_code = ckpt_resp->error_code; if (ckpt_resp->error_msg) *error_msg = strdup(ckpt_resp->error_msg); else *error_msg = strdup(""); slurm_free_checkpoint_resp_msg(ckpt_resp); rc = SLURM_SUCCESS; break; default: rc = SLURM_UNEXPECTED_MSG_ERROR; } return rc; }
/* * slurm_submit_batch_pack_job - issue RPC to submit a heterogeneous job for * later execution * NOTE: free the response using slurm_free_submit_response_response_msg * IN job_req_list - List of resource allocation requests, type job_desc_msg_t * OUT resp - response to request * RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set */ extern int slurm_submit_batch_pack_job(List job_req_list, submit_response_msg_t **resp) { int rc; job_desc_msg_t *req; slurm_msg_t req_msg; slurm_msg_t resp_msg; char *local_hostname = NULL; ListIterator iter; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); /* * set Node and session id for this request */ local_hostname = xshort_hostname(); iter = list_iterator_create(job_req_list); while ((req = (job_desc_msg_t *) list_next(iter))) { if (req->alloc_sid == NO_VAL) req->alloc_sid = getsid(0); if (!req->alloc_node) req->alloc_node = local_hostname; } list_iterator_destroy(iter); req_msg.msg_type = REQUEST_SUBMIT_BATCH_JOB_PACK; req_msg.data = job_req_list; rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg, working_cluster_rec); xfree(local_hostname); if (rc == SLURM_ERROR) return SLURM_ERROR; switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; if (rc) slurm_seterrno_ret(rc); *resp = NULL; break; case RESPONSE_SUBMIT_BATCH_JOB: *resp = (submit_response_msg_t *) resp_msg.data; break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); } return SLURM_SUCCESS; }
/* * slurm_submit_batch_job - issue RPC to submit a job for later execution * NOTE: free the response using slurm_free_submit_response_response_msg * IN job_desc_msg - description of batch job request * OUT resp - response to request * RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set */ extern int slurm_submit_batch_job(job_desc_msg_t *req, submit_response_msg_t **resp) { int rc; slurm_msg_t req_msg; slurm_msg_t resp_msg; char *local_hostname = NULL; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); /* * set Node and session id for this request */ if (req->alloc_sid == NO_VAL) req->alloc_sid = getsid(0); if (req->alloc_node == NULL) { local_hostname = xshort_hostname(); req->alloc_node = local_hostname; } req_msg.msg_type = REQUEST_SUBMIT_BATCH_JOB; req_msg.data = req; rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg, working_cluster_rec); xfree(local_hostname); if (rc == SLURM_ERROR) return SLURM_ERROR; switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; if (rc) slurm_seterrno_ret(rc); *resp = NULL; break; case RESPONSE_SUBMIT_BATCH_JOB: *resp = (submit_response_msg_t *) resp_msg.data; break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); } return SLURM_SUCCESS; }
/* slurm_load_licenses() * * Load requested licenses from the controller. * */ extern int slurm_load_licenses(time_t t, license_info_msg_t **lic_info, uint16_t show_flags) { int cc; slurm_msg_t msg_request; slurm_msg_t msg_reply; struct license_info_request_msg req; memset(&req, 0, sizeof(struct license_info_request_msg)); slurm_msg_t_init(&msg_request); slurm_msg_t_init(&msg_reply); msg_request.msg_type = REQUEST_LICENSE_INFO; req.last_update = t; req.show_flags = show_flags; msg_request.data = &req; cc = slurm_send_recv_controller_msg(&msg_request, &msg_reply, working_cluster_rec); if (cc < 0) return SLURM_ERROR; switch (msg_reply.msg_type) { case RESPONSE_LICENSE_INFO: *lic_info = msg_reply.data; break; case RESPONSE_SLURM_RC: cc = ((return_code_msg_t *)msg_reply.data)->return_code; slurm_free_return_code_msg(msg_reply.data); if (cc) /* slurm_seterrno_ret() is a macro ... sigh */ slurm_seterrno(cc); *lic_info = NULL; return -1; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS; }
/* * slurm_job_step_create - create a job step for a given job id * IN slurm_step_alloc_req_msg - description of job step request * OUT slurm_step_alloc_resp_msg - response to request * RET 0 on success, otherwise return -1 and set errno to indicate the error * NOTE: free the response using slurm_free_job_step_create_response_msg */ int slurm_job_step_create (job_step_create_request_msg_t *req, job_step_create_response_msg_t **resp) { slurm_msg_t req_msg, resp_msg; int delay, rc, retry = 0; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); req_msg.msg_type = REQUEST_JOB_STEP_CREATE; req_msg.data = req; re_send: if (slurm_send_recv_controller_msg(&req_msg, &resp_msg, working_cluster_rec) < 0) return SLURM_ERROR; switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: rc = _handle_rc_msg(&resp_msg); if ((rc < 0) && (errno == EAGAIN)) { if (retry++ == 0) { verbose("Slurm is busy, step creation delayed"); delay = (getpid() % 10) + 10; /* 10-19 secs */ } sleep(delay); goto re_send; } if (rc < 0) return SLURM_PROTOCOL_ERROR; *resp = NULL; break; case RESPONSE_JOB_STEP_CREATE: *resp = (job_step_create_response_msg_t *) resp_msg.data; break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS ; }
/* * slurm_load_block_info - issue RPC to get slurm all node select plugin * information if changed since update_time * IN update_time - time of current configuration data * IN block_info_msg_pptr - place to store a node select configuration * pointer * IN show_flags - controls output form or filtering, see SHOW_FLAGS in slurm.h * RET 0 or a slurm error code * NOTE: free the response using slurm_free_block_info_msg */ extern int slurm_load_block_info (time_t update_time, block_info_msg_t **block_info_msg_pptr, uint16_t show_flags) { int rc; slurm_msg_t req_msg; slurm_msg_t resp_msg; block_info_request_msg_t req; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); req.last_update = update_time; req.show_flags = show_flags; req_msg.msg_type = REQUEST_BLOCK_INFO; req_msg.data = &req; if (slurm_send_recv_controller_msg(&req_msg, &resp_msg, working_cluster_rec) < 0) return SLURM_ERROR; switch (resp_msg.msg_type) { case RESPONSE_BLOCK_INFO: *block_info_msg_pptr = (block_info_msg_t *) resp_msg.data; break; case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; slurm_free_return_code_msg(resp_msg.data); if (rc) slurm_seterrno_ret(rc); *block_info_msg_pptr = NULL; break; default: *block_info_msg_pptr = NULL; slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_SUCCESS; }
/* * slurm_allocate_resources - allocate resources for a job request * IN job_desc_msg - description of resource allocation request * OUT slurm_alloc_msg - response to request * RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set * NOTE: free the response using slurm_free_resource_allocation_response_msg() */ int slurm_allocate_resources (job_desc_msg_t *req, resource_allocation_response_msg_t **resp) { int rc; slurm_msg_t req_msg; slurm_msg_t resp_msg; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); /* * set Node and session id for this request */ if (req->alloc_sid == NO_VAL) req->alloc_sid = getsid(0); req_msg.msg_type = REQUEST_RESOURCE_ALLOCATION; req_msg.data = req; rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg, working_cluster_rec); if (rc == SLURM_ERROR) return SLURM_ERROR; switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: if (_handle_rc_msg(&resp_msg) < 0) return SLURM_ERROR; *resp = NULL; break; case RESPONSE_RESOURCE_ALLOCATION: *resp = (resource_allocation_response_msg_t *) resp_msg.data; break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); } return SLURM_SUCCESS; }
/* * slurm_get_job_steps - issue RPC to get specific slurm job step * configuration information if changed since update_time. * a job_id value of NO_VAL implies all jobs, a step_id value of * NO_VAL implies all steps * IN update_time - time of current configuration data * IN job_id - get information for specific job id, NO_VAL for all jobs * IN step_id - get information for specific job step id, NO_VAL for all * job steps * IN job_info_msg_pptr - place to store a job configuration pointer * IN show_flags - job step filtering options * RET 0 on success, otherwise return -1 and set errno to indicate the error * NOTE: free the response using slurm_free_job_step_info_response_msg */ int slurm_get_job_steps (time_t update_time, uint32_t job_id, uint32_t step_id, job_step_info_response_msg_t **resp, uint16_t show_flags) { int rc; slurm_msg_t req_msg; slurm_msg_t resp_msg; job_step_info_request_msg_t req; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); req.last_update = update_time; req.job_id = job_id; req.step_id = step_id; req.show_flags = show_flags; req_msg.msg_type = REQUEST_JOB_STEP_INFO; req_msg.data = &req; if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0) return SLURM_ERROR; switch (resp_msg.msg_type) { case RESPONSE_JOB_STEP_INFO: *resp = (job_step_info_response_msg_t *) resp_msg.data; break; case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; slurm_free_return_code_msg(resp_msg.data); if (rc) slurm_seterrno_ret(rc); *resp = NULL; break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS; }
/* * slurm_load_partitions - issue RPC to get slurm all partition configuration * information if changed since update_time * IN update_time - time of current configuration data * IN partition_info_msg_pptr - place to store a partition configuration * pointer * IN show_flags - partition filtering options * RET 0 or a slurm error code * NOTE: free the response using slurm_free_partition_info_msg */ extern int slurm_load_partitions (time_t update_time, partition_info_msg_t **resp, uint16_t show_flags) { int rc; slurm_msg_t req_msg; slurm_msg_t resp_msg; part_info_request_msg_t req; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); req.last_update = update_time; req.show_flags = show_flags; req_msg.msg_type = REQUEST_PARTITION_INFO; req_msg.data = &req; if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0) return SLURM_ERROR; switch (resp_msg.msg_type) { case RESPONSE_PARTITION_INFO: *resp = (partition_info_msg_t *) resp_msg.data; break; case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; slurm_free_return_code_msg(resp_msg.data); if (rc) slurm_seterrno_ret(rc); *resp = NULL; break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS; }
/* * slurm_load_node_single - issue RPC to get slurm configuration information * for a specific node * OUT resp - place to store a node configuration pointer * IN node_name - name of the node for which information is requested * IN show_flags - node filtering options * RET 0 or a slurm error code * NOTE: free the response using slurm_free_node_info_msg */ extern int slurm_load_node_single (node_info_msg_t **resp, char *node_name, uint16_t show_flags) { int rc; slurm_msg_t req_msg; slurm_msg_t resp_msg; node_info_single_msg_t req; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); req.node_name = node_name; req.show_flags = show_flags; req_msg.msg_type = REQUEST_NODE_INFO_SINGLE; req_msg.data = &req; if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0) return SLURM_ERROR; switch (resp_msg.msg_type) { case RESPONSE_NODE_INFO: *resp = (node_info_msg_t *) resp_msg.data; if (show_flags & SHOW_MIXED) _set_node_mixed(*resp); break; case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; slurm_free_return_code_msg(resp_msg.data); if (rc) slurm_seterrno_ret(rc); *resp = NULL; break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS; }
/* * slurm_allocation_lookup - retrieve info for an existing resource allocation * without the addrs and such * IN jobid - job allocation identifier * OUT info - job allocation information * RET 0 on success, otherwise return -1 and set errno to indicate the error * NOTE: free the response using slurm_free_resource_allocation_response_msg() */ extern int slurm_allocation_lookup(uint32_t jobid, resource_allocation_response_msg_t **info) { job_alloc_info_msg_t req = {0}; slurm_msg_t req_msg; slurm_msg_t resp_msg; req.job_id = jobid; req.req_cluster = slurmctld_conf.cluster_name; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); req_msg.msg_type = REQUEST_JOB_ALLOCATION_INFO; req_msg.data = &req; if (slurm_send_recv_controller_msg(&req_msg, &resp_msg, working_cluster_rec) < 0) return SLURM_ERROR; req.req_cluster = NULL; switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: if (_handle_rc_msg(&resp_msg) < 0) return SLURM_ERROR; *info = NULL; break; case RESPONSE_JOB_ALLOCATION_INFO: *info = (resource_allocation_response_msg_t *) resp_msg.data; return SLURM_PROTOCOL_SUCCESS; break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS; }
/* * slurm_checkpoint_able - determine if the specified job step can presently * be checkpointed * IN job_id - job on which to perform operation * IN step_id - job step on which to perform operation * OUT start_time - time at which checkpoint request was issued * RET 0 (can be checkpoined) or a slurm error code */ extern int slurm_checkpoint_able (uint32_t job_id, uint32_t step_id, time_t *start_time) { int rc; slurm_msg_t req_msg, resp_msg; checkpoint_msg_t ckp_req; checkpoint_resp_msg_t *resp; ckp_req.op = CHECK_ABLE; ckp_req.job_id = job_id; ckp_req.step_id = step_id; ckp_req.image_dir = NULL; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); req_msg.msg_type = REQUEST_CHECKPOINT; req_msg.data = &ckp_req; if (slurm_send_recv_controller_msg(&req_msg, &resp_msg, working_cluster_rec) < 0) return SLURM_ERROR; switch (resp_msg.msg_type) { case RESPONSE_CHECKPOINT: resp = (checkpoint_resp_msg_t *) resp_msg.data; *start_time = resp->event_time; slurm_free_checkpoint_resp_msg(resp_msg.data); rc = SLURM_SUCCESS; break; case RESPONSE_SLURM_RC: rc = _handle_rc_msg(&resp_msg); break; default: *start_time = (time_t) NULL; rc = SLURM_ERROR; } return rc; }
/* * slurm_job_node_ready - report if nodes are ready for job to execute now * IN job_id - slurm job id * RET: READY_* values as defined in slurm.h */ extern int slurm_job_node_ready(uint32_t job_id) { slurm_msg_t req, resp; job_id_msg_t msg; int rc; slurm_msg_t_init(&req); slurm_msg_t_init(&resp); bzero(&msg, sizeof(job_id_msg_t)); msg.job_id = job_id; req.msg_type = REQUEST_JOB_READY; req.data = &msg; if (slurm_send_recv_controller_msg(&req, &resp) < 0) return READY_JOB_ERROR; if (resp.msg_type == RESPONSE_JOB_READY) { rc = ((return_code_msg_t *) resp.data)->return_code; slurm_free_return_code_msg(resp.data); } else if (resp.msg_type == RESPONSE_SLURM_RC) { int job_rc = ((return_code_msg_t *) resp.data) -> return_code; if ((job_rc == ESLURM_INVALID_PARTITION_NAME) || (job_rc == ESLURM_INVALID_JOB_ID)) rc = READY_JOB_FATAL; else /* EAGAIN */ rc = READY_JOB_ERROR; slurm_free_return_code_msg(resp.data); } else if (resp.msg_type == RESPONSE_PROLOG_EXECUTING) { rc = READY_JOB_ERROR; } else { rc = READY_JOB_ERROR; } return rc; }
/* * slurm_load_job_user - issue RPC to get slurm information about all jobs * to be run as the specified user * IN/OUT job_info_msg_pptr - place to store a job configuration pointer * IN user_id - ID of user we want information for * IN show_flags - job filtering options * RET 0 or -1 on error * NOTE: free the response using slurm_free_job_info_msg */ extern int slurm_load_job_user (job_info_msg_t **job_info_msg_pptr, uint32_t user_id, uint16_t show_flags) { int rc; slurm_msg_t resp_msg; slurm_msg_t req_msg; job_user_id_msg_t req; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); req.show_flags = show_flags; req.user_id = user_id; req_msg.msg_type = REQUEST_JOB_USER_INFO; req_msg.data = &req; if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0) return SLURM_ERROR; switch (resp_msg.msg_type) { case RESPONSE_JOB_INFO: *job_info_msg_pptr = (job_info_msg_t *)resp_msg.data; break; case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; slurm_free_return_code_msg(resp_msg.data); if (rc) slurm_seterrno_ret(rc); break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS; }
/* * slurm_sbcast_lookup - retrieve info for an existing resource allocation * including a credential needed for sbcast * IN job_id - job allocation identifier (or pack job ID) * IN pack_job_offset - pack job index (or NO_VAL if not pack job) * IN step_id - step allocation identifier (or NO_VAL for entire job) * OUT info - job allocation information including a credential for sbcast * RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set * NOTE: free the "resp" using slurm_free_sbcast_cred_msg */ extern int slurm_sbcast_lookup(uint32_t job_id, uint32_t pack_job_offset, uint32_t step_id, job_sbcast_cred_msg_t **info) { step_alloc_info_msg_t req; slurm_msg_t req_msg; slurm_msg_t resp_msg; req.job_id = job_id; req.pack_job_offset = pack_job_offset; req.step_id = step_id; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); req_msg.msg_type = REQUEST_JOB_SBCAST_CRED; req_msg.data = &req; if (slurm_send_recv_controller_msg(&req_msg, &resp_msg,working_cluster_rec) < 0) return SLURM_ERROR; switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: if (_handle_rc_msg(&resp_msg) < 0) return SLURM_ERROR; *info = NULL; break; case RESPONSE_JOB_SBCAST_CRED: *info = (job_sbcast_cred_msg_t *)resp_msg.data; return SLURM_SUCCESS; break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_SUCCESS; }
/* * slurm_allocate_pack_job_blocking * allocate resources for a list of job requests. This call will block * until the entire allocation is granted, or the specified timeout limit * is reached. * IN req - List of resource allocation requests * IN timeout - amount of time, in seconds, to wait for a response before * giving up. * A timeout of zero will wait indefinitely. * IN pending_callback - If the allocation cannot be granted immediately, * the controller will put the job in the PENDING state. If * pending callback is not NULL, it will be called with the job_id * of the pending job as the sole parameter. * * RET List of allocation structures on success, NULL on error set errno to * indicate the error (errno will be ETIMEDOUT if the timeout is reached * with no allocation granted) * NOTE: free the response using list_destroy() */ List slurm_allocate_pack_job_blocking(List job_req_list, time_t timeout, void(*pending_callback)(uint32_t job_id)) { int rc; slurm_msg_t req_msg; slurm_msg_t resp_msg; List resp = NULL; char *local_hostname = NULL; job_desc_msg_t *req; listen_t *listen = NULL; int errnum = SLURM_SUCCESS; ListIterator iter; bool immediate_flag = false; bool immediate_logged = false; uint32_t node_cnt = 0, job_id = 0; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); /* * set node name and session ID for this request */ if (!immediate_flag) { listen = _create_allocation_response_socket(local_hostname); if (listen == NULL) return NULL; } local_hostname = xshort_hostname(); iter = list_iterator_create(job_req_list); while ((req = (job_desc_msg_t *) list_next(iter))) { if (req->alloc_sid == NO_VAL) req->alloc_sid = getsid(0); if (listen) req->alloc_resp_port = listen->port; if (!req->alloc_node) { if (local_hostname) { req->alloc_node = local_hostname; } else if (immediate_logged) { req->immediate = 1; } else { req->immediate = 1; error("Could not get local hostname, forcing " "immediate allocation mode"); immediate_logged = true; } } if (req->immediate) immediate_flag = true; } list_iterator_destroy(iter); req_msg.msg_type = REQUEST_JOB_PACK_ALLOCATION; req_msg.data = job_req_list; rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg, working_cluster_rec); if (rc == SLURM_SOCKET_ERROR) { int errnum = errno; destroy_forward(&req_msg.forward); destroy_forward(&resp_msg.forward); if (listen) _destroy_allocation_response_socket(listen); iter = list_iterator_create(job_req_list); while ((req = (job_desc_msg_t *)list_next(iter))) { if (req->alloc_node == local_hostname) req->alloc_node = NULL; } list_iterator_destroy(iter); xfree(local_hostname); errno = errnum; return NULL; } switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: if (_handle_rc_msg(&resp_msg) < 0) { /* will reach this when the allocation fails */ errnum = errno; } else { /* shouldn't get here */ errnum = SLURM_ERROR; } break; case RESPONSE_JOB_PACK_ALLOCATION: /* Yay, the controller has acknowledged our request! * Test if we have an allocation yet? */ resp = (List) resp_msg.data; _pack_alloc_test(resp, &node_cnt, &job_id); if (node_cnt > 0) { /* yes, allocation has been granted */ errno = SLURM_PROTOCOL_SUCCESS; } else if (immediate_flag) { debug("Immediate allocation not granted"); } else { /* no, we need to wait for a response */ FREE_NULL_LIST(resp); if (pending_callback != NULL) pending_callback(job_id); _wait_for_allocation_response(job_id, listen, RESPONSE_JOB_PACK_ALLOCATION, timeout, (void **) &resp); /* If NULL, we didn't get the allocation in * the time desired, so just free the job id */ if ((resp == NULL) && (errno != ESLURM_ALREADY_DONE)) { errnum = errno; slurm_complete_job(job_id, -1); } } break; default: errnum = SLURM_UNEXPECTED_MSG_ERROR; } destroy_forward(&req_msg.forward); destroy_forward(&resp_msg.forward); if (listen) _destroy_allocation_response_socket(listen); iter = list_iterator_create(job_req_list); while ((req = (job_desc_msg_t *)list_next(iter))) { if (req->alloc_node == local_hostname) req->alloc_node = NULL; } list_iterator_destroy(iter); xfree(local_hostname); errno = errnum; return resp; }
/* * We don't use the api here because it does things we aren't needing * like printing out information and not returning times. */ local_cluster_rec_t *_job_will_run (job_desc_msg_t *req) { slurm_msg_t req_msg, resp_msg; will_run_response_msg_t *will_run_resp; int rc; char buf[64]; char *type = "processors"; local_cluster_rec_t *local_cluster = NULL; /* req.immediate = true; implicit */ slurm_msg_t_init(&req_msg); req_msg.msg_type = REQUEST_JOB_WILL_RUN; req_msg.data = req; rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg); if (rc < 0) { slurm_seterrno(SLURM_SOCKET_ERROR); return NULL; } switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; slurm_free_return_code_msg(resp_msg.data); if (rc) slurm_seterrno(rc); break; case RESPONSE_JOB_WILL_RUN: if (working_cluster_rec->flags & CLUSTER_FLAG_BG) type = "cnodes"; will_run_resp = (will_run_response_msg_t *) resp_msg.data; slurm_make_time_str(&will_run_resp->start_time, buf, sizeof(buf)); debug("Job %u to start at %s on cluster %s using %u %s on %s", will_run_resp->job_id, buf, working_cluster_rec->name, will_run_resp->proc_cnt, type, will_run_resp->node_list); local_cluster = xmalloc(sizeof(local_cluster_rec_t)); local_cluster->cluster_rec = working_cluster_rec; local_cluster->start_time = will_run_resp->start_time; if (will_run_resp->preemptee_job_id) { local_cluster->preempt_cnt = list_count(will_run_resp->preemptee_job_id); if (opt.verbose >= LOG_LEVEL_DEBUG) { ListIterator itr; uint32_t *job_id_ptr; char *job_list = NULL, *sep = ""; itr = list_iterator_create(will_run_resp-> preemptee_job_id); while ((job_id_ptr = list_next(itr))) { if (job_list) sep = ","; xstrfmtcat(job_list, "%s%u", sep, *job_id_ptr); } debug(" Preempts: %s", job_list); xfree(job_list); } } slurm_free_will_run_response_msg(will_run_resp); break; default: slurm_seterrno(SLURM_UNEXPECTED_MSG_ERROR); return NULL; break; } return local_cluster; }
/* * slurm_allocate_resources_blocking * allocate resources for a job request. This call will block until * the allocation is granted, or the specified timeout limit is reached. * IN req - description of resource allocation request * IN timeout - amount of time, in seconds, to wait for a response before * giving up. * A timeout of zero will wait indefinitely. * IN pending_callback - If the allocation cannot be granted immediately, * the controller will put the job in the PENDING state. If * pending callback is not NULL, it will be called with the job_id * of the pending job as the sole parameter. * * RET allocation structure on success, NULL on error set errno to * indicate the error (errno will be ETIMEDOUT if the timeout is reached * with no allocation granted) * NOTE: free the response using slurm_free_resource_allocation_response_msg() */ resource_allocation_response_msg_t * slurm_allocate_resources_blocking (const job_desc_msg_t *user_req, time_t timeout, void(*pending_callback)(uint32_t job_id)) { int rc; slurm_msg_t req_msg; slurm_msg_t resp_msg; resource_allocation_response_msg_t *resp = NULL; char *hostname = NULL; uint32_t job_id; job_desc_msg_t *req; listen_t *listen = NULL; int errnum = SLURM_SUCCESS; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); /* make a copy of the user's job description struct so that we * can make changes before contacting the controller */ req = (job_desc_msg_t *)xmalloc(sizeof(job_desc_msg_t)); if (req == NULL) return NULL; memcpy(req, user_req, sizeof(job_desc_msg_t)); /* * set Node and session id for this request */ if (req->alloc_sid == NO_VAL) req->alloc_sid = getsid(0); if (user_req->alloc_node != NULL) { req->alloc_node = xstrdup(user_req->alloc_node); } else if ((hostname = xshort_hostname()) != NULL) { req->alloc_node = hostname; } else { error("Could not get local hostname," " forcing immediate allocation mode."); req->immediate = 1; } if (!req->immediate) { listen = _create_allocation_response_socket(hostname); if (listen == NULL) { xfree(req); return NULL; } req->alloc_resp_port = listen->port; } req_msg.msg_type = REQUEST_RESOURCE_ALLOCATION; req_msg.data = req; rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg); if (rc == SLURM_SOCKET_ERROR) { int errnum = errno; destroy_forward(&req_msg.forward); destroy_forward(&resp_msg.forward); if (!req->immediate) _destroy_allocation_response_socket(listen); xfree(req); errno = errnum; return NULL; } switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: if (_handle_rc_msg(&resp_msg) < 0) { /* will reach this when the allocation fails */ errnum = errno; } else { /* shouldn't get here */ errnum = -1; } break; case RESPONSE_RESOURCE_ALLOCATION: /* Yay, the controller has acknowledged our request! But did we really get an allocation yet? */ resp = (resource_allocation_response_msg_t *) resp_msg.data; if (resp->node_cnt > 0) { /* yes, allocation has been granted */ errno = SLURM_PROTOCOL_SUCCESS; } else if (!req->immediate) { if (resp->error_code != SLURM_SUCCESS) info("%s", slurm_strerror(resp->error_code)); /* no, we need to wait for a response */ job_id = resp->job_id; slurm_free_resource_allocation_response_msg(resp); if (pending_callback != NULL) pending_callback(job_id); resp = _wait_for_allocation_response(job_id, listen, timeout); /* If NULL, we didn't get the allocation in the time desired, so just free the job id */ if ((resp == NULL) && (errno != ESLURM_ALREADY_DONE)) { errnum = errno; slurm_complete_job(job_id, -1); } } break; default: errnum = SLURM_UNEXPECTED_MSG_ERROR; resp = NULL; } destroy_forward(&req_msg.forward); destroy_forward(&resp_msg.forward); if (!req->immediate) _destroy_allocation_response_socket(listen); xfree(req); errno = errnum; return resp; }
/* * slurm_get_end_time - get the expected end time for a given slurm job * IN jobid - slurm job id * end_time_ptr - location in which to store scheduled end time for job * RET 0 or -1 on error */ extern int slurm_get_end_time(uint32_t jobid, time_t *end_time_ptr) { int rc; slurm_msg_t resp_msg; slurm_msg_t req_msg; job_alloc_info_msg_t job_msg; srun_timeout_msg_t *timeout_msg; time_t now = time(NULL); static uint32_t jobid_cache = 0; static uint32_t jobid_env = 0; static time_t endtime_cache = 0; static time_t last_test_time = 0; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); if (!end_time_ptr) slurm_seterrno_ret(EINVAL); if (jobid == 0) { if (jobid_env) { jobid = jobid_env; } else { char *env = getenv("SLURM_JOB_ID"); if (env) { jobid = (uint32_t) atol(env); jobid_env = jobid; } } if (jobid == 0) { slurm_seterrno(ESLURM_INVALID_JOB_ID); return SLURM_ERROR; } } /* Just use cached data if data less than 60 seconds old */ if ((jobid == jobid_cache) && (difftime(now, last_test_time) < 60)) { *end_time_ptr = endtime_cache; return SLURM_SUCCESS; } job_msg.job_id = jobid; req_msg.msg_type = REQUEST_JOB_END_TIME; req_msg.data = &job_msg; if (slurm_send_recv_controller_msg( &req_msg, &resp_msg) < 0) return SLURM_ERROR; switch (resp_msg.msg_type) { case SRUN_TIMEOUT: timeout_msg = (srun_timeout_msg_t *) resp_msg.data; last_test_time = time(NULL); jobid_cache = jobid; endtime_cache = timeout_msg->timeout; *end_time_ptr = endtime_cache; slurm_free_srun_timeout_msg(resp_msg.data); break; case RESPONSE_SLURM_RC: rc = ((return_code_msg_t *) resp_msg.data)->return_code; slurm_free_return_code_msg(resp_msg.data); if (endtime_cache) *end_time_ptr = endtime_cache; else if (rc) slurm_seterrno_ret(rc); break; default: if (endtime_cache) *end_time_ptr = endtime_cache; else slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_SUCCESS; }
/* * slurm_job_will_run - determine if a job would execute immediately if * submitted now * IN job_desc_msg - description of resource allocation request * RET 0 on success, otherwise return -1 and set errno to indicate the error */ int slurm_job_will_run (job_desc_msg_t *req) { slurm_msg_t req_msg, resp_msg; will_run_response_msg_t *will_run_resp; char buf[64]; bool host_set = false; int rc; uint32_t cluster_flags = slurmdb_setup_cluster_flags(); char *type = "processors"; /* req.immediate = true; implicit */ if ((req->alloc_node == NULL) && (gethostname_short(buf, sizeof(buf)) == 0)) { req->alloc_node = buf; host_set = true; } slurm_msg_t_init(&req_msg); req_msg.msg_type = REQUEST_JOB_WILL_RUN; req_msg.data = req; rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg); if (host_set) req->alloc_node = NULL; if (rc < 0) return SLURM_SOCKET_ERROR; switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: if (_handle_rc_msg(&resp_msg) < 0) return SLURM_PROTOCOL_ERROR; break; case RESPONSE_JOB_WILL_RUN: if(cluster_flags & CLUSTER_FLAG_BG) type = "cnodes"; will_run_resp = (will_run_response_msg_t *) resp_msg.data; slurm_make_time_str(&will_run_resp->start_time, buf, sizeof(buf)); info("Job %u to start at %s using %u %s" " on %s", will_run_resp->job_id, buf, will_run_resp->proc_cnt, type, will_run_resp->node_list); if (will_run_resp->preemptee_job_id) { ListIterator itr; uint32_t *job_id_ptr; char *job_list = NULL, *sep = ""; itr = list_iterator_create(will_run_resp-> preemptee_job_id); while ((job_id_ptr = list_next(itr))) { if (job_list) sep = ","; xstrfmtcat(job_list, "%s%u", sep, *job_id_ptr); } info(" Preempts: %s", job_list); xfree(job_list); } slurm_free_will_run_response_msg(will_run_resp); break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS; }
/* * slurm_allocate_resources_blocking * allocate resources for a job request. This call will block until * the allocation is granted, or the specified timeout limit is reached. * IN req - description of resource allocation request * IN timeout - amount of time, in seconds, to wait for a response before * giving up. * A timeout of zero will wait indefinitely. * IN pending_callback - If the allocation cannot be granted immediately, * the controller will put the job in the PENDING state. If * pending callback is not NULL, it will be called with the job_id * of the pending job as the sole parameter. * * RET allocation structure on success, NULL on error set errno to * indicate the error (errno will be ETIMEDOUT if the timeout is reached * with no allocation granted) * NOTE: free the response using slurm_free_resource_allocation_response_msg() */ resource_allocation_response_msg_t * slurm_allocate_resources_blocking (const job_desc_msg_t *user_req, time_t timeout, void(*pending_callback)(uint32_t job_id)) { int rc; slurm_msg_t req_msg; slurm_msg_t resp_msg; resource_allocation_response_msg_t *resp = NULL; uint32_t job_id; job_desc_msg_t *req; listen_t *listen = NULL; int errnum = SLURM_SUCCESS; bool already_done = false; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); /* make a copy of the user's job description struct so that we * can make changes before contacting the controller */ req = (job_desc_msg_t *)xmalloc(sizeof(job_desc_msg_t)); if (req == NULL) return NULL; memcpy(req, user_req, sizeof(job_desc_msg_t)); /* * set Node and session id for this request */ if (req->alloc_sid == NO_VAL) req->alloc_sid = getsid(0); if (!req->immediate) { listen = _create_allocation_response_socket(); if (listen == NULL) { xfree(req); return NULL; } req->alloc_resp_port = listen->port; } req_msg.msg_type = REQUEST_RESOURCE_ALLOCATION; req_msg.data = req; rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg, working_cluster_rec); if (rc == SLURM_ERROR) { int errnum = errno; destroy_forward(&req_msg.forward); destroy_forward(&resp_msg.forward); if (!req->immediate) _destroy_allocation_response_socket(listen); xfree(req); errno = errnum; return NULL; } switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: if (_handle_rc_msg(&resp_msg) < 0) { /* will reach this when the allocation fails */ errnum = errno; } else { /* shouldn't get here */ errnum = SLURM_ERROR; } break; case RESPONSE_RESOURCE_ALLOCATION: /* Yay, the controller has acknowledged our request! * Test if we have an allocation yet? */ resp = (resource_allocation_response_msg_t *) resp_msg.data; if (resp->node_cnt > 0) { /* yes, allocation has been granted */ errno = SLURM_SUCCESS; } else if (!req->immediate) { if (resp->error_code != SLURM_SUCCESS) info("%s", slurm_strerror(resp->error_code)); /* no, we need to wait for a response */ /* print out any user messages before we wait. */ print_multi_line_string(resp->job_submit_user_msg, -1, LOG_LEVEL_INFO); job_id = resp->job_id; slurm_free_resource_allocation_response_msg(resp); if (pending_callback != NULL) pending_callback(job_id); _wait_for_allocation_response(job_id, listen, RESPONSE_RESOURCE_ALLOCATION, timeout, (void **) &resp); /* If NULL, we didn't get the allocation in the time desired, so just free the job id */ if ((resp == NULL) && (errno != ESLURM_ALREADY_DONE)) { errnum = errno; slurm_complete_job(job_id, -1); } if ((resp == NULL) && (errno == ESLURM_ALREADY_DONE)) already_done = true; } break; default: errnum = SLURM_UNEXPECTED_MSG_ERROR; resp = NULL; } destroy_forward(&req_msg.forward); destroy_forward(&resp_msg.forward); if (!req->immediate) _destroy_allocation_response_socket(listen); xfree(req); if (!resp && already_done && (errnum == SLURM_SUCCESS)) errnum = ESLURM_ALREADY_DONE; errno = errnum; return resp; }