/* * slurm_sbcast_lookup - retrieve info for an existing resource allocation * including a credential needed for sbcast * IN jobid - job allocation identifier * OUT info - job allocation information including a credential for sbcast * RET 0 on success, otherwise return -1 and set errno to indicate the error * NOTE: free the "resp" using slurm_free_sbcast_cred_msg */ int slurm_sbcast_lookup(uint32_t jobid, job_sbcast_cred_msg_t **info) { job_alloc_info_msg_t req; slurm_msg_t req_msg; slurm_msg_t resp_msg; req.job_id = jobid; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); req_msg.msg_type = REQUEST_JOB_SBCAST_CRED; req_msg.data = &req; if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0) return SLURM_ERROR; switch(resp_msg.msg_type) { case RESPONSE_SLURM_RC: if (_handle_rc_msg(&resp_msg) < 0) return SLURM_ERROR; *info = NULL; break; case RESPONSE_JOB_SBCAST_CRED: *info = (job_sbcast_cred_msg_t *)resp_msg.data; return SLURM_PROTOCOL_SUCCESS; break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS; }
/* * slurm_job_step_create - create a job step for a given job id * IN slurm_step_alloc_req_msg - description of job step request * OUT slurm_step_alloc_resp_msg - response to request * RET 0 on success, otherwise return -1 and set errno to indicate the error * NOTE: free the response using slurm_free_job_step_create_response_msg */ int slurm_job_step_create (job_step_create_request_msg_t *req, job_step_create_response_msg_t **resp) { slurm_msg_t req_msg, resp_msg; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); req_msg.msg_type = REQUEST_JOB_STEP_CREATE; req_msg.data = req; if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0) return SLURM_ERROR; switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: if (_handle_rc_msg(&resp_msg) < 0) return SLURM_PROTOCOL_ERROR; *resp = NULL; break; case RESPONSE_JOB_STEP_CREATE: *resp = (job_step_create_response_msg_t *) resp_msg.data; break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS ; }
/* * slurm_allocation_lookup_lite - retrieve info for an existing resource * allocation without the addrs and such * IN jobid - job allocation identifier * OUT info - job allocation information * RET 0 on success, otherwise return -1 and set errno to indicate the error * NOTE: free the response using slurm_free_resource_allocation_response_msg() */ int slurm_allocation_lookup_lite(uint32_t jobid, resource_allocation_response_msg_t **info) { job_alloc_info_msg_t req; slurm_msg_t req_msg; slurm_msg_t resp_msg; req.job_id = jobid; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); req_msg.msg_type = REQUEST_JOB_ALLOCATION_INFO_LITE; req_msg.data = &req; if (slurm_send_recv_controller_msg(&req_msg, &resp_msg) < 0) return SLURM_ERROR; switch(resp_msg.msg_type) { case RESPONSE_SLURM_RC: if (_handle_rc_msg(&resp_msg) < 0) return SLURM_ERROR; *info = NULL; break; case RESPONSE_JOB_ALLOCATION_INFO_LITE: *info = (resource_allocation_response_msg_t *) resp_msg.data; return SLURM_PROTOCOL_SUCCESS; break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS; }
static int _job_will_run_cluster(job_desc_msg_t *req, will_run_response_msg_t **will_run_resp, slurmdb_cluster_rec_t *cluster) { slurm_msg_t req_msg, resp_msg; int rc; /* req.immediate = true; implicit */ slurm_msg_t_init(&req_msg); req_msg.msg_type = REQUEST_JOB_WILL_RUN; req_msg.data = req; rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg, cluster); if (rc < 0) return SLURM_SOCKET_ERROR; switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: if (_handle_rc_msg(&resp_msg) < 0) return SLURM_PROTOCOL_ERROR; break; case RESPONSE_JOB_WILL_RUN: *will_run_resp = (will_run_response_msg_t *) resp_msg.data; break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS; }
/* * slurm_allocate_resources - allocate resources for a job request * IN job_desc_msg - description of resource allocation request * OUT slurm_alloc_msg - response to request * RET 0 on success, otherwise return -1 and set errno to indicate the error * NOTE: free the response using slurm_free_resource_allocation_response_msg() */ int slurm_allocate_resources (job_desc_msg_t *req, resource_allocation_response_msg_t **resp) { int rc; slurm_msg_t req_msg; slurm_msg_t resp_msg; bool host_set = false; char host[64]; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); /* * set Node and session id for this request */ if (req->alloc_sid == NO_VAL) req->alloc_sid = getsid(0); if ( (req->alloc_node == NULL) && (gethostname_short(host, sizeof(host)) == 0) ) { req->alloc_node = host; host_set = true; } req_msg.msg_type = REQUEST_RESOURCE_ALLOCATION; req_msg.data = req; rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg, working_cluster_rec); /* * Clear this hostname if set internally to this function * (memory is on the stack) */ if (host_set) req->alloc_node = NULL; if (rc == SLURM_SOCKET_ERROR) return SLURM_SOCKET_ERROR; switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: if (_handle_rc_msg(&resp_msg) < 0) return SLURM_PROTOCOL_ERROR; *resp = NULL; break; case RESPONSE_RESOURCE_ALLOCATION: *resp = (resource_allocation_response_msg_t *) resp_msg.data; break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); } return SLURM_PROTOCOL_SUCCESS; }
/* * slurm_checkpoint_error - gather error information for the last checkpoint * operation for some job step * IN job_id - job on which to perform operation * IN step_id - job step on which to perform operation * OUT error_code - error number associated with the last checkpoint operation, * this value is dependent upon the checkpoint plugin used and may be * completely unrelated to slurm error codes, the highest value for all * complete calls is preserved * OUT error_msg - error message, preserved for highest error_code, value * must be freed by the caller to prevent memory leak * RET 0 or a slurm error code */ extern int slurm_checkpoint_error (uint32_t job_id, uint32_t step_id, uint32_t *error_code, char **error_msg) { int rc; slurm_msg_t msg; checkpoint_msg_t req; slurm_msg_t resp_msg; checkpoint_resp_msg_t *ckpt_resp; if ((error_code == NULL) || (error_msg == NULL)) return EINVAL; /* * Request message: */ req.op = CHECK_ERROR; req.job_id = job_id; req.step_id = step_id; req.image_dir = NULL; slurm_msg_t_init(&msg); slurm_msg_t_init(&resp_msg); msg.msg_type = REQUEST_CHECKPOINT; msg.data = &req; rc = slurm_send_recv_controller_msg(&msg, &resp_msg, working_cluster_rec); if (rc == SLURM_SOCKET_ERROR) return rc; switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: *error_code = 0; *error_msg = strdup(""); rc = _handle_rc_msg(&resp_msg); break; case RESPONSE_CHECKPOINT: ckpt_resp = (checkpoint_resp_msg_t *) resp_msg.data; *error_code = ckpt_resp->error_code; if (ckpt_resp->error_msg) *error_msg = strdup(ckpt_resp->error_msg); else *error_msg = strdup(""); slurm_free_checkpoint_resp_msg(ckpt_resp); rc = SLURM_SUCCESS; break; default: rc = SLURM_UNEXPECTED_MSG_ERROR; } return rc; }
/* * slurm_job_step_create - create a job step for a given job id * IN slurm_step_alloc_req_msg - description of job step request * OUT slurm_step_alloc_resp_msg - response to request * RET 0 on success, otherwise return -1 and set errno to indicate the error * NOTE: free the response using slurm_free_job_step_create_response_msg */ int slurm_job_step_create (job_step_create_request_msg_t *req, job_step_create_response_msg_t **resp) { slurm_msg_t req_msg, resp_msg; int delay, rc, retry = 0; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); req_msg.msg_type = REQUEST_JOB_STEP_CREATE; req_msg.data = req; re_send: if (slurm_send_recv_controller_msg(&req_msg, &resp_msg, working_cluster_rec) < 0) return SLURM_ERROR; switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: rc = _handle_rc_msg(&resp_msg); if ((rc < 0) && (errno == EAGAIN)) { if (retry++ == 0) { verbose("Slurm is busy, step creation delayed"); delay = (getpid() % 10) + 10; /* 10-19 secs */ } sleep(delay); goto re_send; } if (rc < 0) return SLURM_PROTOCOL_ERROR; *resp = NULL; break; case RESPONSE_JOB_STEP_CREATE: *resp = (job_step_create_response_msg_t *) resp_msg.data; break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS ; }
/* * slurm_allocate_resources - allocate resources for a job request * IN job_desc_msg - description of resource allocation request * OUT slurm_alloc_msg - response to request * RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set * NOTE: free the response using slurm_free_resource_allocation_response_msg() */ int slurm_allocate_resources (job_desc_msg_t *req, resource_allocation_response_msg_t **resp) { int rc; slurm_msg_t req_msg; slurm_msg_t resp_msg; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); /* * set Node and session id for this request */ if (req->alloc_sid == NO_VAL) req->alloc_sid = getsid(0); req_msg.msg_type = REQUEST_RESOURCE_ALLOCATION; req_msg.data = req; rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg, working_cluster_rec); if (rc == SLURM_ERROR) return SLURM_ERROR; switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: if (_handle_rc_msg(&resp_msg) < 0) return SLURM_ERROR; *resp = NULL; break; case RESPONSE_RESOURCE_ALLOCATION: *resp = (resource_allocation_response_msg_t *) resp_msg.data; break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); } return SLURM_SUCCESS; }
/* * slurm_checkpoint_able - determine if the specified job step can presently * be checkpointed * IN job_id - job on which to perform operation * IN step_id - job step on which to perform operation * OUT start_time - time at which checkpoint request was issued * RET 0 (can be checkpoined) or a slurm error code */ extern int slurm_checkpoint_able (uint32_t job_id, uint32_t step_id, time_t *start_time) { int rc; slurm_msg_t req_msg, resp_msg; checkpoint_msg_t ckp_req; checkpoint_resp_msg_t *resp; ckp_req.op = CHECK_ABLE; ckp_req.job_id = job_id; ckp_req.step_id = step_id; ckp_req.image_dir = NULL; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); req_msg.msg_type = REQUEST_CHECKPOINT; req_msg.data = &ckp_req; if (slurm_send_recv_controller_msg(&req_msg, &resp_msg, working_cluster_rec) < 0) return SLURM_ERROR; switch (resp_msg.msg_type) { case RESPONSE_CHECKPOINT: resp = (checkpoint_resp_msg_t *) resp_msg.data; *start_time = resp->event_time; slurm_free_checkpoint_resp_msg(resp_msg.data); rc = SLURM_SUCCESS; break; case RESPONSE_SLURM_RC: rc = _handle_rc_msg(&resp_msg); break; default: *start_time = (time_t) NULL; rc = SLURM_ERROR; } return rc; }
/* * slurm_allocation_lookup - retrieve info for an existing resource allocation * without the addrs and such * IN jobid - job allocation identifier * OUT info - job allocation information * RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set * NOTE: free the response using slurm_free_resource_allocation_response_msg() */ extern int slurm_allocation_lookup(uint32_t jobid, resource_allocation_response_msg_t **info) { job_alloc_info_msg_t req = {0}; slurm_msg_t req_msg; slurm_msg_t resp_msg; req.job_id = jobid; req.req_cluster = slurmctld_conf.cluster_name; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); req_msg.msg_type = REQUEST_JOB_ALLOCATION_INFO; req_msg.data = &req; if (slurm_send_recv_controller_msg(&req_msg, &resp_msg, working_cluster_rec) < 0) return SLURM_ERROR; req.req_cluster = NULL; switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: if (_handle_rc_msg(&resp_msg) < 0) return SLURM_ERROR; *info = NULL; break; case RESPONSE_JOB_ALLOCATION_INFO: *info = (resource_allocation_response_msg_t *) resp_msg.data; return SLURM_SUCCESS; break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_SUCCESS; }
/* * slurm_job_will_run - determine if a job would execute immediately if * submitted now * IN job_desc_msg - description of resource allocation request * RET 0 on success, otherwise return -1 and set errno to indicate the error */ int slurm_job_will_run (job_desc_msg_t *req) { slurm_msg_t req_msg, resp_msg; will_run_response_msg_t *will_run_resp; char buf[64]; bool host_set = false; int rc; uint32_t cluster_flags = slurmdb_setup_cluster_flags(); char *type = "processors"; /* req.immediate = true; implicit */ if ((req->alloc_node == NULL) && (gethostname_short(buf, sizeof(buf)) == 0)) { req->alloc_node = buf; host_set = true; } slurm_msg_t_init(&req_msg); req_msg.msg_type = REQUEST_JOB_WILL_RUN; req_msg.data = req; rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg); if (host_set) req->alloc_node = NULL; if (rc < 0) return SLURM_SOCKET_ERROR; switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: if (_handle_rc_msg(&resp_msg) < 0) return SLURM_PROTOCOL_ERROR; break; case RESPONSE_JOB_WILL_RUN: if(cluster_flags & CLUSTER_FLAG_BG) type = "cnodes"; will_run_resp = (will_run_response_msg_t *) resp_msg.data; slurm_make_time_str(&will_run_resp->start_time, buf, sizeof(buf)); info("Job %u to start at %s using %u %s" " on %s", will_run_resp->job_id, buf, will_run_resp->proc_cnt, type, will_run_resp->node_list); if (will_run_resp->preemptee_job_id) { ListIterator itr; uint32_t *job_id_ptr; char *job_list = NULL, *sep = ""; itr = list_iterator_create(will_run_resp-> preemptee_job_id); while ((job_id_ptr = list_next(itr))) { if (job_list) sep = ","; xstrfmtcat(job_list, "%s%u", sep, *job_id_ptr); } info(" Preempts: %s", job_list); xfree(job_list); } slurm_free_will_run_response_msg(will_run_resp); break; default: slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR); break; } return SLURM_PROTOCOL_SUCCESS; }
/* * slurm_allocate_resources_blocking * allocate resources for a job request. This call will block until * the allocation is granted, or the specified timeout limit is reached. * IN req - description of resource allocation request * IN timeout - amount of time, in seconds, to wait for a response before * giving up. * A timeout of zero will wait indefinitely. * IN pending_callback - If the allocation cannot be granted immediately, * the controller will put the job in the PENDING state. If * pending callback is not NULL, it will be called with the job_id * of the pending job as the sole parameter. * * RET allocation structure on success, NULL on error set errno to * indicate the error (errno will be ETIMEDOUT if the timeout is reached * with no allocation granted) * NOTE: free the response using slurm_free_resource_allocation_response_msg() */ resource_allocation_response_msg_t * slurm_allocate_resources_blocking (const job_desc_msg_t *user_req, time_t timeout, void(*pending_callback)(uint32_t job_id)) { int rc; slurm_msg_t req_msg; slurm_msg_t resp_msg; resource_allocation_response_msg_t *resp = NULL; char *hostname = NULL; uint32_t job_id; job_desc_msg_t *req; listen_t *listen = NULL; int errnum = SLURM_SUCCESS; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); /* make a copy of the user's job description struct so that we * can make changes before contacting the controller */ req = (job_desc_msg_t *)xmalloc(sizeof(job_desc_msg_t)); if (req == NULL) return NULL; memcpy(req, user_req, sizeof(job_desc_msg_t)); /* * set Node and session id for this request */ if (req->alloc_sid == NO_VAL) req->alloc_sid = getsid(0); if (user_req->alloc_node != NULL) { req->alloc_node = xstrdup(user_req->alloc_node); } else if ((hostname = xshort_hostname()) != NULL) { req->alloc_node = hostname; } else { error("Could not get local hostname," " forcing immediate allocation mode."); req->immediate = 1; } if (!req->immediate) { listen = _create_allocation_response_socket(hostname); if (listen == NULL) { xfree(req); return NULL; } req->alloc_resp_port = listen->port; } req_msg.msg_type = REQUEST_RESOURCE_ALLOCATION; req_msg.data = req; rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg); if (rc == SLURM_SOCKET_ERROR) { int errnum = errno; destroy_forward(&req_msg.forward); destroy_forward(&resp_msg.forward); if (!req->immediate) _destroy_allocation_response_socket(listen); xfree(req); errno = errnum; return NULL; } switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: if (_handle_rc_msg(&resp_msg) < 0) { /* will reach this when the allocation fails */ errnum = errno; } else { /* shouldn't get here */ errnum = -1; } break; case RESPONSE_RESOURCE_ALLOCATION: /* Yay, the controller has acknowledged our request! But did we really get an allocation yet? */ resp = (resource_allocation_response_msg_t *) resp_msg.data; if (resp->node_cnt > 0) { /* yes, allocation has been granted */ errno = SLURM_PROTOCOL_SUCCESS; } else if (!req->immediate) { if (resp->error_code != SLURM_SUCCESS) info("%s", slurm_strerror(resp->error_code)); /* no, we need to wait for a response */ job_id = resp->job_id; slurm_free_resource_allocation_response_msg(resp); if (pending_callback != NULL) pending_callback(job_id); resp = _wait_for_allocation_response(job_id, listen, timeout); /* If NULL, we didn't get the allocation in the time desired, so just free the job id */ if ((resp == NULL) && (errno != ESLURM_ALREADY_DONE)) { errnum = errno; slurm_complete_job(job_id, -1); } } break; default: errnum = SLURM_UNEXPECTED_MSG_ERROR; resp = NULL; } destroy_forward(&req_msg.forward); destroy_forward(&resp_msg.forward); if (!req->immediate) _destroy_allocation_response_socket(listen); xfree(req); errno = errnum; return resp; }
/* * slurm_allocate_pack_job_blocking * allocate resources for a list of job requests. This call will block * until the entire allocation is granted, or the specified timeout limit * is reached. * IN req - List of resource allocation requests * IN timeout - amount of time, in seconds, to wait for a response before * giving up. * A timeout of zero will wait indefinitely. * IN pending_callback - If the allocation cannot be granted immediately, * the controller will put the job in the PENDING state. If * pending callback is not NULL, it will be called with the job_id * of the pending job as the sole parameter. * * RET List of allocation structures on success, NULL on error set errno to * indicate the error (errno will be ETIMEDOUT if the timeout is reached * with no allocation granted) * NOTE: free the response using list_destroy() */ List slurm_allocate_pack_job_blocking(List job_req_list, time_t timeout, void(*pending_callback)(uint32_t job_id)) { int rc; slurm_msg_t req_msg; slurm_msg_t resp_msg; List resp = NULL; char *local_hostname = NULL; job_desc_msg_t *req; listen_t *listen = NULL; int errnum = SLURM_SUCCESS; ListIterator iter; bool immediate_flag = false; bool immediate_logged = false; uint32_t node_cnt = 0, job_id = 0; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); /* * set node name and session ID for this request */ if (!immediate_flag) { listen = _create_allocation_response_socket(local_hostname); if (listen == NULL) return NULL; } local_hostname = xshort_hostname(); iter = list_iterator_create(job_req_list); while ((req = (job_desc_msg_t *) list_next(iter))) { if (req->alloc_sid == NO_VAL) req->alloc_sid = getsid(0); if (listen) req->alloc_resp_port = listen->port; if (!req->alloc_node) { if (local_hostname) { req->alloc_node = local_hostname; } else if (immediate_logged) { req->immediate = 1; } else { req->immediate = 1; error("Could not get local hostname, forcing " "immediate allocation mode"); immediate_logged = true; } } if (req->immediate) immediate_flag = true; } list_iterator_destroy(iter); req_msg.msg_type = REQUEST_JOB_PACK_ALLOCATION; req_msg.data = job_req_list; rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg, working_cluster_rec); if (rc == SLURM_SOCKET_ERROR) { int errnum = errno; destroy_forward(&req_msg.forward); destroy_forward(&resp_msg.forward); if (listen) _destroy_allocation_response_socket(listen); iter = list_iterator_create(job_req_list); while ((req = (job_desc_msg_t *)list_next(iter))) { if (req->alloc_node == local_hostname) req->alloc_node = NULL; } list_iterator_destroy(iter); xfree(local_hostname); errno = errnum; return NULL; } switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: if (_handle_rc_msg(&resp_msg) < 0) { /* will reach this when the allocation fails */ errnum = errno; } else { /* shouldn't get here */ errnum = SLURM_ERROR; } break; case RESPONSE_JOB_PACK_ALLOCATION: /* Yay, the controller has acknowledged our request! * Test if we have an allocation yet? */ resp = (List) resp_msg.data; _pack_alloc_test(resp, &node_cnt, &job_id); if (node_cnt > 0) { /* yes, allocation has been granted */ errno = SLURM_PROTOCOL_SUCCESS; } else if (immediate_flag) { debug("Immediate allocation not granted"); } else { /* no, we need to wait for a response */ FREE_NULL_LIST(resp); if (pending_callback != NULL) pending_callback(job_id); _wait_for_allocation_response(job_id, listen, RESPONSE_JOB_PACK_ALLOCATION, timeout, (void **) &resp); /* If NULL, we didn't get the allocation in * the time desired, so just free the job id */ if ((resp == NULL) && (errno != ESLURM_ALREADY_DONE)) { errnum = errno; slurm_complete_job(job_id, -1); } } break; default: errnum = SLURM_UNEXPECTED_MSG_ERROR; } destroy_forward(&req_msg.forward); destroy_forward(&resp_msg.forward); if (listen) _destroy_allocation_response_socket(listen); iter = list_iterator_create(job_req_list); while ((req = (job_desc_msg_t *)list_next(iter))) { if (req->alloc_node == local_hostname) req->alloc_node = NULL; } list_iterator_destroy(iter); xfree(local_hostname); errno = errnum; return resp; }
/* * slurm_allocate_resources_blocking * allocate resources for a job request. This call will block until * the allocation is granted, or the specified timeout limit is reached. * IN req - description of resource allocation request * IN timeout - amount of time, in seconds, to wait for a response before * giving up. * A timeout of zero will wait indefinitely. * IN pending_callback - If the allocation cannot be granted immediately, * the controller will put the job in the PENDING state. If * pending callback is not NULL, it will be called with the job_id * of the pending job as the sole parameter. * * RET allocation structure on success, NULL on error set errno to * indicate the error (errno will be ETIMEDOUT if the timeout is reached * with no allocation granted) * NOTE: free the response using slurm_free_resource_allocation_response_msg() */ resource_allocation_response_msg_t * slurm_allocate_resources_blocking (const job_desc_msg_t *user_req, time_t timeout, void(*pending_callback)(uint32_t job_id)) { int rc; slurm_msg_t req_msg; slurm_msg_t resp_msg; resource_allocation_response_msg_t *resp = NULL; uint32_t job_id; job_desc_msg_t *req; listen_t *listen = NULL; int errnum = SLURM_SUCCESS; bool already_done = false; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); /* make a copy of the user's job description struct so that we * can make changes before contacting the controller */ req = (job_desc_msg_t *)xmalloc(sizeof(job_desc_msg_t)); if (req == NULL) return NULL; memcpy(req, user_req, sizeof(job_desc_msg_t)); /* * set Node and session id for this request */ if (req->alloc_sid == NO_VAL) req->alloc_sid = getsid(0); if (!req->immediate) { listen = _create_allocation_response_socket(); if (listen == NULL) { xfree(req); return NULL; } req->alloc_resp_port = listen->port; } req_msg.msg_type = REQUEST_RESOURCE_ALLOCATION; req_msg.data = req; rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg, working_cluster_rec); if (rc == SLURM_ERROR) { int errnum = errno; destroy_forward(&req_msg.forward); destroy_forward(&resp_msg.forward); if (!req->immediate) _destroy_allocation_response_socket(listen); xfree(req); errno = errnum; return NULL; } switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: if (_handle_rc_msg(&resp_msg) < 0) { /* will reach this when the allocation fails */ errnum = errno; } else { /* shouldn't get here */ errnum = SLURM_ERROR; } break; case RESPONSE_RESOURCE_ALLOCATION: /* Yay, the controller has acknowledged our request! * Test if we have an allocation yet? */ resp = (resource_allocation_response_msg_t *) resp_msg.data; if (resp->node_cnt > 0) { /* yes, allocation has been granted */ errno = SLURM_SUCCESS; } else if (!req->immediate) { if (resp->error_code != SLURM_SUCCESS) info("%s", slurm_strerror(resp->error_code)); /* no, we need to wait for a response */ /* print out any user messages before we wait. */ print_multi_line_string(resp->job_submit_user_msg, -1, LOG_LEVEL_INFO); job_id = resp->job_id; slurm_free_resource_allocation_response_msg(resp); if (pending_callback != NULL) pending_callback(job_id); _wait_for_allocation_response(job_id, listen, RESPONSE_RESOURCE_ALLOCATION, timeout, (void **) &resp); /* If NULL, we didn't get the allocation in the time desired, so just free the job id */ if ((resp == NULL) && (errno != ESLURM_ALREADY_DONE)) { errnum = errno; slurm_complete_job(job_id, -1); } if ((resp == NULL) && (errno == ESLURM_ALREADY_DONE)) already_done = true; } break; default: errnum = SLURM_UNEXPECTED_MSG_ERROR; resp = NULL; } destroy_forward(&req_msg.forward); destroy_forward(&resp_msg.forward); if (!req->immediate) _destroy_allocation_response_socket(listen); xfree(req); if (!resp && already_done && (errnum == SLURM_SUCCESS)) errnum = ESLURM_ALREADY_DONE; errno = errnum; return resp; }