/* Get total node count and lead job ID from RESPONSE_JOB_PACK_ALLOCATION */ static void _pack_alloc_test(List resp, uint32_t *node_cnt, uint32_t *job_id) { resource_allocation_response_msg_t *alloc; uint32_t inx = 0, pack_node_cnt = 0, pack_job_id = 0; ListIterator iter; xassert(resp); iter = list_iterator_create(resp); while ((alloc = (resource_allocation_response_msg_t *)list_next(iter))){ pack_node_cnt += alloc->node_cnt; if (pack_job_id == 0) pack_job_id = alloc->job_id; print_multi_line_string(alloc->job_submit_user_msg, inx); inx++; } list_iterator_destroy(iter); *job_id = pack_job_id; *node_cnt = pack_node_cnt; }
/* * slurm_pack_job_will_run - determine if a heterogenous job would execute * immediately if submitted now * IN job_req_list - List of job_desc_msg_t structures describing the resource * allocation request * RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set */ extern int slurm_pack_job_will_run(List job_req_list) { job_desc_msg_t *req; will_run_response_msg_t *will_run_resp; char buf[64], *sep = ""; int rc = SLURM_SUCCESS, inx = 0; ListIterator iter, itr; time_t first_start = (time_t) 0; uint32_t first_job_id = 0, tot_proc_count = 0, *job_id_ptr; hostset_t hs = NULL; char *job_list = NULL; if (!job_req_list || (list_count(job_req_list) == 0)) { error("No job descriptors input"); return SLURM_ERROR; } iter = list_iterator_create(job_req_list); while ((req = (job_desc_msg_t *) list_next(iter))) { will_run_resp = NULL; rc = slurm_job_will_run2(req, &will_run_resp); if (will_run_resp) print_multi_line_string( will_run_resp->job_submit_user_msg, inx, LOG_LEVEL_INFO); if ((rc == SLURM_SUCCESS) && will_run_resp) { if (first_job_id == 0) first_job_id = will_run_resp->job_id; if ((first_start == 0) || (first_start < will_run_resp->start_time)) first_start = will_run_resp->start_time; tot_proc_count += will_run_resp->proc_cnt; if (hs) hostset_insert(hs, will_run_resp->node_list); else hs = hostset_create(will_run_resp->node_list); if (will_run_resp->preemptee_job_id) { itr = list_iterator_create(will_run_resp-> preemptee_job_id); while ((job_id_ptr = list_next(itr))) { if (job_list) sep = ","; xstrfmtcat(job_list, "%s%u", sep, *job_id_ptr); } list_iterator_destroy(itr); } slurm_free_will_run_response_msg(will_run_resp); } if (rc != SLURM_SUCCESS) break; inx++; } list_iterator_destroy(iter); if (rc == SLURM_SUCCESS) { char node_list[1028] = ""; if (hs) hostset_ranged_string(hs, sizeof(node_list), node_list); slurm_make_time_str(&first_start, buf, sizeof(buf)); info("Job %u to start at %s using %u processors on %s", first_job_id, buf, tot_proc_count, node_list); if (job_list) info(" Preempts: %s", job_list); } if (hs) hostset_destroy(hs); xfree(job_list); return rc; }
/* * slurm_job_will_run - determine if a job would execute immediately if * submitted now * IN job_desc_msg - description of resource allocation request * RET SLURM_SUCCESS on success, otherwise return SLURM_ERROR with errno set */ int slurm_job_will_run(job_desc_msg_t *req) { will_run_response_msg_t *will_run_resp = NULL; char buf[64]; int rc; char *cluster_name = NULL; void *ptr = NULL; if (working_cluster_rec) cluster_name = working_cluster_rec->name; else cluster_name = slurmctld_conf.cluster_name; if (!slurm_load_federation(&ptr) && cluster_in_federation(ptr, cluster_name)) rc = _fed_job_will_run(req, &will_run_resp, ptr); else rc = slurm_job_will_run2(req, &will_run_resp); if (will_run_resp) print_multi_line_string( will_run_resp->job_submit_user_msg, -1, LOG_LEVEL_INFO); if ((rc == 0) && will_run_resp) { slurm_make_time_str(&will_run_resp->start_time, buf, sizeof(buf)); if (will_run_resp->part_name) { info("Job %u to start at %s using %u processors on nodes %s in partition %s", will_run_resp->job_id, buf, will_run_resp->proc_cnt, will_run_resp->node_list, will_run_resp->part_name); } else { /* * Partition name not provided from slurmctld v17.11 * or earlier. Remove this in the future. */ info("Job %u to start at %s using %u processors on nodes %s", will_run_resp->job_id, buf, will_run_resp->proc_cnt, will_run_resp->node_list); } if (will_run_resp->preemptee_job_id) { ListIterator itr; uint32_t *job_id_ptr; char *job_list = NULL, *sep = ""; itr = list_iterator_create(will_run_resp-> preemptee_job_id); while ((job_id_ptr = list_next(itr))) { if (job_list) sep = ","; xstrfmtcat(job_list, "%s%u", sep, *job_id_ptr); } list_iterator_destroy(itr); info(" Preempts: %s", job_list); xfree(job_list); } slurm_free_will_run_response_msg(will_run_resp); } if (ptr) slurm_destroy_federation_rec(ptr); return rc; }
/* * slurm_allocate_resources_blocking * allocate resources for a job request. This call will block until * the allocation is granted, or the specified timeout limit is reached. * IN req - description of resource allocation request * IN timeout - amount of time, in seconds, to wait for a response before * giving up. * A timeout of zero will wait indefinitely. * IN pending_callback - If the allocation cannot be granted immediately, * the controller will put the job in the PENDING state. If * pending callback is not NULL, it will be called with the job_id * of the pending job as the sole parameter. * * RET allocation structure on success, NULL on error set errno to * indicate the error (errno will be ETIMEDOUT if the timeout is reached * with no allocation granted) * NOTE: free the response using slurm_free_resource_allocation_response_msg() */ resource_allocation_response_msg_t * slurm_allocate_resources_blocking (const job_desc_msg_t *user_req, time_t timeout, void(*pending_callback)(uint32_t job_id)) { int rc; slurm_msg_t req_msg; slurm_msg_t resp_msg; resource_allocation_response_msg_t *resp = NULL; uint32_t job_id; job_desc_msg_t *req; listen_t *listen = NULL; int errnum = SLURM_SUCCESS; bool already_done = false; slurm_msg_t_init(&req_msg); slurm_msg_t_init(&resp_msg); /* make a copy of the user's job description struct so that we * can make changes before contacting the controller */ req = (job_desc_msg_t *)xmalloc(sizeof(job_desc_msg_t)); if (req == NULL) return NULL; memcpy(req, user_req, sizeof(job_desc_msg_t)); /* * set Node and session id for this request */ if (req->alloc_sid == NO_VAL) req->alloc_sid = getsid(0); if (!req->immediate) { listen = _create_allocation_response_socket(); if (listen == NULL) { xfree(req); return NULL; } req->alloc_resp_port = listen->port; } req_msg.msg_type = REQUEST_RESOURCE_ALLOCATION; req_msg.data = req; rc = slurm_send_recv_controller_msg(&req_msg, &resp_msg, working_cluster_rec); if (rc == SLURM_ERROR) { int errnum = errno; destroy_forward(&req_msg.forward); destroy_forward(&resp_msg.forward); if (!req->immediate) _destroy_allocation_response_socket(listen); xfree(req); errno = errnum; return NULL; } switch (resp_msg.msg_type) { case RESPONSE_SLURM_RC: if (_handle_rc_msg(&resp_msg) < 0) { /* will reach this when the allocation fails */ errnum = errno; } else { /* shouldn't get here */ errnum = SLURM_ERROR; } break; case RESPONSE_RESOURCE_ALLOCATION: /* Yay, the controller has acknowledged our request! * Test if we have an allocation yet? */ resp = (resource_allocation_response_msg_t *) resp_msg.data; if (resp->node_cnt > 0) { /* yes, allocation has been granted */ errno = SLURM_SUCCESS; } else if (!req->immediate) { if (resp->error_code != SLURM_SUCCESS) info("%s", slurm_strerror(resp->error_code)); /* no, we need to wait for a response */ /* print out any user messages before we wait. */ print_multi_line_string(resp->job_submit_user_msg, -1, LOG_LEVEL_INFO); job_id = resp->job_id; slurm_free_resource_allocation_response_msg(resp); if (pending_callback != NULL) pending_callback(job_id); _wait_for_allocation_response(job_id, listen, RESPONSE_RESOURCE_ALLOCATION, timeout, (void **) &resp); /* If NULL, we didn't get the allocation in the time desired, so just free the job id */ if ((resp == NULL) && (errno != ESLURM_ALREADY_DONE)) { errnum = errno; slurm_complete_job(job_id, -1); } if ((resp == NULL) && (errno == ESLURM_ALREADY_DONE)) already_done = true; } break; default: errnum = SLURM_UNEXPECTED_MSG_ERROR; resp = NULL; } destroy_forward(&req_msg.forward); destroy_forward(&resp_msg.forward); if (!req->immediate) _destroy_allocation_response_socket(listen); xfree(req); if (!resp && already_done && (errnum == SLURM_SUCCESS)) errnum = ESLURM_ALREADY_DONE; errno = errnum; return resp; }
/* * Allocate nodes from the slurm controller -- retrying the attempt * if the controller appears to be down, and optionally waiting for * resources if none are currently available (see opt.immediate) * * Returns a pointer to a resource_allocation_response_msg which must * be freed with slurm_free_resource_allocation_response_msg() */ extern resource_allocation_response_msg_t * allocate_nodes(bool handle_signals, slurm_opt_t *opt_local) { srun_opt_t *srun_opt = opt_local->srun_opt; resource_allocation_response_msg_t *resp = NULL; job_desc_msg_t *j; slurm_allocation_callbacks_t callbacks; int i; xassert(srun_opt); if (srun_opt->relative_set && srun_opt->relative) fatal("--relative option invalid for job allocation request"); if ((j = _job_desc_msg_create_from_opts(&opt)) == NULL) return NULL; if (opt_local->clusters && (slurmdb_get_first_avail_cluster(j, opt_local->clusters, &working_cluster_rec) != SLURM_SUCCESS)) { print_db_notok(opt_local->clusters, 0); return NULL; } j->origin_cluster = xstrdup(slurmctld_conf.cluster_name); /* Do not re-use existing job id when submitting new job * from within a running job */ if ((j->job_id != NO_VAL) && !opt_local->jobid_set) { info("WARNING: Creating SLURM job allocation from within " "another allocation"); info("WARNING: You are attempting to initiate a second job"); if (!opt_local->jobid_set) /* Let slurmctld set jobid */ j->job_id = NO_VAL; } callbacks.ping = _ping_handler; callbacks.timeout = _timeout_handler; callbacks.job_complete = _job_complete_handler; callbacks.job_suspend = NULL; callbacks.user_msg = _user_msg_handler; callbacks.node_fail = _node_fail_handler; /* create message thread to handle pings and such from slurmctld */ msg_thr = slurm_allocation_msg_thr_create(&j->other_port, &callbacks); /* NOTE: Do not process signals in separate pthread. The signal will * cause slurm_allocate_resources_blocking() to exit immediately. */ if (handle_signals) { xsignal_unblock(sig_array); for (i = 0; sig_array[i]; i++) xsignal(sig_array[i], _signal_while_allocating); } while (!resp) { resp = slurm_allocate_resources_blocking(j, opt_local->immediate, _set_pending_job_id); if (destroy_job) { /* cancelled by signal */ break; } else if (!resp && !_retry()) { break; } } if (resp) print_multi_line_string(resp->job_submit_user_msg, -1); if (resp && !destroy_job) { /* * Allocation granted! */ pending_job_id = resp->job_id; /* * These values could be changed while the job was * pending so overwrite the request with what was * allocated so we don't have issues when we use them * in the step creation. */ opt_local->pn_min_memory = NO_VAL64; opt_local->mem_per_cpu = NO_VAL64; if (resp->pn_min_memory != NO_VAL64) { if (resp->pn_min_memory & MEM_PER_CPU) { opt_local->mem_per_cpu = (resp->pn_min_memory & (~MEM_PER_CPU)); } else { opt_local->pn_min_memory = resp->pn_min_memory; } } #ifdef HAVE_BG uint32_t node_cnt = 0; select_g_select_jobinfo_get(resp->select_jobinfo, SELECT_JOBDATA_NODE_CNT, &node_cnt); if ((node_cnt == 0) || (node_cnt == NO_VAL)) { opt_local->min_nodes = node_cnt; opt_local->max_nodes = node_cnt; } /* else we just use the original request */ if (!_wait_bluegene_block_ready(resp)) { if (!destroy_job) error("Something is wrong with the " "boot of the block."); goto relinquish; } #else opt_local->min_nodes = resp->node_cnt; opt_local->max_nodes = resp->node_cnt; if (resp->working_cluster_rec) slurm_setup_remote_working_cluster(resp); if (!_wait_nodes_ready(resp)) { if (!destroy_job) error("Something is wrong with the boot of the nodes."); goto relinquish; } #endif } else if (destroy_job) { goto relinquish; } if (handle_signals) xsignal_block(sig_array); job_desc_msg_destroy(j); return resp; relinquish: if (resp) { if (!destroy_job) slurm_complete_job(resp->job_id, 1); slurm_free_resource_allocation_response_msg(resp); } exit(error_exit); return NULL; }