/* returns 1 if job and nodes are ready for job to begin, 0 otherwise */ static int _wait_nodes_ready(resource_allocation_response_msg_t *alloc) { int is_ready = 0, i, rc; int cur_delay = 0; int suspend_time, resume_time, max_delay; suspend_time = slurm_get_suspend_timeout(); resume_time = slurm_get_resume_timeout(); if ((suspend_time == 0) || (resume_time == 0)) return 1; /* Power save mode disabled */ max_delay = suspend_time + resume_time; max_delay *= 5; /* Allow for ResumeRate support */ pending_job_id = alloc->job_id; if (opt.wait_all_nodes == (uint16_t) NO_VAL) opt.wait_all_nodes = DEFAULT_WAIT_ALL_NODES; for (i=0; (cur_delay < max_delay); i++) { if (i) { if (i == 1) info("Waiting for nodes to boot"); else debug("still waiting"); sleep(POLL_SLEEP); cur_delay += POLL_SLEEP; } if (opt.wait_all_nodes) rc = slurm_job_node_ready(alloc->job_id); else { is_ready = 1; break; } if (rc == READY_JOB_FATAL) break; /* fatal error */ if ((rc == READY_JOB_ERROR) || (rc == EAGAIN)) continue; /* retry */ if ((rc & READY_JOB_STATE) == 0) /* job killed */ break; if (rc & READY_NODE_STATE) { /* job and node ready */ is_ready = 1; break; } if (allocation_interrupted) break; } if (is_ready) { if (i > 0) info ("Nodes %s are ready for job", alloc->node_list); } else if (!allocation_interrupted) error("Nodes %s are still not ready", alloc->node_list); else /* allocation_interrupted or slurmctld not responing */ is_ready = 0; pending_job_id = 0; return is_ready; }
static int _wait_nodes_ready(uint32_t job_id) { int is_ready = SLURM_ERROR, i, rc = 0; int cur_delay = 0; int suspend_time, resume_time, max_delay; suspend_time = slurm_get_suspend_timeout(); resume_time = slurm_get_resume_timeout(); if ((suspend_time == 0) || (resume_time == 0)) return SLURM_SUCCESS; /* Power save mode disabled */ max_delay = suspend_time + resume_time; max_delay *= 5; /* Allow for ResumeRate support */ for (i=0; (cur_delay < max_delay); i++) { if (i) { if (i == 1) info("Waiting for nodes to boot"); sleep(POLL_SLEEP); cur_delay += POLL_SLEEP; } rc = slurm_job_node_ready(job_id); if (rc == READY_JOB_FATAL) break; /* fatal error */ if ((rc == READY_JOB_ERROR) || (rc == EAGAIN)) continue; /* retry */ if ((rc & READY_JOB_STATE) == 0) /* job killed */ break; if (rc & READY_NODE_STATE) { /* job and node ready */ is_ready = SLURM_SUCCESS; break; } } if (is_ready == SLURM_SUCCESS) info("Nodes are ready for job %u", job_id); else if ((rc & READY_JOB_STATE) == 0) info("Job %u no longer running", job_id); else info("Problem running job %u", job_id); return is_ready; }
/* returns 1 if job and nodes are ready for job to begin, 0 otherwise */ static int _wait_nodes_ready(resource_allocation_response_msg_t *alloc) { int is_ready = 0, i, rc; int cur_delay = 0; int suspend_time, resume_time, max_delay; suspend_time = slurm_get_suspend_timeout(); resume_time = slurm_get_resume_timeout(); if ((suspend_time == 0) || (resume_time == 0)) return 1; /* Power save mode disabled */ max_delay = suspend_time + resume_time; max_delay *= 5; /* Allow for ResumeRate support */ pending_job_id = alloc->job_id; for (i = 0; (cur_delay < max_delay); i++) { if (i) { if (i == 1) verbose("Waiting for nodes to boot"); else debug("still waiting"); sleep(POLL_SLEEP); cur_delay += POLL_SLEEP; } rc = slurm_job_node_ready(alloc->job_id); if (rc == READY_JOB_FATAL) break; /* fatal error */ if ((rc == READY_JOB_ERROR) || (rc == EAGAIN)) continue; /* retry */ if ((rc & READY_JOB_STATE) == 0) /* job killed */ break; if (rc & READY_NODE_STATE) { /* job and node ready */ is_ready = 1; break; } if (destroy_job) break; } if (is_ready) { resource_allocation_response_msg_t *resp; char *tmp_str; if (i > 0) verbose("Nodes %s are ready for job", alloc->node_list); if (alloc->alias_list && !strcmp(alloc->alias_list, "TBD") && (slurm_allocation_lookup_lite(pending_job_id, &resp) == SLURM_SUCCESS)) { tmp_str = alloc->alias_list; alloc->alias_list = resp->alias_list; resp->alias_list = tmp_str; slurm_free_resource_allocation_response_msg(resp); } } else if (!destroy_job) error("Nodes %s are still not ready", alloc->node_list); else /* allocation_interrupted and slurmctld not responing */ is_ready = 0; pending_job_id = 0; return is_ready; }
/* Test if a batch launch request should be defered * RET -1: abort the request, pending job cancelled * 0: execute the request now * 1: defer the request */ static int _batch_launch_defer(queued_request_t *queued_req_ptr) { agent_arg_t *agent_arg_ptr; batch_job_launch_msg_t *launch_msg_ptr; time_t now = time(NULL); struct job_record *job_ptr; int delay_time, nodes_ready = 0; agent_arg_ptr = queued_req_ptr->agent_arg_ptr; if (agent_arg_ptr->msg_type != REQUEST_BATCH_JOB_LAUNCH) return 0; if (difftime(now, queued_req_ptr->last_attempt) < 10) { /* Reduce overhead by only testing once every 10 secs */ return 1; } launch_msg_ptr = (batch_job_launch_msg_t *)agent_arg_ptr->msg_args; job_ptr = find_job_record(launch_msg_ptr->job_id); if ((job_ptr == NULL) || (!IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr))) { info("agent(batch_launch): removed pending request for " "cancelled job %u", launch_msg_ptr->job_id); return -1; /* job cancelled while waiting */ } if (job_ptr->wait_all_nodes) { (void) job_node_ready(launch_msg_ptr->job_id, &nodes_ready); } else { #ifdef HAVE_FRONT_END nodes_ready = 1; #else struct node_record *node_ptr; char *hostname; hostname = hostlist_deranged_string_xmalloc( agent_arg_ptr->hostlist); node_ptr = find_node_record(hostname); if (node_ptr == NULL) { error("agent(batch_launch) removed pending request for " "job %u, missing node %s", launch_msg_ptr->job_id, hostname); xfree(hostname); return -1; /* invalid request?? */ } xfree(hostname); if (!IS_NODE_POWER_SAVE(node_ptr) && !IS_NODE_NO_RESPOND(node_ptr)) { nodes_ready = 1; } #endif } delay_time = difftime(now, job_ptr->start_time); if (nodes_ready) { /* ready to launch, adjust time limit for boot time */ if (delay_time && (job_ptr->time_limit != INFINITE) && (!wiki2_sched)) { info("Job %u launch delayed by %d secs, " "updating end_time", launch_msg_ptr->job_id, delay_time); job_ptr->end_time += delay_time; } queued_req_ptr->last_attempt = (time_t) 0; return 0; } if (queued_req_ptr->last_attempt == 0) { queued_req_ptr->first_attempt = now; queued_req_ptr->last_attempt = now; } else if (difftime(now, queued_req_ptr->first_attempt) >= slurm_get_resume_timeout()) { error("agent waited too long for nodes to respond, " "sending batch request anyway..."); if (delay_time && (job_ptr->time_limit != INFINITE) && (!wiki2_sched)) { info("Job %u launch delayed by %d secs, " "updating end_time", launch_msg_ptr->job_id, delay_time); job_ptr->end_time += delay_time; } queued_req_ptr->last_attempt = (time_t) 0; return 0; } queued_req_ptr->last_attempt = now; return 1; }
/* returns 1 if job and nodes are ready for job to begin, 0 otherwise */ static int _wait_nodes_ready(resource_allocation_response_msg_t *alloc) { int is_ready = 0, i, rc; double cur_delay = 0; double cur_sleep = 0; int suspend_time, resume_time, max_delay; bool job_killed = false; suspend_time = slurm_get_suspend_timeout(); resume_time = slurm_get_resume_timeout(); if ((suspend_time == 0) || (resume_time == 0)) return 1; /* Power save mode disabled */ max_delay = suspend_time + resume_time; max_delay *= 5; /* Allow for ResumeRate support */ pending_job_id = alloc->job_id; for (i = 0; cur_delay < max_delay; i++) { if (i) { cur_sleep = POLL_SLEEP * i; if (i == 1) { verbose("Waiting for nodes to boot (delay looping %d times @ %f secs x index)", max_delay, POLL_SLEEP); } else { debug("Waited %f sec and still waiting: next sleep for %f sec", cur_delay, cur_sleep); } usleep(1000000 * cur_sleep); cur_delay += cur_sleep; } rc = slurm_job_node_ready(alloc->job_id); if (rc == READY_JOB_FATAL) break; /* fatal error */ if ((rc == READY_JOB_ERROR) || (rc == EAGAIN)) continue; /* retry */ if ((rc & READY_JOB_STATE) == 0) { /* job killed */ job_killed = true; break; } if (rc & READY_NODE_STATE) { /* job and node ready */ is_ready = 1; break; } if (destroy_job) break; } if (is_ready) { resource_allocation_response_msg_t *resp; char *tmp_str; if (i > 0) verbose("Nodes %s are ready for job", alloc->node_list); if (alloc->alias_list && !xstrcmp(alloc->alias_list, "TBD") && (slurm_allocation_lookup(pending_job_id, &resp) == SLURM_SUCCESS)) { tmp_str = alloc->alias_list; alloc->alias_list = resp->alias_list; resp->alias_list = tmp_str; slurm_free_resource_allocation_response_msg(resp); } } else if (!destroy_job) { if (job_killed) { error("Job allocation %u has been revoked", alloc->job_id); destroy_job = true; } else error("Nodes %s are still not ready", alloc->node_list); } else /* allocation_interrupted and slurmctld not responing */ is_ready = 0; pending_job_id = 0; return is_ready; }