/* returns 1 if job and nodes are ready for job to begin, 0 otherwise */ static int _wait_nodes_ready(resource_allocation_response_msg_t *alloc) { int is_ready = 0, i, rc; int cur_delay = 0; int suspend_time, resume_time, max_delay; suspend_time = slurm_get_suspend_timeout(); resume_time = slurm_get_resume_timeout(); if ((suspend_time == 0) || (resume_time == 0)) return 1; /* Power save mode disabled */ max_delay = suspend_time + resume_time; max_delay *= 5; /* Allow for ResumeRate support */ pending_job_id = alloc->job_id; if (opt.wait_all_nodes == (uint16_t) NO_VAL) opt.wait_all_nodes = DEFAULT_WAIT_ALL_NODES; for (i=0; (cur_delay < max_delay); i++) { if (i) { if (i == 1) info("Waiting for nodes to boot"); else debug("still waiting"); sleep(POLL_SLEEP); cur_delay += POLL_SLEEP; } if (opt.wait_all_nodes) rc = slurm_job_node_ready(alloc->job_id); else { is_ready = 1; break; } if (rc == READY_JOB_FATAL) break; /* fatal error */ if ((rc == READY_JOB_ERROR) || (rc == EAGAIN)) continue; /* retry */ if ((rc & READY_JOB_STATE) == 0) /* job killed */ break; if (rc & READY_NODE_STATE) { /* job and node ready */ is_ready = 1; break; } if (allocation_interrupted) break; } if (is_ready) { if (i > 0) info ("Nodes %s are ready for job", alloc->node_list); } else if (!allocation_interrupted) error("Nodes %s are still not ready", alloc->node_list); else /* allocation_interrupted or slurmctld not responing */ is_ready = 0; pending_job_id = 0; return is_ready; }
static int _wait_nodes_ready(uint32_t job_id) { int is_ready = SLURM_ERROR, i, rc = 0; int cur_delay = 0; int suspend_time, resume_time, max_delay; suspend_time = slurm_get_suspend_timeout(); resume_time = slurm_get_resume_timeout(); if ((suspend_time == 0) || (resume_time == 0)) return SLURM_SUCCESS; /* Power save mode disabled */ max_delay = suspend_time + resume_time; max_delay *= 5; /* Allow for ResumeRate support */ for (i=0; (cur_delay < max_delay); i++) { if (i) { if (i == 1) info("Waiting for nodes to boot"); sleep(POLL_SLEEP); cur_delay += POLL_SLEEP; } rc = slurm_job_node_ready(job_id); if (rc == READY_JOB_FATAL) break; /* fatal error */ if ((rc == READY_JOB_ERROR) || (rc == EAGAIN)) continue; /* retry */ if ((rc & READY_JOB_STATE) == 0) /* job killed */ break; if (rc & READY_NODE_STATE) { /* job and node ready */ is_ready = SLURM_SUCCESS; break; } } if (is_ready == SLURM_SUCCESS) info("Nodes are ready for job %u", job_id); else if ((rc & READY_JOB_STATE) == 0) info("Job %u no longer running", job_id); else info("Problem running job %u", job_id); return is_ready; }
/* returns 1 if job and nodes are ready for job to begin, 0 otherwise */ static int _wait_nodes_ready(resource_allocation_response_msg_t *alloc) { int is_ready = 0, i, rc; int cur_delay = 0; int suspend_time, resume_time, max_delay; suspend_time = slurm_get_suspend_timeout(); resume_time = slurm_get_resume_timeout(); if ((suspend_time == 0) || (resume_time == 0)) return 1; /* Power save mode disabled */ max_delay = suspend_time + resume_time; max_delay *= 5; /* Allow for ResumeRate support */ pending_job_id = alloc->job_id; for (i = 0; (cur_delay < max_delay); i++) { if (i) { if (i == 1) verbose("Waiting for nodes to boot"); else debug("still waiting"); sleep(POLL_SLEEP); cur_delay += POLL_SLEEP; } rc = slurm_job_node_ready(alloc->job_id); if (rc == READY_JOB_FATAL) break; /* fatal error */ if ((rc == READY_JOB_ERROR) || (rc == EAGAIN)) continue; /* retry */ if ((rc & READY_JOB_STATE) == 0) /* job killed */ break; if (rc & READY_NODE_STATE) { /* job and node ready */ is_ready = 1; break; } if (destroy_job) break; } if (is_ready) { resource_allocation_response_msg_t *resp; char *tmp_str; if (i > 0) verbose("Nodes %s are ready for job", alloc->node_list); if (alloc->alias_list && !strcmp(alloc->alias_list, "TBD") && (slurm_allocation_lookup_lite(pending_job_id, &resp) == SLURM_SUCCESS)) { tmp_str = alloc->alias_list; alloc->alias_list = resp->alias_list; resp->alias_list = tmp_str; slurm_free_resource_allocation_response_msg(resp); } } else if (!destroy_job) error("Nodes %s are still not ready", alloc->node_list); else /* allocation_interrupted and slurmctld not responing */ is_ready = 0; pending_job_id = 0; return is_ready; }
/* returns 1 if job and nodes are ready for job to begin, 0 otherwise */ static int _wait_nodes_ready(resource_allocation_response_msg_t *alloc) { int is_ready = 0, i, rc; double cur_delay = 0; double cur_sleep = 0; int suspend_time, resume_time, max_delay; bool job_killed = false; suspend_time = slurm_get_suspend_timeout(); resume_time = slurm_get_resume_timeout(); if ((suspend_time == 0) || (resume_time == 0)) return 1; /* Power save mode disabled */ max_delay = suspend_time + resume_time; max_delay *= 5; /* Allow for ResumeRate support */ pending_job_id = alloc->job_id; for (i = 0; cur_delay < max_delay; i++) { if (i) { cur_sleep = POLL_SLEEP * i; if (i == 1) { verbose("Waiting for nodes to boot (delay looping %d times @ %f secs x index)", max_delay, POLL_SLEEP); } else { debug("Waited %f sec and still waiting: next sleep for %f sec", cur_delay, cur_sleep); } usleep(1000000 * cur_sleep); cur_delay += cur_sleep; } rc = slurm_job_node_ready(alloc->job_id); if (rc == READY_JOB_FATAL) break; /* fatal error */ if ((rc == READY_JOB_ERROR) || (rc == EAGAIN)) continue; /* retry */ if ((rc & READY_JOB_STATE) == 0) { /* job killed */ job_killed = true; break; } if (rc & READY_NODE_STATE) { /* job and node ready */ is_ready = 1; break; } if (destroy_job) break; } if (is_ready) { resource_allocation_response_msg_t *resp; char *tmp_str; if (i > 0) verbose("Nodes %s are ready for job", alloc->node_list); if (alloc->alias_list && !xstrcmp(alloc->alias_list, "TBD") && (slurm_allocation_lookup(pending_job_id, &resp) == SLURM_SUCCESS)) { tmp_str = alloc->alias_list; alloc->alias_list = resp->alias_list; resp->alias_list = tmp_str; slurm_free_resource_allocation_response_msg(resp); } } else if (!destroy_job) { if (job_killed) { error("Job allocation %u has been revoked", alloc->job_id); destroy_job = true; } else error("Nodes %s are still not ready", alloc->node_list); } else /* allocation_interrupted and slurmctld not responing */ is_ready = 0; pending_job_id = 0; return is_ready; }