/* returns 1 if job and nodes are ready for job to begin, 0 otherwise */ static int _wait_nodes_ready(resource_allocation_response_msg_t *alloc) { int is_ready = 0, i, rc; int cur_delay = 0; int suspend_time, resume_time, max_delay; suspend_time = slurm_get_suspend_timeout(); resume_time = slurm_get_resume_timeout(); if ((suspend_time == 0) || (resume_time == 0)) return 1; /* Power save mode disabled */ max_delay = suspend_time + resume_time; max_delay *= 5; /* Allow for ResumeRate support */ pending_job_id = alloc->job_id; if (opt.wait_all_nodes == (uint16_t) NO_VAL) opt.wait_all_nodes = DEFAULT_WAIT_ALL_NODES; for (i=0; (cur_delay < max_delay); i++) { if (i) { if (i == 1) info("Waiting for nodes to boot"); else debug("still waiting"); sleep(POLL_SLEEP); cur_delay += POLL_SLEEP; } if (opt.wait_all_nodes) rc = slurm_job_node_ready(alloc->job_id); else { is_ready = 1; break; } if (rc == READY_JOB_FATAL) break; /* fatal error */ if ((rc == READY_JOB_ERROR) || (rc == EAGAIN)) continue; /* retry */ if ((rc & READY_JOB_STATE) == 0) /* job killed */ break; if (rc & READY_NODE_STATE) { /* job and node ready */ is_ready = 1; break; } if (allocation_interrupted) break; } if (is_ready) { if (i > 0) info ("Nodes %s are ready for job", alloc->node_list); } else if (!allocation_interrupted) error("Nodes %s are still not ready", alloc->node_list); else /* allocation_interrupted or slurmctld not responing */ is_ready = 0; pending_job_id = 0; return is_ready; }
/* returns 1 if job and nodes are ready for job to begin, 0 otherwise */ static int _wait_bluegene_block_ready(resource_allocation_response_msg_t *alloc) { int is_ready = 0, i, rc; char *block_id = NULL; double cur_delay = 0; double cur_sleep = 0; int max_delay = BG_FREE_PREVIOUS_BLOCK + BG_MIN_BLOCK_BOOT + (BG_INCR_BLOCK_BOOT * alloc->node_cnt); select_g_select_jobinfo_get(alloc->select_jobinfo, SELECT_JOBDATA_BLOCK_ID, &block_id); for (i = 0; cur_delay < max_delay; i++) { cur_sleep = POLL_SLEEP * i; if (i == 1) { debug("Waiting for block %s to become ready for job", block_id); } if (i) { usleep(1000000 * cur_sleep); rc = _blocks_dealloc(); if ((rc == 0) || (rc == -1)) cur_delay += cur_sleep; debug2("still waiting"); } rc = slurm_job_node_ready(alloc->job_id); if (rc == READY_JOB_FATAL) break; /* fatal error */ if ((rc == READY_JOB_ERROR) || (rc == EAGAIN)) continue; /* retry */ if ((rc & READY_JOB_STATE) == 0) /* job killed */ break; if (rc & READY_NODE_STATE) { /* job and node ready */ is_ready = 1; break; } if (destroy_job) break; } if (is_ready) debug("Block %s is ready for job", block_id); else if (!destroy_job) error("Block %s still not ready", block_id); else /* destroy_job set and slurmctld not responing */ is_ready = 0; xfree(block_id); return is_ready; }
static int _wait_bluegene_block_ready(resource_allocation_response_msg_t *alloc) { int is_ready = SLURM_ERROR, i, rc = 0; char *block_id = NULL; int cur_delay = 0; int max_delay = BG_FREE_PREVIOUS_BLOCK + BG_MIN_BLOCK_BOOT + (BG_INCR_BLOCK_BOOT * alloc->node_cnt); select_g_select_jobinfo_get(alloc->select_jobinfo, SELECT_JOBDATA_BLOCK_ID, &block_id); for (i=0; (cur_delay < max_delay); i++) { if (i) { if (i == 1) { info("Waiting for block %s to become ready for " "job", block_id); } else debug("still waiting"); sleep(POLL_SLEEP); rc = _blocks_dealloc(); if ((rc == 0) || (rc == -1)) cur_delay += POLL_SLEEP; } rc = slurm_job_node_ready(alloc->job_id); if (rc == READY_JOB_FATAL) break; /* fatal error */ if ((rc == READY_JOB_ERROR) || (rc == EAGAIN)) continue; /* retry */ if ((rc & READY_JOB_STATE) == 0) /* job killed */ break; if (rc & READY_NODE_STATE) { /* job and node ready */ is_ready = SLURM_SUCCESS; break; } } if (is_ready == SLURM_SUCCESS) info("Block %s is ready for job %u", block_id, alloc->job_id); else if ((rc & READY_JOB_STATE) == 0) info("Job %u no longer running", alloc->job_id); else info("Problem running job %u", alloc->job_id); xfree(block_id); return is_ready; }
/* returns 1 if job and nodes are ready for job to begin, 0 otherwise */ static int _wait_part_ready(uint32_t job_id) { int is_ready = 0, i, rc; max_delay = BG_FREE_PREVIOUS_BLOCK + BG_MIN_BLOCK_BOOT + (BG_INCR_BLOCK_BOOT * _get_job_size(job_id)); #if _DEBUG printf("Waiting for job %u to become ready.", job_id); #endif for (i=0; (cur_delay < max_delay); i++) { if (i) { sleep(POLL_SLEEP); rc = _partitions_dealloc(); if ((rc == 0) || (rc == -1)) cur_delay += POLL_SLEEP; #if _DEBUG printf("."); #endif } rc = slurm_job_node_ready(job_id); if (rc == READY_JOB_FATAL) break; /* fatal error */ if (rc == READY_JOB_ERROR) /* error */ continue; /* retry */ if ((rc & READY_JOB_STATE) == 0) { /* job killed */ /* return 1 so we don't get a prolog error */ is_ready = 1; break; } if (rc & READY_NODE_STATE) { /* job and node ready */ is_ready = 1; break; } } #if _DEBUG if (is_ready == 0) printf("\n"); else printf("\nJob %u is ready.\n", job_id); #endif if (is_ready == 0) fprintf(stderr, "Job %u is not ready.\n", job_id); return is_ready; }
static int _wait_nodes_ready(uint32_t job_id) { int is_ready = SLURM_ERROR, i, rc = 0; int cur_delay = 0; int suspend_time, resume_time, max_delay; suspend_time = slurm_get_suspend_timeout(); resume_time = slurm_get_resume_timeout(); if ((suspend_time == 0) || (resume_time == 0)) return SLURM_SUCCESS; /* Power save mode disabled */ max_delay = suspend_time + resume_time; max_delay *= 5; /* Allow for ResumeRate support */ for (i=0; (cur_delay < max_delay); i++) { if (i) { if (i == 1) info("Waiting for nodes to boot"); sleep(POLL_SLEEP); cur_delay += POLL_SLEEP; } rc = slurm_job_node_ready(job_id); if (rc == READY_JOB_FATAL) break; /* fatal error */ if ((rc == READY_JOB_ERROR) || (rc == EAGAIN)) continue; /* retry */ if ((rc & READY_JOB_STATE) == 0) /* job killed */ break; if (rc & READY_NODE_STATE) { /* job and node ready */ is_ready = SLURM_SUCCESS; break; } } if (is_ready == SLURM_SUCCESS) info("Nodes are ready for job %u", job_id); else if ((rc & READY_JOB_STATE) == 0) info("Job %u no longer running", job_id); else info("Problem running job %u", job_id); return is_ready; }
static void _wait_part_not_ready(uint32_t job_id) { int is_ready = 1, i, rc; max_delay = MIN_DELAY + (INCR_DELAY * _get_job_size(job_id)); #if _DEBUG printf("Waiting for job %u to be not ready.", job_id); #endif for (i=0; (cur_delay < max_delay); i++) { if (i) { sleep(POLL_SLEEP); cur_delay += POLL_SLEEP; #if _DEBUG printf("."); #endif } rc = slurm_job_node_ready(job_id); if (rc == READY_JOB_FATAL) break; /* fatal error */ if (rc == READY_JOB_ERROR) /* error */ continue; /* retry */ if ((rc & READY_NODE_STATE) == 0) { is_ready = 0; break; } } #if _DEBUG if (is_ready == 1) printf("\n"); else printf("\nJob %u is not ready.\n", job_id); #endif if (is_ready == 1) fprintf(stderr, "Job %u is still ready.\n", job_id); }
static void _wait_part_not_ready(uint32_t job_id) { int is_ready = 1, rc; #if _DEBUG printf("Waiting for job %u to be not ready.", job_id); #endif /* It has been decided waiting forever is a better solution than ending early and saying we are done when in reality the job is still running. So now we trust the slurmctld to tell us when we are done and never end until that happens. */ while (1) { rc = slurm_job_node_ready(job_id); if (rc == READY_JOB_FATAL) break; /* fatal error */ if (rc == READY_JOB_ERROR) /* error */ continue; /* retry */ if ((rc & READY_NODE_STATE) == 0) { is_ready = 0; break; } sleep(POLL_SLEEP); #if _DEBUG printf("."); #endif } #if _DEBUG if (is_ready == 1) printf("\n"); else printf("\nJob %u is not ready.\n", job_id); #endif if (is_ready == 1) fprintf(stderr, "Job %u is still ready.\n", job_id); }
/* returns 1 if job and nodes are ready for job to begin, 0 otherwise */ static int _wait_nodes_ready(resource_allocation_response_msg_t *alloc) { int is_ready = 0, i, rc; int cur_delay = 0; int suspend_time, resume_time, max_delay; suspend_time = slurm_get_suspend_timeout(); resume_time = slurm_get_resume_timeout(); if ((suspend_time == 0) || (resume_time == 0)) return 1; /* Power save mode disabled */ max_delay = suspend_time + resume_time; max_delay *= 5; /* Allow for ResumeRate support */ pending_job_id = alloc->job_id; for (i = 0; (cur_delay < max_delay); i++) { if (i) { if (i == 1) verbose("Waiting for nodes to boot"); else debug("still waiting"); sleep(POLL_SLEEP); cur_delay += POLL_SLEEP; } rc = slurm_job_node_ready(alloc->job_id); if (rc == READY_JOB_FATAL) break; /* fatal error */ if ((rc == READY_JOB_ERROR) || (rc == EAGAIN)) continue; /* retry */ if ((rc & READY_JOB_STATE) == 0) /* job killed */ break; if (rc & READY_NODE_STATE) { /* job and node ready */ is_ready = 1; break; } if (destroy_job) break; } if (is_ready) { resource_allocation_response_msg_t *resp; char *tmp_str; if (i > 0) verbose("Nodes %s are ready for job", alloc->node_list); if (alloc->alias_list && !strcmp(alloc->alias_list, "TBD") && (slurm_allocation_lookup_lite(pending_job_id, &resp) == SLURM_SUCCESS)) { tmp_str = alloc->alias_list; alloc->alias_list = resp->alias_list; resp->alias_list = tmp_str; slurm_free_resource_allocation_response_msg(resp); } } else if (!destroy_job) error("Nodes %s are still not ready", alloc->node_list); else /* allocation_interrupted and slurmctld not responing */ is_ready = 0; pending_job_id = 0; return is_ready; }
/* returns 1 if job and nodes are ready for job to begin, 0 otherwise */ static int _wait_nodes_ready(resource_allocation_response_msg_t *alloc) { int is_ready = 0, i, rc; double cur_delay = 0; double cur_sleep = 0; int suspend_time, resume_time, max_delay; bool job_killed = false; suspend_time = slurm_get_suspend_timeout(); resume_time = slurm_get_resume_timeout(); if ((suspend_time == 0) || (resume_time == 0)) return 1; /* Power save mode disabled */ max_delay = suspend_time + resume_time; max_delay *= 5; /* Allow for ResumeRate support */ pending_job_id = alloc->job_id; for (i = 0; cur_delay < max_delay; i++) { if (i) { cur_sleep = POLL_SLEEP * i; if (i == 1) { verbose("Waiting for nodes to boot (delay looping %d times @ %f secs x index)", max_delay, POLL_SLEEP); } else { debug("Waited %f sec and still waiting: next sleep for %f sec", cur_delay, cur_sleep); } usleep(1000000 * cur_sleep); cur_delay += cur_sleep; } rc = slurm_job_node_ready(alloc->job_id); if (rc == READY_JOB_FATAL) break; /* fatal error */ if ((rc == READY_JOB_ERROR) || (rc == EAGAIN)) continue; /* retry */ if ((rc & READY_JOB_STATE) == 0) { /* job killed */ job_killed = true; break; } if (rc & READY_NODE_STATE) { /* job and node ready */ is_ready = 1; break; } if (destroy_job) break; } if (is_ready) { resource_allocation_response_msg_t *resp; char *tmp_str; if (i > 0) verbose("Nodes %s are ready for job", alloc->node_list); if (alloc->alias_list && !xstrcmp(alloc->alias_list, "TBD") && (slurm_allocation_lookup(pending_job_id, &resp) == SLURM_SUCCESS)) { tmp_str = alloc->alias_list; alloc->alias_list = resp->alias_list; resp->alias_list = tmp_str; slurm_free_resource_allocation_response_msg(resp); } } else if (!destroy_job) { if (job_killed) { error("Job allocation %u has been revoked", alloc->job_id); destroy_job = true; } else error("Nodes %s are still not ready", alloc->node_list); } else /* allocation_interrupted and slurmctld not responing */ is_ready = 0; pending_job_id = 0; return is_ready; }