/* block_state_mutex must be unlocked before calling this. */ extern void bg_requeue_job(uint32_t job_id, bool wait_for_start, bool slurmctld_locked, uint32_t job_state, bool preempted) { int rc; slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK }; /* Wait for the slurmd to begin the batch script, slurm_fail_job() is a no-op if issued prior to the script initiation do clean up just incase the fail job isn't ran. */ if (wait_for_start) sleep(2); if (!slurmctld_locked) lock_slurmctld(job_write_lock); rc = job_requeue(0, job_id, NULL, preempted, 0); if (rc == ESLURM_JOB_PENDING) { error("%s: Could not requeue pending job %u", __func__, job_id); } else if (rc != SLURM_SUCCESS) { error("%s: Could not requeue job %u, failing it: %s", __func__, job_id, slurm_strerror(rc)); job_fail(job_id, job_state); } if (!slurmctld_locked) unlock_slurmctld(job_write_lock); }
/* RET 0 on success, -1 on failure */ extern int job_requeue_wiki(char *cmd_ptr, int *err_code, char **err_msg) { char *arg_ptr, *tmp_char; uint32_t jobid; struct job_record *job_ptr; static char reply_msg[128]; int slurm_rc; /* Write lock on job and node info */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK }; arg_ptr = strstr(cmd_ptr, "ARG="); if (arg_ptr == NULL) { *err_code = -300; *err_msg = "REQUEUEJOB lacks ARG"; error("wiki: REQUEUEJOB lacks ARG"); return -1; } jobid = strtoul(arg_ptr+4, &tmp_char, 10); if ((tmp_char[0] != '\0') && (!isspace(tmp_char[0]))) { *err_code = -300; *err_msg = "Invalid ARG value"; error("wiki: REQUEUEJOB has invalid jobid"); return -1; } lock_slurmctld(job_write_lock); slurm_rc = job_requeue(0, jobid, NULL, false, 0); if (slurm_rc != SLURM_SUCCESS) { unlock_slurmctld(job_write_lock); *err_code = -700; *err_msg = slurm_strerror(slurm_rc); error("wiki: Failed to requeue job %u (%m)", jobid); return -1; } /* We need to clear the required node list here. * If the job was submitted with srun and a * required node list, it gets lost here. */ job_ptr = find_job_record(jobid); if (job_ptr && job_ptr->details) { xfree(job_ptr->details->req_nodes); FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap); } info("wiki: requeued job %u", jobid); unlock_slurmctld(job_write_lock); snprintf(reply_msg, sizeof(reply_msg), "job %u requeued successfully", jobid); *err_msg = reply_msg; return 0; }
/* block_state_mutex must be unlocked before calling this. */ extern void bg_requeue_job(uint32_t job_id, bool wait_for_start) { int rc; slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; /* Wait for the slurmd to begin the batch script, slurm_fail_job() is a no-op if issued prior to the script initiation do clean up just incase the fail job isn't ran. */ if (wait_for_start) sleep(2); lock_slurmctld(job_write_lock); if ((rc = job_requeue(0, job_id, -1, (uint16_t)NO_VAL, false))) { error("Couldn't requeue job %u, failing it: %s", job_id, slurm_strerror(rc)); job_fail(job_id); } unlock_slurmctld(job_write_lock); }
static void _preempt_job_dequeue(void) { struct job_record *job_ptr; uint32_t job_id, *tmp_id; uint16_t preempt_mode; xassert(preempt_job_list); while ((tmp_id = list_pop(preempt_job_list))) { int rc = SLURM_ERROR; job_id = *tmp_id; xfree(tmp_id); if ((job_ptr = find_job_record(job_id)) == NULL) { error("_preempt_job_dequeue could not find job %u", job_id); continue; } preempt_mode = slurm_job_preempt_mode(job_ptr); if (preempt_mode == PREEMPT_MODE_SUSPEND) { if ((rc = _suspend_job(job_id)) == ESLURM_DISABLED) rc = SLURM_SUCCESS; } else if (preempt_mode == PREEMPT_MODE_CANCEL) { rc = job_signal(job_ptr->job_id, SIGKILL, 0, 0, true); if (rc == SLURM_SUCCESS) { info("preempted job %u has been killed", job_ptr->job_id); } } else if (preempt_mode == PREEMPT_MODE_CHECKPOINT) { checkpoint_msg_t ckpt_msg; memset(&ckpt_msg, 0, sizeof(checkpoint_msg_t)); ckpt_msg.op = CHECK_REQUEUE; ckpt_msg.job_id = job_ptr->job_id; rc = job_checkpoint(&ckpt_msg, 0, -1, (uint16_t)NO_VAL); if (rc == ESLURM_NOT_SUPPORTED) { memset(&ckpt_msg, 0, sizeof(checkpoint_msg_t)); ckpt_msg.op = CHECK_VACATE; ckpt_msg.job_id = job_ptr->job_id; rc = job_checkpoint(&ckpt_msg, 0, -1, (uint16_t)NO_VAL); } if (rc == SLURM_SUCCESS) { info("preempted job %u has been checkpointed", job_ptr->job_id); } else error("preempted job %u could not be " "checkpointed: %s", job_ptr->job_id, slurm_strerror(rc)); } else if ((preempt_mode == PREEMPT_MODE_REQUEUE) && job_ptr->batch_flag && job_ptr->details && (job_ptr->details->requeue > 0)) { rc = job_requeue(0, job_ptr->job_id, -1, (uint16_t)NO_VAL, true, 0); if (rc == SLURM_SUCCESS) { info("preempted job %u has been requeued", job_ptr->job_id); } else error("preempted job %u could not be " "requeued: %s", job_ptr->job_id, slurm_strerror(rc)); } if (rc != SLURM_SUCCESS) { rc = job_signal(job_ptr->job_id, SIGKILL, 0, 0, true); if (rc == SLURM_SUCCESS) info("preempted job %u had to be killed", job_ptr->job_id); else { info("preempted job %u kill failure %s", job_ptr->job_id, slurm_strerror(rc)); } } } return; }
static int _check_for_booted_overlapping_blocks( List block_list, ListIterator bg_record_itr, bg_record_t *bg_record, int overlap_check, List overlapped_list, uint16_t query_mode) { bg_record_t *found_record = NULL; ListIterator itr = NULL; int rc = 0; int overlap = 0; bool is_test = SELECT_IS_TEST(query_mode); /* this test only is for actually picking a block not testing */ if (is_test && bg_conf->layout_mode == LAYOUT_DYNAMIC) return rc; /* Make sure no other blocks are under this block are booted and running jobs */ itr = list_iterator_create(block_list); while ((found_record = (bg_record_t*)list_next(itr)) != NULL) { if ((!found_record->bg_block_id) || (bg_record == found_record)) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("Don't need to look at myself %s %s", bg_record->bg_block_id, found_record->bg_block_id); continue; } slurm_mutex_lock(&block_state_mutex); overlap = blocks_overlap(bg_record, found_record); slurm_mutex_unlock(&block_state_mutex); if (overlap) { overlap = 0; /* make the available time on this block * (bg_record) the max of this found_record's job * or the one already set if in overlapped_block_list * since we aren't setting job_running we * don't have to remove them since the * block_list should always be destroyed afterwards. */ if (is_test && overlapped_list && found_record->job_ptr && bg_record->job_running == NO_JOB_RUNNING) { ListIterator itr = list_iterator_create( overlapped_list); bg_record_t *tmp_rec = NULL; if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("found overlapping block %s " "overlapped %s with job %u", found_record->bg_block_id, bg_record->bg_block_id, found_record->job_ptr->job_id); while ((tmp_rec = list_next(itr))) { if (tmp_rec == bg_record) break; } list_iterator_destroy(itr); if (tmp_rec && tmp_rec->job_ptr->end_time < found_record->job_ptr->end_time) tmp_rec->job_ptr = found_record->job_ptr; else if (!tmp_rec) { bg_record->job_ptr = found_record->job_ptr; list_append(overlapped_list, bg_record); } } /* We already know this block doesn't work * right now so we will if there is another * overlapping block that ends later */ if (rc) continue; /* This test is here to check if the block we * chose is not booted or if there is a block * overlapping that we could avoid freeing if * we choose something else */ if (bg_conf->layout_mode == LAYOUT_OVERLAP && ((overlap_check == 0 && bg_record->state != BG_BLOCK_INITED) || (overlap_check == 1 && found_record->state != BG_BLOCK_FREE))) { if (!is_test) { rc = 1; break; } } if (((bg_conf->layout_mode == LAYOUT_DYNAMIC) || ((!SELECT_IS_CHECK_FULL_SET(query_mode) || SELECT_IS_MODE_RUN_NOW(query_mode)) && (bg_conf->layout_mode != LAYOUT_DYNAMIC))) && ((found_record->job_running != NO_JOB_RUNNING) || (found_record->state & BG_BLOCK_ERROR_FLAG))) { if ((found_record->job_running == BLOCK_ERROR_STATE) || (found_record->state & BG_BLOCK_ERROR_FLAG)) error("can't use %s, " "overlapping block %s " "is in an error state.", bg_record->bg_block_id, found_record->bg_block_id); else if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("can't use %s, there is " "a job (%d) running on " "an overlapping " "block %s", bg_record->bg_block_id, found_record->job_running, found_record->bg_block_id); if (bg_conf->layout_mode == LAYOUT_DYNAMIC) { List tmp_list = list_create(NULL); /* this will remove and * destroy the memory for * bg_record */ list_remove(bg_record_itr); slurm_mutex_lock(&block_state_mutex); if (bg_record->original) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("This was a " "copy %s", bg_record-> bg_block_id); found_record = bg_record->original; } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("looking for " "original"); found_record = find_org_in_bg_list( bg_lists->main, bg_record); } if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("Removing unusable block " "%s from the system.", bg_record->bg_block_id); if (!found_record) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("This record %s " "wasn't found in " "the " "bg_lists->main, " "no big deal, it " "probably wasn't " "added", bg_record-> bg_block_id); found_record = bg_record; } else destroy_bg_record(bg_record); list_push(tmp_list, found_record); slurm_mutex_unlock(&block_state_mutex); /* We need to make sure if a job is running here to not call the regular method since we are inside the job write lock already. */ if (found_record->job_ptr && !IS_JOB_FINISHED( found_record->job_ptr)) { info("Somehow block %s " "is being freed, but " "appears to already have " "a job %u(%u) running " "on it.", found_record->bg_block_id, found_record-> job_ptr->job_id, found_record->job_running); if (job_requeue(0, found_record-> job_ptr->job_id, -1, (uint16_t) NO_VAL, false)) { error("Couldn't " "requeue job %u, " "failing it: %s", found_record-> job_ptr->job_id, slurm_strerror( rc)); job_fail(found_record-> job_ptr-> job_id); } } free_block_list(NO_VAL, tmp_list, 0, 0); list_destroy(tmp_list); } rc = 1; if (!is_test) break; } } } list_iterator_destroy(itr); return rc; }