/* block_state_mutex must be unlocked before calling this. */ extern void bg_requeue_job(uint32_t job_id, bool wait_for_start, bool slurmctld_locked, uint32_t job_state, bool preempted) { int rc; slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK }; /* Wait for the slurmd to begin the batch script, slurm_fail_job() is a no-op if issued prior to the script initiation do clean up just incase the fail job isn't ran. */ if (wait_for_start) sleep(2); if (!slurmctld_locked) lock_slurmctld(job_write_lock); rc = job_requeue(0, job_id, NULL, preempted, 0); if (rc == ESLURM_JOB_PENDING) { error("%s: Could not requeue pending job %u", __func__, job_id); } else if (rc != SLURM_SUCCESS) { error("%s: Could not requeue job %u, failing it: %s", __func__, job_id, slurm_strerror(rc)); job_fail(job_id, job_state); } if (!slurmctld_locked) unlock_slurmctld(job_write_lock); }
/* block_state_mutex must be unlocked before calling this. */ extern void bg_requeue_job(uint32_t job_id, bool wait_for_start) { int rc; slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK }; /* Wait for the slurmd to begin the batch script, slurm_fail_job() is a no-op if issued prior to the script initiation do clean up just incase the fail job isn't ran. */ if (wait_for_start) sleep(2); lock_slurmctld(job_write_lock); if ((rc = job_requeue(0, job_id, -1, (uint16_t)NO_VAL, false))) { error("Couldn't requeue job %u, failing it: %s", job_id, slurm_strerror(rc)); job_fail(job_id); } unlock_slurmctld(job_write_lock); }
static int _check_for_booted_overlapping_blocks( List block_list, ListIterator bg_record_itr, bg_record_t *bg_record, int overlap_check, List overlapped_list, uint16_t query_mode) { bg_record_t *found_record = NULL; ListIterator itr = NULL; int rc = 0; int overlap = 0; bool is_test = SELECT_IS_TEST(query_mode); /* this test only is for actually picking a block not testing */ if (is_test && bg_conf->layout_mode == LAYOUT_DYNAMIC) return rc; /* Make sure no other blocks are under this block are booted and running jobs */ itr = list_iterator_create(block_list); while ((found_record = (bg_record_t*)list_next(itr)) != NULL) { if ((!found_record->bg_block_id) || (bg_record == found_record)) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("Don't need to look at myself %s %s", bg_record->bg_block_id, found_record->bg_block_id); continue; } slurm_mutex_lock(&block_state_mutex); overlap = blocks_overlap(bg_record, found_record); slurm_mutex_unlock(&block_state_mutex); if (overlap) { overlap = 0; /* make the available time on this block * (bg_record) the max of this found_record's job * or the one already set if in overlapped_block_list * since we aren't setting job_running we * don't have to remove them since the * block_list should always be destroyed afterwards. */ if (is_test && overlapped_list && found_record->job_ptr && bg_record->job_running == NO_JOB_RUNNING) { ListIterator itr = list_iterator_create( overlapped_list); bg_record_t *tmp_rec = NULL; if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("found overlapping block %s " "overlapped %s with job %u", found_record->bg_block_id, bg_record->bg_block_id, found_record->job_ptr->job_id); while ((tmp_rec = list_next(itr))) { if (tmp_rec == bg_record) break; } list_iterator_destroy(itr); if (tmp_rec && tmp_rec->job_ptr->end_time < found_record->job_ptr->end_time) tmp_rec->job_ptr = found_record->job_ptr; else if (!tmp_rec) { bg_record->job_ptr = found_record->job_ptr; list_append(overlapped_list, bg_record); } } /* We already know this block doesn't work * right now so we will if there is another * overlapping block that ends later */ if (rc) continue; /* This test is here to check if the block we * chose is not booted or if there is a block * overlapping that we could avoid freeing if * we choose something else */ if (bg_conf->layout_mode == LAYOUT_OVERLAP && ((overlap_check == 0 && bg_record->state != BG_BLOCK_INITED) || (overlap_check == 1 && found_record->state != BG_BLOCK_FREE))) { if (!is_test) { rc = 1; break; } } if (((bg_conf->layout_mode == LAYOUT_DYNAMIC) || ((!SELECT_IS_CHECK_FULL_SET(query_mode) || SELECT_IS_MODE_RUN_NOW(query_mode)) && (bg_conf->layout_mode != LAYOUT_DYNAMIC))) && ((found_record->job_running != NO_JOB_RUNNING) || (found_record->state & BG_BLOCK_ERROR_FLAG))) { if ((found_record->job_running == BLOCK_ERROR_STATE) || (found_record->state & BG_BLOCK_ERROR_FLAG)) error("can't use %s, " "overlapping block %s " "is in an error state.", bg_record->bg_block_id, found_record->bg_block_id); else if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("can't use %s, there is " "a job (%d) running on " "an overlapping " "block %s", bg_record->bg_block_id, found_record->job_running, found_record->bg_block_id); if (bg_conf->layout_mode == LAYOUT_DYNAMIC) { List tmp_list = list_create(NULL); /* this will remove and * destroy the memory for * bg_record */ list_remove(bg_record_itr); slurm_mutex_lock(&block_state_mutex); if (bg_record->original) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("This was a " "copy %s", bg_record-> bg_block_id); found_record = bg_record->original; } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("looking for " "original"); found_record = find_org_in_bg_list( bg_lists->main, bg_record); } if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("Removing unusable block " "%s from the system.", bg_record->bg_block_id); if (!found_record) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) info("This record %s " "wasn't found in " "the " "bg_lists->main, " "no big deal, it " "probably wasn't " "added", bg_record-> bg_block_id); found_record = bg_record; } else destroy_bg_record(bg_record); list_push(tmp_list, found_record); slurm_mutex_unlock(&block_state_mutex); /* We need to make sure if a job is running here to not call the regular method since we are inside the job write lock already. */ if (found_record->job_ptr && !IS_JOB_FINISHED( found_record->job_ptr)) { info("Somehow block %s " "is being freed, but " "appears to already have " "a job %u(%u) running " "on it.", found_record->bg_block_id, found_record-> job_ptr->job_id, found_record->job_running); if (job_requeue(0, found_record-> job_ptr->job_id, -1, (uint16_t) NO_VAL, false)) { error("Couldn't " "requeue job %u, " "failing it: %s", found_record-> job_ptr->job_id, slurm_strerror( rc)); job_fail(found_record-> job_ptr-> job_id); } } free_block_list(NO_VAL, tmp_list, 0, 0); list_destroy(tmp_list); } rc = 1; if (!is_test) break; } } } list_iterator_destroy(itr); return rc; }
/* * Synchronize BG block state to that of currently active jobs. * This can recover from slurmctld crashes when block usership * changes were queued */ extern int sync_jobs(List job_list) { ListIterator itr; struct job_record *job_ptr = NULL; List block_list = NULL; static bool run_already = false; bg_record_t *bg_record = NULL; /* Execute only on initial startup. We don't support bgblock * creation on demand today, so there is no need to re-sync data. */ if (run_already) return SLURM_SUCCESS; run_already = true; if (!job_list) { error("sync_jobs: no job_list"); return SLURM_ERROR; } slurm_mutex_lock(&block_state_mutex); /* Insure that all running jobs own the specified block */ itr = list_iterator_create(job_list); while ((job_ptr = list_next(itr))) { bg_action_t *bg_action_ptr = NULL; if (!IS_JOB_RUNNING(job_ptr) && !IS_JOB_COMPLETING(job_ptr)) continue; bg_action_ptr = xmalloc(sizeof(bg_action_t)); if (IS_JOB_COMPLETING(job_ptr)) bg_action_ptr->op = TERM_OP; else bg_action_ptr->op = START_OP; bg_action_ptr->job_ptr = job_ptr; get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_BLOCK_ID, &(bg_action_ptr->bg_block_id)); #ifdef HAVE_BG_L_P # ifdef HAVE_BGL get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_BLRTS_IMAGE, &(bg_action_ptr->blrtsimage)); # else get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_CONN_TYPE, &(bg_action_ptr->conn_type)); # endif get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_LINUX_IMAGE, &(bg_action_ptr->linuximage)); get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_RAMDISK_IMAGE, &(bg_action_ptr->ramdiskimage)); #endif get_select_jobinfo(job_ptr->select_jobinfo->data, SELECT_JOBDATA_MLOADER_IMAGE, &(bg_action_ptr->mloaderimage)); if (bg_action_ptr->bg_block_id == NULL) { error("Running job %u has bgblock==NULL", job_ptr->job_id); } else if (job_ptr->nodes == NULL) { error("Running job %u has nodes==NULL", job_ptr->job_id); } else if (!(bg_record = find_bg_record_in_list( bg_lists->main, bg_action_ptr->bg_block_id))) { error("Kill job %u belongs to defunct " "bgblock %s", job_ptr->job_id, bg_action_ptr->bg_block_id); } if (!bg_record) { /* Don't use slurm_fail_job, locks are already in place. */ job_fail(job_ptr->job_id); _destroy_bg_action(bg_action_ptr); continue; } /* _sync_agent will destroy the bg_action_ptr */ _sync_agent(bg_action_ptr, bg_record); } list_iterator_destroy(itr); block_list = list_create(destroy_bg_record); itr = list_iterator_create(bg_lists->main); while ((bg_record = list_next(itr))) { bg_record_t *rm_record; if (bg_record->job_ptr || (bg_record->job_list && list_count(bg_record->job_list))) continue; rm_record = xmalloc(sizeof(bg_record_t)); rm_record->magic = BLOCK_MAGIC; rm_record->bg_block_id = xstrdup(bg_record->bg_block_id); rm_record->mp_str = xstrdup(bg_record->mp_str); list_append(block_list, rm_record); } list_iterator_destroy(itr); slurm_mutex_unlock(&block_state_mutex); /* Insure that all other blocks are free of users */ if (block_list) { itr = list_iterator_create(block_list); while ((bg_record = list_next(itr))) { info("Queue clearing of users of BG block %s", bg_record->bg_block_id); term_jobs_on_block(bg_record->bg_block_id); } list_iterator_destroy(itr); list_destroy(block_list); } else { /* this should never happen, * vestigial logic */ error("sync_jobs: no block_list"); return SLURM_ERROR; } return SLURM_SUCCESS; }
/* * This could potentially lock the node lock in the slurmctld with * slurm_drain_node, or slurm_fail_job so if slurmctld_locked is called we * will call the functions without locking the locks again. */ extern int down_nodecard(char *mp_name, bitoff_t io_start, bool slurmctld_locked) { List requests = NULL; List delete_list = NULL; ListIterator itr = NULL; bg_record_t *bg_record = NULL, *found_record = NULL, tmp_record; bg_record_t *smallest_bg_record = NULL; struct node_record *node_ptr = NULL; int mp_bit = 0; static int io_cnt = NO_VAL; static int create_size = NO_VAL; static select_ba_request_t blockreq; int rc = SLURM_SUCCESS; char *reason = "select_bluegene: nodecard down"; xassert(mp_name); if (io_cnt == NO_VAL) { io_cnt = 1; /* Translate 1 nodecard count to ionode count */ if ((io_cnt *= bg_conf->io_ratio)) io_cnt--; /* make sure we create something that is able to be created */ if (bg_conf->smallest_block < bg_conf->nodecard_cnode_cnt) create_size = bg_conf->nodecard_cnode_cnt; else create_size = bg_conf->smallest_block; } node_ptr = find_node_record(mp_name); if (!node_ptr) { error ("down_sub_node_blocks: invalid node specified '%s'", mp_name); return EINVAL; } /* this is here for sanity check to make sure we don't core on these bits when we set them below. */ if (io_start >= bg_conf->ionodes_per_mp || (io_start+io_cnt) >= bg_conf->ionodes_per_mp) { debug("io %d-%d not configured on this " "system, only %d ionodes per midplane", io_start, io_start+io_cnt, bg_conf->ionodes_per_mp); return EINVAL; } mp_bit = (node_ptr - node_record_table_ptr); memset(&blockreq, 0, sizeof(select_ba_request_t)); blockreq.conn_type[0] = SELECT_SMALL; blockreq.save_name = mp_name; debug3("here setting node %d of %d and ionodes %d-%d of %d", mp_bit, node_record_count, io_start, io_start+io_cnt, bg_conf->ionodes_per_mp); memset(&tmp_record, 0, sizeof(bg_record_t)); tmp_record.mp_count = 1; tmp_record.cnode_cnt = bg_conf->nodecard_cnode_cnt; tmp_record.mp_bitmap = bit_alloc(node_record_count); bit_set(tmp_record.mp_bitmap, mp_bit); tmp_record.ionode_bitmap = bit_alloc(bg_conf->ionodes_per_mp); bit_nset(tmp_record.ionode_bitmap, io_start, io_start+io_cnt); slurm_mutex_lock(&block_state_mutex); itr = list_iterator_create(bg_lists->main); while ((bg_record = list_next(itr))) { if (!bit_test(bg_record->mp_bitmap, mp_bit)) continue; if (!blocks_overlap(bg_record, &tmp_record)) continue; if (bg_record->job_running > NO_JOB_RUNNING) { if (slurmctld_locked) job_fail(bg_record->job_running); else slurm_fail_job(bg_record->job_running); } /* If Running Dynamic mode and the block is smaller than the create size just continue on. */ if ((bg_conf->layout_mode == LAYOUT_DYNAMIC) && (bg_record->cnode_cnt < create_size)) { if (!delete_list) delete_list = list_create(NULL); list_append(delete_list, bg_record); continue; } /* keep track of the smallest size that is at least the size of create_size. */ if (!smallest_bg_record || (smallest_bg_record->cnode_cnt > bg_record->cnode_cnt)) smallest_bg_record = bg_record; } list_iterator_destroy(itr); slurm_mutex_unlock(&block_state_mutex); if (bg_conf->layout_mode != LAYOUT_DYNAMIC) { debug3("running non-dynamic mode"); /* This should never happen, but just in case... */ if (delete_list) list_destroy(delete_list); /* If we found a block that is smaller or equal to a midplane we will just mark it in an error state as opposed to draining the node. */ if (smallest_bg_record && (smallest_bg_record->cnode_cnt < bg_conf->mp_cnode_cnt)){ if (smallest_bg_record->state & BG_BLOCK_ERROR_FLAG) { rc = SLURM_NO_CHANGE_IN_DATA; goto cleanup; } rc = put_block_in_error_state( smallest_bg_record, reason); goto cleanup; } debug("No block under 1 midplane available for this nodecard. " "Draining the whole node."); if (!node_already_down(mp_name)) { if (slurmctld_locked) drain_nodes(mp_name, reason, slurm_get_slurm_user_id()); else slurm_drain_nodes(mp_name, reason, slurm_get_slurm_user_id()); } rc = SLURM_SUCCESS; goto cleanup; } /* below is only for Dynamic mode */ if (delete_list) { int cnt_set = 0; bitstr_t *iobitmap = bit_alloc(bg_conf->ionodes_per_mp); /* don't lock here since it is handled inside the put_block_in_error_state */ itr = list_iterator_create(delete_list); while ((bg_record = list_next(itr))) { debug2("combining smaller than nodecard " "dynamic block %s", bg_record->bg_block_id); while (bg_record->job_running > NO_JOB_RUNNING) sleep(1); bit_or(iobitmap, bg_record->ionode_bitmap); cnt_set++; } list_iterator_destroy(itr); list_destroy(delete_list); if (!cnt_set) { FREE_NULL_BITMAP(iobitmap); rc = SLURM_ERROR; goto cleanup; } /* set the start to be the same as the start of the ionode_bitmap. If no ionodes set (not a small block) set io_start = 0. */ if ((io_start = bit_ffs(iobitmap)) == -1) { io_start = 0; if (create_size > bg_conf->nodecard_cnode_cnt) blockreq.small128 = 4; else blockreq.small32 = 16; } else if (create_size <= bg_conf->nodecard_cnode_cnt) blockreq.small32 = 1; else /* this should never happen */ blockreq.small128 = 1; FREE_NULL_BITMAP(iobitmap); } else if (smallest_bg_record) { debug2("smallest dynamic block is %s", smallest_bg_record->bg_block_id); if (smallest_bg_record->state & BG_BLOCK_ERROR_FLAG) { rc = SLURM_NO_CHANGE_IN_DATA; goto cleanup; } while (smallest_bg_record->job_running > NO_JOB_RUNNING) sleep(1); if (smallest_bg_record->cnode_cnt == create_size) { rc = put_block_in_error_state( smallest_bg_record, reason); goto cleanup; } if (create_size > smallest_bg_record->cnode_cnt) { /* we should never get here. This means we * have a create_size that is bigger than a * block that is already made. */ rc = put_block_in_error_state( smallest_bg_record, reason); goto cleanup; } debug3("node count is %d", smallest_bg_record->cnode_cnt); switch(smallest_bg_record->cnode_cnt) { #ifndef HAVE_BGL case 64: blockreq.small32 = 2; break; case 256: blockreq.small32 = 8; break; #endif case 128: blockreq.small32 = 4; break; case 512: default: blockreq.small32 = 16; break; } if (create_size != bg_conf->nodecard_cnode_cnt) { blockreq.small128 = blockreq.small32 / 4; blockreq.small32 = 0; io_start = 0; } else if ((io_start = bit_ffs(smallest_bg_record->ionode_bitmap)) == -1) /* set the start to be the same as the start of the ionode_bitmap. If no ionodes set (not a small block) set io_start = 0. */ io_start = 0; } else { switch(create_size) { #ifndef HAVE_BGL case 64: blockreq.small64 = 8; break; case 256: blockreq.small256 = 2; #endif case 32: blockreq.small32 = 16; break; case 128: blockreq.small128 = 4; break; case 512: if (!node_already_down(mp_name)) { char *reason = "select_bluegene: nodecard down"; if (slurmctld_locked) drain_nodes(mp_name, reason, slurm_get_slurm_user_id()); else slurm_drain_nodes( mp_name, reason, slurm_get_slurm_user_id()); } rc = SLURM_SUCCESS; goto cleanup; break; default: error("Unknown create size of %d", create_size); break; } /* since we don't have a block in this midplane we need to start at the beginning. */ io_start = 0; /* we also need a bg_block to pretend to be the smallest block that takes up the entire midplane. */ } /* Here we need to add blocks that take up nodecards on this midplane. Since Slurm only keeps track of midplanes natively this is the only want to handle this case. */ requests = list_create(destroy_bg_record); add_bg_record(requests, NULL, &blockreq, 1, io_start); slurm_mutex_lock(&block_state_mutex); delete_list = list_create(NULL); while ((bg_record = list_pop(requests))) { itr = list_iterator_create(bg_lists->main); while ((found_record = list_next(itr))) { if (!blocks_overlap(bg_record, found_record)) continue; list_push(delete_list, found_record); list_remove(itr); } list_iterator_destroy(itr); /* we need to add this record since it doesn't exist */ if (bridge_block_create(bg_record) == SLURM_ERROR) { destroy_bg_record(bg_record); error("down_sub_node_blocks: " "unable to configure block in api"); continue; } debug("adding block %s to fill in small blocks " "around bad nodecards", bg_record->bg_block_id); print_bg_record(bg_record); list_append(bg_lists->main, bg_record); if (bit_overlap(bg_record->ionode_bitmap, tmp_record.ionode_bitmap)) { /* here we know the error block doesn't exist so just set the state here */ slurm_mutex_unlock(&block_state_mutex); rc = put_block_in_error_state(bg_record, reason); slurm_mutex_lock(&block_state_mutex); } } list_destroy(requests); if (delete_list) { slurm_mutex_unlock(&block_state_mutex); free_block_list(NO_VAL, delete_list, 0, 0); list_destroy(delete_list); } slurm_mutex_lock(&block_state_mutex); sort_bg_record_inc_size(bg_lists->main); slurm_mutex_unlock(&block_state_mutex); last_bg_update = time(NULL); cleanup: FREE_NULL_BITMAP(tmp_record.mp_bitmap); FREE_NULL_BITMAP(tmp_record.ionode_bitmap); return rc; }