extern int bg_status_update_block_state(bg_record_t *bg_record, uint16_t state, List kill_job_list) { bool skipped_dealloc = false; kill_job_struct_t *freeit = NULL; int updated = 0; uint16_t real_state = bg_record->state & (~BG_BLOCK_ERROR_FLAG); if (real_state == state) return 0; debug("state of Block %s was %s and now is %s", bg_record->bg_block_id, bg_block_state_string(bg_record->state), bg_block_state_string(state)); /* check to make sure block went through freeing correctly */ if ((real_state != BG_BLOCK_TERM && !(bg_record->state & BG_BLOCK_ERROR_FLAG)) && state == BG_BLOCK_FREE) skipped_dealloc = 1; else if ((real_state == BG_BLOCK_INITED) && (state == BG_BLOCK_BOOTING)) { /* This means the user did a reboot through mpirun but we missed the state change */ debug("Block %s skipped rebooting, " "but it really is. " "Setting target_name back to %s", bg_record->bg_block_id, bg_record->user_name); xfree(bg_record->target_name); bg_record->target_name = xstrdup(bg_record->user_name); } else if ((real_state == BG_BLOCK_TERM) && (state == BG_BLOCK_BOOTING)) /* This is a funky state IBM says isn't a bug, but all their documentation says this doesn't happen, but IBM says oh yeah, you weren't really suppose to notice that. So we will just skip this state and act like this didn't happen. */ goto nochange_state; real_state = state; if (bg_record->state & BG_BLOCK_ERROR_FLAG) state |= BG_BLOCK_ERROR_FLAG; bg_record->state = state; if (real_state == BG_BLOCK_TERM || skipped_dealloc) _block_is_deallocating(bg_record, kill_job_list); else if (real_state == BG_BLOCK_BOOTING) { debug("Setting bootflag for %s", bg_record->bg_block_id); bg_record->boot_state = 1; } else if (real_state == BG_BLOCK_FREE) { if (remove_from_bg_list(bg_lists->job_running, bg_record) == SLURM_SUCCESS) num_unused_cpus += bg_record->cpu_cnt; remove_from_bg_list(bg_lists->booted, bg_record); } else if (real_state & BG_BLOCK_ERROR_FLAG) { if (bg_record->boot_state) error("Block %s in an error state while booting.", bg_record->bg_block_id); else error("Block %s in an error state.", bg_record->bg_block_id); remove_from_bg_list(bg_lists->booted, bg_record); trigger_block_error(); } else if (real_state == BG_BLOCK_INITED) { if (!block_ptr_exist_in_list(bg_lists->booted, bg_record)) list_push(bg_lists->booted, bg_record); } updated = 1; nochange_state: /* check the boot state */ debug3("boot state for block %s is %d", bg_record->bg_block_id, bg_record->boot_state); if (bg_record->boot_state) { if (bg_record->state & BG_BLOCK_ERROR_FLAG) { /* If we get an error on boot that * means it is a transparent L3 error * and should be trying to fix * itself. If this is the case we * just hang out waiting for the state * to go to free where we will try to * boot again below. */ return updated; } switch (real_state) { case BG_BLOCK_BOOTING: debug3("checking to make sure user %s " "is the user.", bg_record->target_name); if (update_block_user(bg_record, 0) == 1) last_bg_update = time(NULL); if (bg_record->job_ptr) { bg_record->job_ptr->job_state |= JOB_CONFIGURING; last_job_update = time(NULL); } break; case BG_BLOCK_FREE: if (bg_record->boot_count < RETRY_BOOT_COUNT) { bridge_block_boot(bg_record); if (bg_record->magic == BLOCK_MAGIC) { debug("boot count for block %s is %d", bg_record->bg_block_id, bg_record->boot_count); bg_record->boot_count++; } } else { char *reason = (char *) "status_check: Boot fails "; error("Couldn't boot Block %s for user %s", bg_record->bg_block_id, bg_record->target_name); slurm_mutex_unlock(&block_state_mutex); requeue_and_error(bg_record, reason); slurm_mutex_lock(&block_state_mutex); bg_record->boot_state = 0; bg_record->boot_count = 0; if (remove_from_bg_list( bg_lists->job_running, bg_record) == SLURM_SUCCESS) num_unused_cpus += bg_record->cpu_cnt; remove_from_bg_list(bg_lists->booted, bg_record); } break; case BG_BLOCK_INITED: debug("block %s is ready.", bg_record->bg_block_id); if (bg_record->job_ptr) { bg_record->job_ptr->job_state &= (~JOB_CONFIGURING); last_job_update = time(NULL); } /* boot flags are reset here */ if (kill_job_list && set_block_user(bg_record) == SLURM_ERROR) { freeit = (kill_job_struct_t *) xmalloc(sizeof(kill_job_struct_t)); freeit->jobid = bg_record->job_running; list_push(kill_job_list, freeit); } break; case BG_BLOCK_TERM: debug2("Block %s is in a deallocating state " "during a boot. Doing nothing until " "free state.", bg_record->bg_block_id); break; case BG_BLOCK_REBOOTING: debug2("Block %s is rebooting.", bg_record->bg_block_id); break; default: debug("Hey the state of block " "%s is %d(%s) doing nothing.", bg_record->bg_block_id, real_state, bg_block_state_string(bg_record->state)); break; } } return updated; }
/* Perform job initiation work */ static void _start_agent(bg_action_t *bg_action_ptr) { int rc, set_user_rc = SLURM_SUCCESS; bg_record_t *bg_record = NULL; bg_record_t *found_record = NULL; ListIterator itr; List delete_list = NULL; int requeue_job = 0; slurm_mutex_lock(&block_state_mutex); bg_record = find_bg_record_in_list(bg_lists->main, bg_action_ptr->bg_block_id); if (!bg_record) { slurm_mutex_unlock(&block_state_mutex); error("block %s not found in bg_lists->main", bg_action_ptr->bg_block_id); bg_requeue_job(bg_action_ptr->job_ptr->job_id, 1); return; } if (bg_record->job_running <= NO_JOB_RUNNING) { // bg_reset_block(bg_record); should already happened slurm_mutex_unlock(&block_state_mutex); debug("job %u finished during the queueing job " "(everything is ok)", bg_action_ptr->job_ptr->job_id); return; } if (bg_record->state == BG_BLOCK_TERM) { debug("Block is in Deallocating state, waiting for free."); /* It doesn't appear state of a small block (conn_type) is held on a BGP system so if we to reset it so, just set the reboot flag and handle it later in that code. */ bg_action_ptr->reboot = 1; } delete_list = list_create(NULL); itr = list_iterator_create(bg_lists->main); while ((found_record = list_next(itr))) { if ((!found_record) || (bg_record == found_record)) continue; if (!blocks_overlap(bg_record, found_record)) { debug2("block %s isn't part of %s", found_record->bg_block_id, bg_record->bg_block_id); continue; } if (found_record->job_ptr) { error("Trying to start job %u on block %s, " "but there is a job %u running on an overlapping " "block %s it will not end until %ld. " "This should never happen.", bg_action_ptr->job_ptr->job_id, bg_record->bg_block_id, found_record->job_ptr->job_id, found_record->bg_block_id, found_record->job_ptr->end_time); requeue_job = 1; break; } debug2("need to make sure %s is free, it's part of %s", found_record->bg_block_id, bg_record->bg_block_id); list_push(delete_list, found_record); } list_iterator_destroy(itr); if (requeue_job) { list_destroy(delete_list); bg_reset_block(bg_record); slurm_mutex_unlock(&block_state_mutex); bg_requeue_job(bg_action_ptr->job_ptr->job_id, 0); return; } slurm_mutex_unlock(&block_state_mutex); rc = free_block_list(bg_action_ptr->job_ptr->job_id, delete_list, 0, 1); list_destroy(delete_list); if (rc != SLURM_SUCCESS) { error("Problem with deallocating blocks to run job %u " "on block %s", bg_action_ptr->job_ptr->job_id, bg_action_ptr->bg_block_id); if (IS_JOB_CONFIGURING(bg_action_ptr->job_ptr)) bg_requeue_job(bg_action_ptr->job_ptr->job_id, 0); return; } slurm_mutex_lock(&block_state_mutex); /* Failure will unlock block_state_mutex so no need to unlock before return. Failure will unlock block_state_mutex so no need to unlock before return. */ if (!_make_sure_block_still_exists(bg_action_ptr, bg_record)) return; if (bg_record->job_running <= NO_JOB_RUNNING) { // bg_reset_block(bg_record); should already happened slurm_mutex_unlock(&block_state_mutex); debug("job %u already finished before boot", bg_action_ptr->job_ptr->job_id); return; } rc = 0; #ifdef HAVE_BGL if (bg_action_ptr->blrtsimage && strcasecmp(bg_action_ptr->blrtsimage, bg_record->blrtsimage)) { debug3("changing BlrtsImage from %s to %s", bg_record->blrtsimage, bg_action_ptr->blrtsimage); xfree(bg_record->blrtsimage); bg_record->blrtsimage = xstrdup(bg_action_ptr->blrtsimage); rc = 1; } #elif defined HAVE_BGP if ((bg_action_ptr->conn_type[0] >= SELECT_SMALL) && (bg_action_ptr->conn_type[0] != bg_record->conn_type[0])) { debug3("changing small block mode from %s to %s", conn_type_string(bg_record->conn_type[0]), conn_type_string(bg_action_ptr->conn_type[0])); rc = 1; # ifndef HAVE_BG_FILES /* since we don't check state on an emulated system we * have to change it here */ bg_record->conn_type[0] = bg_action_ptr->conn_type[0]; # endif } #endif #ifdef HAVE_BG_L_P if (bg_action_ptr->linuximage && strcasecmp(bg_action_ptr->linuximage, bg_record->linuximage)) { # ifdef HAVE_BGL debug3("changing LinuxImage from %s to %s", bg_record->linuximage, bg_action_ptr->linuximage); # else debug3("changing CnloadImage from %s to %s", bg_record->linuximage, bg_action_ptr->linuximage); # endif xfree(bg_record->linuximage); bg_record->linuximage = xstrdup(bg_action_ptr->linuximage); rc = 1; } if (bg_action_ptr->ramdiskimage && strcasecmp(bg_action_ptr->ramdiskimage, bg_record->ramdiskimage)) { # ifdef HAVE_BGL debug3("changing RamDiskImage from %s to %s", bg_record->ramdiskimage, bg_action_ptr->ramdiskimage); # else debug3("changing IoloadImage from %s to %s", bg_record->ramdiskimage, bg_action_ptr->ramdiskimage); # endif xfree(bg_record->ramdiskimage); bg_record->ramdiskimage = xstrdup(bg_action_ptr->ramdiskimage); rc = 1; } #endif if (bg_action_ptr->mloaderimage && strcasecmp(bg_action_ptr->mloaderimage, bg_record->mloaderimage)) { debug3("changing MloaderImage from %s to %s", bg_record->mloaderimage, bg_action_ptr->mloaderimage); xfree(bg_record->mloaderimage); bg_record->mloaderimage = xstrdup(bg_action_ptr->mloaderimage); rc = 1; } if (rc || bg_action_ptr->reboot) { bg_record->modifying = 1; /* Increment free_cnt to make sure we don't loose this * block since bg_free_block will unlock block_state_mutex. */ bg_record->free_cnt++; bg_free_block(bg_record, 1, 1); bg_record->free_cnt--; #if defined HAVE_BG_FILES && defined HAVE_BG_L_P #ifdef HAVE_BGL if ((rc = bridge_block_modify(bg_record->bg_block_id, RM_MODIFY_BlrtsImg, bg_record->blrtsimage)) != SLURM_SUCCESS) error("bridge_block_modify(RM_MODIFY_BlrtsImg): %s", bg_err_str(rc)); if ((rc = bridge_block_modify(bg_record->bg_block_id, RM_MODIFY_LinuxImg, bg_record->linuximage)) != SLURM_SUCCESS) error("bridge_block_modify(RM_MODIFY_LinuxImg): %s", bg_err_str(rc)); if ((rc = bridge_block_modify(bg_record->bg_block_id, RM_MODIFY_RamdiskImg, bg_record->ramdiskimage)) != SLURM_SUCCESS) error("bridge_block_modify(RM_MODIFY_RamdiskImg): %s", bg_err_str(rc)); #elif defined HAVE_BGP if ((rc = bridge_block_modify(bg_record->bg_block_id, RM_MODIFY_CnloadImg, bg_record->linuximage)) != SLURM_SUCCESS) error("bridge_block_modify(RM_MODIFY_CnloadImg): %s", bg_err_str(rc)); if ((rc = bridge_block_modify(bg_record->bg_block_id, RM_MODIFY_IoloadImg, bg_record->ramdiskimage)) != SLURM_SUCCESS) error("bridge_block_modify(RM_MODIFY_IoloadImg): %s", bg_err_str(rc)); if (bg_action_ptr->conn_type[0] > SELECT_SMALL) { char *conn_type = NULL; switch(bg_action_ptr->conn_type[0]) { case SELECT_HTC_S: conn_type = "s"; break; case SELECT_HTC_D: conn_type = "d"; break; case SELECT_HTC_V: conn_type = "v"; break; case SELECT_HTC_L: conn_type = "l"; break; default: break; } /* the option has to be set before the pool can be set */ if ((rc = bridge_block_modify( bg_record->bg_block_id, RM_MODIFY_Options, conn_type)) != SLURM_SUCCESS) error("bridge_set_data(RM_MODIFY_Options): %s", bg_err_str(rc)); } #endif if ((rc = bridge_block_modify(bg_record->bg_block_id, RM_MODIFY_MloaderImg, bg_record->mloaderimage)) != SLURM_SUCCESS) error("bridge_block_modify(RM_MODIFY_MloaderImg): %s", bg_err_str(rc)); #endif bg_record->modifying = 0; } if (bg_record->state == BG_BLOCK_FREE) { if ((rc = bridge_block_boot(bg_record)) != SLURM_SUCCESS) { char reason[200]; bg_record->boot_state = 0; bg_record->boot_count = 0; if (rc == BG_ERROR_INVALID_STATE) snprintf(reason, sizeof(reason), "Block %s is in an incompatible " "state. This usually means " "hardware is allocated " "by another block (maybe outside " "of SLURM).", bg_record->bg_block_id); else snprintf(reason, sizeof(reason), "Couldn't boot block %s: %s", bg_record->bg_block_id, bg_err_str(rc)); slurm_mutex_unlock(&block_state_mutex); requeue_and_error(bg_record, reason); return; } } else if (bg_record->state == BG_BLOCK_BOOTING) { #ifdef HAVE_BG_FILES bg_record->boot_state = 1; #else if (!block_ptr_exist_in_list(bg_lists->booted, bg_record)) list_push(bg_lists->booted, bg_record); bg_record->state = BG_BLOCK_INITED; last_bg_update = time(NULL); #endif } if (bg_record->job_running <= NO_JOB_RUNNING) { slurm_mutex_unlock(&block_state_mutex); debug("job %u finished during the start of the boot " "(everything is ok)", bg_action_ptr->job_ptr->job_id); return; } /* Don't reset boot_count, it will be reset when state changes, and needs to outlast a job allocation. */ /* bg_record->boot_count = 0; */ xfree(bg_record->target_name); bg_record->target_name = uid_to_string(bg_action_ptr->job_ptr->user_id); debug("setting the target_name for Block %s to %s", bg_record->bg_block_id, bg_record->target_name); if (bg_record->state == BG_BLOCK_INITED) { debug("block %s is ready.", bg_record->bg_block_id); set_user_rc = set_block_user(bg_record); if (bg_action_ptr->job_ptr) { bg_action_ptr->job_ptr->job_state &= (~JOB_CONFIGURING); last_job_update = time(NULL); } } slurm_mutex_unlock(&block_state_mutex); if (set_user_rc == SLURM_ERROR) { sleep(2); /* wait for the slurmd to begin the batch script, slurm_fail_job() is a no-op if issued prior to the script initiation do clean up just incase the fail job isn't ran */ (void) slurm_fail_job(bg_record->job_running); slurm_mutex_lock(&block_state_mutex); if (remove_from_bg_list(bg_lists->job_running, bg_record) == SLURM_SUCCESS) num_unused_cpus += bg_record->cpu_cnt; slurm_mutex_unlock(&block_state_mutex); } }
/* Perform job initiation work */ static void _start_agent(bg_action_t *bg_action_ptr) { int rc, set_user_rc = SLURM_SUCCESS; bg_record_t *bg_record = NULL; bg_record_t *found_record = NULL; ListIterator itr; List delete_list = NULL; int requeue_job = 0; uint32_t req_job_id = bg_action_ptr->job_ptr->job_id; bool block_inited = 0; bool delete_it = 0; slurm_mutex_lock(&block_state_mutex); bg_record = find_bg_record_in_list(bg_lists->main, bg_action_ptr->bg_block_id); if (!bg_record) { bg_record->modifying = 0; slurm_mutex_unlock(&block_state_mutex); error("block %s not found in bg_lists->main", bg_action_ptr->bg_block_id); bg_requeue_job(req_job_id, 1, 0, JOB_BOOT_FAIL, false); return; } if ((bg_record->job_running <= NO_JOB_RUNNING) && !find_job_in_bg_record(bg_record, req_job_id)) { bg_record->modifying = 0; // bg_reset_block(bg_record); should already happened slurm_mutex_unlock(&block_state_mutex); debug("job %u finished during the queueing job " "(everything is ok)", req_job_id); return; } if ((bg_record->state == BG_BLOCK_TERM) || bg_record->free_cnt) { /* It doesn't appear state of a small block (conn_type) is held on a BGP system so if we to reset it so, just set the reboot flag and handle it later in that code. */ bg_action_ptr->reboot = 1; } delete_list = list_create(NULL); itr = list_iterator_create(bg_lists->main); while ((found_record = list_next(itr))) { if (bg_record == found_record) continue; if (!blocks_overlap(bg_record, found_record)) { debug2("block %s isn't part of %s", found_record->bg_block_id, bg_record->bg_block_id); continue; } if (found_record->job_ptr || (found_record->job_list && list_count(found_record->job_list))) { struct job_record *job_ptr = found_record->job_ptr; if (!found_record->job_ptr) job_ptr = find_job_in_bg_record( found_record, NO_VAL); error("Trying to start job %u on block %s, " "but there is a job %u running on an overlapping " "block %s it will not end until %ld. " "This should never happen.", req_job_id, bg_record->bg_block_id, job_ptr->job_id, found_record->bg_block_id, job_ptr->end_time); requeue_job = 1; break; } debug2("need to make sure %s is free, it's part of %s", found_record->bg_block_id, bg_record->bg_block_id); list_push(delete_list, found_record); } list_iterator_destroy(itr); if (requeue_job) { FREE_NULL_LIST(delete_list); bg_reset_block(bg_record, bg_action_ptr->job_ptr); bg_record->modifying = 0; slurm_mutex_unlock(&block_state_mutex); bg_requeue_job(req_job_id, 0, 0, JOB_BOOT_FAIL, false); return; } slurm_mutex_unlock(&block_state_mutex); if (bg_conf->layout_mode == LAYOUT_DYNAMIC) delete_it = 1; free_block_list(req_job_id, delete_list, delete_it, 1); FREE_NULL_LIST(delete_list); while (1) { slurm_mutex_lock(&block_state_mutex); /* Failure will unlock block_state_mutex so no need to unlock before return. No need to reset modifying here if the block doesn't exist. */ if (!_make_sure_block_still_exists(bg_action_ptr, bg_record)) { error("Problem with deallocating blocks to run job %u " "on block %s", req_job_id, bg_action_ptr->bg_block_id); return; } /* If another thread is freeing this block we need to wait until it is done or we will get into a state where this job will be killed. */ if (!bg_record->free_cnt) break; debug("Waiting for block %s to free for job %u. " "%d thread(s) trying to free it", bg_record->bg_block_id, req_job_id, bg_record->free_cnt); slurm_mutex_unlock(&block_state_mutex); sleep(1); } /* This was set in the start_job function to close the above window where a job could be mistakenly requeued if another thread is trying to free this block as we are trying to run on it, which is fine since we will reboot it later. */ bg_record->modifying = 0; if ((bg_record->job_running <= NO_JOB_RUNNING) && !find_job_in_bg_record(bg_record, req_job_id)) { // bg_reset_block(bg_record); should already happened slurm_mutex_unlock(&block_state_mutex); debug("job %u already finished before boot", req_job_id); return; } if (bg_record->job_list && (bg_action_ptr->job_ptr->total_cpus != bg_record->cpu_cnt) && (list_count(bg_record->job_list) != 1)) { /* We don't allow modification of a block or reboot of a block if we are running multiple jobs on the block. */ debug2("no reboot"); goto no_reboot; } rc = 0; #ifdef HAVE_BGL if (bg_action_ptr->blrtsimage && xstrcasecmp(bg_action_ptr->blrtsimage, bg_record->blrtsimage)) { debug3("changing BlrtsImage from %s to %s", bg_record->blrtsimage, bg_action_ptr->blrtsimage); xfree(bg_record->blrtsimage); bg_record->blrtsimage = xstrdup(bg_action_ptr->blrtsimage); rc = 1; } #elif defined HAVE_BGP if ((bg_action_ptr->conn_type[0] >= SELECT_SMALL) && (bg_action_ptr->conn_type[0] != bg_record->conn_type[0])) { if (bg_conf->slurm_debug_level >= LOG_LEVEL_DEBUG3) { char *req_conn_type = conn_type_string_full(bg_action_ptr->conn_type); char *conn_type = conn_type_string_full(bg_record->conn_type); debug3("changing small block mode from %s to %s", conn_type, req_conn_type); xfree(req_conn_type); xfree(conn_type); } rc = 1; # ifndef HAVE_BG_FILES /* since we don't check state on an emulated system we * have to change it here */ bg_record->conn_type[0] = bg_action_ptr->conn_type[0]; # endif } #endif #ifdef HAVE_BG_L_P if (bg_action_ptr->linuximage && xstrcasecmp(bg_action_ptr->linuximage, bg_record->linuximage)) { # ifdef HAVE_BGL debug3("changing LinuxImage from %s to %s", bg_record->linuximage, bg_action_ptr->linuximage); # else debug3("changing CnloadImage from %s to %s", bg_record->linuximage, bg_action_ptr->linuximage); # endif xfree(bg_record->linuximage); bg_record->linuximage = xstrdup(bg_action_ptr->linuximage); rc = 1; } if (bg_action_ptr->ramdiskimage && xstrcasecmp(bg_action_ptr->ramdiskimage, bg_record->ramdiskimage)) { # ifdef HAVE_BGL debug3("changing RamDiskImage from %s to %s", bg_record->ramdiskimage, bg_action_ptr->ramdiskimage); # else debug3("changing IoloadImage from %s to %s", bg_record->ramdiskimage, bg_action_ptr->ramdiskimage); # endif xfree(bg_record->ramdiskimage); bg_record->ramdiskimage = xstrdup(bg_action_ptr->ramdiskimage); rc = 1; } #endif if (bg_action_ptr->mloaderimage && xstrcasecmp(bg_action_ptr->mloaderimage, bg_record->mloaderimage)) { debug3("changing MloaderImage from %s to %s", bg_record->mloaderimage, bg_action_ptr->mloaderimage); xfree(bg_record->mloaderimage); bg_record->mloaderimage = xstrdup(bg_action_ptr->mloaderimage); rc = 1; } if (rc || bg_action_ptr->reboot) { bg_record->modifying = 1; /* Increment free_cnt to make sure we don't loose this * block since bg_free_block will unlock block_state_mutex. */ bg_record->free_cnt++; bg_free_block(bg_record, 1, 1); bg_record->free_cnt--; #if defined HAVE_BG_FILES && defined HAVE_BG_L_P #ifdef HAVE_BGL if ((rc = bridge_block_modify(bg_record->bg_block_id, RM_MODIFY_BlrtsImg, bg_record->blrtsimage)) != SLURM_SUCCESS) error("bridge_block_modify(RM_MODIFY_BlrtsImg): %s", bg_err_str(rc)); if ((rc = bridge_block_modify(bg_record->bg_block_id, RM_MODIFY_LinuxImg, bg_record->linuximage)) != SLURM_SUCCESS) error("bridge_block_modify(RM_MODIFY_LinuxImg): %s", bg_err_str(rc)); if ((rc = bridge_block_modify(bg_record->bg_block_id, RM_MODIFY_RamdiskImg, bg_record->ramdiskimage)) != SLURM_SUCCESS) error("bridge_block_modify(RM_MODIFY_RamdiskImg): %s", bg_err_str(rc)); #elif defined HAVE_BGP if ((rc = bridge_block_modify(bg_record->bg_block_id, RM_MODIFY_CnloadImg, bg_record->linuximage)) != SLURM_SUCCESS) error("bridge_block_modify(RM_MODIFY_CnloadImg): %s", bg_err_str(rc)); if ((rc = bridge_block_modify(bg_record->bg_block_id, RM_MODIFY_IoloadImg, bg_record->ramdiskimage)) != SLURM_SUCCESS) error("bridge_block_modify(RM_MODIFY_IoloadImg): %s", bg_err_str(rc)); if (bg_action_ptr->conn_type[0] > SELECT_SMALL) { char *conn_type = NULL; switch(bg_action_ptr->conn_type[0]) { case SELECT_HTC_S: conn_type = "s"; break; case SELECT_HTC_D: conn_type = "d"; break; case SELECT_HTC_V: conn_type = "v"; break; case SELECT_HTC_L: conn_type = "l"; break; default: break; } /* the option has to be set before the pool can be set */ if ((rc = bridge_block_modify( bg_record->bg_block_id, RM_MODIFY_Options, conn_type)) != SLURM_SUCCESS) error("bridge_set_data(RM_MODIFY_Options): %s", bg_err_str(rc)); } #endif if ((rc = bridge_block_modify(bg_record->bg_block_id, RM_MODIFY_MloaderImg, bg_record->mloaderimage)) != SLURM_SUCCESS) error("bridge_block_modify(RM_MODIFY_MloaderImg): %s", bg_err_str(rc)); #endif bg_record->modifying = 0; } no_reboot: if (bg_record->state == BG_BLOCK_FREE) { if ((rc = bridge_block_boot(bg_record)) != SLURM_SUCCESS) { char reason[200]; bg_record->boot_state = 0; bg_record->boot_count = 0; if (rc == BG_ERROR_INVALID_STATE) snprintf(reason, sizeof(reason), "Block %s is in an incompatible " "state. This usually means " "hardware is allocated " "by another block (maybe outside " "of SLURM).", bg_record->bg_block_id); else snprintf(reason, sizeof(reason), "Couldn't boot block %s: %s", bg_record->bg_block_id, bg_err_str(rc)); slurm_mutex_unlock(&block_state_mutex); requeue_and_error(bg_record, reason); return; } } else if (bg_record->state == BG_BLOCK_BOOTING) { #ifdef HAVE_BG_FILES bg_record->boot_state = 1; #else if (!block_ptr_exist_in_list(bg_lists->booted, bg_record)) list_push(bg_lists->booted, bg_record); bg_record->state = BG_BLOCK_INITED; last_bg_update = time(NULL); #endif } if ((bg_record->job_running <= NO_JOB_RUNNING) && !find_job_in_bg_record(bg_record, req_job_id)) { slurm_mutex_unlock(&block_state_mutex); debug("job %u finished during the start of the boot " "(everything is ok)", req_job_id); return; } /* Don't reset boot_count, it will be reset when state changes, and needs to outlast a job allocation. */ /* bg_record->boot_count = 0; */ if (bg_record->state == BG_BLOCK_INITED) { debug("block %s is already ready.", bg_record->bg_block_id); /* Just in case reset the boot flags */ bg_record->boot_state = 0; bg_record->boot_count = 0; set_user_rc = bridge_block_sync_users(bg_record); block_inited = 1; } slurm_mutex_unlock(&block_state_mutex); /* This lock needs to happen after the block_state_mutex to avoid deadlock. */ if (block_inited && bg_action_ptr->job_ptr) { slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; lock_slurmctld(job_write_lock); bg_action_ptr->job_ptr->job_state &= (~JOB_CONFIGURING); last_job_update = time(NULL); unlock_slurmctld(job_write_lock); } if (set_user_rc == SLURM_ERROR) { sleep(2); /* wait for the slurmd to begin the batch script, slurm_fail_job() is a no-op if issued prior to the script initiation do clean up just in case the fail job isn't ran */ (void) slurm_fail_job(req_job_id, JOB_BOOT_FAIL); } }
extern int bg_status_update_block_state(bg_record_t *bg_record, uint16_t state, List kill_job_list) { bool skipped_dealloc = false; kill_job_struct_t *freeit = NULL; int updated = 0; uint16_t real_state = bg_record->state & (~BG_BLOCK_ERROR_FLAG); if (real_state == state) return 0; debug("state of Block %s was %s and now is %s", bg_record->bg_block_id, bg_block_state_string(bg_record->state), bg_block_state_string(state)); /* check to make sure block went through freeing correctly */ if ((real_state != BG_BLOCK_TERM && !(bg_record->state & BG_BLOCK_ERROR_FLAG)) && state == BG_BLOCK_FREE) skipped_dealloc = 1; else if ((real_state == BG_BLOCK_INITED) && (state == BG_BLOCK_BOOTING)) { /* This means the user did a reboot through mpirun but we missed the state change */ debug("Block %s skipped rebooting, " "but it really is.", bg_record->bg_block_id); } else if ((real_state == BG_BLOCK_TERM) && (state == BG_BLOCK_BOOTING)) /* This is a funky state IBM says isn't a bug, but all their documentation says this doesn't happen, but IBM says oh yeah, you weren't really suppose to notice that. So we will just skip this state and act like this didn't happen. */ goto nochange_state; real_state = state; if (bg_record->state & BG_BLOCK_ERROR_FLAG) state |= BG_BLOCK_ERROR_FLAG; bg_record->state = state; if (real_state == BG_BLOCK_TERM || skipped_dealloc) _block_is_deallocating(bg_record, kill_job_list); else if (real_state == BG_BLOCK_BOOTING) { debug("Setting bootflag for %s", bg_record->bg_block_id); bg_record->boot_state = 1; } else if (real_state == BG_BLOCK_FREE) { /* Make sure block is cleaned up. If there are * running jobs on the block this happens when they * are cleaned off. */ if (bg_record->job_running == NO_JOB_RUNNING && (!bg_record->job_list || !list_count(bg_record->job_list))) bg_reset_block(bg_record, NULL); remove_from_bg_list(bg_lists->booted, bg_record); } else if (real_state & BG_BLOCK_ERROR_FLAG) { if (bg_record->boot_state) error("Block %s in an error state while booting.", bg_record->bg_block_id); else error("Block %s in an error state.", bg_record->bg_block_id); remove_from_bg_list(bg_lists->booted, bg_record); trigger_block_error(); } else if (real_state == BG_BLOCK_INITED) { if (!block_ptr_exist_in_list(bg_lists->booted, bg_record)) list_push(bg_lists->booted, bg_record); } updated = 1; last_bg_update = time(NULL); nochange_state: /* check the boot state */ debug3("boot state for block %s is %d", bg_record->bg_block_id, bg_record->boot_state); if (bg_record->boot_state) { if (bg_record->state & BG_BLOCK_ERROR_FLAG) { /* If we get an error on boot that * means it is a transparent L3 error * and should be trying to fix * itself. If this is the case we * just hang out waiting for the state * to go to free where we will try to * boot again below. */ return updated; } switch (real_state) { case BG_BLOCK_BOOTING: if (bg_record->job_ptr && !IS_JOB_CONFIGURING(bg_record->job_ptr)) { debug3("Setting job %u on block %s " "to configuring", bg_record->job_ptr->job_id, bg_record->bg_block_id); bg_record->job_ptr->job_state |= JOB_CONFIGURING; last_job_update = time(NULL); } else if (bg_record->job_list && list_count(bg_record->job_list)) { struct job_record *job_ptr; ListIterator job_itr = list_iterator_create( bg_record->job_list); while ((job_ptr = list_next(job_itr))) { if (job_ptr->magic != JOB_MAGIC) { error("bg_status_update_" "block_state: 1 " "bad magic found when " "looking at block %s", bg_record->bg_block_id); list_delete_item(job_itr); continue; } job_ptr->job_state |= JOB_CONFIGURING; } list_iterator_destroy(job_itr); last_job_update = time(NULL); } break; case BG_BLOCK_FREE: if (bg_record->boot_count < RETRY_BOOT_COUNT) { bridge_block_boot(bg_record); if (bg_record->magic == BLOCK_MAGIC) { debug("boot count for block %s is %d", bg_record->bg_block_id, bg_record->boot_count); bg_record->boot_count++; } } else { char *reason = (char *) "status_check: Boot fails "; error("Couldn't boot Block %s", bg_record->bg_block_id); /* We can't push on the kill_job_list here since we have to put this block in an error and that means the killing has to take place before the erroring of the block. */ slurm_mutex_unlock(&block_state_mutex); unlock_slurmctld(job_read_lock); requeue_and_error(bg_record, reason); lock_slurmctld(job_read_lock); slurm_mutex_lock(&block_state_mutex); bg_record->boot_state = 0; bg_record->boot_count = 0; remove_from_bg_list(bg_lists->booted, bg_record); } break; case BG_BLOCK_INITED: debug("block %s is ready.", bg_record->bg_block_id); if (bg_record->job_ptr && IS_JOB_CONFIGURING(bg_record->job_ptr)) { bg_record->job_ptr->job_state &= (~JOB_CONFIGURING); last_job_update = time(NULL); } else if (bg_record->job_list && list_count(bg_record->job_list)) { struct job_record *job_ptr; ListIterator job_itr = list_iterator_create( bg_record->job_list); while ((job_ptr = list_next(job_itr))) { if (job_ptr->magic != JOB_MAGIC) { error("bg_status_update_" "block_state: 2 " "bad magic found when " "looking at block %s", bg_record->bg_block_id); list_delete_item(job_itr); continue; } job_ptr->job_state &= (~JOB_CONFIGURING); } list_iterator_destroy(job_itr); last_job_update = time(NULL); } bg_record->boot_state = 0; bg_record->boot_count = 0; if (kill_job_list && bridge_block_sync_users(bg_record) == SLURM_ERROR) { freeit = (kill_job_struct_t *) xmalloc(sizeof(kill_job_struct_t)); freeit->jobid = bg_record->job_running; list_push(kill_job_list, freeit); } break; case BG_BLOCK_TERM: debug2("Block %s is in a deallocating state " "during a boot. Doing nothing until " "free state.", bg_record->bg_block_id); break; case BG_BLOCK_REBOOTING: debug2("Block %s is rebooting.", bg_record->bg_block_id); break; default: debug("Hey the state of block " "%s is %d(%s) doing nothing.", bg_record->bg_block_id, real_state, bg_block_state_string(bg_record->state)); break; } } return updated; }