static int _get_switches_by_mpid( my_bluegene_t* my_bg, const char *mpid, rm_switch_t *coord_switch[SYSTEM_DIMENSIONS]) { static int switch_num = 0; rm_switch_t *curr_switch = NULL; int i, rc; int found_mpid = 0; char *curr_mpid = NULL; if (!switch_num) { if ((rc = bridge_get_data(my_bg, RM_SwitchNum, &switch_num)) != SLURM_SUCCESS) { fatal("bridge_get_data: RM_SwitchNum: %s", bg_err_str(rc)); return SLURM_ERROR; } } for (i=0; i<switch_num; i++) { if (i) { if ((rc = bridge_get_data(my_bg, RM_NextSwitch, &curr_switch)) != SLURM_SUCCESS) { fatal("bridge_get_data" "(RM_NextSwitch): %s", bg_err_str(rc)); } } else { if ((rc = bridge_get_data(my_bg, RM_FirstSwitch, &curr_switch)) != SLURM_SUCCESS) { fatal("bridge_get_data" "(RM_FirstSwitch): %s", bg_err_str(rc)); } } if ((rc = bridge_get_data(curr_switch, RM_SwitchBPID, &curr_mpid)) != SLURM_SUCCESS) { fatal("bridge_get_data: RM_SwitchBPID: %s", bg_err_str(rc)); } if (!curr_mpid) { error("No BP ID was returned from database"); continue; } if (!strcasecmp((char *)mpid, (char *)curr_mpid)) { coord_switch[found_mpid] = curr_switch; found_mpid++; if (found_mpid==SYSTEM_DIMENSIONS) { free(curr_mpid); return SLURM_SUCCESS; } } free(curr_mpid); } return SLURM_ERROR; }
/* Test for nodes that are not UP in MMCS and DRAIN them in SLURM */ static void _test_down_nodes(my_bluegene_t *my_bg) { int bp_num, i, rc; rm_BP_t *my_bp; debug2("Running _test_down_nodes"); if ((rc = bridge_get_data(my_bg, RM_BPNum, &bp_num)) != SLURM_SUCCESS) { error("bridge_get_data(RM_BPNum): %s", bg_err_str(rc)); bp_num = 0; } for (i=0; i<bp_num; i++) { if (i) { if ((rc = bridge_get_data(my_bg, RM_NextBP, &my_bp)) != SLURM_SUCCESS) { error("bridge_get_data(RM_NextBP): %s", bg_err_str(rc)); continue; } } else { if ((rc = bridge_get_data(my_bg, RM_FirstBP, &my_bp)) != SLURM_SUCCESS) { error("bridge_get_data(RM_FirstBP): %s", bg_err_str(rc)); continue; } } _test_down_nodecards(my_bp, 0); } }
/* Test for switches that are not UP in MMCS, * when found DRAIN them in SLURM and configure their midplane DOWN */ static void _test_down_switches(my_bluegene_t *my_bg) { int switch_num, i, rc; rm_switch_t *my_switch; rm_bp_id_t bp_id; rm_switch_state_t switch_state; debug2("Running _test_down_switches"); if ((rc = bridge_get_data(my_bg, RM_SwitchNum, &switch_num)) != SLURM_SUCCESS) { error("bridge_get_data(RM_SwitchNum): %s", bg_err_str(rc)); switch_num = 0; } for (i=0; i<switch_num; i++) { if (i) { if ((rc = bridge_get_data(my_bg, RM_NextSwitch, &my_switch)) != SLURM_SUCCESS) { error("bridge_get_data(RM_NextSwitch): %s", bg_err_str(rc)); continue; } } else { if ((rc = bridge_get_data(my_bg, RM_FirstSwitch, &my_switch)) != SLURM_SUCCESS) { error("bridge_get_data(RM_FirstSwitch): %s", bg_err_str(rc)); continue; } } if ((rc = bridge_get_data(my_switch, RM_SwitchState, &switch_state)) != SLURM_SUCCESS) { error("bridge_get_data(RM_SwitchState): %s", bg_err_str(rc)); continue; } if (switch_state == RM_SWITCH_UP) continue; if ((rc = bridge_get_data(my_switch, RM_SwitchBPID, &bp_id)) != SLURM_SUCCESS) { error("bridge_get_data(RM_SwitchBPID): %s", bg_err_str(rc)); continue; } if (!bp_id) { error("No BPID was returned from database"); continue; } _configure_node_down(bp_id, my_bg); free(bp_id); } }
static int _get_mp_by_location(my_bluegene_t* my_bg, uint16_t* curr_coord, rm_BP_t** mp) { static int mp_num = 0; int i, rc; rm_location_t loc; if (!mp_num) { if ((rc = bridge_get_data(my_bg, RM_BPNum, &mp_num)) != SLURM_SUCCESS) { fatal("bridge_get_data: RM_BPNum: %s", bg_err_str(rc)); return SLURM_ERROR; } } for (i=0; i<mp_num; i++) { if (i) { if ((rc = bridge_get_data(my_bg, RM_NextBP, mp)) != SLURM_SUCCESS) { fatal("bridge_get_data: RM_NextBP: %s", bg_err_str(rc)); return SLURM_ERROR; } } else { if ((rc = bridge_get_data(my_bg, RM_FirstBP, mp)) != SLURM_SUCCESS) { fatal("bridge_get_data: RM_FirstBP: %s", bg_err_str(rc)); return SLURM_ERROR; } } if ((rc = bridge_get_data(*mp, RM_BPLoc, &loc)) != SLURM_SUCCESS) { fatal("bridge_get_data: RM_BPLoc: %s", bg_err_str(rc)); return SLURM_ERROR; } if ((loc.X == curr_coord[X]) && (loc.Y == curr_coord[Y]) && (loc.Z == curr_coord[Z])) { return SLURM_SUCCESS; } } // error("_get_mp_by_location: could not find specified mp."); return SLURM_ERROR; }
static char *_get_bp_node_name(rm_BP_t *bp_ptr) { rm_location_t bp_loc; int rc; errno = SLURM_SUCCESS; if ((rc = bridge_get_data(bp_ptr, RM_BPLoc, &bp_loc)) != SLURM_SUCCESS) { error("bridge_get_data(RM_BPLoc): %s", bg_err_str(rc)); errno = SLURM_ERROR; return NULL; } /* make sure we have this midplane in the system */ if (bp_loc.X >= DIM_SIZE[X] || bp_loc.Y >= DIM_SIZE[Y] || bp_loc.Z >= DIM_SIZE[Z]) { debug4("node %s%c%c%c isn't configured", bg_conf->slurm_node_prefix, alpha_num[bp_loc.X], alpha_num[bp_loc.Y], alpha_num[bp_loc.Z]); return NULL; } return xstrdup_printf("%s%c%c%c", bg_conf->slurm_node_prefix, alpha_num[bp_loc.X], alpha_num[bp_loc.Y], alpha_num[bp_loc.Z]); }
/* All changes to the bg_list target_name must be done before this function is called. also slurm_conf_lock() must be called before calling this function along with slurm_conf_unlock() afterwards. */ extern int update_block_user(bg_record_t *bg_record, int set) { int rc=0; if (!bg_record->target_name) { error("Must set target_name to run update_block_user."); return -1; } if (!bg_record->user_name) { error("No user_name"); bg_record->user_name = xstrdup(bg_conf->slurm_user_name); } if (set) { if ((rc = bridge_block_remove_all_users( bg_record, bg_record->target_name)) == REMOVE_USER_ERR) { error("1 Something happened removing " "users from block %s", bg_record->bg_block_id); return -1; } else if (rc == REMOVE_USER_NONE) { if (strcmp(bg_record->target_name, bg_conf->slurm_user_name)) { info("Adding user %s to Block %s", bg_record->target_name, bg_record->bg_block_id); if ((rc = bridge_block_add_user( bg_record, bg_record->target_name)) != SLURM_SUCCESS) { error("bridge_add_block_user" "(%s,%s): %s", bg_record->bg_block_id, bg_record->target_name, bg_err_str(rc)); return -1; } } } } if (strcmp(bg_record->target_name, bg_record->user_name)) { uid_t pw_uid; xfree(bg_record->user_name); bg_record->user_name = xstrdup(bg_record->target_name); if (uid_from_string (bg_record->user_name, &pw_uid) < 0) { error("No such user: %s", bg_record->user_name); return -1; } else { bg_record->user_uid = pw_uid; } return 1; } return 0; }
/* * Search MMCS for failed switches and nodes. Failed resources are DRAINED in * SLURM. This relies upon rm_get_BG(), which is slow (10+ seconds) so run * this test infrequently. */ static void _test_mmcs_failures(void) { #if defined HAVE_BG_FILES my_bluegene_t *local_bg; int rc; if ((rc = bridge_get_bg(&local_bg)) != SLURM_SUCCESS) { error("bridge_get_BG(): %s", bg_err_str(rc)); return; } _test_down_switches(local_bg); _test_down_nodes(local_bg); if ((rc = bridge_free_bg(local_bg)) != SLURM_SUCCESS) error("bridge_free_BG(): %s", bg_err_str(rc)); #endif }
extern int bg_free_block(bg_record_t *bg_record, bool wait, bool locked) { int rc = SLURM_SUCCESS; int count = 0; if (!bg_record) { error("bg_free_block: there was no bg_record"); return SLURM_ERROR; } if (!locked) slurm_mutex_lock(&block_state_mutex); while (count < MAX_FREE_RETRIES) { /* block was removed */ if (bg_record->magic != BLOCK_MAGIC) { error("block was removed while freeing it here"); xassert(0); if (!locked) slurm_mutex_unlock(&block_state_mutex); return SLURM_SUCCESS; } /* Reset these here so we don't try to reboot it when the state goes to free. */ bg_record->boot_state = 0; bg_record->boot_count = 0; /* Here we don't need to check if the block is still * in exsistance since this function can't be called on * the same block twice. It may * had already been removed at this point also. */ #ifdef HAVE_BG_FILES if (bg_record->state != BG_BLOCK_FREE && bg_record->state != BG_BLOCK_TERM) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("bridge_destroy %s", bg_record->bg_block_id); rc = bridge_block_free(bg_record); if (rc != SLURM_SUCCESS) { if (rc == BG_ERROR_BLOCK_NOT_FOUND) { debug("block %s is not found", bg_record->bg_block_id); bg_record->state = BG_BLOCK_FREE; break; } else if (rc == BG_ERROR_FREE) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("bridge_block_free" "(%s): %s State = %s", bg_record->bg_block_id, bg_err_str(rc), bg_block_state_string( bg_record->state)); } else if (rc == BG_ERROR_INVALID_STATE) { #ifndef HAVE_BGL /* If the state is error and we get an incompatible state back here, it means we set it ourselves so break out. */ if (bg_record->state & BG_BLOCK_ERROR_FLAG) break; #endif if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("bridge_block_free" "(%s): %s State = %s", bg_record->bg_block_id, bg_err_str(rc), bg_block_state_string( bg_record->state)); #ifdef HAVE_BGQ if (bg_record->state != BG_BLOCK_FREE && bg_record->state != BG_BLOCK_TERM) bg_record->state = BG_BLOCK_TERM; #endif } else { error("bridge_block_free" "(%s): %s State = %s", bg_record->bg_block_id, bg_err_str(rc), bg_block_state_string( bg_record->state)); } } } #else /* Fake a free since we are n deallocating state before this. */ if (bg_record->state & BG_BLOCK_ERROR_FLAG) { /* This will set the state to ERROR(Free) * just incase the state was ERROR(SOMETHING ELSE) */ bg_record->state = BG_BLOCK_ERROR_FLAG; break; } else if (!wait || (count >= 3)) bg_record->state = BG_BLOCK_FREE; else if (bg_record->state != BG_BLOCK_FREE) bg_record->state = BG_BLOCK_TERM; #endif if (!wait || (bg_record->state == BG_BLOCK_FREE) #ifndef HAVE_BGL || (bg_record->state & BG_BLOCK_ERROR_FLAG) #endif ) { break; } /* If we were locked outside of this we need to unlock to not cause deadlock on this mutex until we are done. */ slurm_mutex_unlock(&block_state_mutex); sleep(FREE_SLEEP_INTERVAL); count++; slurm_mutex_lock(&block_state_mutex); } rc = SLURM_SUCCESS; if ((bg_record->state == BG_BLOCK_FREE) || (bg_record->state & BG_BLOCK_ERROR_FLAG)) remove_from_bg_list(bg_lists->booted, bg_record); else if (count >= MAX_FREE_RETRIES) { /* Something isn't right, go mark this one in an error state. */ update_block_msg_t block_msg; if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("bg_free_block: block %s is not in state " "free (%s), putting it in error state.", bg_record->bg_block_id, bg_block_state_string(bg_record->state)); slurm_init_update_block_msg(&block_msg); block_msg.bg_block_id = bg_record->bg_block_id; block_msg.state = BG_BLOCK_ERROR_FLAG; block_msg.reason = "Block would not deallocate"; slurm_mutex_unlock(&block_state_mutex); select_g_update_block(&block_msg); slurm_mutex_lock(&block_state_mutex); rc = SLURM_ERROR; } if (!locked) slurm_mutex_unlock(&block_state_mutex); return rc; }
/* Perform job initiation work */ static void _start_agent(bg_action_t *bg_action_ptr) { int rc, set_user_rc = SLURM_SUCCESS; bg_record_t *bg_record = NULL; bg_record_t *found_record = NULL; ListIterator itr; List delete_list = NULL; int requeue_job = 0; uint32_t req_job_id = bg_action_ptr->job_ptr->job_id; bool block_inited = 0; bool delete_it = 0; slurm_mutex_lock(&block_state_mutex); bg_record = find_bg_record_in_list(bg_lists->main, bg_action_ptr->bg_block_id); if (!bg_record) { bg_record->modifying = 0; slurm_mutex_unlock(&block_state_mutex); error("block %s not found in bg_lists->main", bg_action_ptr->bg_block_id); bg_requeue_job(req_job_id, 1, 0, JOB_BOOT_FAIL, false); return; } if ((bg_record->job_running <= NO_JOB_RUNNING) && !find_job_in_bg_record(bg_record, req_job_id)) { bg_record->modifying = 0; // bg_reset_block(bg_record); should already happened slurm_mutex_unlock(&block_state_mutex); debug("job %u finished during the queueing job " "(everything is ok)", req_job_id); return; } if ((bg_record->state == BG_BLOCK_TERM) || bg_record->free_cnt) { /* It doesn't appear state of a small block (conn_type) is held on a BGP system so if we to reset it so, just set the reboot flag and handle it later in that code. */ bg_action_ptr->reboot = 1; } delete_list = list_create(NULL); itr = list_iterator_create(bg_lists->main); while ((found_record = list_next(itr))) { if (bg_record == found_record) continue; if (!blocks_overlap(bg_record, found_record)) { debug2("block %s isn't part of %s", found_record->bg_block_id, bg_record->bg_block_id); continue; } if (found_record->job_ptr || (found_record->job_list && list_count(found_record->job_list))) { struct job_record *job_ptr = found_record->job_ptr; if (!found_record->job_ptr) job_ptr = find_job_in_bg_record( found_record, NO_VAL); error("Trying to start job %u on block %s, " "but there is a job %u running on an overlapping " "block %s it will not end until %ld. " "This should never happen.", req_job_id, bg_record->bg_block_id, job_ptr->job_id, found_record->bg_block_id, job_ptr->end_time); requeue_job = 1; break; } debug2("need to make sure %s is free, it's part of %s", found_record->bg_block_id, bg_record->bg_block_id); list_push(delete_list, found_record); } list_iterator_destroy(itr); if (requeue_job) { FREE_NULL_LIST(delete_list); bg_reset_block(bg_record, bg_action_ptr->job_ptr); bg_record->modifying = 0; slurm_mutex_unlock(&block_state_mutex); bg_requeue_job(req_job_id, 0, 0, JOB_BOOT_FAIL, false); return; } slurm_mutex_unlock(&block_state_mutex); if (bg_conf->layout_mode == LAYOUT_DYNAMIC) delete_it = 1; free_block_list(req_job_id, delete_list, delete_it, 1); FREE_NULL_LIST(delete_list); while (1) { slurm_mutex_lock(&block_state_mutex); /* Failure will unlock block_state_mutex so no need to unlock before return. No need to reset modifying here if the block doesn't exist. */ if (!_make_sure_block_still_exists(bg_action_ptr, bg_record)) { error("Problem with deallocating blocks to run job %u " "on block %s", req_job_id, bg_action_ptr->bg_block_id); return; } /* If another thread is freeing this block we need to wait until it is done or we will get into a state where this job will be killed. */ if (!bg_record->free_cnt) break; debug("Waiting for block %s to free for job %u. " "%d thread(s) trying to free it", bg_record->bg_block_id, req_job_id, bg_record->free_cnt); slurm_mutex_unlock(&block_state_mutex); sleep(1); } /* This was set in the start_job function to close the above window where a job could be mistakenly requeued if another thread is trying to free this block as we are trying to run on it, which is fine since we will reboot it later. */ bg_record->modifying = 0; if ((bg_record->job_running <= NO_JOB_RUNNING) && !find_job_in_bg_record(bg_record, req_job_id)) { // bg_reset_block(bg_record); should already happened slurm_mutex_unlock(&block_state_mutex); debug("job %u already finished before boot", req_job_id); return; } if (bg_record->job_list && (bg_action_ptr->job_ptr->total_cpus != bg_record->cpu_cnt) && (list_count(bg_record->job_list) != 1)) { /* We don't allow modification of a block or reboot of a block if we are running multiple jobs on the block. */ debug2("no reboot"); goto no_reboot; } rc = 0; #ifdef HAVE_BGL if (bg_action_ptr->blrtsimage && xstrcasecmp(bg_action_ptr->blrtsimage, bg_record->blrtsimage)) { debug3("changing BlrtsImage from %s to %s", bg_record->blrtsimage, bg_action_ptr->blrtsimage); xfree(bg_record->blrtsimage); bg_record->blrtsimage = xstrdup(bg_action_ptr->blrtsimage); rc = 1; } #elif defined HAVE_BGP if ((bg_action_ptr->conn_type[0] >= SELECT_SMALL) && (bg_action_ptr->conn_type[0] != bg_record->conn_type[0])) { if (bg_conf->slurm_debug_level >= LOG_LEVEL_DEBUG3) { char *req_conn_type = conn_type_string_full(bg_action_ptr->conn_type); char *conn_type = conn_type_string_full(bg_record->conn_type); debug3("changing small block mode from %s to %s", conn_type, req_conn_type); xfree(req_conn_type); xfree(conn_type); } rc = 1; # ifndef HAVE_BG_FILES /* since we don't check state on an emulated system we * have to change it here */ bg_record->conn_type[0] = bg_action_ptr->conn_type[0]; # endif } #endif #ifdef HAVE_BG_L_P if (bg_action_ptr->linuximage && xstrcasecmp(bg_action_ptr->linuximage, bg_record->linuximage)) { # ifdef HAVE_BGL debug3("changing LinuxImage from %s to %s", bg_record->linuximage, bg_action_ptr->linuximage); # else debug3("changing CnloadImage from %s to %s", bg_record->linuximage, bg_action_ptr->linuximage); # endif xfree(bg_record->linuximage); bg_record->linuximage = xstrdup(bg_action_ptr->linuximage); rc = 1; } if (bg_action_ptr->ramdiskimage && xstrcasecmp(bg_action_ptr->ramdiskimage, bg_record->ramdiskimage)) { # ifdef HAVE_BGL debug3("changing RamDiskImage from %s to %s", bg_record->ramdiskimage, bg_action_ptr->ramdiskimage); # else debug3("changing IoloadImage from %s to %s", bg_record->ramdiskimage, bg_action_ptr->ramdiskimage); # endif xfree(bg_record->ramdiskimage); bg_record->ramdiskimage = xstrdup(bg_action_ptr->ramdiskimage); rc = 1; } #endif if (bg_action_ptr->mloaderimage && xstrcasecmp(bg_action_ptr->mloaderimage, bg_record->mloaderimage)) { debug3("changing MloaderImage from %s to %s", bg_record->mloaderimage, bg_action_ptr->mloaderimage); xfree(bg_record->mloaderimage); bg_record->mloaderimage = xstrdup(bg_action_ptr->mloaderimage); rc = 1; } if (rc || bg_action_ptr->reboot) { bg_record->modifying = 1; /* Increment free_cnt to make sure we don't loose this * block since bg_free_block will unlock block_state_mutex. */ bg_record->free_cnt++; bg_free_block(bg_record, 1, 1); bg_record->free_cnt--; #if defined HAVE_BG_FILES && defined HAVE_BG_L_P #ifdef HAVE_BGL if ((rc = bridge_block_modify(bg_record->bg_block_id, RM_MODIFY_BlrtsImg, bg_record->blrtsimage)) != SLURM_SUCCESS) error("bridge_block_modify(RM_MODIFY_BlrtsImg): %s", bg_err_str(rc)); if ((rc = bridge_block_modify(bg_record->bg_block_id, RM_MODIFY_LinuxImg, bg_record->linuximage)) != SLURM_SUCCESS) error("bridge_block_modify(RM_MODIFY_LinuxImg): %s", bg_err_str(rc)); if ((rc = bridge_block_modify(bg_record->bg_block_id, RM_MODIFY_RamdiskImg, bg_record->ramdiskimage)) != SLURM_SUCCESS) error("bridge_block_modify(RM_MODIFY_RamdiskImg): %s", bg_err_str(rc)); #elif defined HAVE_BGP if ((rc = bridge_block_modify(bg_record->bg_block_id, RM_MODIFY_CnloadImg, bg_record->linuximage)) != SLURM_SUCCESS) error("bridge_block_modify(RM_MODIFY_CnloadImg): %s", bg_err_str(rc)); if ((rc = bridge_block_modify(bg_record->bg_block_id, RM_MODIFY_IoloadImg, bg_record->ramdiskimage)) != SLURM_SUCCESS) error("bridge_block_modify(RM_MODIFY_IoloadImg): %s", bg_err_str(rc)); if (bg_action_ptr->conn_type[0] > SELECT_SMALL) { char *conn_type = NULL; switch(bg_action_ptr->conn_type[0]) { case SELECT_HTC_S: conn_type = "s"; break; case SELECT_HTC_D: conn_type = "d"; break; case SELECT_HTC_V: conn_type = "v"; break; case SELECT_HTC_L: conn_type = "l"; break; default: break; } /* the option has to be set before the pool can be set */ if ((rc = bridge_block_modify( bg_record->bg_block_id, RM_MODIFY_Options, conn_type)) != SLURM_SUCCESS) error("bridge_set_data(RM_MODIFY_Options): %s", bg_err_str(rc)); } #endif if ((rc = bridge_block_modify(bg_record->bg_block_id, RM_MODIFY_MloaderImg, bg_record->mloaderimage)) != SLURM_SUCCESS) error("bridge_block_modify(RM_MODIFY_MloaderImg): %s", bg_err_str(rc)); #endif bg_record->modifying = 0; } no_reboot: if (bg_record->state == BG_BLOCK_FREE) { if ((rc = bridge_block_boot(bg_record)) != SLURM_SUCCESS) { char reason[200]; bg_record->boot_state = 0; bg_record->boot_count = 0; if (rc == BG_ERROR_INVALID_STATE) snprintf(reason, sizeof(reason), "Block %s is in an incompatible " "state. This usually means " "hardware is allocated " "by another block (maybe outside " "of SLURM).", bg_record->bg_block_id); else snprintf(reason, sizeof(reason), "Couldn't boot block %s: %s", bg_record->bg_block_id, bg_err_str(rc)); slurm_mutex_unlock(&block_state_mutex); requeue_and_error(bg_record, reason); return; } } else if (bg_record->state == BG_BLOCK_BOOTING) { #ifdef HAVE_BG_FILES bg_record->boot_state = 1; #else if (!block_ptr_exist_in_list(bg_lists->booted, bg_record)) list_push(bg_lists->booted, bg_record); bg_record->state = BG_BLOCK_INITED; last_bg_update = time(NULL); #endif } if ((bg_record->job_running <= NO_JOB_RUNNING) && !find_job_in_bg_record(bg_record, req_job_id)) { slurm_mutex_unlock(&block_state_mutex); debug("job %u finished during the start of the boot " "(everything is ok)", req_job_id); return; } /* Don't reset boot_count, it will be reset when state changes, and needs to outlast a job allocation. */ /* bg_record->boot_count = 0; */ if (bg_record->state == BG_BLOCK_INITED) { debug("block %s is already ready.", bg_record->bg_block_id); /* Just in case reset the boot flags */ bg_record->boot_state = 0; bg_record->boot_count = 0; set_user_rc = bridge_block_sync_users(bg_record); block_inited = 1; } slurm_mutex_unlock(&block_state_mutex); /* This lock needs to happen after the block_state_mutex to avoid deadlock. */ if (block_inited && bg_action_ptr->job_ptr) { slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; lock_slurmctld(job_write_lock); bg_action_ptr->job_ptr->job_state &= (~JOB_CONFIGURING); last_job_update = time(NULL); unlock_slurmctld(job_write_lock); } if (set_user_rc == SLURM_ERROR) { sleep(2); /* wait for the slurmd to begin the batch script, slurm_fail_job() is a no-op if issued prior to the script initiation do clean up just in case the fail job isn't ran */ (void) slurm_fail_job(req_job_id, JOB_BOOT_FAIL); } }
/* block_state_mutex should be locked before calling this */ static int _post_block_free(bg_record_t *bg_record, bool restore) { int rc = SLURM_SUCCESS; if (bg_record->magic != BLOCK_MAGIC) { error("block already destroyed %p", bg_record); xassert(0); return SLURM_ERROR; } bg_record->free_cnt--; if (bg_record->free_cnt == -1) { info("we got a negative 1 here for %s", bg_record->bg_block_id); xassert(0); return SLURM_SUCCESS; } else if (bg_record->modifying) { info("%d others are modifing this block %s", bg_record->free_cnt, bg_record->bg_block_id); return SLURM_SUCCESS; } else if (bg_record->free_cnt) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("%d others are trying to destroy this block %s", bg_record->free_cnt, bg_record->bg_block_id); return SLURM_SUCCESS; } if (!(bg_record->state & BG_BLOCK_ERROR_FLAG) && (bg_record->state != BG_BLOCK_FREE)) { /* Something isn't right, go mark this one in an error state. */ update_block_msg_t block_msg; if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("_post_block_free: block %s is not in state " "free (%s), putting it in error state.", bg_record->bg_block_id, bg_block_state_string(bg_record->state)); slurm_init_update_block_msg(&block_msg); block_msg.bg_block_id = bg_record->bg_block_id; block_msg.state = BG_BLOCK_ERROR_FLAG; block_msg.reason = "Block would not deallocate"; slurm_mutex_unlock(&block_state_mutex); select_g_update_block(&block_msg); slurm_mutex_lock(&block_state_mutex); return SLURM_SUCCESS; } /* A bit of a sanity check to make sure blocks are being removed out of all the lists. */ remove_from_bg_list(bg_lists->booted, bg_record); if (remove_from_bg_list(bg_lists->job_running, bg_record) == SLURM_SUCCESS) num_unused_cpus += bg_record->cpu_cnt; if (restore) return SLURM_SUCCESS; if (remove_from_bg_list(bg_lists->main, bg_record) != SLURM_SUCCESS) { /* This should only happen if called from * bg_job_place.c where the block was never added to * the list. */ debug("_post_block_free: It appears this block %s isn't " "in the main list anymore.", bg_record->bg_block_id); } if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("_post_block_free: removing %s from database", bg_record->bg_block_id); rc = bridge_block_remove(bg_record); if (rc != SLURM_SUCCESS) { if (rc == BG_ERROR_BLOCK_NOT_FOUND) { debug("_post_block_free: block %s is not found", bg_record->bg_block_id); } else { error("_post_block_free: " "bridge_block_remove(%s): %s", bg_record->bg_block_id, bg_err_str(rc)); } } else if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("_post_block_free: done %s(%p)", bg_record->bg_block_id, bg_record); destroy_bg_record(bg_record); if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("_post_block_free: destroyed"); return SLURM_SUCCESS; }
/* Perform job initiation work */ static void _start_agent(bg_action_t *bg_action_ptr) { int rc, set_user_rc = SLURM_SUCCESS; bg_record_t *bg_record = NULL; bg_record_t *found_record = NULL; ListIterator itr; List delete_list = NULL; int requeue_job = 0; slurm_mutex_lock(&block_state_mutex); bg_record = find_bg_record_in_list(bg_lists->main, bg_action_ptr->bg_block_id); if (!bg_record) { slurm_mutex_unlock(&block_state_mutex); error("block %s not found in bg_lists->main", bg_action_ptr->bg_block_id); bg_requeue_job(bg_action_ptr->job_ptr->job_id, 1); return; } if (bg_record->job_running <= NO_JOB_RUNNING) { // bg_reset_block(bg_record); should already happened slurm_mutex_unlock(&block_state_mutex); debug("job %u finished during the queueing job " "(everything is ok)", bg_action_ptr->job_ptr->job_id); return; } if (bg_record->state == BG_BLOCK_TERM) { debug("Block is in Deallocating state, waiting for free."); /* It doesn't appear state of a small block (conn_type) is held on a BGP system so if we to reset it so, just set the reboot flag and handle it later in that code. */ bg_action_ptr->reboot = 1; } delete_list = list_create(NULL); itr = list_iterator_create(bg_lists->main); while ((found_record = list_next(itr))) { if ((!found_record) || (bg_record == found_record)) continue; if (!blocks_overlap(bg_record, found_record)) { debug2("block %s isn't part of %s", found_record->bg_block_id, bg_record->bg_block_id); continue; } if (found_record->job_ptr) { error("Trying to start job %u on block %s, " "but there is a job %u running on an overlapping " "block %s it will not end until %ld. " "This should never happen.", bg_action_ptr->job_ptr->job_id, bg_record->bg_block_id, found_record->job_ptr->job_id, found_record->bg_block_id, found_record->job_ptr->end_time); requeue_job = 1; break; } debug2("need to make sure %s is free, it's part of %s", found_record->bg_block_id, bg_record->bg_block_id); list_push(delete_list, found_record); } list_iterator_destroy(itr); if (requeue_job) { list_destroy(delete_list); bg_reset_block(bg_record); slurm_mutex_unlock(&block_state_mutex); bg_requeue_job(bg_action_ptr->job_ptr->job_id, 0); return; } slurm_mutex_unlock(&block_state_mutex); rc = free_block_list(bg_action_ptr->job_ptr->job_id, delete_list, 0, 1); list_destroy(delete_list); if (rc != SLURM_SUCCESS) { error("Problem with deallocating blocks to run job %u " "on block %s", bg_action_ptr->job_ptr->job_id, bg_action_ptr->bg_block_id); if (IS_JOB_CONFIGURING(bg_action_ptr->job_ptr)) bg_requeue_job(bg_action_ptr->job_ptr->job_id, 0); return; } slurm_mutex_lock(&block_state_mutex); /* Failure will unlock block_state_mutex so no need to unlock before return. Failure will unlock block_state_mutex so no need to unlock before return. */ if (!_make_sure_block_still_exists(bg_action_ptr, bg_record)) return; if (bg_record->job_running <= NO_JOB_RUNNING) { // bg_reset_block(bg_record); should already happened slurm_mutex_unlock(&block_state_mutex); debug("job %u already finished before boot", bg_action_ptr->job_ptr->job_id); return; } rc = 0; #ifdef HAVE_BGL if (bg_action_ptr->blrtsimage && strcasecmp(bg_action_ptr->blrtsimage, bg_record->blrtsimage)) { debug3("changing BlrtsImage from %s to %s", bg_record->blrtsimage, bg_action_ptr->blrtsimage); xfree(bg_record->blrtsimage); bg_record->blrtsimage = xstrdup(bg_action_ptr->blrtsimage); rc = 1; } #elif defined HAVE_BGP if ((bg_action_ptr->conn_type[0] >= SELECT_SMALL) && (bg_action_ptr->conn_type[0] != bg_record->conn_type[0])) { debug3("changing small block mode from %s to %s", conn_type_string(bg_record->conn_type[0]), conn_type_string(bg_action_ptr->conn_type[0])); rc = 1; # ifndef HAVE_BG_FILES /* since we don't check state on an emulated system we * have to change it here */ bg_record->conn_type[0] = bg_action_ptr->conn_type[0]; # endif } #endif #ifdef HAVE_BG_L_P if (bg_action_ptr->linuximage && strcasecmp(bg_action_ptr->linuximage, bg_record->linuximage)) { # ifdef HAVE_BGL debug3("changing LinuxImage from %s to %s", bg_record->linuximage, bg_action_ptr->linuximage); # else debug3("changing CnloadImage from %s to %s", bg_record->linuximage, bg_action_ptr->linuximage); # endif xfree(bg_record->linuximage); bg_record->linuximage = xstrdup(bg_action_ptr->linuximage); rc = 1; } if (bg_action_ptr->ramdiskimage && strcasecmp(bg_action_ptr->ramdiskimage, bg_record->ramdiskimage)) { # ifdef HAVE_BGL debug3("changing RamDiskImage from %s to %s", bg_record->ramdiskimage, bg_action_ptr->ramdiskimage); # else debug3("changing IoloadImage from %s to %s", bg_record->ramdiskimage, bg_action_ptr->ramdiskimage); # endif xfree(bg_record->ramdiskimage); bg_record->ramdiskimage = xstrdup(bg_action_ptr->ramdiskimage); rc = 1; } #endif if (bg_action_ptr->mloaderimage && strcasecmp(bg_action_ptr->mloaderimage, bg_record->mloaderimage)) { debug3("changing MloaderImage from %s to %s", bg_record->mloaderimage, bg_action_ptr->mloaderimage); xfree(bg_record->mloaderimage); bg_record->mloaderimage = xstrdup(bg_action_ptr->mloaderimage); rc = 1; } if (rc || bg_action_ptr->reboot) { bg_record->modifying = 1; /* Increment free_cnt to make sure we don't loose this * block since bg_free_block will unlock block_state_mutex. */ bg_record->free_cnt++; bg_free_block(bg_record, 1, 1); bg_record->free_cnt--; #if defined HAVE_BG_FILES && defined HAVE_BG_L_P #ifdef HAVE_BGL if ((rc = bridge_block_modify(bg_record->bg_block_id, RM_MODIFY_BlrtsImg, bg_record->blrtsimage)) != SLURM_SUCCESS) error("bridge_block_modify(RM_MODIFY_BlrtsImg): %s", bg_err_str(rc)); if ((rc = bridge_block_modify(bg_record->bg_block_id, RM_MODIFY_LinuxImg, bg_record->linuximage)) != SLURM_SUCCESS) error("bridge_block_modify(RM_MODIFY_LinuxImg): %s", bg_err_str(rc)); if ((rc = bridge_block_modify(bg_record->bg_block_id, RM_MODIFY_RamdiskImg, bg_record->ramdiskimage)) != SLURM_SUCCESS) error("bridge_block_modify(RM_MODIFY_RamdiskImg): %s", bg_err_str(rc)); #elif defined HAVE_BGP if ((rc = bridge_block_modify(bg_record->bg_block_id, RM_MODIFY_CnloadImg, bg_record->linuximage)) != SLURM_SUCCESS) error("bridge_block_modify(RM_MODIFY_CnloadImg): %s", bg_err_str(rc)); if ((rc = bridge_block_modify(bg_record->bg_block_id, RM_MODIFY_IoloadImg, bg_record->ramdiskimage)) != SLURM_SUCCESS) error("bridge_block_modify(RM_MODIFY_IoloadImg): %s", bg_err_str(rc)); if (bg_action_ptr->conn_type[0] > SELECT_SMALL) { char *conn_type = NULL; switch(bg_action_ptr->conn_type[0]) { case SELECT_HTC_S: conn_type = "s"; break; case SELECT_HTC_D: conn_type = "d"; break; case SELECT_HTC_V: conn_type = "v"; break; case SELECT_HTC_L: conn_type = "l"; break; default: break; } /* the option has to be set before the pool can be set */ if ((rc = bridge_block_modify( bg_record->bg_block_id, RM_MODIFY_Options, conn_type)) != SLURM_SUCCESS) error("bridge_set_data(RM_MODIFY_Options): %s", bg_err_str(rc)); } #endif if ((rc = bridge_block_modify(bg_record->bg_block_id, RM_MODIFY_MloaderImg, bg_record->mloaderimage)) != SLURM_SUCCESS) error("bridge_block_modify(RM_MODIFY_MloaderImg): %s", bg_err_str(rc)); #endif bg_record->modifying = 0; } if (bg_record->state == BG_BLOCK_FREE) { if ((rc = bridge_block_boot(bg_record)) != SLURM_SUCCESS) { char reason[200]; bg_record->boot_state = 0; bg_record->boot_count = 0; if (rc == BG_ERROR_INVALID_STATE) snprintf(reason, sizeof(reason), "Block %s is in an incompatible " "state. This usually means " "hardware is allocated " "by another block (maybe outside " "of SLURM).", bg_record->bg_block_id); else snprintf(reason, sizeof(reason), "Couldn't boot block %s: %s", bg_record->bg_block_id, bg_err_str(rc)); slurm_mutex_unlock(&block_state_mutex); requeue_and_error(bg_record, reason); return; } } else if (bg_record->state == BG_BLOCK_BOOTING) { #ifdef HAVE_BG_FILES bg_record->boot_state = 1; #else if (!block_ptr_exist_in_list(bg_lists->booted, bg_record)) list_push(bg_lists->booted, bg_record); bg_record->state = BG_BLOCK_INITED; last_bg_update = time(NULL); #endif } if (bg_record->job_running <= NO_JOB_RUNNING) { slurm_mutex_unlock(&block_state_mutex); debug("job %u finished during the start of the boot " "(everything is ok)", bg_action_ptr->job_ptr->job_id); return; } /* Don't reset boot_count, it will be reset when state changes, and needs to outlast a job allocation. */ /* bg_record->boot_count = 0; */ xfree(bg_record->target_name); bg_record->target_name = uid_to_string(bg_action_ptr->job_ptr->user_id); debug("setting the target_name for Block %s to %s", bg_record->bg_block_id, bg_record->target_name); if (bg_record->state == BG_BLOCK_INITED) { debug("block %s is ready.", bg_record->bg_block_id); set_user_rc = set_block_user(bg_record); if (bg_action_ptr->job_ptr) { bg_action_ptr->job_ptr->job_state &= (~JOB_CONFIGURING); last_job_update = time(NULL); } } slurm_mutex_unlock(&block_state_mutex); if (set_user_rc == SLURM_ERROR) { sleep(2); /* wait for the slurmd to begin the batch script, slurm_fail_job() is a no-op if issued prior to the script initiation do clean up just incase the fail job isn't ran */ (void) slurm_fail_job(bg_record->job_running); slurm_mutex_lock(&block_state_mutex); if (remove_from_bg_list(bg_lists->job_running, bg_record) == SLURM_SUCCESS) num_unused_cpus += bg_record->cpu_cnt; slurm_mutex_unlock(&block_state_mutex); } }
extern int bg_free_block(bg_record_t *bg_record, bool wait, bool locked) { int rc = SLURM_SUCCESS; int count = 0; if (!bg_record) { error("bg_free_block: there was no bg_record"); return SLURM_ERROR; } if (!locked) slurm_mutex_lock(&block_state_mutex); while (count < MAX_FREE_RETRIES) { /* block was removed */ if (bg_record->magic != BLOCK_MAGIC) { error("block was removed while freeing it here"); xassert(0); if (!locked) slurm_mutex_unlock(&block_state_mutex); return SLURM_SUCCESS; } /* Reset these here so we don't try to reboot it when the state goes to free. */ bg_record->boot_state = 0; bg_record->boot_count = 0; /* Here we don't need to check if the block is still * in exsistance since this function can't be called on * the same block twice. It may * had already been removed at this point also. */ #ifdef HAVE_BG_FILES if (bg_record->state != BG_BLOCK_FREE && bg_record->state != BG_BLOCK_TERM) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("bridge_destroy %s", bg_record->bg_block_id); rc = bridge_block_free(bg_record); if (rc != SLURM_SUCCESS) { if (rc == BG_ERROR_BLOCK_NOT_FOUND) { debug("block %s is not found", bg_record->bg_block_id); bg_record->state = BG_BLOCK_FREE; break; } else if (rc == BG_ERROR_FREE) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("bridge_block_free" "(%s): %s State = %s", bg_record->bg_block_id, bg_err_str(rc), bg_block_state_string( bg_record->state)); } else if (rc == BG_ERROR_INVALID_STATE) { #ifndef HAVE_BGL /* If the state is error and we get an incompatible state back here, it means we set it ourselves so break out. */ if (bg_record->state & BG_BLOCK_ERROR_FLAG) break; #endif if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("bridge_block_free" "(%s): %s State = %s", bg_record->bg_block_id, bg_err_str(rc), bg_block_state_string( bg_record->state)); #ifdef HAVE_BGQ if (bg_record->state != BG_BLOCK_FREE && bg_record->state != BG_BLOCK_TERM) bg_record->state = BG_BLOCK_TERM; #endif } else { error("bridge_block_free" "(%s): %s State = %s", bg_record->bg_block_id, bg_err_str(rc), bg_block_state_string( bg_record->state)); } } } #else /* Fake a free since we are n deallocating state before this. */ if (bg_record->state & BG_BLOCK_ERROR_FLAG) { /* This will set the state to ERROR(Free) * just incase the state was ERROR(SOMETHING ELSE) */ bg_record->state = BG_BLOCK_ERROR_FLAG; break; } else if (!wait || (count >= 3)) bg_record->state = BG_BLOCK_FREE; else if (bg_record->state != BG_BLOCK_FREE) bg_record->state = BG_BLOCK_TERM; #endif if (!wait || (bg_record->state == BG_BLOCK_FREE) #ifndef HAVE_BGL || (bg_record->state & BG_BLOCK_ERROR_FLAG) #endif ) { break; } /* If we were locked outside of this we need to unlock to not cause deadlock on this mutex until we are done. */ slurm_mutex_unlock(&block_state_mutex); sleep(FREE_SLEEP_INTERVAL); count++; slurm_mutex_lock(&block_state_mutex); } rc = SLURM_SUCCESS; if ((bg_record->state == BG_BLOCK_FREE) || (bg_record->state & BG_BLOCK_ERROR_FLAG)) { if (bg_record->err_ratio && (bg_record->state == BG_BLOCK_FREE)) { /* Sometime the realtime server can report software error on cnodes even though the block is free. If this is the case we need to manually clear them. */ ba_mp_t *found_ba_mp; ListIterator itr = list_iterator_create(bg_record->ba_mp_list); debug("Block %s is free, but has %u cnodes in error. " "This can happen if a large block goes into " "error and then is freed and the state of " "the block changes before the " "database informs all the cnodes are back to " "normal. This is no big deal.", bg_record->bg_block_id, bg_record->cnode_err_cnt); while ((found_ba_mp = list_next(itr))) { if (!found_ba_mp->used) continue; if (!found_ba_mp->cnode_err_bitmap) found_ba_mp->cnode_err_bitmap = bit_alloc( bg_conf->mp_cnode_cnt); bit_nclear(found_ba_mp->cnode_err_bitmap, 0, bit_size(found_ba_mp-> cnode_err_bitmap)-1); } list_iterator_destroy(itr); bg_record->cnode_err_cnt = 0; bg_record->err_ratio = 0; } remove_from_bg_list(bg_lists->booted, bg_record); } else if (count >= MAX_FREE_RETRIES) { /* Something isn't right, go mark this one in an error state. */ update_block_msg_t block_msg; if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("bg_free_block: block %s is not in state " "free (%s), putting it in error state.", bg_record->bg_block_id, bg_block_state_string(bg_record->state)); slurm_init_update_block_msg(&block_msg); block_msg.bg_block_id = bg_record->bg_block_id; block_msg.state = BG_BLOCK_ERROR_FLAG; block_msg.reason = "Block would not deallocate"; slurm_mutex_unlock(&block_state_mutex); select_g_update_block(&block_msg); slurm_mutex_lock(&block_state_mutex); rc = SLURM_ERROR; } if (!locked) slurm_mutex_unlock(&block_state_mutex); return rc; }
static int _do_block_poll(void) { int updated = 0; #if defined HAVE_BG_FILES int rc; rm_partition_t *block_ptr = NULL; #ifdef HAVE_BGL rm_partition_mode_t node_use; #endif rm_partition_state_t state; char *name = NULL; bg_record_t *bg_record = NULL; ListIterator itr = NULL; if (!bg_lists->main) return updated; lock_slurmctld(job_read_lock); slurm_mutex_lock(&block_state_mutex); itr = list_iterator_create(bg_lists->main); while ((bg_record = (bg_record_t *) list_next(itr)) != NULL) { if (bg_record->magic != BLOCK_MAGIC) { /* block is gone */ list_remove(itr); continue; } else if (!bg_record->bg_block_id) continue; name = bg_record->bg_block_id; if ((rc = bridge_get_block_info(name, &block_ptr)) != SLURM_SUCCESS) { if (bg_conf->layout_mode == LAYOUT_DYNAMIC) { switch(rc) { case BG_ERROR_INCONSISTENT_DATA: debug2("got inconsistent data when " "querying block %s", name); continue; break; case BG_ERROR_BLOCK_NOT_FOUND: debug("block %s not found, removing " "from slurm", name); list_remove(itr); destroy_bg_record(bg_record); continue; break; default: break; } } /* If the call was busy, just skip this iteration. It usually means something like rm_get_BG was called which can be a very long call */ if (rc == EBUSY) { debug5("lock was busy, aborting"); break; } error("bridge_get_block_info(%s): %s", name, bg_err_str(rc)); continue; } #ifdef HAVE_BGL if ((rc = bridge_get_data(block_ptr, RM_PartitionMode, &node_use)) != SLURM_SUCCESS) { error("bridge_get_data(RM_PartitionMode): %s", bg_err_str(rc)); if (!updated) updated = -1; goto next_block; } else if (bg_record->node_use != node_use) { debug("node_use of Block %s was %d " "and now is %d", bg_record->bg_block_id, bg_record->node_use, node_use); bg_record->node_use = node_use; updated = 1; } #else if ((bg_record->cnode_cnt < bg_conf->mp_cnode_cnt) || (bg_conf->mp_cnode_cnt == bg_conf->nodecard_cnode_cnt)) { char *mode = NULL; uint16_t conn_type = SELECT_SMALL; if ((rc = bridge_get_data(block_ptr, RM_PartitionOptions, &mode)) != SLURM_SUCCESS) { error("bridge_get_data(RM_PartitionOptions): " "%s", bg_err_str(rc)); if (!updated) updated = -1; goto next_block; } else if (mode) { switch(mode[0]) { case 's': conn_type = SELECT_HTC_S; break; case 'd': conn_type = SELECT_HTC_D; break; case 'v': conn_type = SELECT_HTC_V; break; case 'l': conn_type = SELECT_HTC_L; break; default: conn_type = SELECT_SMALL; break; } free(mode); } if (bg_record->conn_type[0] != conn_type) { debug("mode of small Block %s was %u " "and now is %u", bg_record->bg_block_id, bg_record->conn_type[0], conn_type); bg_record->conn_type[0] = conn_type; updated = 1; } } #endif if ((rc = bridge_get_data(block_ptr, RM_PartitionState, &state)) != SLURM_SUCCESS) { error("bridge_get_data(RM_PartitionState): %s", bg_err_str(rc)); if (!updated) updated = -1; goto next_block; } else if (bg_status_update_block_state( bg_record, state, kill_job_list) == 1) updated = 1; next_block: if ((rc = bridge_free_block(block_ptr)) != SLURM_SUCCESS) { error("bridge_free_block(): %s", bg_err_str(rc)); } } list_iterator_destroy(itr); slurm_mutex_unlock(&block_state_mutex); unlock_slurmctld(job_read_lock); bg_status_process_kill_job_list(kill_job_list, JOB_FAILED, 0); #endif return updated; }
/* * This could potentially lock the node lock in the slurmctld with * slurm_drain_node, so if nodes_locked is called we will call the * drainning function without locking the lock again. */ static int _test_down_nodecards(rm_BP_t *bp_ptr, bool slurmctld_locked) { rm_bp_id_t bp_id = NULL; int num = 0; int marked_down = 0; int i=0; int rc = SLURM_SUCCESS; rm_nodecard_list_t *ncard_list = NULL; rm_nodecard_t *ncard = NULL; //bitstr_t *ionode_bitmap = NULL; //bg_record_t *bg_record = NULL; char *node_name = NULL; //int bp_bit = 0; //int io_cnt = 1; /* Translate 1 nodecard count to ionode count */ /* if ((io_cnt *= bg_conf->io_ratio)) */ /* io_cnt--; */ if ((rc = bridge_get_data(bp_ptr, RM_BPID, &bp_id)) != SLURM_SUCCESS) { error("bridge_get_data(RM_BPID): %s", bg_err_str(rc)); return SLURM_ERROR; } if ((rc = bridge_get_nodecards(bp_id, &ncard_list)) != SLURM_SUCCESS) { error("bridge_get_nodecards(%s): %d", bp_id, rc); rc = SLURM_ERROR; goto clean_up; } /* The node_name will only be NULL if this system doesn't really have the node. */ if (!(node_name = _get_bp_node_name(bp_ptr))) { rc = SLURM_ERROR; goto clean_up; } if ((rc = bridge_get_data(ncard_list, RM_NodeCardListSize, &num)) != SLURM_SUCCESS) { error("bridge_get_data(RM_NodeCardListSize): %s", bg_err_str(rc)); rc = SLURM_ERROR; goto clean_up; } for(i=0; i<num; i++) { if (i) { if ((rc = bridge_get_data(ncard_list, RM_NodeCardListNext, &ncard)) != SLURM_SUCCESS) { error("bridge_get_data" "(RM_NodeCardListNext): %s", bg_err_str(rc)); rc = SLURM_ERROR; goto clean_up; } } else { if ((rc = bridge_get_data(ncard_list, RM_NodeCardListFirst, &ncard)) != SLURM_SUCCESS) { error("bridge_get_data" "(RM_NodeCardListFirst: %s", bg_err_str(rc)); rc = SLURM_ERROR; goto clean_up; } } if (_test_nodecard_state(ncard, i, node_name, slurmctld_locked) != SLURM_SUCCESS) marked_down++; } /* this code is here to bring up a block after it is in an error state. It is commented out because it hasn't been tested very well yet. If you ever want to use this code there should probably be a configurable option in the bluegene.conf file that gives you an option as to have this happen or not automatically. */ /* if (ionode_bitmap) { */ /* info("got ionode_bitmap"); */ /* bit_not(ionode_bitmap); */ /* up_nodecard(node_name, ionode_bitmap); */ /* } else { */ /* int ret = 0; */ /* info("no ionode_bitmap"); */ /* ListIterator itr = NULL; */ /* slurm_mutex_lock(&block_state_mutex); */ /* itr = list_iterator_create(bg_lists->main); */ /* while ((bg_record = list_next(itr))) { */ /* if (bg_record->job_running != BLOCK_ERROR_STATE) */ /* continue; */ /* if (!bit_test(bg_record->mp_bitmap, bp_bit)) */ /* continue; */ /* info("bringing %s back to service", */ /* bg_record->bg_block_id); */ /* bg_record->job_running = NO_JOB_RUNNING; */ /* bg_record->state = BG_BLOCK_FREE; */ /* last_bg_update = time(NULL); */ /* } */ /* list_iterator_destroy(itr); */ /* slurm_mutex_unlock(&block_state_mutex); */ /* /\* FIX ME: This needs to call the opposite of */ /* slurm_drain_nodes which does not yet exist. */ /* *\/ */ /* if ((ret = node_already_down(node_name))) { */ /* /\* means it was drained *\/ */ /* if (ret == 2) { */ /* /\* debug("node %s put back into * service after " *\/ */ /* /\* "being in an error state", *\/ */ /* /\* node_name); *\/ */ /* } */ /* } */ /* } */ clean_up: if (ncard_list) bridge_free_nodecard_list(ncard_list); xfree(node_name); /* if (ionode_bitmap) */ /* FREE_NULL_BITMAP(ionode_bitmap); */ free(bp_id); /* If we marked any nodecard down we need to state it here */ if ((rc == SLURM_SUCCESS) && marked_down) rc = SLURM_ERROR; return rc; }
/* To fake a nodecard down do this on the service node. db2 "update bg{l|p}nodecard set status = 'E' where location = 'Rxx-Mx-Nx' and status='A'" Reverse the A, and E to bring it back up. */ static int _test_nodecard_state(rm_nodecard_t *ncard, int nc_id, char *node_name, bool slurmctld_locked) { int rc = SLURM_SUCCESS; rm_nodecard_id_t nc_name = NULL; rm_nodecard_state_t state; int io_start = 0; if ((rc = bridge_get_data(ncard, RM_NodeCardState, &state)) != SLURM_SUCCESS) { error("bridge_get_data(RM_NodeCardState): %s", bg_err_str(rc)); return SLURM_ERROR; } if (state == RM_NODECARD_UP) return SLURM_SUCCESS; if ((rc = bridge_get_data(ncard, RM_NodeCardID, &nc_name)) != SLURM_SUCCESS) { error("bridge_get_data(RM_NodeCardID): %s", bg_err_str(rc)); return SLURM_ERROR; } if (!nc_name) { error("We didn't get an RM_NodeCardID but rc was SLURM_SUCCESS?"); return SLURM_ERROR; } #ifdef HAVE_BGL if ((rc = bridge_get_data(ncard, RM_NodeCardQuarter, &io_start)) != SLURM_SUCCESS) { error("bridge_get_data(CardQuarter): %s", bg_err_str(rc)); rc = SLURM_ERROR; goto clean_up; } io_start *= bg_conf->quarter_ionode_cnt; io_start += bg_conf->nodecard_ionode_cnt * (nc_id%4); #else /* From the first nodecard id we can figure out where to start from with the alloc of ionodes. */ io_start = atoi((char*)nc_name+1); io_start *= bg_conf->io_ratio; #endif /* On small systems with less than a midplane the database may see the nodecards there but in missing state. To avoid getting a bunch of warnings here just skip over the ones missing. */ if (io_start >= bg_conf->ionodes_per_mp) { rc = SLURM_SUCCESS; if (state == RM_NODECARD_MISSING) { debug3("Nodecard %s is missing", nc_name); } else { error("We don't have the system configured " "for this nodecard %s, we only have " "%d ionodes and this starts at %d", nc_name, bg_conf->ionodes_per_mp, io_start); } goto clean_up; } /* if (!ionode_bitmap) */ /* ionode_bitmap = bit_alloc(bg_conf->ionodes_per_mp); */ /* info("setting %s start %d of %d", */ /* nc_name, io_start, bg_conf->ionodes_per_mp); */ /* bit_nset(ionode_bitmap, io_start, io_start+io_cnt); */ /* we have to handle each nodecard separately to make sure we don't create holes in the system */ if (down_nodecard(node_name, io_start, slurmctld_locked, NULL) == SLURM_SUCCESS) { debug("nodecard %s on %s is in an error state", nc_name, node_name); } else debug2("nodecard %s on %s is in an error state, " "but error was returned when trying to make it so", nc_name, node_name); /* Here we want to keep track of any nodecard that isn't up and return error if it is in the system. */ rc = SLURM_ERROR; clean_up: free(nc_name); return rc; }
static int _add_switch_conns(rm_switch_t* curr_switch, ba_switch_t *ba_switch) { int firstconnect=1; /* max number of connections in a switch */ int num_connections = 3; ba_connection_t *ba_conn = NULL; rm_connection_t conn[num_connections]; rm_connection_t *conn_ptr = NULL; int i, rc; int source = 0; List conn_list = list_create(NULL); /* we have to figure out how may connections we have and then go through the loop again to actually add them */ for(i=0; i<num_connections; i++) { /* set the source port(-) to check */ switch(i) { case 0: source = 1; conn[i].p1 = RM_PORT_S1; break; case 1: source = 2; conn[i].p1 = RM_PORT_S2; break; case 2: source = 4; conn[i].p1 = RM_PORT_S4; break; default: error("we are to far into the switch connections"); break; } ba_conn = &ba_switch->int_wire[source]; if (ba_conn->used && ba_conn->port_tar != source) { switch(ba_conn->port_tar) { case 0: conn[i].p2 = RM_PORT_S0; break; case 3: conn[i].p2 = RM_PORT_S3; break; case 5: conn[i].p2 = RM_PORT_S5; break; default: error("we are trying to connection %d -> %d " "which can't happen", source, ba_conn->port_tar); break; } conn[i].part_state = BG_BLOCK_INITED; if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_WIRES) info("adding %d -> %d", source, ba_conn->port_tar); list_push(conn_list, &conn[i]); } } i = list_count(conn_list); if (i) { if ((rc = bridge_set_data(curr_switch, RM_SwitchConnNum, &i)) != SLURM_SUCCESS) { fatal("bridge_set_data: RM_SwitchConnNum: %s", bg_err_str(rc)); return SLURM_ERROR; } } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_WIRES) info("we got a switch with no connections"); list_destroy(conn_list); return SLURM_ERROR; } /* Now we can add them to the mix */ while ((conn_ptr = list_pop(conn_list))) { if (firstconnect) { if ((rc = bridge_set_data( curr_switch, RM_SwitchFirstConnection, conn_ptr)) != SLURM_SUCCESS) { fatal("bridge_set_data" "(RM_SwitchFirstConnection): " "%s", bg_err_str(rc)); list_destroy(conn_list); return SLURM_ERROR; } firstconnect=0; } else { if ((rc = bridge_set_data( curr_switch, RM_SwitchNextConnection, conn_ptr)) != SLURM_SUCCESS) { fatal("bridge_set_data" "(RM_SwitchNextConnection): %s", bg_err_str(rc)); list_destroy(conn_list); return SLURM_ERROR; } } } list_destroy(conn_list); return SLURM_SUCCESS; }
extern int configure_small_block(bg_record_t *bg_record) { int rc = SLURM_SUCCESS; #if defined HAVE_BG_FILES bool small = true; ba_mp_t* ba_node = NULL; rm_BP_t *curr_mp = NULL; rm_bp_id_t mp_id = NULL; #ifndef HAVE_BGL rm_nodecard_id_t nc_char = NULL; #endif int nc_id = 0; int num_ncards = 0, sub_nodecard = 0, ionode_card = 0, nc_count = 0; rm_nodecard_t *ncard; rm_nodecard_list_t *ncard_list = NULL; int num, i; int use_nc[bg_conf->mp_nodecard_cnt]; double nc_pos = 0; #endif xassert(bg_record->ionode_bitmap); if (bg_record->mp_count != 1) { error("Requesting small block with %d mps, needs to be 1.", bg_record->mp_count); return SLURM_ERROR; } /* info("configuring small block on ionodes %s out of %d ncs", */ /* bg_record->ionodes, bg_conf->mp_nodecard_cnt); */ #if defined HAVE_BG_FILES /* set that we are doing a small block */ if ((rc = bridge_set_data(bg_record->bg_block, RM_PartitionSmall, &small)) != SLURM_SUCCESS) { fatal("bridge_set_data(RM_PartitionPsetsPerBP): %s", bg_err_str(rc)); } num_ncards = bg_record->cnode_cnt/bg_conf->nodecard_cnode_cnt; if (num_ncards < 1) { num_ncards = 1; sub_nodecard = 1; } memset(use_nc, 0, sizeof(use_nc)); /* find out how many nodecards to get for each ionode */ for(i = 0; i<bg_conf->ionodes_per_mp; i++) { if (bit_test(bg_record->ionode_bitmap, i)) { if (bg_conf->nc_ratio > 1) { int j=0; for(j=0; j<bg_conf->nc_ratio; j++) use_nc[(int)nc_pos+j] = 1; } else { use_nc[(int)nc_pos] = 1; if (i%2) ionode_card = 1; } } nc_pos += bg_conf->nc_ratio; } if ((rc = bridge_set_data(bg_record->bg_block, RM_PartitionNodeCardNum, &num_ncards)) != SLURM_SUCCESS) { fatal("bridge_set_data: RM_PartitionBPNum: %s", bg_err_str(rc)); } ba_node = list_peek(bg_record->ba_mp_list); if (_get_mp_by_location(bg, ba_node->coord, &curr_mp) == SLURM_ERROR) { fatal("_get_mp_by_location()"); } /* Set the one MP */ if ((rc = bridge_set_data(bg_record->bg_block, RM_PartitionBPNum, &bg_record->mp_count)) != SLURM_SUCCESS) { fatal("bridge_set_data: RM_PartitionBPNum: %s", bg_err_str(rc)); return SLURM_ERROR; } if ((rc = bridge_set_data(bg_record->bg_block, RM_PartitionFirstBP, curr_mp)) != SLURM_SUCCESS) { fatal("bridge_set_data(" "BRIDGE_PartitionFirstBP): %s", bg_err_str(rc)); return SLURM_ERROR; } /* find the mp_id of the mp to get the small32 */ if ((rc = bridge_get_data(curr_mp, RM_BPID, &mp_id)) != SLURM_SUCCESS) { error("bridge_get_data(): %d", rc); return SLURM_ERROR; } if (!mp_id) { error("No MP ID was returned from database"); return SLURM_ERROR; } if ((rc = bridge_get_nodecards(mp_id, &ncard_list)) != SLURM_SUCCESS) { error("bridge_get_nodecards(%s): %d", mp_id, rc); free(mp_id); return SLURM_ERROR; } free(mp_id); if ((rc = bridge_get_data(ncard_list, RM_NodeCardListSize, &num)) != SLURM_SUCCESS) { error("bridge_get_data(RM_NodeCardListSize): %s", bg_err_str(rc)); return SLURM_ERROR; } if (num_ncards > num) { error("You requested more (%d > %d) nodecards " "than are available on this block %s", num_ncards, num, bg_record->mp_str); } for(i=0; i<num; i++) { if (i) { if ((rc = bridge_get_data(ncard_list, RM_NodeCardListNext, &ncard)) != SLURM_SUCCESS) { error("bridge_get_data" "(RM_NodeCardListNext): %s", bg_err_str(rc)); rc = SLURM_ERROR; goto cleanup; } } else { if ((rc = bridge_get_data(ncard_list, RM_NodeCardListFirst, &ncard)) != SLURM_SUCCESS) { error("bridge_get_data" "(RM_NodeCardListFirst): %s", bg_err_str(rc)); rc = SLURM_ERROR; goto cleanup; } } #ifdef HAVE_BGL /* on BG/L we assume the order never changes when the system is up. This could change when a reboot of the system happens, but that should be rare. */ nc_id = i; if (!use_nc[i]) continue; #else if ((rc = bridge_get_data(ncard, RM_NodeCardID, &nc_char)) != SLURM_SUCCESS) { error("bridge_get_data(RM_NodeCardID): %s", bg_err_str(rc)); rc = SLURM_ERROR; goto cleanup; } if (!nc_char) { error("No NodeCard ID was returned from database"); rc = SLURM_ERROR; goto cleanup; } nc_id = atoi((char*)nc_char+1); if (!use_nc[nc_id]) { free(nc_char); continue; } if (sub_nodecard) { rm_ionode_t *ionode; char *ionode_id = "J00"; if ((rc = bridge_new_nodecard(&ncard)) != SLURM_SUCCESS) { error("bridge_new_nodecard(): %s", bg_err_str(rc)); rc = SLURM_ERROR; goto cleanup; } if ((rc = bridge_set_data(ncard, RM_NodeCardID, nc_char)) != SLURM_SUCCESS) { error("bridge_set_data(" "RM_NodeCardID): %s", bg_err_str(rc)); rc = SLURM_ERROR; goto cleanup; } if ((rc = bridge_set_data(ncard, RM_NodeCardIONodeNum, &sub_nodecard)) != SLURM_SUCCESS) { error("bridge_set_data(" "RM_NodeCardIONodeNum): %s", bg_err_str(rc)); rc = SLURM_ERROR; goto cleanup; } if ((rc = bridge_new_ionode(&ionode)) != SLURM_SUCCESS) { error("bridge_new_ionode(): %s", bg_err_str(rc)); rc = SLURM_ERROR; goto cleanup; } if (ionode_card) ionode_id = "J01"; if ((rc = bridge_set_data(ionode, RM_IONodeID, ionode_id)) != SLURM_SUCCESS) { error("bridge_set_data(" "RM_NodeCardIONodeNum): %s", bg_err_str(rc)); rc = SLURM_ERROR; goto cleanup; } if ((rc = bridge_set_data(ncard, RM_NodeCardFirstIONode, ionode)) != SLURM_SUCCESS) { error("bridge_set_data(" "RM_NodeCardFirstIONode): %s", bg_err_str(rc)); rc = SLURM_ERROR; goto cleanup; } if ((rc = bridge_free_ionode(ionode)) != SLURM_SUCCESS) { error("bridge_free_ionode(): %s", bg_err_str(rc)); rc = SLURM_ERROR; goto cleanup; } } free(nc_char); #endif if (nc_count) { if ((rc = bridge_set_data(bg_record->bg_block, RM_PartitionNextNodeCard, ncard)) != SLURM_SUCCESS) { error("bridge_set_data(" "RM_PartitionNextNodeCard): %s", bg_err_str(rc)); rc = SLURM_ERROR; goto cleanup; } } else { if ((rc = bridge_set_data(bg_record->bg_block, RM_PartitionFirstNodeCard, ncard)) != SLURM_SUCCESS) { error("bridge_set_data(" "RM_PartitionFirstNodeCard): %s", bg_err_str(rc)); rc = SLURM_ERROR; goto cleanup; } } nc_count++; #ifndef HAVE_BGL if (sub_nodecard) { if ((rc = bridge_free_nodecard(ncard)) != SLURM_SUCCESS) { error("bridge_free_nodecard(): %s", bg_err_str(rc)); rc = SLURM_ERROR; goto cleanup; } } #endif if (nc_count == num_ncards) break; } cleanup: if ((rc = bridge_free_nodecard_list(ncard_list)) != SLURM_SUCCESS) { error("bridge_free_nodecard_list(): %s", bg_err_str(rc)); return SLURM_ERROR; } #endif if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_WIRES) info("making the small block"); if (rc != SLURM_ERROR) rc = SLURM_SUCCESS; return rc; }
/* Find the specified BlueGene node ID and drain it from SLURM */ static void _configure_node_down(rm_bp_id_t bp_id, my_bluegene_t *my_bg) { int bp_num, i, rc; rm_bp_id_t bpid; rm_BP_t *my_bp; rm_location_t bp_loc; rm_BP_state_t bp_state; char bg_down_node[128]; if ((rc = bridge_get_data(my_bg, RM_BPNum, &bp_num)) != SLURM_SUCCESS) { error("bridge_get_data(RM_BPNum): %s", bg_err_str(rc)); bp_num = 0; } for (i=0; i<bp_num; i++) { if (i) { if ((rc = bridge_get_data(my_bg, RM_NextBP, &my_bp)) != SLURM_SUCCESS) { error("bridge_get_data(RM_NextBP): %s", bg_err_str(rc)); continue; } } else { if ((rc = bridge_get_data(my_bg, RM_FirstBP, &my_bp)) != SLURM_SUCCESS) { error("bridge_get_data(RM_FirstBP): %s", bg_err_str(rc)); continue; } } if ((rc = bridge_get_data(my_bp, RM_BPID, &bpid)) != SLURM_SUCCESS) { error("bridge_get_data(RM_BPID): %s", bg_err_str(rc)); continue; } if (!bpid) { error("No BPID was returned from database"); continue; } if (strcmp(bp_id, bpid) != 0) { /* different midplane */ free(bpid); continue; } free(bpid); if ((rc = bridge_get_data(my_bp, RM_BPState, &bp_state)) != SLURM_SUCCESS) { error("bridge_get_data(RM_BPState): %s", bg_err_str(rc)); continue; } if (bp_state != RM_BP_UP) /* already down */ continue; if ((rc = bridge_get_data(my_bp, RM_BPLoc, &bp_loc)) != SLURM_SUCCESS) { error("bridge_get_data(RM_BPLoc): %s", bg_err_str(rc)); continue; } /* make sure we have this midplane in the system */ if (bp_loc.X >= DIM_SIZE[X] || bp_loc.Y >= DIM_SIZE[Y] || bp_loc.Z >= DIM_SIZE[Z]) { debug4("node %s%c%c%c isn't configured", bg_conf->slurm_node_prefix, alpha_num[bp_loc.X], alpha_num[bp_loc.Y], alpha_num[bp_loc.Z]); continue; } snprintf(bg_down_node, sizeof(bg_down_node), "%s%c%c%c", bg_conf->slurm_node_prefix, alpha_num[bp_loc.X], alpha_num[bp_loc.Y], alpha_num[bp_loc.Z]); if (node_already_down(bg_down_node)) break; error("switch for node %s is bad", bg_down_node); slurm_drain_nodes(bg_down_node, "select_bluegene: MMCS switch not UP", slurm_get_slurm_user_id()); break; } }
/** * connect the given switch up with the given connections */ extern int configure_block_switches(bg_record_t * bg_record) { int rc = SLURM_SUCCESS; ListIterator itr; ba_mp_t *ba_node = NULL; #if defined HAVE_BG_FILES char *mpid = NULL; int first_mp=1; int first_switch=1; int i = 0; rm_BP_t *curr_mp = NULL; rm_switch_t *coord_switch[SYSTEM_DIMENSIONS]; #endif if (!bg_record->ba_mp_list) { error("There was no block_list given, can't create block"); return SLURM_ERROR; } bg_record->switch_count = 0; bg_record->mp_count = 0; itr = list_iterator_create(bg_record->ba_mp_list); while ((ba_node = list_next(itr))) { if (ba_node->used) { bg_record->mp_count++; } bg_record->switch_count += _used_switches(ba_node); } #if defined HAVE_BG_FILES if ((rc = bridge_set_data(bg_record->bg_block, RM_PartitionBPNum, &bg_record->mp_count)) != SLURM_SUCCESS) { fatal("bridge_set_data: RM_PartitionBPNum: %s", bg_err_str(rc)); rc = SLURM_ERROR; goto cleanup; } if ((rc = bridge_set_data(bg_record->bg_block, RM_PartitionSwitchNum, &bg_record->switch_count)) != SLURM_SUCCESS) { fatal("bridge_set_data: RM_PartitionSwitchNum: %s", bg_err_str(rc)); rc = SLURM_ERROR; goto cleanup; } #endif if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_WIRES) info("MP count %d", bg_record->mp_count); if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_WIRES) info("switch count %d", bg_record->switch_count); list_iterator_reset(itr); while ((ba_node = list_next(itr))) { #if defined HAVE_BG_FILES if (_get_mp_by_location(bg, ba_node->coord, &curr_mp) == SLURM_ERROR) { rc = SLURM_ERROR; goto cleanup; } #endif if (!ba_node->used) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_WIRES) info("%c%c%c is a passthrough, " "not including in request", alpha_num[ba_node->coord[X]], alpha_num[ba_node->coord[Y]], alpha_num[ba_node->coord[Z]]); } else { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_WIRES) info("using node %c%c%c", alpha_num[ba_node->coord[X]], alpha_num[ba_node->coord[Y]], alpha_num[ba_node->coord[Z]]); #if defined HAVE_BG_FILES if (first_mp){ if ((rc = bridge_set_data(bg_record->bg_block, RM_PartitionFirstBP, curr_mp)) != SLURM_SUCCESS) { list_iterator_destroy(itr); fatal("bridge_set_data(" "RM_PartitionFirstBP): %s", bg_err_str(rc)); } first_mp = 0; } else { if ((rc = bridge_set_data(bg_record->bg_block, RM_PartitionNextBP, curr_mp)) != SLURM_SUCCESS) { list_iterator_destroy(itr); fatal("bridge_set_data" "(RM_PartitionNextBP): %s", bg_err_str(rc)); } } #endif } #if defined HAVE_BG_FILES if ((rc = bridge_get_data(curr_mp, RM_BPID, &mpid)) != SLURM_SUCCESS) { list_iterator_destroy(itr); fatal("bridge_get_data: RM_BPID: %s", bg_err_str(rc)); } if (!mpid) { error("No BP ID was returned from database"); continue; } if (_get_switches_by_mpid(bg, mpid, coord_switch) != SLURM_SUCCESS) { error("Didn't get all the switches for mp %s", mpid); free(mpid); continue; } free(mpid); for(i=0; i<SYSTEM_DIMENSIONS; i++) { if (_add_switch_conns(coord_switch[i], &ba_node->axis_switch[i]) == SLURM_SUCCESS) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_WIRES) info("adding switch dim %d", i); if (first_switch){ if ((rc = bridge_set_data( bg_record->bg_block, RM_PartitionFirstSwitch, coord_switch[i])) != SLURM_SUCCESS) { fatal("bridge_set_data(" "RM_PartitionFirst" "Switch): %s", bg_err_str(rc)); } first_switch = 0; } else { if ((rc = bridge_set_data( bg_record->bg_block, RM_PartitionNextSwitch, coord_switch[i])) != SLURM_SUCCESS) { fatal("bridge_set_data(" "RM_PartitionNext" "Switch): %s", bg_err_str(rc)); } } } } #endif } rc = SLURM_SUCCESS; #if defined HAVE_BG_FILES cleanup: #endif return rc; }
/* This needs to have block_state_mutex locked before hand. */ extern int bridge_status_update_block_list_state(List block_list) { int updated = 0; #if defined HAVE_BG_FILES int rc; rm_partition_t *block_ptr = NULL; rm_partition_state_t state; uint16_t real_state; char *name = NULL; bg_record_t *bg_record = NULL; ListIterator itr = NULL; itr = list_iterator_create(block_list); while ((bg_record = (bg_record_t *) list_next(itr)) != NULL) { if (bg_record->magic != BLOCK_MAGIC) { /* block is gone */ list_remove(itr); continue; } else if (!bg_record->bg_block_id) continue; name = bg_record->bg_block_id; real_state = bg_record->state & (~BG_BLOCK_ERROR_FLAG); if ((rc = bridge_get_block_info(name, &block_ptr)) != SLURM_SUCCESS) { if (bg_conf->layout_mode == LAYOUT_DYNAMIC) { switch(rc) { case BG_ERROR_INCONSISTENT_DATA: debug2("got inconsistent data when " "querying block %s", name); continue; break; case BG_ERROR_BLOCK_NOT_FOUND: debug("block %s not found, removing " "from slurm", name); /* Just set to free, everything will be cleaned up outside this. */ bg_record->state = BG_BLOCK_FREE; continue; break; default: break; } } /* If the call was busy, just skip this iteration. It usually means something like rm_get_BG was called which can be a very long call */ if (rc == EBUSY) { debug5("lock was busy, aborting"); break; } error("bridge_get_block_info(%s): %s", name, bg_err_str(rc)); continue; } if ((rc = bridge_get_data(block_ptr, RM_PartitionState, &state)) != SLURM_SUCCESS) { error("bridge_get_data(RM_PartitionState): %s", bg_err_str(rc)); updated = -1; goto next_block; } else if (real_state != state) { debug("freeing state of Block %s was %d and now is %d", bg_record->bg_block_id, bg_record->state, state); if (bg_record->state & BG_BLOCK_ERROR_FLAG) state |= BG_BLOCK_ERROR_FLAG; bg_record->state = state; updated = 1; } next_block: if ((rc = bridge_free_block(block_ptr)) != SLURM_SUCCESS) { error("bridge_free_block(): %s", bg_err_str(rc)); } } list_iterator_destroy(itr); #endif return updated; }
/* block_state_mutex should be locked before calling this */ static int _post_block_free(bg_record_t *bg_record, bool restore) { int rc = SLURM_SUCCESS; if (bg_record->magic != BLOCK_MAGIC) { error("block already destroyed %p", bg_record); xassert(0); return SLURM_ERROR; } bg_record->free_cnt--; if (bg_record->free_cnt == -1) { info("we got a negative 1 here for %s", bg_record->bg_block_id); xassert(0); return SLURM_SUCCESS; } else if (bg_record->modifying) { info("others are modifing this block %s, don't clear it up", bg_record->bg_block_id); return SLURM_SUCCESS; } else if (bg_record->free_cnt) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("%d others are trying to destroy this block %s", bg_record->free_cnt, bg_record->bg_block_id); return SLURM_SUCCESS; } /* Even if the block is already in error state we need to do this to avoid any overlapping blocks that may have been created due to bad hardware. */ if ((bg_record->state & (~BG_BLOCK_ERROR_FLAG)) != BG_BLOCK_FREE) { /* Something isn't right, go mark this one in an error state. */ update_block_msg_t block_msg; if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("_post_block_free: block %s is not in state " "free (%s), putting it in error state.", bg_record->bg_block_id, bg_block_state_string(bg_record->state)); slurm_init_update_block_msg(&block_msg); block_msg.bg_block_id = bg_record->bg_block_id; block_msg.state = BG_BLOCK_ERROR_FLAG; block_msg.reason = "Block would not deallocate"; slurm_mutex_unlock(&block_state_mutex); select_g_update_block(&block_msg); slurm_mutex_lock(&block_state_mutex); if (block_ptr_exist_in_list(bg_lists->main, bg_record)) bg_record->destroy = 0; return SLURM_SUCCESS; } /* The reason restore is used on the entire list is if this * was for a bunch of small blocks. If we record is marked to * be destroyed and it is bigger than 1 midplane destroy it * even if restore is true. */ if (restore && bg_record->destroy && (bg_record->mp_count > 1)) restore = false; /* If we are here we are done with the destroy so just reset it. */ bg_record->destroy = 0; /* A bit of a sanity check to make sure blocks are being removed out of all the lists. */ remove_from_bg_list(bg_lists->booted, bg_record); if (remove_from_bg_list(bg_lists->job_running, bg_record) == SLURM_SUCCESS) { debug2("_post_block_free: we are freeing block %s and " "it was in the job_running list. This can happen if a " "block is removed while waiting for mmcs to finish " "removing the job from the block.", bg_record->bg_block_id); num_unused_cpus += bg_record->cpu_cnt; } /* If we don't have any mp_counts force block removal */ if (restore && bg_record->mp_count) return SLURM_SUCCESS; if (remove_from_bg_list(bg_lists->main, bg_record) != SLURM_SUCCESS) { /* This should only happen if called from * bg_job_place.c where the block was never added to * the list. */ debug("_post_block_free: It appears this block %s isn't " "in the main list anymore.", bg_record->bg_block_id); } if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("_post_block_free: removing %s from database", bg_record->bg_block_id); rc = bridge_block_remove(bg_record); if (rc != SLURM_SUCCESS) { if (rc == BG_ERROR_BLOCK_NOT_FOUND) { debug("_post_block_free: block %s is not found", bg_record->bg_block_id); } else { error("_post_block_free: " "bridge_block_remove(%s): %s", bg_record->bg_block_id, bg_err_str(rc)); } } else if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("_post_block_free: done %s(%p)", bg_record->bg_block_id, bg_record); destroy_bg_record(bg_record); if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("_post_block_free: destroyed"); return SLURM_SUCCESS; }
/* * This could potentially lock the node lock in the slurmctld with * slurm_drain_node, so if slurmctld_locked is called we will call the * drainning function without locking the lock again. */ extern int bridge_block_check_mp_states(char *bg_block_id, bool slurmctld_locked) { int rc = SLURM_SUCCESS; #if defined HAVE_BG_FILES rm_partition_t *block_ptr = NULL; rm_BP_t *bp_ptr = NULL; int cnt = 0; int i = 0; bool small = false; /* If no bg_record->bg_block_id we don't need to check this since this block isn't really created. */ if (!bg_block_id) return SLURM_SUCCESS; if ((rc = bridge_get_block(bg_block_id, &block_ptr)) != SLURM_SUCCESS) { error("Block %s doesn't exist.", bg_block_id); rc = SLURM_ERROR; goto done; } if ((rc = bridge_get_data(block_ptr, RM_PartitionSmall, &small)) != SLURM_SUCCESS) { error("bridge_get_data(RM_PartitionSmall): %s", bg_err_str(rc)); rc = SLURM_ERROR; goto cleanup; } if (small) { rm_nodecard_t *ncard = NULL; char *node_name = NULL; /* If this is a small block we can just check the nodecard list of the block. */ if ((rc = bridge_get_data(block_ptr, RM_PartitionNodeCardNum, &cnt)) != SLURM_SUCCESS) { error("bridge_get_data(RM_PartitionNodeCardNum): %s", bg_err_str(rc)); rc = SLURM_ERROR; goto cleanup; } if ((rc = bridge_get_data(block_ptr, RM_PartitionFirstBP, &bp_ptr)) != SLURM_SUCCESS) { error("bridge_get_data(RM_FirstBP): %s", bg_err_str(rc)); rc = SLURM_ERROR; goto cleanup; } if (!(node_name = _get_bp_node_name(bp_ptr))) { rc = errno; goto cleanup; } for(i=0; i<cnt; i++) { int nc_id = 0; if (i) { if ((rc = bridge_get_data( block_ptr, RM_PartitionNextNodeCard, &ncard)) != SLURM_SUCCESS) { error("bridge_get_data(" "RM_PartitionNextNodeCard): %s", bg_err_str(rc)); rc = SLURM_ERROR; break; } } else { if ((rc = bridge_get_data( block_ptr, RM_PartitionFirstNodeCard, &ncard)) != SLURM_SUCCESS) { error("bridge_get_data(" "RM_PartitionFirstNodeCard): %s", bg_err_str(rc)); rc = SLURM_ERROR; break; } } #ifdef HAVE_BGL bridge_find_nodecard_num(block_ptr, ncard, &nc_id); #endif /* If we find any nodecards in an error state just break here since we are seeing if we can run. If any nodecard is down this can't happen. */ if (_test_nodecard_state( ncard, nc_id, node_name, slurmctld_locked) != SLURM_SUCCESS) { rc = SLURM_ERROR; break; } } xfree(node_name); goto cleanup; } /* If this isn't a small block we have to check the list of nodecards on each midplane. */ if ((rc = bridge_get_data(block_ptr, RM_PartitionBPNum, &cnt)) != SLURM_SUCCESS) { error("bridge_get_data(RM_BPNum): %s", bg_err_str(rc)); rc = SLURM_ERROR; goto cleanup; } for(i=0; i<cnt; i++) { if (i) { if ((rc = bridge_get_data(block_ptr, RM_PartitionNextBP, &bp_ptr)) != SLURM_SUCCESS) { error("bridge_get_data(RM_NextBP): %s", bg_err_str(rc)); rc = SLURM_ERROR; break; } } else { if ((rc = bridge_get_data(block_ptr, RM_PartitionFirstBP, &bp_ptr)) != SLURM_SUCCESS) { error("bridge_get_data(RM_FirstBP): %s", bg_err_str(rc)); rc = SLURM_ERROR; break; } } /* If we find any nodecards in an error state just break here since we are seeing if we can run. If any nodecard is down this can't happen. */ if (_test_down_nodecards(bp_ptr, slurmctld_locked) != SLURM_SUCCESS) { rc = SLURM_ERROR; break; } } cleanup: bridge_free_block(block_ptr); done: #endif return rc; }