/* block_state_mutex should be locked before calling this */ static int _post_block_free(bg_record_t *bg_record, bool restore) { int rc = SLURM_SUCCESS; if (bg_record->magic != BLOCK_MAGIC) { error("block already destroyed %p", bg_record); xassert(0); return SLURM_ERROR; } bg_record->free_cnt--; if (bg_record->free_cnt == -1) { info("we got a negative 1 here for %s", bg_record->bg_block_id); xassert(0); return SLURM_SUCCESS; } else if (bg_record->modifying) { info("others are modifing this block %s, don't clear it up", bg_record->bg_block_id); return SLURM_SUCCESS; } else if (bg_record->free_cnt) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("%d others are trying to destroy this block %s", bg_record->free_cnt, bg_record->bg_block_id); return SLURM_SUCCESS; } /* Even if the block is already in error state we need to do this to avoid any overlapping blocks that may have been created due to bad hardware. */ if ((bg_record->state & (~BG_BLOCK_ERROR_FLAG)) != BG_BLOCK_FREE) { /* Something isn't right, go mark this one in an error state. */ update_block_msg_t block_msg; if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("_post_block_free: block %s is not in state " "free (%s), putting it in error state.", bg_record->bg_block_id, bg_block_state_string(bg_record->state)); slurm_init_update_block_msg(&block_msg); block_msg.bg_block_id = bg_record->bg_block_id; block_msg.state = BG_BLOCK_ERROR_FLAG; block_msg.reason = "Block would not deallocate"; slurm_mutex_unlock(&block_state_mutex); select_g_update_block(&block_msg); slurm_mutex_lock(&block_state_mutex); if (block_ptr_exist_in_list(bg_lists->main, bg_record)) bg_record->destroy = 0; return SLURM_SUCCESS; } /* The reason restore is used on the entire list is if this * was for a bunch of small blocks. If we record is marked to * be destroyed and it is bigger than 1 midplane destroy it * even if restore is true. */ if (restore && bg_record->destroy && (bg_record->mp_count > 1)) restore = false; /* If we are here we are done with the destroy so just reset it. */ bg_record->destroy = 0; /* A bit of a sanity check to make sure blocks are being removed out of all the lists. */ remove_from_bg_list(bg_lists->booted, bg_record); if (remove_from_bg_list(bg_lists->job_running, bg_record) == SLURM_SUCCESS) { debug2("_post_block_free: we are freeing block %s and " "it was in the job_running list. This can happen if a " "block is removed while waiting for mmcs to finish " "removing the job from the block.", bg_record->bg_block_id); num_unused_cpus += bg_record->cpu_cnt; } /* If we don't have any mp_counts force block removal */ if (restore && bg_record->mp_count) return SLURM_SUCCESS; if (remove_from_bg_list(bg_lists->main, bg_record) != SLURM_SUCCESS) { /* This should only happen if called from * bg_job_place.c where the block was never added to * the list. */ debug("_post_block_free: It appears this block %s isn't " "in the main list anymore.", bg_record->bg_block_id); } if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("_post_block_free: removing %s from database", bg_record->bg_block_id); rc = bridge_block_remove(bg_record); if (rc != SLURM_SUCCESS) { if (rc == BG_ERROR_BLOCK_NOT_FOUND) { debug("_post_block_free: block %s is not found", bg_record->bg_block_id); } else { error("_post_block_free: " "bridge_block_remove(%s): %s", bg_record->bg_block_id, bg_err_str(rc)); } } else if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("_post_block_free: done %s(%p)", bg_record->bg_block_id, bg_record); destroy_bg_record(bg_record); if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("_post_block_free: destroyed"); return SLURM_SUCCESS; }
extern int bg_free_block(bg_record_t *bg_record, bool wait, bool locked) { int rc = SLURM_SUCCESS; int count = 0; if (!bg_record) { error("bg_free_block: there was no bg_record"); return SLURM_ERROR; } if (!locked) slurm_mutex_lock(&block_state_mutex); while (count < MAX_FREE_RETRIES) { /* block was removed */ if (bg_record->magic != BLOCK_MAGIC) { error("block was removed while freeing it here"); xassert(0); if (!locked) slurm_mutex_unlock(&block_state_mutex); return SLURM_SUCCESS; } /* Reset these here so we don't try to reboot it when the state goes to free. */ bg_record->boot_state = 0; bg_record->boot_count = 0; /* Here we don't need to check if the block is still * in exsistance since this function can't be called on * the same block twice. It may * had already been removed at this point also. */ #ifdef HAVE_BG_FILES if (bg_record->state != BG_BLOCK_FREE && bg_record->state != BG_BLOCK_TERM) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("bridge_destroy %s", bg_record->bg_block_id); rc = bridge_block_free(bg_record); if (rc != SLURM_SUCCESS) { if (rc == BG_ERROR_BLOCK_NOT_FOUND) { debug("block %s is not found", bg_record->bg_block_id); bg_record->state = BG_BLOCK_FREE; break; } else if (rc == BG_ERROR_FREE) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("bridge_block_free" "(%s): %s State = %s", bg_record->bg_block_id, bg_err_str(rc), bg_block_state_string( bg_record->state)); } else if (rc == BG_ERROR_INVALID_STATE) { #ifndef HAVE_BGL /* If the state is error and we get an incompatible state back here, it means we set it ourselves so break out. */ if (bg_record->state & BG_BLOCK_ERROR_FLAG) break; #endif if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("bridge_block_free" "(%s): %s State = %s", bg_record->bg_block_id, bg_err_str(rc), bg_block_state_string( bg_record->state)); #ifdef HAVE_BGQ if (bg_record->state != BG_BLOCK_FREE && bg_record->state != BG_BLOCK_TERM) bg_record->state = BG_BLOCK_TERM; #endif } else { error("bridge_block_free" "(%s): %s State = %s", bg_record->bg_block_id, bg_err_str(rc), bg_block_state_string( bg_record->state)); } } } #else /* Fake a free since we are n deallocating state before this. */ if (bg_record->state & BG_BLOCK_ERROR_FLAG) { /* This will set the state to ERROR(Free) * just incase the state was ERROR(SOMETHING ELSE) */ bg_record->state = BG_BLOCK_ERROR_FLAG; break; } else if (!wait || (count >= 3)) bg_record->state = BG_BLOCK_FREE; else if (bg_record->state != BG_BLOCK_FREE) bg_record->state = BG_BLOCK_TERM; #endif if (!wait || (bg_record->state == BG_BLOCK_FREE) #ifndef HAVE_BGL || (bg_record->state & BG_BLOCK_ERROR_FLAG) #endif ) { break; } /* If we were locked outside of this we need to unlock to not cause deadlock on this mutex until we are done. */ slurm_mutex_unlock(&block_state_mutex); sleep(FREE_SLEEP_INTERVAL); count++; slurm_mutex_lock(&block_state_mutex); } rc = SLURM_SUCCESS; if ((bg_record->state == BG_BLOCK_FREE) || (bg_record->state & BG_BLOCK_ERROR_FLAG)) { if (bg_record->err_ratio && (bg_record->state == BG_BLOCK_FREE)) { /* Sometime the realtime server can report software error on cnodes even though the block is free. If this is the case we need to manually clear them. */ ba_mp_t *found_ba_mp; ListIterator itr = list_iterator_create(bg_record->ba_mp_list); debug("Block %s is free, but has %u cnodes in error. " "This can happen if a large block goes into " "error and then is freed and the state of " "the block changes before the " "database informs all the cnodes are back to " "normal. This is no big deal.", bg_record->bg_block_id, bg_record->cnode_err_cnt); while ((found_ba_mp = list_next(itr))) { if (!found_ba_mp->used) continue; if (!found_ba_mp->cnode_err_bitmap) found_ba_mp->cnode_err_bitmap = bit_alloc( bg_conf->mp_cnode_cnt); bit_nclear(found_ba_mp->cnode_err_bitmap, 0, bit_size(found_ba_mp-> cnode_err_bitmap)-1); } list_iterator_destroy(itr); bg_record->cnode_err_cnt = 0; bg_record->err_ratio = 0; } remove_from_bg_list(bg_lists->booted, bg_record); } else if (count >= MAX_FREE_RETRIES) { /* Something isn't right, go mark this one in an error state. */ update_block_msg_t block_msg; if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("bg_free_block: block %s is not in state " "free (%s), putting it in error state.", bg_record->bg_block_id, bg_block_state_string(bg_record->state)); slurm_init_update_block_msg(&block_msg); block_msg.bg_block_id = bg_record->bg_block_id; block_msg.state = BG_BLOCK_ERROR_FLAG; block_msg.reason = "Block would not deallocate"; slurm_mutex_unlock(&block_state_mutex); select_g_update_block(&block_msg); slurm_mutex_lock(&block_state_mutex); rc = SLURM_ERROR; } if (!locked) slurm_mutex_unlock(&block_state_mutex); return rc; }
/* block_state_mutex should be locked before calling this */ static int _post_block_free(bg_record_t *bg_record, bool restore) { int rc = SLURM_SUCCESS; if (bg_record->magic != BLOCK_MAGIC) { error("block already destroyed %p", bg_record); xassert(0); return SLURM_ERROR; } bg_record->free_cnt--; if (bg_record->free_cnt == -1) { info("we got a negative 1 here for %s", bg_record->bg_block_id); xassert(0); return SLURM_SUCCESS; } else if (bg_record->modifying) { info("%d others are modifing this block %s", bg_record->free_cnt, bg_record->bg_block_id); return SLURM_SUCCESS; } else if (bg_record->free_cnt) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("%d others are trying to destroy this block %s", bg_record->free_cnt, bg_record->bg_block_id); return SLURM_SUCCESS; } if (!(bg_record->state & BG_BLOCK_ERROR_FLAG) && (bg_record->state != BG_BLOCK_FREE)) { /* Something isn't right, go mark this one in an error state. */ update_block_msg_t block_msg; if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("_post_block_free: block %s is not in state " "free (%s), putting it in error state.", bg_record->bg_block_id, bg_block_state_string(bg_record->state)); slurm_init_update_block_msg(&block_msg); block_msg.bg_block_id = bg_record->bg_block_id; block_msg.state = BG_BLOCK_ERROR_FLAG; block_msg.reason = "Block would not deallocate"; slurm_mutex_unlock(&block_state_mutex); select_g_update_block(&block_msg); slurm_mutex_lock(&block_state_mutex); return SLURM_SUCCESS; } /* A bit of a sanity check to make sure blocks are being removed out of all the lists. */ remove_from_bg_list(bg_lists->booted, bg_record); if (remove_from_bg_list(bg_lists->job_running, bg_record) == SLURM_SUCCESS) num_unused_cpus += bg_record->cpu_cnt; if (restore) return SLURM_SUCCESS; if (remove_from_bg_list(bg_lists->main, bg_record) != SLURM_SUCCESS) { /* This should only happen if called from * bg_job_place.c where the block was never added to * the list. */ debug("_post_block_free: It appears this block %s isn't " "in the main list anymore.", bg_record->bg_block_id); } if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("_post_block_free: removing %s from database", bg_record->bg_block_id); rc = bridge_block_remove(bg_record); if (rc != SLURM_SUCCESS) { if (rc == BG_ERROR_BLOCK_NOT_FOUND) { debug("_post_block_free: block %s is not found", bg_record->bg_block_id); } else { error("_post_block_free: " "bridge_block_remove(%s): %s", bg_record->bg_block_id, bg_err_str(rc)); } } else if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("_post_block_free: done %s(%p)", bg_record->bg_block_id, bg_record); destroy_bg_record(bg_record); if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("_post_block_free: destroyed"); return SLURM_SUCCESS; }
extern int bg_free_block(bg_record_t *bg_record, bool wait, bool locked) { int rc = SLURM_SUCCESS; int count = 0; if (!bg_record) { error("bg_free_block: there was no bg_record"); return SLURM_ERROR; } if (!locked) slurm_mutex_lock(&block_state_mutex); while (count < MAX_FREE_RETRIES) { /* block was removed */ if (bg_record->magic != BLOCK_MAGIC) { error("block was removed while freeing it here"); xassert(0); if (!locked) slurm_mutex_unlock(&block_state_mutex); return SLURM_SUCCESS; } /* Reset these here so we don't try to reboot it when the state goes to free. */ bg_record->boot_state = 0; bg_record->boot_count = 0; /* Here we don't need to check if the block is still * in exsistance since this function can't be called on * the same block twice. It may * had already been removed at this point also. */ #ifdef HAVE_BG_FILES if (bg_record->state != BG_BLOCK_FREE && bg_record->state != BG_BLOCK_TERM) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("bridge_destroy %s", bg_record->bg_block_id); rc = bridge_block_free(bg_record); if (rc != SLURM_SUCCESS) { if (rc == BG_ERROR_BLOCK_NOT_FOUND) { debug("block %s is not found", bg_record->bg_block_id); bg_record->state = BG_BLOCK_FREE; break; } else if (rc == BG_ERROR_FREE) { if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("bridge_block_free" "(%s): %s State = %s", bg_record->bg_block_id, bg_err_str(rc), bg_block_state_string( bg_record->state)); } else if (rc == BG_ERROR_INVALID_STATE) { #ifndef HAVE_BGL /* If the state is error and we get an incompatible state back here, it means we set it ourselves so break out. */ if (bg_record->state & BG_BLOCK_ERROR_FLAG) break; #endif if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("bridge_block_free" "(%s): %s State = %s", bg_record->bg_block_id, bg_err_str(rc), bg_block_state_string( bg_record->state)); #ifdef HAVE_BGQ if (bg_record->state != BG_BLOCK_FREE && bg_record->state != BG_BLOCK_TERM) bg_record->state = BG_BLOCK_TERM; #endif } else { error("bridge_block_free" "(%s): %s State = %s", bg_record->bg_block_id, bg_err_str(rc), bg_block_state_string( bg_record->state)); } } } #else /* Fake a free since we are n deallocating state before this. */ if (bg_record->state & BG_BLOCK_ERROR_FLAG) { /* This will set the state to ERROR(Free) * just incase the state was ERROR(SOMETHING ELSE) */ bg_record->state = BG_BLOCK_ERROR_FLAG; break; } else if (!wait || (count >= 3)) bg_record->state = BG_BLOCK_FREE; else if (bg_record->state != BG_BLOCK_FREE) bg_record->state = BG_BLOCK_TERM; #endif if (!wait || (bg_record->state == BG_BLOCK_FREE) #ifndef HAVE_BGL || (bg_record->state & BG_BLOCK_ERROR_FLAG) #endif ) { break; } /* If we were locked outside of this we need to unlock to not cause deadlock on this mutex until we are done. */ slurm_mutex_unlock(&block_state_mutex); sleep(FREE_SLEEP_INTERVAL); count++; slurm_mutex_lock(&block_state_mutex); } rc = SLURM_SUCCESS; if ((bg_record->state == BG_BLOCK_FREE) || (bg_record->state & BG_BLOCK_ERROR_FLAG)) remove_from_bg_list(bg_lists->booted, bg_record); else if (count >= MAX_FREE_RETRIES) { /* Something isn't right, go mark this one in an error state. */ update_block_msg_t block_msg; if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE) info("bg_free_block: block %s is not in state " "free (%s), putting it in error state.", bg_record->bg_block_id, bg_block_state_string(bg_record->state)); slurm_init_update_block_msg(&block_msg); block_msg.bg_block_id = bg_record->bg_block_id; block_msg.state = BG_BLOCK_ERROR_FLAG; block_msg.reason = "Block would not deallocate"; slurm_mutex_unlock(&block_state_mutex); select_g_update_block(&block_msg); slurm_mutex_lock(&block_state_mutex); rc = SLURM_ERROR; } if (!locked) slurm_mutex_unlock(&block_state_mutex); return rc; }
extern int update_state_block(GtkDialog *dialog, const char *blockid, const char *type) { int i = 0; int rc = SLURM_SUCCESS; char tmp_char[100]; update_block_msg_t block_msg; GtkWidget *label = NULL; int no_dialog = 0; if (!dialog) { dialog = GTK_DIALOG( gtk_dialog_new_with_buttons( type, GTK_WINDOW(main_window), GTK_DIALOG_MODAL | GTK_DIALOG_DESTROY_WITH_PARENT, NULL)); no_dialog = 1; } slurm_init_update_block_msg(&block_msg); block_msg.bg_block_id = (char *)blockid; label = gtk_dialog_add_button(dialog, GTK_STOCK_YES, GTK_RESPONSE_OK); gtk_window_set_default(GTK_WINDOW(dialog), label); gtk_dialog_add_button(dialog, GTK_STOCK_CANCEL, GTK_RESPONSE_CANCEL); if (!xstrcasecmp("Error", type) || !xstrcasecmp("Put block in error state", type)) { snprintf(tmp_char, sizeof(tmp_char), "Are you sure you want to put block %s " "in an error state?", blockid); block_msg.state = BG_BLOCK_ERROR_FLAG; } else if (!xstrcasecmp("Recreate block", type)) { snprintf(tmp_char, sizeof(tmp_char), "Are you sure you want to recreate block %s?", blockid); block_msg.state = BG_BLOCK_BOOTING; } else if (!xstrcasecmp("Remove block", type)) { snprintf(tmp_char, sizeof(tmp_char), "Are you sure you want to remove block %s?", blockid); block_msg.state = BG_BLOCK_NAV; } else if (!xstrcasecmp("Resume block", type)) { snprintf(tmp_char, sizeof(tmp_char), "Are you sure you want to resume block %s?", blockid); block_msg.state = BG_BLOCK_TERM; } else { snprintf(tmp_char, sizeof(tmp_char), "Are you sure you want to put block %s " "in a free state?", blockid); block_msg.state = BG_BLOCK_FREE; } label = gtk_label_new(tmp_char); gtk_box_pack_start(GTK_BOX(dialog->vbox), label, FALSE, FALSE, 0); gtk_widget_show_all(GTK_WIDGET(dialog)); i = gtk_dialog_run(dialog); if (i == GTK_RESPONSE_OK) { if (slurm_update_block(&block_msg) == SLURM_SUCCESS) { snprintf(tmp_char, sizeof(tmp_char), "Block %s updated successfully", blockid); } else { snprintf(tmp_char, sizeof(tmp_char), "Problem updating block %s.", blockid); } display_edit_note(tmp_char); } if (no_dialog) gtk_widget_destroy(GTK_WIDGET(dialog)); return rc; }