Пример #1
0
/* block_state_mutex should be locked before calling this */
static int _post_block_free(bg_record_t *bg_record, bool restore)
{
	int rc = SLURM_SUCCESS;

	if (bg_record->magic != BLOCK_MAGIC) {
		error("block already destroyed %p", bg_record);
		xassert(0);
		return SLURM_ERROR;
	}

	bg_record->free_cnt--;
	if (bg_record->free_cnt == -1) {
		info("we got a negative 1 here for %s",
		     bg_record->bg_block_id);
		xassert(0);
		return SLURM_SUCCESS;
	} else if (bg_record->modifying) {
		info("others are modifing this block %s, don't clear it up",
		     bg_record->bg_block_id);
		return SLURM_SUCCESS;
	} else if (bg_record->free_cnt) {
		if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
			info("%d others are trying to destroy this block %s",
			     bg_record->free_cnt, bg_record->bg_block_id);
		return SLURM_SUCCESS;
	}

	/* Even if the block is already in error state we need to do this to
	   avoid any overlapping blocks that may have been created due
	   to bad hardware.
	*/
	if ((bg_record->state & (~BG_BLOCK_ERROR_FLAG)) != BG_BLOCK_FREE) {
		/* Something isn't right, go mark this one in an error
		   state. */
		update_block_msg_t block_msg;
		if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
			info("_post_block_free: block %s is not in state "
			     "free (%s), putting it in error state.",
			     bg_record->bg_block_id,
			     bg_block_state_string(bg_record->state));
		slurm_init_update_block_msg(&block_msg);
		block_msg.bg_block_id = bg_record->bg_block_id;
		block_msg.state = BG_BLOCK_ERROR_FLAG;
		block_msg.reason = "Block would not deallocate";
		slurm_mutex_unlock(&block_state_mutex);
		select_g_update_block(&block_msg);
		slurm_mutex_lock(&block_state_mutex);
		if (block_ptr_exist_in_list(bg_lists->main, bg_record))
			bg_record->destroy = 0;
		return SLURM_SUCCESS;
	}

	/* The reason restore is used on the entire list is if this
	 * was for a bunch of small blocks.  If we record is marked to
	 * be destroyed and it is bigger than 1 midplane destroy it
	 * even if restore is true.
	 */
	 if (restore && bg_record->destroy && (bg_record->mp_count > 1))
		restore = false;

	/* If we are here we are done with the destroy so just reset it. */
	bg_record->destroy = 0;

	/* A bit of a sanity check to make sure blocks are being
	   removed out of all the lists.
	*/
	remove_from_bg_list(bg_lists->booted, bg_record);
	if (remove_from_bg_list(bg_lists->job_running, bg_record)
	    == SLURM_SUCCESS) {
		debug2("_post_block_free: we are freeing block %s and "
		       "it was in the job_running list.  This can happen if a "
		       "block is removed while waiting for mmcs to finish "
		       "removing the job from the block.",
		       bg_record->bg_block_id);
		num_unused_cpus += bg_record->cpu_cnt;
	}

	/* If we don't have any mp_counts force block removal */
	if (restore && bg_record->mp_count)
		return SLURM_SUCCESS;

	if (remove_from_bg_list(bg_lists->main, bg_record) != SLURM_SUCCESS) {
		/* This should only happen if called from
		 * bg_job_place.c where the block was never added to
		 * the list. */
		debug("_post_block_free: It appears this block %s isn't "
		      "in the main list anymore.",
		      bg_record->bg_block_id);
	}

	if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
		info("_post_block_free: removing %s from database",
		     bg_record->bg_block_id);

	rc = bridge_block_remove(bg_record);
	if (rc != SLURM_SUCCESS) {
		if (rc == BG_ERROR_BLOCK_NOT_FOUND) {
			debug("_post_block_free: block %s is not found",
			      bg_record->bg_block_id);
		} else {
			error("_post_block_free: "
			      "bridge_block_remove(%s): %s",
			      bg_record->bg_block_id,
			      bg_err_str(rc));
		}
	} else if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
		info("_post_block_free: done %s(%p)",
		     bg_record->bg_block_id, bg_record);

	destroy_bg_record(bg_record);
	if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
		info("_post_block_free: destroyed");

	return SLURM_SUCCESS;
}
Пример #2
0
extern int bg_free_block(bg_record_t *bg_record, bool wait, bool locked)
{
	int rc = SLURM_SUCCESS;
	int count = 0;

	if (!bg_record) {
		error("bg_free_block: there was no bg_record");
		return SLURM_ERROR;
	}

	if (!locked)
		slurm_mutex_lock(&block_state_mutex);

	while (count < MAX_FREE_RETRIES) {
		/* block was removed */
		if (bg_record->magic != BLOCK_MAGIC) {
			error("block was removed while freeing it here");
			xassert(0);
			if (!locked)
				slurm_mutex_unlock(&block_state_mutex);
			return SLURM_SUCCESS;
		}
		/* Reset these here so we don't try to reboot it
		   when the state goes to free.
		*/
		bg_record->boot_state = 0;
		bg_record->boot_count = 0;
		/* Here we don't need to check if the block is still
		 * in exsistance since this function can't be called on
		 * the same block twice.  It may
		 * had already been removed at this point also.
		 */
#ifdef HAVE_BG_FILES
		if (bg_record->state != BG_BLOCK_FREE
		    && bg_record->state != BG_BLOCK_TERM) {
			if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
				info("bridge_destroy %s",
				     bg_record->bg_block_id);
			rc = bridge_block_free(bg_record);
			if (rc != SLURM_SUCCESS) {
				if (rc == BG_ERROR_BLOCK_NOT_FOUND) {
					debug("block %s is not found",
					      bg_record->bg_block_id);
					bg_record->state = BG_BLOCK_FREE;
					break;
				} else if (rc == BG_ERROR_FREE) {
					if (bg_conf->slurm_debug_flags
					    & DEBUG_FLAG_SELECT_TYPE)
						info("bridge_block_free"
						     "(%s): %s State = %s",
						     bg_record->bg_block_id,
						     bg_err_str(rc),
						     bg_block_state_string(
							     bg_record->state));
				} else if (rc == BG_ERROR_INVALID_STATE) {
#ifndef HAVE_BGL
					/* If the state is error and
					   we get an incompatible
					   state back here, it means
					   we set it ourselves so
					   break out.
					*/
					if (bg_record->state
					    & BG_BLOCK_ERROR_FLAG)
						break;
#endif
					if (bg_conf->slurm_debug_flags
					    & DEBUG_FLAG_SELECT_TYPE)
						info("bridge_block_free"
						     "(%s): %s State = %s",
						     bg_record->bg_block_id,
						     bg_err_str(rc),
						     bg_block_state_string(
							     bg_record->state));
#ifdef HAVE_BGQ
					if (bg_record->state != BG_BLOCK_FREE
					    && bg_record->state
					    != BG_BLOCK_TERM)
					bg_record->state = BG_BLOCK_TERM;
#endif
				} else {
					error("bridge_block_free"
					      "(%s): %s State = %s",
					      bg_record->bg_block_id,
					      bg_err_str(rc),
					      bg_block_state_string(
						      bg_record->state));
				}
			}
		}
#else
		/* Fake a free since we are n deallocating
		   state before this.
		*/
		if (bg_record->state & BG_BLOCK_ERROR_FLAG) {
			/* This will set the state to ERROR(Free)
			 * just incase the state was ERROR(SOMETHING ELSE) */
			bg_record->state = BG_BLOCK_ERROR_FLAG;
			break;
		} else if (!wait || (count >= 3))
			bg_record->state = BG_BLOCK_FREE;
		else if (bg_record->state != BG_BLOCK_FREE)
			bg_record->state = BG_BLOCK_TERM;
#endif

		if (!wait || (bg_record->state == BG_BLOCK_FREE)
#ifndef HAVE_BGL
		    ||  (bg_record->state & BG_BLOCK_ERROR_FLAG)
#endif
			) {
			break;
		}
		/* If we were locked outside of this we need to unlock
		   to not cause deadlock on this mutex until we are
		   done.
		*/
		slurm_mutex_unlock(&block_state_mutex);
		sleep(FREE_SLEEP_INTERVAL);
		count++;
		slurm_mutex_lock(&block_state_mutex);
	}

	rc = SLURM_SUCCESS;
	if ((bg_record->state == BG_BLOCK_FREE)
	    || (bg_record->state & BG_BLOCK_ERROR_FLAG)) {

		if (bg_record->err_ratio
		    && (bg_record->state == BG_BLOCK_FREE)) {
			/* Sometime the realtime server can report
			   software error on cnodes even though the
			   block is free.  If this is the case we need
			   to manually clear them.
			*/
			ba_mp_t *found_ba_mp;
			ListIterator itr =
				list_iterator_create(bg_record->ba_mp_list);
			debug("Block %s is free, but has %u cnodes in error.  "
			      "This can happen if a large block goes into "
			      "error and then is freed and the state of "
			      "the block changes before the "
			      "database informs all the cnodes are back to "
			      "normal.  This is no big deal.",
			      bg_record->bg_block_id, bg_record->cnode_err_cnt);
			while ((found_ba_mp = list_next(itr))) {
				if (!found_ba_mp->used)
					continue;

				if (!found_ba_mp->cnode_err_bitmap)
					found_ba_mp->cnode_err_bitmap =
						bit_alloc(
							bg_conf->mp_cnode_cnt);

				bit_nclear(found_ba_mp->cnode_err_bitmap, 0,
					   bit_size(found_ba_mp->
						    cnode_err_bitmap)-1);
			}
			list_iterator_destroy(itr);
			bg_record->cnode_err_cnt = 0;
			bg_record->err_ratio = 0;
		}

		remove_from_bg_list(bg_lists->booted, bg_record);
	} else if (count >= MAX_FREE_RETRIES) {
		/* Something isn't right, go mark this one in an error
		   state. */
		update_block_msg_t block_msg;
		if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
			info("bg_free_block: block %s is not in state "
			     "free (%s), putting it in error state.",
			     bg_record->bg_block_id,
			     bg_block_state_string(bg_record->state));
		slurm_init_update_block_msg(&block_msg);
		block_msg.bg_block_id = bg_record->bg_block_id;
		block_msg.state = BG_BLOCK_ERROR_FLAG;
		block_msg.reason = "Block would not deallocate";
		slurm_mutex_unlock(&block_state_mutex);
		select_g_update_block(&block_msg);
		slurm_mutex_lock(&block_state_mutex);
		rc = SLURM_ERROR;
	}
	if (!locked)
		slurm_mutex_unlock(&block_state_mutex);

	return rc;
}
Пример #3
0
/* block_state_mutex should be locked before calling this */
static int _post_block_free(bg_record_t *bg_record, bool restore)
{
	int rc = SLURM_SUCCESS;

	if (bg_record->magic != BLOCK_MAGIC) {
		error("block already destroyed %p", bg_record);
		xassert(0);
		return SLURM_ERROR;
	}

	bg_record->free_cnt--;

	if (bg_record->free_cnt == -1) {
		info("we got a negative 1 here for %s",
		     bg_record->bg_block_id);
		xassert(0);
		return SLURM_SUCCESS;
	} else if (bg_record->modifying) {
		info("%d others are modifing this block %s",
		     bg_record->free_cnt, bg_record->bg_block_id);
		return SLURM_SUCCESS;
	} else if (bg_record->free_cnt) {
		if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
			info("%d others are trying to destroy this block %s",
			     bg_record->free_cnt, bg_record->bg_block_id);
		return SLURM_SUCCESS;
	}

	if (!(bg_record->state & BG_BLOCK_ERROR_FLAG)
	    && (bg_record->state != BG_BLOCK_FREE)) {
		/* Something isn't right, go mark this one in an error
		   state. */
		update_block_msg_t block_msg;
		if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
			info("_post_block_free: block %s is not in state "
			     "free (%s), putting it in error state.",
			     bg_record->bg_block_id,
			     bg_block_state_string(bg_record->state));
		slurm_init_update_block_msg(&block_msg);
		block_msg.bg_block_id = bg_record->bg_block_id;
		block_msg.state = BG_BLOCK_ERROR_FLAG;
		block_msg.reason = "Block would not deallocate";
		slurm_mutex_unlock(&block_state_mutex);
		select_g_update_block(&block_msg);
		slurm_mutex_lock(&block_state_mutex);
		return SLURM_SUCCESS;
	}

	/* A bit of a sanity check to make sure blocks are being
	   removed out of all the lists.
	*/
	remove_from_bg_list(bg_lists->booted, bg_record);
	if (remove_from_bg_list(bg_lists->job_running, bg_record)
	    == SLURM_SUCCESS)
		num_unused_cpus += bg_record->cpu_cnt;

	if (restore)
		return SLURM_SUCCESS;

	if (remove_from_bg_list(bg_lists->main, bg_record) != SLURM_SUCCESS) {
		/* This should only happen if called from
		 * bg_job_place.c where the block was never added to
		 * the list. */
		debug("_post_block_free: It appears this block %s isn't "
		      "in the main list anymore.",
		      bg_record->bg_block_id);
	}

	if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
		info("_post_block_free: removing %s from database",
		     bg_record->bg_block_id);

	rc = bridge_block_remove(bg_record);
	if (rc != SLURM_SUCCESS) {
		if (rc == BG_ERROR_BLOCK_NOT_FOUND) {
			debug("_post_block_free: block %s is not found",
			      bg_record->bg_block_id);
		} else {
			error("_post_block_free: "
			      "bridge_block_remove(%s): %s",
			      bg_record->bg_block_id,
			      bg_err_str(rc));
		}
	} else if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
		info("_post_block_free: done %s(%p)",
		     bg_record->bg_block_id, bg_record);

	destroy_bg_record(bg_record);
	if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
		info("_post_block_free: destroyed");

	return SLURM_SUCCESS;
}
Пример #4
0
extern int bg_free_block(bg_record_t *bg_record, bool wait, bool locked)
{
	int rc = SLURM_SUCCESS;
	int count = 0;

	if (!bg_record) {
		error("bg_free_block: there was no bg_record");
		return SLURM_ERROR;
	}

	if (!locked)
		slurm_mutex_lock(&block_state_mutex);

	while (count < MAX_FREE_RETRIES) {
		/* block was removed */
		if (bg_record->magic != BLOCK_MAGIC) {
			error("block was removed while freeing it here");
			xassert(0);
			if (!locked)
				slurm_mutex_unlock(&block_state_mutex);
			return SLURM_SUCCESS;
		}
		/* Reset these here so we don't try to reboot it
		   when the state goes to free.
		*/
		bg_record->boot_state = 0;
		bg_record->boot_count = 0;
		/* Here we don't need to check if the block is still
		 * in exsistance since this function can't be called on
		 * the same block twice.  It may
		 * had already been removed at this point also.
		 */
#ifdef HAVE_BG_FILES
		if (bg_record->state != BG_BLOCK_FREE
		    && bg_record->state != BG_BLOCK_TERM) {
			if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
				info("bridge_destroy %s",
				     bg_record->bg_block_id);
			rc = bridge_block_free(bg_record);
			if (rc != SLURM_SUCCESS) {
				if (rc == BG_ERROR_BLOCK_NOT_FOUND) {
					debug("block %s is not found",
					      bg_record->bg_block_id);
					bg_record->state = BG_BLOCK_FREE;
					break;
				} else if (rc == BG_ERROR_FREE) {
					if (bg_conf->slurm_debug_flags
					    & DEBUG_FLAG_SELECT_TYPE)
						info("bridge_block_free"
						     "(%s): %s State = %s",
						     bg_record->bg_block_id,
						     bg_err_str(rc),
						     bg_block_state_string(
							     bg_record->state));
				} else if (rc == BG_ERROR_INVALID_STATE) {
#ifndef HAVE_BGL
					/* If the state is error and
					   we get an incompatible
					   state back here, it means
					   we set it ourselves so
					   break out.
					*/
					if (bg_record->state
					    & BG_BLOCK_ERROR_FLAG)
						break;
#endif
					if (bg_conf->slurm_debug_flags
					    & DEBUG_FLAG_SELECT_TYPE)
						info("bridge_block_free"
						     "(%s): %s State = %s",
						     bg_record->bg_block_id,
						     bg_err_str(rc),
						     bg_block_state_string(
							     bg_record->state));
#ifdef HAVE_BGQ
					if (bg_record->state != BG_BLOCK_FREE
					    && bg_record->state
					    != BG_BLOCK_TERM)
					bg_record->state = BG_BLOCK_TERM;
#endif
				} else {
					error("bridge_block_free"
					      "(%s): %s State = %s",
					      bg_record->bg_block_id,
					      bg_err_str(rc),
					      bg_block_state_string(
						      bg_record->state));
				}
			}
		}
#else
		/* Fake a free since we are n deallocating
		   state before this.
		*/
		if (bg_record->state & BG_BLOCK_ERROR_FLAG) {
			/* This will set the state to ERROR(Free)
			 * just incase the state was ERROR(SOMETHING ELSE) */
			bg_record->state = BG_BLOCK_ERROR_FLAG;
			break;
		} else if (!wait || (count >= 3))
			bg_record->state = BG_BLOCK_FREE;
		else if (bg_record->state != BG_BLOCK_FREE)
			bg_record->state = BG_BLOCK_TERM;
#endif

		if (!wait || (bg_record->state == BG_BLOCK_FREE)
#ifndef HAVE_BGL
		    ||  (bg_record->state & BG_BLOCK_ERROR_FLAG)
#endif
			) {
			break;
		}
		/* If we were locked outside of this we need to unlock
		   to not cause deadlock on this mutex until we are
		   done.
		*/
		slurm_mutex_unlock(&block_state_mutex);
		sleep(FREE_SLEEP_INTERVAL);
		count++;
		slurm_mutex_lock(&block_state_mutex);
	}

	rc = SLURM_SUCCESS;
	if ((bg_record->state == BG_BLOCK_FREE)
	    || (bg_record->state & BG_BLOCK_ERROR_FLAG))
		remove_from_bg_list(bg_lists->booted, bg_record);
	else if (count >= MAX_FREE_RETRIES) {
		/* Something isn't right, go mark this one in an error
		   state. */
		update_block_msg_t block_msg;
		if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
			info("bg_free_block: block %s is not in state "
			     "free (%s), putting it in error state.",
			     bg_record->bg_block_id,
			     bg_block_state_string(bg_record->state));
		slurm_init_update_block_msg(&block_msg);
		block_msg.bg_block_id = bg_record->bg_block_id;
		block_msg.state = BG_BLOCK_ERROR_FLAG;
		block_msg.reason = "Block would not deallocate";
		slurm_mutex_unlock(&block_state_mutex);
		select_g_update_block(&block_msg);
		slurm_mutex_lock(&block_state_mutex);
		rc = SLURM_ERROR;
	}
	if (!locked)
		slurm_mutex_unlock(&block_state_mutex);

	return rc;
}
Пример #5
0
extern int update_state_block(GtkDialog *dialog,
			      const char *blockid, const char *type)
{
	int i = 0;
	int rc = SLURM_SUCCESS;
	char tmp_char[100];
	update_block_msg_t block_msg;
	GtkWidget *label = NULL;
	int no_dialog = 0;

	if (!dialog) {
		dialog = GTK_DIALOG(
			gtk_dialog_new_with_buttons(
				type,
				GTK_WINDOW(main_window),
				GTK_DIALOG_MODAL
				| GTK_DIALOG_DESTROY_WITH_PARENT,
				NULL));
		no_dialog = 1;
	}
	slurm_init_update_block_msg(&block_msg);
	block_msg.bg_block_id = (char *)blockid;

	label = gtk_dialog_add_button(dialog,
				      GTK_STOCK_YES, GTK_RESPONSE_OK);
	gtk_window_set_default(GTK_WINDOW(dialog), label);
	gtk_dialog_add_button(dialog,
			      GTK_STOCK_CANCEL, GTK_RESPONSE_CANCEL);

	if (!xstrcasecmp("Error", type) ||
	    !xstrcasecmp("Put block in error state", type)) {
		snprintf(tmp_char, sizeof(tmp_char),
			 "Are you sure you want to put block %s "
			 "in an error state?",
			 blockid);
		block_msg.state = BG_BLOCK_ERROR_FLAG;
	} else if (!xstrcasecmp("Recreate block", type)) {
		snprintf(tmp_char, sizeof(tmp_char),
			 "Are you sure you want to recreate block %s?",
			 blockid);
		block_msg.state = BG_BLOCK_BOOTING;
	} else if (!xstrcasecmp("Remove block", type)) {
		snprintf(tmp_char, sizeof(tmp_char),
			 "Are you sure you want to remove block %s?",
			 blockid);
		block_msg.state = BG_BLOCK_NAV;
	} else if (!xstrcasecmp("Resume block", type)) {
		snprintf(tmp_char, sizeof(tmp_char),
			 "Are you sure you want to resume block %s?",
			 blockid);
		block_msg.state = BG_BLOCK_TERM;
	} else {
		snprintf(tmp_char, sizeof(tmp_char),
			 "Are you sure you want to put block %s "
			 "in a free state?",
			 blockid);
		block_msg.state = BG_BLOCK_FREE;
	}

	label = gtk_label_new(tmp_char);

	gtk_box_pack_start(GTK_BOX(dialog->vbox), label, FALSE, FALSE, 0);

	gtk_widget_show_all(GTK_WIDGET(dialog));
	i = gtk_dialog_run(dialog);
	if (i == GTK_RESPONSE_OK) {
		if (slurm_update_block(&block_msg)
		    == SLURM_SUCCESS) {
			snprintf(tmp_char, sizeof(tmp_char),
				 "Block %s updated successfully",
				 blockid);
		} else {
			snprintf(tmp_char, sizeof(tmp_char),
				 "Problem updating block %s.",
				 blockid);
		}
		display_edit_note(tmp_char);
	}

	if (no_dialog)
		gtk_widget_destroy(GTK_WIDGET(dialog));
	return rc;
}