Exemplo n.º 1
0
static void _update_block_record(sview_block_info_t *block_ptr,
				 GtkTreeStore *treestore)
{
	char cnode_cnt[20], cnode_cnt2[20];
	char *tmp_char = NULL, *tmp_char2 = NULL, *tmp_char3 = NULL;

	convert_num_unit((float)block_ptr->cnode_cnt, cnode_cnt,
			 sizeof(cnode_cnt), UNIT_NONE, NO_VAL,
			 working_sview_config.convert_flags);
	if (cluster_flags & CLUSTER_FLAG_BGQ) {
		convert_num_unit((float)block_ptr->cnode_err_cnt, cnode_cnt2,
				 sizeof(cnode_cnt), UNIT_NONE, NO_VAL,
				 working_sview_config.convert_flags);
		tmp_char3 = xstrdup_printf("%s/%s", cnode_cnt, cnode_cnt2);
	} else
		tmp_char3 = cnode_cnt;

	tmp_char = conn_type_string_full(block_ptr->bg_conn_type);
	tmp_char2 = _set_running_job_str(block_ptr->job_list, 0);
	/* Combining these records provides a slight performance improvement */
	gtk_tree_store_set(treestore, &block_ptr->iter_ptr,
			   SORTID_BLOCK,        block_ptr->bg_block_name,
			   SORTID_COLOR,
				sview_colors[block_ptr->color_inx],
			   SORTID_COLOR_INX,    block_ptr->color_inx,
			   SORTID_CONN,		tmp_char,
			   SORTID_IMAGEMLOADER, block_ptr->imagemloader,
			   SORTID_JOB,          tmp_char2,
			   SORTID_NODE_INX,     block_ptr->mp_inx,
			   SORTID_NODE_CNT,     tmp_char3,
			   SORTID_NODELIST,     block_ptr->mp_str,
			   SORTID_PARTITION,    block_ptr->slurm_part_name,
			   SORTID_REASON,       block_ptr->reason,
			   SORTID_SMALL_BLOCK,  block_ptr->small_block,
			   SORTID_STATE,
				bg_block_state_string(block_ptr->state),
			   SORTID_UPDATED,      1,
			   -1);
	xfree(tmp_char);
	xfree(tmp_char2);
	if (cluster_flags & CLUSTER_FLAG_BGQ)
		xfree(tmp_char3);

	if (cluster_flags & CLUSTER_FLAG_BGP) {
		gtk_tree_store_set(treestore, &block_ptr->iter_ptr,
				   SORTID_IMAGERAMDISK, block_ptr->imageramdisk,
				   SORTID_IMAGELINUX,   block_ptr->imagelinux,
				   -1);
	} else if (cluster_flags & CLUSTER_FLAG_BGL) {
		gtk_tree_store_set(treestore, &block_ptr->iter_ptr,
				   SORTID_IMAGERAMDISK, block_ptr->imageramdisk,
				   SORTID_IMAGELINUX,   block_ptr->imagelinux,
				   SORTID_IMAGEBLRTS,   block_ptr->imageblrts,
				   SORTID_USE,
					node_use_string(block_ptr->bg_node_use),
				   -1);
	}

	return;
}
Exemplo n.º 2
0
/*
 * _bg_report - download and print current bgblock state information
 */
static int _bg_report(block_info_msg_t *block_ptr)
{
	int i;

	if (!block_ptr) {
		slurm_perror("No block_ptr given");
		return SLURM_ERROR;
	}

	if (!params.no_header)
		printf("BG_BLOCK         MIDPLANES       STATE    CONNECTION USE\n");
/*                      1234567890123456 123456789012 12345678 1234567890 12345+ */
/*                      RMP_22Apr1544018 bg[123x456]  READY    TORUS      COPROCESSOR */

	for (i=0; i<block_ptr->record_count; i++) {
		char *conn_str = conn_type_string_full(
			block_ptr->block_array[i].conn_type);
		printf("%-16.16s %-15.15s %-8.8s %-10.10s %s\n",
		       block_ptr->block_array[i].bg_block_id,
		       block_ptr->block_array[i].mp_str,
		       bg_block_state_string(
			       block_ptr->block_array[i].state),
		       conn_str,
		       node_use_string(
			       block_ptr->block_array[i].node_use));
		xfree(conn_str);
	}

	return SLURM_SUCCESS;
}
Exemplo n.º 3
0
static void _update_block_record(sview_block_info_t *block_ptr,
				 GtkTreeStore *treestore, GtkTreeIter *iter)
{
	char job_running[20], cnode_cnt[20];

	if (block_ptr->job_running > NO_JOB_RUNNING)
		snprintf(job_running, sizeof(job_running),
			 "%d", block_ptr->job_running);
	else
		snprintf(job_running, sizeof(job_running), "-");

	convert_num_unit((float)block_ptr->cnode_cnt, cnode_cnt, sizeof(cnode_cnt),
			 UNIT_NONE);

	/* Combining these records provides a slight performance improvement */
	gtk_tree_store_set(treestore, iter,
			   SORTID_BLOCK,        block_ptr->bg_block_name,
			   SORTID_COLOR,
				sview_colors[block_ptr->color_inx],
			   SORTID_COLOR_INX,    block_ptr->color_inx,
			   SORTID_CONN,
				conn_type_string(block_ptr->bg_conn_type),
			   SORTID_IMAGERAMDISK, block_ptr->imageramdisk,
			   SORTID_IMAGELINUX,   block_ptr->imagelinux,
			   SORTID_IMAGEMLOADER, block_ptr->imagemloader,
			   SORTID_JOB,          job_running,
			   SORTID_NODE_INX,     block_ptr->bp_inx,
			   SORTID_MP_STR,        cnode_cnt,
			   SORTID_NODELIST,     block_ptr->mp_str,
			   SORTID_PARTITION,    block_ptr->slurm_part_name,
			   SORTID_SMALL_BLOCK,  block_ptr->small_block,
			   SORTID_STATE,
				bg_block_state_string(block_ptr->state),
			   SORTID_USER,         block_ptr->bg_user_name,
			   SORTID_UPDATED,      1,
			   -1);

	if (cluster_flags & CLUSTER_FLAG_BGL) {
		gtk_tree_store_set(treestore, iter,
				   SORTID_IMAGEBLRTS,   block_ptr->imageblrts,
				   SORTID_USE,
					node_use_string(block_ptr->bg_node_use),
				   -1);
	}

	return;
}
Exemplo n.º 4
0
/* Creates a tree model containing the completions */
void _search_entry(sview_search_info_t *sview_search_info)
{
	int id = 0;
	char title[100];
	ListIterator itr = NULL;
	popup_info_t *popup_win = NULL;
	GError *error = NULL;
	char *upper = NULL, *lower = NULL;
	char *type;

	if (cluster_flags & CLUSTER_FLAG_BG)
		type = "Midplane";
	else
		type = "Node";

	if (sview_search_info->int_data == NO_VAL &&
	    (!sview_search_info->gchar_data
	     || !strlen(sview_search_info->gchar_data))) {
		g_print("nothing given to search for.\n");
		return;
	}

	switch(sview_search_info->search_type) {
	case SEARCH_JOB_STATE:
		id = JOB_PAGE;
		upper = job_state_string(sview_search_info->int_data);
		lower = str_tolower(upper);
		snprintf(title, 100, "Job(s) in the %s state", lower);
		xfree(lower);
		break;
	case SEARCH_JOB_ID:
		id = JOB_PAGE;
		snprintf(title, 100, "Job %s info",
			 sview_search_info->gchar_data);
		break;
	case SEARCH_JOB_USER:
		id = JOB_PAGE;
		snprintf(title, 100, "Job(s) info for user %s",
			 sview_search_info->gchar_data);
		break;
	case SEARCH_BLOCK_STATE:
		id = BLOCK_PAGE;
		upper = bg_block_state_string(sview_search_info->int_data);
		lower = str_tolower(upper);
		snprintf(title, 100, "BG Block(s) in the %s state", lower);
		xfree(lower);
		break;
	case SEARCH_BLOCK_NAME:
		id = BLOCK_PAGE;
		snprintf(title, 100, "Block %s info",
			 sview_search_info->gchar_data);
		break;
	case SEARCH_BLOCK_SIZE:
		id = BLOCK_PAGE;
		sview_search_info->int_data =
			revert_num_unit(sview_search_info->gchar_data);
		if (sview_search_info->int_data == -1)
			return;
		snprintf(title, 100, "Block(s) of size %d cnodes",
			 sview_search_info->int_data);
		break;
	case SEARCH_PARTITION_NAME:
		id = PART_PAGE;
		snprintf(title, 100, "Partition %s info",
			 sview_search_info->gchar_data);
		break;
	case SEARCH_PARTITION_STATE:
		id = PART_PAGE;
		if (sview_search_info->int_data)
			snprintf(title, 100, "Partition(s) that are up");
		else
			snprintf(title, 100, "Partition(s) that are down");
		break;
	case SEARCH_NODE_NAME:
		id = NODE_PAGE;
		snprintf(title, 100, "%s(s) %s info",
			 type, sview_search_info->gchar_data);
		break;
	case SEARCH_NODE_STATE:
		id = NODE_PAGE;
		upper = node_state_string(sview_search_info->int_data);
		lower = str_tolower(upper);
		snprintf(title, 100, "%s(s) in the %s state",
			 type, lower);
		xfree(lower);

		break;
	case SEARCH_RESERVATION_NAME:
		id = RESV_PAGE;
		snprintf(title, 100, "Reservation %s info",
			 sview_search_info->gchar_data);
		break;
	default:
		g_print("unknown search type %d.\n",
			sview_search_info->search_type);

		return;
	}

	itr = list_iterator_create(popup_list);
	while ((popup_win = list_next(itr))) {
		if (popup_win->spec_info)
			if (!strcmp(popup_win->spec_info->title, title)) {
				break;
			}
	}
	list_iterator_destroy(itr);

	if (!popup_win) {
		popup_win = create_popup_info(id, id, title);
	} else {
		gtk_window_present(GTK_WINDOW(popup_win->popup));
		return;
	}
	memcpy(popup_win->spec_info->search_info, sview_search_info,
	       sizeof(sview_search_info_t));

	if (!g_thread_create((gpointer)popup_thr, popup_win, FALSE, &error))
	{
		g_printerr ("Failed to create main popup thread: %s\n",
			    error->message);
		return;
	}
	return;
}
Exemplo n.º 5
0
extern int bg_status_update_block_state(bg_record_t *bg_record,
					uint16_t state,
					List kill_job_list)
{
	bool skipped_dealloc = false;
	kill_job_struct_t *freeit = NULL;
	int updated = 0;
	uint16_t real_state = bg_record->state & (~BG_BLOCK_ERROR_FLAG);

	if (real_state == state)
		return 0;

	debug("state of Block %s was %s and now is %s",
	      bg_record->bg_block_id,
	      bg_block_state_string(bg_record->state),
	      bg_block_state_string(state));

	/*
	  check to make sure block went
	  through freeing correctly
	*/
	if ((real_state != BG_BLOCK_TERM
	     && !(bg_record->state & BG_BLOCK_ERROR_FLAG))
	    && state == BG_BLOCK_FREE)
		skipped_dealloc = 1;
	else if ((real_state == BG_BLOCK_INITED)
		 && (state == BG_BLOCK_BOOTING)) {
		/* This means the user did a reboot through
		   mpirun but we missed the state
		   change */
		debug("Block %s skipped rebooting, "
		      "but it really is.  "
		      "Setting target_name back to %s",
		      bg_record->bg_block_id,
		      bg_record->user_name);
		xfree(bg_record->target_name);
		bg_record->target_name = xstrdup(bg_record->user_name);
	} else if ((real_state == BG_BLOCK_TERM)
		   && (state == BG_BLOCK_BOOTING))
		/* This is a funky state IBM says
		   isn't a bug, but all their
		   documentation says this doesn't
		   happen, but IBM says oh yeah, you
		   weren't really suppose to notice
		   that. So we will just skip this
		   state and act like this didn't happen. */
		goto nochange_state;
	real_state = state;
	if (bg_record->state & BG_BLOCK_ERROR_FLAG)
		state |= BG_BLOCK_ERROR_FLAG;

	bg_record->state = state;

	if (real_state == BG_BLOCK_TERM || skipped_dealloc)
		_block_is_deallocating(bg_record, kill_job_list);
	else if (real_state == BG_BLOCK_BOOTING) {
		debug("Setting bootflag for %s", bg_record->bg_block_id);
		bg_record->boot_state = 1;
	} else if (real_state == BG_BLOCK_FREE) {
		if (remove_from_bg_list(bg_lists->job_running, bg_record)
		    == SLURM_SUCCESS)
			num_unused_cpus += bg_record->cpu_cnt;
		remove_from_bg_list(bg_lists->booted,
				    bg_record);
	} else if (real_state & BG_BLOCK_ERROR_FLAG) {
		if (bg_record->boot_state)
			error("Block %s in an error state while booting.",
			      bg_record->bg_block_id);
		else
			error("Block %s in an error state.",
			      bg_record->bg_block_id);
		remove_from_bg_list(bg_lists->booted, bg_record);
		trigger_block_error();
	} else if (real_state == BG_BLOCK_INITED) {
		if (!block_ptr_exist_in_list(bg_lists->booted, bg_record))
			list_push(bg_lists->booted, bg_record);
	}
	updated = 1;
nochange_state:

	/* check the boot state */
	debug3("boot state for block %s is %d",
	       bg_record->bg_block_id, bg_record->boot_state);
	if (bg_record->boot_state) {
		if (bg_record->state & BG_BLOCK_ERROR_FLAG) {
			/* If we get an error on boot that
			 * means it is a transparent L3 error
			 * and should be trying to fix
			 * itself.  If this is the case we
			 * just hang out waiting for the state
			 * to go to free where we will try to
			 * boot again below.
			 */
			return updated;
		}

		switch (real_state) {
		case BG_BLOCK_BOOTING:
			debug3("checking to make sure user %s "
			       "is the user.",
			       bg_record->target_name);

			if (update_block_user(bg_record, 0) == 1)
				last_bg_update = time(NULL);
			if (bg_record->job_ptr) {
				bg_record->job_ptr->job_state |=
					JOB_CONFIGURING;
				last_job_update = time(NULL);
			}
			break;
		case BG_BLOCK_FREE:
			if (bg_record->boot_count < RETRY_BOOT_COUNT) {
				bridge_block_boot(bg_record);

				if (bg_record->magic == BLOCK_MAGIC) {
					debug("boot count for block %s is %d",
					      bg_record->bg_block_id,
					      bg_record->boot_count);
					bg_record->boot_count++;
				}
			} else {
				char *reason = (char *)
					"status_check: Boot fails ";

				error("Couldn't boot Block %s for user %s",
				      bg_record->bg_block_id,
				      bg_record->target_name);

				slurm_mutex_unlock(&block_state_mutex);
				requeue_and_error(bg_record, reason);
				slurm_mutex_lock(&block_state_mutex);

				bg_record->boot_state = 0;
				bg_record->boot_count = 0;
				if (remove_from_bg_list(
					    bg_lists->job_running, bg_record)
				    == SLURM_SUCCESS)
					num_unused_cpus += bg_record->cpu_cnt;

				remove_from_bg_list(bg_lists->booted,
						    bg_record);
			}
			break;
		case BG_BLOCK_INITED:
			debug("block %s is ready.",
			      bg_record->bg_block_id);
			if (bg_record->job_ptr) {
				bg_record->job_ptr->job_state &=
					(~JOB_CONFIGURING);
				last_job_update = time(NULL);
			}
			/* boot flags are reset here */
			if (kill_job_list &&
			    set_block_user(bg_record) == SLURM_ERROR) {
				freeit = (kill_job_struct_t *)
					xmalloc(sizeof(kill_job_struct_t));
				freeit->jobid = bg_record->job_running;
				list_push(kill_job_list, freeit);
			}
			break;
		case BG_BLOCK_TERM:
			debug2("Block %s is in a deallocating state "
			       "during a boot.  Doing nothing until "
			       "free state.",
			       bg_record->bg_block_id);
			break;
		case BG_BLOCK_REBOOTING:
			debug2("Block %s is rebooting.",
			       bg_record->bg_block_id);
			break;
		default:
			debug("Hey the state of block "
			      "%s is %d(%s) doing nothing.",
			      bg_record->bg_block_id,
			      real_state,
			      bg_block_state_string(bg_record->state));
			break;
		}
	}

	return updated;
}
Exemplo n.º 6
0
extern int select_nodeinfo_set_all(void)
{
	ListIterator itr = NULL;
	struct node_record *node_ptr = NULL;
	int i=0;
	bg_record_t *bg_record = NULL;
	static time_t last_set_all = 0;
	ba_mp_t *ba_mp;
	node_subgrp_t *subgrp = NULL;
	int bit_count;

	//uint32_t cluster_flags = slurmdb_setup_cluster_flags();

	if (!blocks_are_created)
		return SLURM_NO_CHANGE_IN_DATA;

	if (!g_bitmap_size) {
		/* if (cluster_flags & CLUSTER_FLAG_BGQ) */
		/* 	g_bitmap_size = bg_conf->mp_cnode_cnt; */
		/* else */
			g_bitmap_size = bg_conf->ionodes_per_mp;
	}

	/* only set this once when the last_bg_update is newer than
	   the last time we set things up. */
	if (last_set_all && (last_bg_update-1 < last_set_all)) {
		debug2("Node select info for set all hasn't "
		       "changed since %ld",
		       last_set_all);
		return SLURM_NO_CHANGE_IN_DATA;
	}
	last_set_all = last_bg_update;

	/* set this here so we know things have changed */
	last_node_update = time(NULL);

	slurm_mutex_lock(&block_state_mutex);
	for (i=0; i<node_record_count; i++) {
		select_nodeinfo_t *nodeinfo;

		node_ptr = &(node_record_table_ptr[i]);
		xassert(node_ptr->select_nodeinfo);
		nodeinfo = node_ptr->select_nodeinfo->data;
		xassert(nodeinfo);
		xassert(nodeinfo->subgrp_list);
		list_flush(nodeinfo->subgrp_list);
		if (nodeinfo->bitmap_size != g_bitmap_size)
			nodeinfo->bitmap_size = g_bitmap_size;
	}

	itr = list_iterator_create(bg_lists->main);
	while ((bg_record = list_next(itr))) {
		enum node_states state = NODE_STATE_UNKNOWN;
		select_nodeinfo_t *nodeinfo;
		bitstr_t *bitmap;
		ListIterator itr2 = NULL;

		/* Only mark unidle blocks */
		if (bg_record->job_list && list_count(bg_record->job_list)) {
			struct job_record *job_ptr;
			select_jobinfo_t *jobinfo;
			ListIterator itr =
				list_iterator_create(bg_record->job_list);
			ba_mp = list_peek(bg_record->ba_mp_list);
			node_ptr = &(node_record_table_ptr[ba_mp->index]);
			xassert(node_ptr->select_nodeinfo);
			nodeinfo = node_ptr->select_nodeinfo->data;
			xassert(nodeinfo);
			xassert(nodeinfo->subgrp_list);
			if (ba_mp->cnode_err_bitmap
			    && (bit_count =
				bit_set_count(ba_mp->cnode_err_bitmap))) {
				subgrp = _find_subgrp(nodeinfo->subgrp_list,
						      NODE_STATE_ERROR,
						      g_bitmap_size);
				/* FIXME: the subgrp->bitmap isn't set here. */
				subgrp->cnode_cnt += bit_count;
			}

			subgrp = _find_subgrp(nodeinfo->subgrp_list,
					      NODE_STATE_ALLOCATED,
					      g_bitmap_size);
			while ((job_ptr = list_next(itr))) {
				jobinfo = job_ptr->select_jobinfo->data;
				/* FIXME: the subgrp->bitmap isn't set here. */
				subgrp->cnode_cnt += jobinfo->cnode_cnt;
			}
			list_iterator_destroy(itr);
			continue;
		} else if (bg_record->job_running == NO_JOB_RUNNING)
			continue;

		if (bg_record->state & BG_BLOCK_ERROR_FLAG)
			state = NODE_STATE_ERROR;
		else if (bg_record->job_running > NO_JOB_RUNNING) {
			/* we don't need to set the allocated here
			 * since the whole midplane is allocated */
			if (bg_record->conn_type[0] < SELECT_SMALL)
				continue;
			state = NODE_STATE_ALLOCATED;
		} else {
			error("not sure why we got here with block %s %s",
			      bg_record->bg_block_id,
			      bg_block_state_string(bg_record->state));
			continue;
		}
		/* if ((cluster_flags & CLUSTER_FLAG_BGQ) */
		/*     && (state != NODE_STATE_ERROR)) */
		/* 	bitmap = bg_record->cnodes_used_bitmap; */
		/* else */
		bitmap = bg_record->ionode_bitmap;

		itr2 = list_iterator_create(bg_record->ba_mp_list);
		while ((ba_mp = list_next(itr2))) {
			if (!ba_mp->used)
				continue;

			node_ptr = &(node_record_table_ptr[ba_mp->index]);

			xassert(node_ptr->select_nodeinfo);
			nodeinfo = node_ptr->select_nodeinfo->data;
			xassert(nodeinfo);
			xassert(nodeinfo->subgrp_list);

			if (ba_mp->cnode_err_bitmap
			    && (state == NODE_STATE_ALLOCATED)
			    && (bit_count =
				bit_set_count(ba_mp->cnode_err_bitmap))) {
				subgrp = _find_subgrp(nodeinfo->subgrp_list,
						      NODE_STATE_ERROR,
						      g_bitmap_size);
				/* FIXME: the subgrp->bitmap isn't set here. */
				subgrp->cnode_cnt += bit_count;
			}

			subgrp = _find_subgrp(nodeinfo->subgrp_list,
					      state, g_bitmap_size);

			if (subgrp->cnode_cnt < bg_conf->mp_cnode_cnt) {
				/* if (cluster_flags & CLUSTER_FLAG_BGQ) { */
				/* 	bit_or(subgrp->bitmap, bitmap); */
				/* 	subgrp->cnode_cnt += */
				/* 		bit_set_count(bitmap); */
				/* } else */ if (bg_record->cnode_cnt
					   < bg_conf->mp_cnode_cnt) {
					bit_or(subgrp->bitmap, bitmap);
					subgrp->cnode_cnt +=
						bg_record->cnode_cnt;
				} else {
					bit_nset(subgrp->bitmap,
						 0,
						 (g_bitmap_size-1));
					subgrp->cnode_cnt =
						bg_conf->mp_cnode_cnt;
				}
			}
		}
		list_iterator_destroy(itr2);
	}
	list_iterator_destroy(itr);
	slurm_mutex_unlock(&block_state_mutex);

	return SLURM_SUCCESS;
}
Exemplo n.º 7
0
extern int bg_free_block(bg_record_t *bg_record, bool wait, bool locked)
{
	int rc = SLURM_SUCCESS;
	int count = 0;

	if (!bg_record) {
		error("bg_free_block: there was no bg_record");
		return SLURM_ERROR;
	}

	if (!locked)
		slurm_mutex_lock(&block_state_mutex);

	while (count < MAX_FREE_RETRIES) {
		/* block was removed */
		if (bg_record->magic != BLOCK_MAGIC) {
			error("block was removed while freeing it here");
			xassert(0);
			if (!locked)
				slurm_mutex_unlock(&block_state_mutex);
			return SLURM_SUCCESS;
		}
		/* Reset these here so we don't try to reboot it
		   when the state goes to free.
		*/
		bg_record->boot_state = 0;
		bg_record->boot_count = 0;
		/* Here we don't need to check if the block is still
		 * in exsistance since this function can't be called on
		 * the same block twice.  It may
		 * had already been removed at this point also.
		 */
#ifdef HAVE_BG_FILES
		if (bg_record->state != BG_BLOCK_FREE
		    && bg_record->state != BG_BLOCK_TERM) {
			if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
				info("bridge_destroy %s",
				     bg_record->bg_block_id);
			rc = bridge_block_free(bg_record);
			if (rc != SLURM_SUCCESS) {
				if (rc == BG_ERROR_BLOCK_NOT_FOUND) {
					debug("block %s is not found",
					      bg_record->bg_block_id);
					bg_record->state = BG_BLOCK_FREE;
					break;
				} else if (rc == BG_ERROR_FREE) {
					if (bg_conf->slurm_debug_flags
					    & DEBUG_FLAG_SELECT_TYPE)
						info("bridge_block_free"
						     "(%s): %s State = %s",
						     bg_record->bg_block_id,
						     bg_err_str(rc),
						     bg_block_state_string(
							     bg_record->state));
				} else if (rc == BG_ERROR_INVALID_STATE) {
#ifndef HAVE_BGL
					/* If the state is error and
					   we get an incompatible
					   state back here, it means
					   we set it ourselves so
					   break out.
					*/
					if (bg_record->state
					    & BG_BLOCK_ERROR_FLAG)
						break;
#endif
					if (bg_conf->slurm_debug_flags
					    & DEBUG_FLAG_SELECT_TYPE)
						info("bridge_block_free"
						     "(%s): %s State = %s",
						     bg_record->bg_block_id,
						     bg_err_str(rc),
						     bg_block_state_string(
							     bg_record->state));
#ifdef HAVE_BGQ
					if (bg_record->state != BG_BLOCK_FREE
					    && bg_record->state
					    != BG_BLOCK_TERM)
					bg_record->state = BG_BLOCK_TERM;
#endif
				} else {
					error("bridge_block_free"
					      "(%s): %s State = %s",
					      bg_record->bg_block_id,
					      bg_err_str(rc),
					      bg_block_state_string(
						      bg_record->state));
				}
			}
		}
#else
		/* Fake a free since we are n deallocating
		   state before this.
		*/
		if (bg_record->state & BG_BLOCK_ERROR_FLAG) {
			/* This will set the state to ERROR(Free)
			 * just incase the state was ERROR(SOMETHING ELSE) */
			bg_record->state = BG_BLOCK_ERROR_FLAG;
			break;
		} else if (!wait || (count >= 3))
			bg_record->state = BG_BLOCK_FREE;
		else if (bg_record->state != BG_BLOCK_FREE)
			bg_record->state = BG_BLOCK_TERM;
#endif

		if (!wait || (bg_record->state == BG_BLOCK_FREE)
#ifndef HAVE_BGL
		    ||  (bg_record->state & BG_BLOCK_ERROR_FLAG)
#endif
			) {
			break;
		}
		/* If we were locked outside of this we need to unlock
		   to not cause deadlock on this mutex until we are
		   done.
		*/
		slurm_mutex_unlock(&block_state_mutex);
		sleep(FREE_SLEEP_INTERVAL);
		count++;
		slurm_mutex_lock(&block_state_mutex);
	}

	rc = SLURM_SUCCESS;
	if ((bg_record->state == BG_BLOCK_FREE)
	    || (bg_record->state & BG_BLOCK_ERROR_FLAG))
		remove_from_bg_list(bg_lists->booted, bg_record);
	else if (count >= MAX_FREE_RETRIES) {
		/* Something isn't right, go mark this one in an error
		   state. */
		update_block_msg_t block_msg;
		if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
			info("bg_free_block: block %s is not in state "
			     "free (%s), putting it in error state.",
			     bg_record->bg_block_id,
			     bg_block_state_string(bg_record->state));
		slurm_init_update_block_msg(&block_msg);
		block_msg.bg_block_id = bg_record->bg_block_id;
		block_msg.state = BG_BLOCK_ERROR_FLAG;
		block_msg.reason = "Block would not deallocate";
		slurm_mutex_unlock(&block_state_mutex);
		select_g_update_block(&block_msg);
		slurm_mutex_lock(&block_state_mutex);
		rc = SLURM_ERROR;
	}
	if (!locked)
		slurm_mutex_unlock(&block_state_mutex);

	return rc;
}
static int _print_text_part(partition_info_t *part_ptr,
			    db2_block_info_t *db2_info_ptr)
{
	int printed = 0;
	int tempxcord;
	int prefixlen;
	int i = 0;
	int width = 0;
	char *nodes = NULL, time_buf[20], *conn_str = NULL;
	char tmp_cnt[8];
	char tmp_char[8];

	if (params.cluster_flags & CLUSTER_FLAG_BG)
		convert_num_unit((float)part_ptr->total_nodes, tmp_cnt,
				 sizeof(tmp_cnt), UNIT_NONE, NO_VAL,
				 CONVERT_NUM_UNIT_EXACT);
	else
		snprintf(tmp_cnt, sizeof(tmp_cnt), "%u", part_ptr->total_nodes);

	if (!params.commandline) {
		mvwprintw(text_win,
			  main_ycord,
			  main_xcord, "%c",
			  part_ptr->flags);
		main_xcord += 4;

		if (part_ptr->name) {
			mvwprintw(text_win,
				  main_ycord,
				  main_xcord, "%.9s",
				  part_ptr->name);
			main_xcord += 10;
			if (params.display != BGPART) {
				char *tmp_state;
				if (part_ptr->state_up == PARTITION_INACTIVE)
					tmp_state = "inact";
				else if (part_ptr->state_up == PARTITION_UP)
					tmp_state = "up";
				else if (part_ptr->state_up == PARTITION_DOWN)
					tmp_state = "down";
				else if (part_ptr->state_up == PARTITION_DRAIN)
					tmp_state = "drain";
				else
					tmp_state = "unk";
				mvwprintw(text_win, main_ycord, main_xcord,
					  tmp_state);
				main_xcord += 7;

				if (part_ptr->max_time == INFINITE)
					snprintf(time_buf, sizeof(time_buf),
						 "infinite");
				else {
					secs2time_str((part_ptr->max_time
						       * 60),
						      time_buf,
						      sizeof(time_buf));
				}

				width = strlen(time_buf);
				mvwprintw(text_win,
					  main_ycord,
					  main_xcord + (9 - width),
					  "%s",
					  time_buf);
				main_xcord += 11;
			}
		} else
			main_xcord += 10;

		if (params.display == BGPART) {
			if (db2_info_ptr) {
				char *job_running = _set_running_job_str(
					db2_info_ptr->job_list, 1);
				mvwprintw(text_win,
					  main_ycord,
					  main_xcord, "%.16s",
					  db2_info_ptr->bg_block_name);
				main_xcord += 18;
				mvwprintw(text_win,
					  main_ycord,
					  main_xcord, "%.7s",
					  bg_block_state_string(
						  db2_info_ptr->state));
				main_xcord += 8;

				snprintf(tmp_char, sizeof(tmp_char),
					 "%s", job_running);
				xfree(job_running);

				mvwprintw(text_win,
					  main_ycord,
					  main_xcord,
					  "%.8s", tmp_char);
				main_xcord += 8;

				conn_str = conn_type_string_full(
					db2_info_ptr->bg_conn_type);
				mvwprintw(text_win,
					  main_ycord,
					  main_xcord, "%.7s",
					  conn_str);
				xfree(conn_str);
				main_xcord += 8;

			} else {
				mvwprintw(text_win,
					  main_ycord,
					  main_xcord, "?");
				main_xcord += 18;
				mvwprintw(text_win,
					  main_ycord,
					  main_xcord, "?");
				main_xcord += 8;
				mvwprintw(text_win,
					  main_ycord,
					  main_xcord, "?");
				main_xcord += 8;
				mvwprintw(text_win,
					  main_ycord,
					  main_xcord, "?");
				main_xcord += 9;
				mvwprintw(text_win,
					  main_ycord,
					  main_xcord, "?");
				main_xcord += 7;
				mvwprintw(text_win,
					  main_ycord,
					  main_xcord, "?");
				main_xcord += 10;
			}
		}
		mvwprintw(text_win,
			  main_ycord,
			  main_xcord, "%5s", tmp_cnt);

		main_xcord += 7;

		tempxcord = main_xcord;

		if (params.display == BGPART)
			nodes = part_ptr->allow_groups;
		else
			nodes = part_ptr->nodes;
		i = 0;
		prefixlen = i;
		while (nodes && nodes[i]) {
			width = getmaxx(text_win) - 1 - main_xcord;

			if (!prefixlen && (nodes[i] == '[') &&
			    (nodes[i - 1] == ','))
				prefixlen = i + 1;

			if (nodes[i - 1] == ',' && (width - 12) <= 0) {
				main_ycord++;
				main_xcord = tempxcord + prefixlen;
			} else if (main_xcord >= getmaxx(text_win)) {
				main_ycord++;
				main_xcord = tempxcord + prefixlen;
			}

			if ((printed = mvwaddch(text_win,
						main_ycord,
						main_xcord,
						nodes[i])) < 0)
				return printed;
			main_xcord++;

			i++;
		}
		if ((params.display == BGPART) && db2_info_ptr &&
		    (db2_info_ptr->ionode_str)) {
			mvwprintw(text_win,
				  main_ycord,
				  main_xcord, "[%s]",
				  db2_info_ptr->ionode_str);
		}

		main_xcord = 1;
		main_ycord++;
	} else {
		if (part_ptr->name) {
			printf("%9.9s ", part_ptr->name);

			if (params.display != BGPART) {
				if (part_ptr->state_up == PARTITION_INACTIVE)
					printf(" inact ");
				else if (part_ptr->state_up == PARTITION_UP)
					printf("   up ");
				else if (part_ptr->state_up == PARTITION_DOWN)
					printf(" down ");
				else if (part_ptr->state_up == PARTITION_DRAIN)
					printf(" drain ");
				else
					printf(" unk ");

				if (part_ptr->max_time == INFINITE)
					snprintf(time_buf, sizeof(time_buf),
						 "infinite");
				else {
					secs2time_str((part_ptr->max_time
						       * 60),
						      time_buf,
						      sizeof(time_buf));
				}

				printf("%9.9s ", time_buf);
			}
		}

		if (params.display == BGPART) {
			if (db2_info_ptr) {
				char *job_running = _set_running_job_str(
					db2_info_ptr->job_list, 1);
				printf("%16.16s ",
				       db2_info_ptr->bg_block_name);
				printf("%-7.7s ",
				       bg_block_state_string(
					       db2_info_ptr->state));

				printf("%8.8s ", job_running);
				xfree(job_running);

				conn_str = conn_type_string_full(
					db2_info_ptr->bg_conn_type);
				printf("%8.8s ", conn_str);
				xfree(conn_str);
			}
		}

		printf("%5s ", tmp_cnt);

		if (params.display == BGPART)
			nodes = part_ptr->allow_groups;
		else
			nodes = part_ptr->nodes;

		if ((params.display == BGPART) && db2_info_ptr &&
		    (db2_info_ptr->ionode_str)) {
			printf("%s[%s]\n", nodes, db2_info_ptr->ionode_str);
		} else
			printf("%s\n",nodes);
	}
	return printed;
}
Exemplo n.º 9
0
/*
 * slurm_sprint_block_info - output information about a specific Bluegene
 *	block based upon message as loaded using slurm_load_block
 * IN block_ptr - an individual partition information record pointer
 * IN one_liner - print as a single line if true
 * RET out - char * containing formatted output (must be freed after call)
 *           NULL is returned on failure.
 */
char *slurm_sprint_block_info(
	block_info_t * block_ptr, int one_liner)
{
	int j;
	char tmp1[16], tmp2[16], *tmp_char = NULL;
	char *out = NULL;
	char *line_end = "\n   ";
	uint32_t cluster_flags = slurmdb_setup_cluster_flags();

	if (one_liner)
		line_end = " ";

	/****** Line 1 ******/
	convert_num_unit((float)block_ptr->cnode_cnt, tmp1, sizeof(tmp1),
			 UNIT_NONE, NO_VAL, CONVERT_NUM_UNIT_EXACT);
	if (cluster_flags & CLUSTER_FLAG_BGQ) {
		convert_num_unit((float)block_ptr->cnode_err_cnt, tmp2,
				 sizeof(tmp2), UNIT_NONE, NO_VAL,
				 CONVERT_NUM_UNIT_EXACT);
		tmp_char = xstrdup_printf("%s/%s", tmp1, tmp2);
	} else
		tmp_char = tmp1;

	out = xstrdup_printf("BlockName=%s TotalNodes=%s State=%s%s",
			     block_ptr->bg_block_id, tmp_char,
			     bg_block_state_string(block_ptr->state),
			     line_end);
	if (cluster_flags & CLUSTER_FLAG_BGQ)
		xfree(tmp_char);
	/****** Line 2 ******/
	j = 0;
	if (block_ptr->job_list)
		j = list_count(block_ptr->job_list);

	if (!j)
		xstrcat(out, "JobRunning=NONE ");
	else if (j == 1) {
		block_job_info_t *block_job = list_peek(block_ptr->job_list);
		xstrfmtcat(out, "JobRunning=%u ", block_job->job_id);
	} else
		xstrcat(out, "JobRunning=Multiple ");

	tmp_char = conn_type_string_full(block_ptr->conn_type);
	xstrfmtcat(out, "ConnType=%s", tmp_char);
	xfree(tmp_char);
	if (cluster_flags & CLUSTER_FLAG_BGL)
		xstrfmtcat(out, " NodeUse=%s",
			   node_use_string(block_ptr->node_use));

	xstrcat(out, line_end);

	/****** Line 3 ******/
	if (block_ptr->ionode_str)
		xstrfmtcat(out, "MidPlanes=%s[%s] MPIndices=",
			   block_ptr->mp_str, block_ptr->ionode_str);
	else
		xstrfmtcat(out, "MidPlanes=%s MPIndices=",
			   block_ptr->mp_str);
	for (j = 0;
	     (block_ptr->mp_inx && (block_ptr->mp_inx[j] != -1));
	     j+=2) {
		if (j > 0)
			xstrcat(out, ",");
		xstrfmtcat(out, "%d-%d", block_ptr->mp_inx[j],
			   block_ptr->mp_inx[j+1]);
	}
	xstrcat(out, line_end);

	/****** Line 4 ******/
	xstrfmtcat(out, "MloaderImage=%s%s",
		   block_ptr->mloaderimage, line_end);

	if (cluster_flags & CLUSTER_FLAG_BGL) {
		/****** Line 5 ******/
		xstrfmtcat(out, "BlrtsImage=%s%s", block_ptr->blrtsimage,
			   line_end);
		/****** Line 6 ******/
		xstrfmtcat(out, "LinuxImage=%s%s", block_ptr->linuximage,
			   line_end);
		/****** Line 7 ******/
		xstrfmtcat(out, "RamdiskImage=%s", block_ptr->ramdiskimage);
	} else if (cluster_flags & CLUSTER_FLAG_BGP) {
		/****** Line 5 ******/
		xstrfmtcat(out, "CnloadImage=%s%s", block_ptr->linuximage,
			   line_end);
		/****** Line 6 ******/
		xstrfmtcat(out, "IoloadImage=%s", block_ptr->ramdiskimage);
	}

	if (block_ptr->reason)
		xstrfmtcat(out, "Reason=%s%s",
			   block_ptr->reason, line_end);

	if (one_liner)
		xstrcat(out, "\n");
	else
		xstrcat(out, "\n\n");

	return out;
}
Exemplo n.º 10
0
/*
 * create_dynamic_block - create new block(s) to be used for a new
 * job allocation.
 * RET - a list of created block(s) or NULL on failure errno is set.
 */
extern List create_dynamic_block(List block_list,
				 select_ba_request_t *request,
				 List my_block_list,
				 bool track_down_nodes)
{
	int rc = SLURM_SUCCESS;

	ListIterator itr, itr2;
	bg_record_t *bg_record = NULL, *found_record = NULL;
	List results = NULL;
	List new_blocks = NULL;
	bitstr_t *my_bitmap = NULL;
	select_ba_request_t blockreq;
	int cnodes = request->procs / bg_conf->cpu_ratio;
	uint16_t start_geo[SYSTEM_DIMENSIONS];

	if (cnodes < bg_conf->smallest_block) {
		error("Can't create this size %d "
		      "on this system ionodes_per_mp is %d",
		      request->procs,
		      bg_conf->ionodes_per_mp);
		goto finished;
	}
	memset(&blockreq, 0, sizeof(select_ba_request_t));
	memcpy(start_geo, request->geometry, sizeof(start_geo));

	/* We need to lock this just incase a blocks_overlap is called
	   which will in turn reset and set the system as it sees fit.
	*/
	slurm_mutex_lock(&block_state_mutex);
	if (my_block_list) {
		reset_ba_system(track_down_nodes);
		itr = list_iterator_create(my_block_list);
		while ((bg_record = list_next(itr))) {
			if (bg_record->magic != BLOCK_MAGIC) {
				/* This should never happen since we
				   only call this on copies of blocks
				   and we check on this during the
				   copy.
				*/
				error("create_dynamic_block: "
				      "got a block with bad magic?");
				continue;
			}
			if (bg_record->free_cnt) {
				if (bg_conf->slurm_debug_flags
				    & DEBUG_FLAG_BG_PICK) {
					int dim;
					char start_geo[SYSTEM_DIMENSIONS+1];
					char geo[SYSTEM_DIMENSIONS+1];
					for (dim=0; dim<SYSTEM_DIMENSIONS;
					     dim++) {
						start_geo[dim] = alpha_num[
							bg_record->start[dim]];
						geo[dim] = alpha_num[
							bg_record->geo[dim]];
					}
					start_geo[dim] = '\0';
					geo[dim] = '\0';
					info("not adding %s(%s) %s %s %s %u "
					     "(free_cnt)",
					     bg_record->bg_block_id,
					     bg_record->mp_str,
					     bg_block_state_string(
						     bg_record->state),
					     start_geo,
					     geo,
					     bg_record->cnode_cnt);
				}
				continue;
			}

			if (!my_bitmap) {
				my_bitmap =
					bit_alloc(bit_size(bg_record->bitmap));
			}

			if (!bit_super_set(bg_record->bitmap, my_bitmap)) {
				bit_or(my_bitmap, bg_record->bitmap);

				if (bg_conf->slurm_debug_flags
				    & DEBUG_FLAG_BG_PICK) {
					int dim;
					char start_geo[SYSTEM_DIMENSIONS+1];
					char geo[SYSTEM_DIMENSIONS+1];
					for (dim=0; dim<SYSTEM_DIMENSIONS;
					     dim++) {
						start_geo[dim] = alpha_num[
							bg_record->start[dim]];
						geo[dim] = alpha_num[
							bg_record->geo[dim]];
					}
					start_geo[dim] = '\0';
					geo[dim] = '\0';
					info("adding %s(%s) %s %s %s %u",
					     bg_record->bg_block_id,
					     bg_record->mp_str,
					     bg_block_state_string(
						     bg_record->state),
					     start_geo, geo,
					     bg_record->cnode_cnt);
				}
				if (check_and_set_mp_list(
					    bg_record->ba_mp_list)
				    == SLURM_ERROR) {
					if (bg_conf->slurm_debug_flags
					    & DEBUG_FLAG_BG_PICK)
						info("something happened in "
						     "the load of %s",
						     bg_record->bg_block_id);
					list_iterator_destroy(itr);
					FREE_NULL_BITMAP(my_bitmap);
					rc = SLURM_ERROR;
					goto finished;
				}
			} else {
				if (bg_conf->slurm_debug_flags
				    & DEBUG_FLAG_BG_PICK) {
					int dim;
					char start_geo[SYSTEM_DIMENSIONS+1];
					char geo[SYSTEM_DIMENSIONS+1];
					for (dim=0; dim<SYSTEM_DIMENSIONS;
					     dim++) {
						start_geo[dim] = alpha_num[
							bg_record->start[dim]];
						geo[dim] = alpha_num[
							bg_record->geo[dim]];
					}
					start_geo[dim] = '\0';
					geo[dim] = '\0';
					info("not adding %s(%s) %s %s %s %u ",
					     bg_record->bg_block_id,
					     bg_record->mp_str,
					     bg_block_state_string(
						     bg_record->state),
					     start_geo,
					     geo,
					     bg_record->cnode_cnt);
				}
				/* just so we don't look at it later */
				bg_record->free_cnt = -1;
			}
		}
		list_iterator_destroy(itr);
		FREE_NULL_BITMAP(my_bitmap);
	} else {
		reset_ba_system(false);
		if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
			info("No list was given");
	}

	if (request->avail_mp_bitmap)
		ba_set_removable_mps(request->avail_mp_bitmap, 1);

	if (request->size==1 && cnodes < bg_conf->mp_cnode_cnt) {
		switch(cnodes) {
#ifdef HAVE_BGL
		case 32:
			blockreq.small32 = 4;
			blockreq.small128 = 3;
			break;
		case 128:
			blockreq.small128 = 4;
			break;
#else
		case 16:
			blockreq.small16 = 2;
			blockreq.small32 = 1;
			blockreq.small64 = 1;
			blockreq.small128 = 1;
			blockreq.small256 = 1;
			break;
		case 32:
			blockreq.small32 = 2;
			blockreq.small64 = 1;
			blockreq.small128 = 1;
			blockreq.small256 = 1;
			break;
		case 64:
			blockreq.small64 = 2;
			blockreq.small128 = 1;
			blockreq.small256 = 1;
			break;
		case 128:
			blockreq.small128 = 2;
			blockreq.small256 = 1;
			break;
		case 256:
			blockreq.small256 = 2;
			break;
#endif
		default:
			error("This size %d is unknown on this system", cnodes);
			goto finished;
			break;
		}

		/* Sort the list so the small blocks are in the order
		 * of ionodes. */
		list_sort(block_list, (ListCmpF)bg_record_cmpf_inc);
		request->conn_type[0] = SELECT_SMALL;
		new_blocks = list_create(destroy_bg_record);
		/* check only blocks that are free and small */
		if (_breakup_blocks(block_list, new_blocks,
				    request, my_block_list,
				    true, true)
		    == SLURM_SUCCESS)
			goto finished;

		/* check only blocks that are free and any size */
		if (_breakup_blocks(block_list, new_blocks,
				    request, my_block_list,
				    true, false)
		    == SLURM_SUCCESS)
			goto finished;

		/* check usable blocks that are small with any state */
		if (_breakup_blocks(block_list, new_blocks,
				    request, my_block_list,
				    false, true)
		    == SLURM_SUCCESS)
			goto finished;

		/* check all usable blocks */
		if (_breakup_blocks(block_list, new_blocks,
				    request, my_block_list,
				    false, false)
		    == SLURM_SUCCESS)
			goto finished;

		/* Re-sort the list back to the original order. */
		list_sort(block_list, (ListCmpF)bg_record_sort_aval_inc);
		list_destroy(new_blocks);
		new_blocks = NULL;
		if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
			info("small block not able to be placed inside others");
	}

	if (request->conn_type[0] == SELECT_NAV)
		request->conn_type[0] = SELECT_TORUS;

	//debug("going to create %d", request->size);
	if (!new_ba_request(request)) {
		if (request->geometry[0] != (uint16_t)NO_VAL) {
			char *geo = give_geo(request->geometry);
			error("Problems with request for size %d geo %s",
			      request->size, geo);
			xfree(geo);
		} else {
			error("Problems with request for size %d.  "
			      "No geo given.",
			      request->size);
		}
		rc = ESLURM_INTERCONNECT_FAILURE;
		goto finished;
	}

	/* try on free midplanes */
	rc = SLURM_SUCCESS;
	if (results)
		list_flush(results);
	else {
#ifdef HAVE_BGQ
		results = list_create(destroy_ba_mp);
#else
		results = list_create(NULL);
#endif
	}

	rc = allocate_block(request, results);
	/* This could be changed in allocate_block so set it back up */
	memcpy(request->geometry, start_geo, sizeof(start_geo));

	if (rc) {
		rc = SLURM_SUCCESS;
		goto setup_records;
	}

	if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
		info("allocate failure for size %d base "
		     "partitions of free midplanes",
		     request->size);
	rc = SLURM_ERROR;

	if (!list_count(my_block_list) || !my_block_list)
		goto finished;

	/*Try to put block starting in the smallest of the exisiting blocks*/
	itr = list_iterator_create(my_block_list);
	itr2 = list_iterator_create(my_block_list);
	while ((bg_record = (bg_record_t *) list_next(itr)) != NULL) {
		bool is_small = 0;
		/* never check a block with a job running */
		if (bg_record->free_cnt
		    || bg_record->job_running != NO_JOB_RUNNING)
			continue;

		/* Here we are only looking for the first
		   block on the midplane.  So either the count
		   is greater or equal than
		   bg_conf->mp_cnode_cnt or the first bit is
		   set in the ionode_bitmap.
		*/
		if (bg_record->cnode_cnt < bg_conf->mp_cnode_cnt) {
			bool found = 0;
			if (bit_ffs(bg_record->ionode_bitmap) != 0)
				continue;
			/* Check to see if we have other blocks in
			   this midplane that have jobs running.
			*/
			while ((found_record = list_next(itr2))) {
				if (!found_record->free_cnt
				    && (found_record->job_running
					!= NO_JOB_RUNNING)
				    && bit_overlap(bg_record->bitmap,
						   found_record->bitmap)) {
					found = 1;
					break;
				}
			}
			list_iterator_reset(itr2);
			if (found)
				continue;
			is_small = 1;
		}

		if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
			info("removing %s(%s) for request %d",
			     bg_record->bg_block_id,
			     bg_record->mp_str, request->size);

		remove_block(bg_record->ba_mp_list, is_small);
		rc = SLURM_SUCCESS;
		if (results)
			list_flush(results);
		else {
#ifdef HAVE_BGQ
			results = list_create(destroy_ba_mp);
#else
			results = list_create(NULL);
#endif
		}

		rc = allocate_block(request, results);
		/* This could be changed in allocate_block so set it back up */
		memcpy(request->geometry, start_geo, sizeof(start_geo));
		if (rc) {
			rc = SLURM_SUCCESS;
			break;
		}

		if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
			info("allocate failure for size %d base partitions",
			     request->size);
		rc = SLURM_ERROR;
	}
	list_iterator_destroy(itr);
	list_iterator_destroy(itr2);

setup_records:
	if (rc == SLURM_SUCCESS) {
		/*set up bg_record(s) here */
		new_blocks = list_create(destroy_bg_record);

		blockreq.save_name = request->save_name;
#ifdef HAVE_BGL
		blockreq.blrtsimage = request->blrtsimage;
#endif
		blockreq.linuximage = request->linuximage;
		blockreq.mloaderimage = request->mloaderimage;
		blockreq.ramdiskimage = request->ramdiskimage;
		memcpy(blockreq.conn_type, request->conn_type,
		       sizeof(blockreq.conn_type));

		add_bg_record(new_blocks, &results, &blockreq, 0, 0);
	}

finished:
	if (request->avail_mp_bitmap
	    && (bit_ffc(request->avail_mp_bitmap) == -1))
		ba_reset_all_removed_mps();
	slurm_mutex_unlock(&block_state_mutex);

	/* reset the ones we mucked with */
	itr = list_iterator_create(my_block_list);
	while ((bg_record = (bg_record_t *) list_next(itr))) {
		if (bg_record->free_cnt == -1)
			bg_record->free_cnt = 0;
	}
	list_iterator_destroy(itr);


	xfree(request->save_name);

	if (results)
		list_destroy(results);
	errno = rc;
	return new_blocks;
}
Exemplo n.º 11
0
/* block_state_mutex should be locked before calling this */
static int _post_block_free(bg_record_t *bg_record, bool restore)
{
	int rc = SLURM_SUCCESS;

	if (bg_record->magic != BLOCK_MAGIC) {
		error("block already destroyed %p", bg_record);
		xassert(0);
		return SLURM_ERROR;
	}

	bg_record->free_cnt--;

	if (bg_record->free_cnt == -1) {
		info("we got a negative 1 here for %s",
		     bg_record->bg_block_id);
		xassert(0);
		return SLURM_SUCCESS;
	} else if (bg_record->modifying) {
		info("%d others are modifing this block %s",
		     bg_record->free_cnt, bg_record->bg_block_id);
		return SLURM_SUCCESS;
	} else if (bg_record->free_cnt) {
		if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
			info("%d others are trying to destroy this block %s",
			     bg_record->free_cnt, bg_record->bg_block_id);
		return SLURM_SUCCESS;
	}

	if (!(bg_record->state & BG_BLOCK_ERROR_FLAG)
	    && (bg_record->state != BG_BLOCK_FREE)) {
		/* Something isn't right, go mark this one in an error
		   state. */
		update_block_msg_t block_msg;
		if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
			info("_post_block_free: block %s is not in state "
			     "free (%s), putting it in error state.",
			     bg_record->bg_block_id,
			     bg_block_state_string(bg_record->state));
		slurm_init_update_block_msg(&block_msg);
		block_msg.bg_block_id = bg_record->bg_block_id;
		block_msg.state = BG_BLOCK_ERROR_FLAG;
		block_msg.reason = "Block would not deallocate";
		slurm_mutex_unlock(&block_state_mutex);
		select_g_update_block(&block_msg);
		slurm_mutex_lock(&block_state_mutex);
		return SLURM_SUCCESS;
	}

	/* A bit of a sanity check to make sure blocks are being
	   removed out of all the lists.
	*/
	remove_from_bg_list(bg_lists->booted, bg_record);
	if (remove_from_bg_list(bg_lists->job_running, bg_record)
	    == SLURM_SUCCESS)
		num_unused_cpus += bg_record->cpu_cnt;

	if (restore)
		return SLURM_SUCCESS;

	if (remove_from_bg_list(bg_lists->main, bg_record) != SLURM_SUCCESS) {
		/* This should only happen if called from
		 * bg_job_place.c where the block was never added to
		 * the list. */
		debug("_post_block_free: It appears this block %s isn't "
		      "in the main list anymore.",
		      bg_record->bg_block_id);
	}

	if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
		info("_post_block_free: removing %s from database",
		     bg_record->bg_block_id);

	rc = bridge_block_remove(bg_record);
	if (rc != SLURM_SUCCESS) {
		if (rc == BG_ERROR_BLOCK_NOT_FOUND) {
			debug("_post_block_free: block %s is not found",
			      bg_record->bg_block_id);
		} else {
			error("_post_block_free: "
			      "bridge_block_remove(%s): %s",
			      bg_record->bg_block_id,
			      bg_err_str(rc));
		}
	} else if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
		info("_post_block_free: done %s(%p)",
		     bg_record->bg_block_id, bg_record);

	destroy_bg_record(bg_record);
	if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
		info("_post_block_free: destroyed");

	return SLURM_SUCCESS;
}
Exemplo n.º 12
0
extern int bg_status_update_block_state(bg_record_t *bg_record,
					uint16_t state,
					List kill_job_list)
{
	bool skipped_dealloc = false;
	kill_job_struct_t *freeit = NULL;
	int updated = 0;
	uint16_t real_state = bg_record->state & (~BG_BLOCK_ERROR_FLAG);

	if (real_state == state)
		return 0;

	debug("state of Block %s was %s and now is %s",
	      bg_record->bg_block_id,
	      bg_block_state_string(bg_record->state),
	      bg_block_state_string(state));

	/*
	  check to make sure block went
	  through freeing correctly
	*/
	if ((real_state != BG_BLOCK_TERM
	     && !(bg_record->state & BG_BLOCK_ERROR_FLAG))
	    && state == BG_BLOCK_FREE)
		skipped_dealloc = 1;
	else if ((real_state == BG_BLOCK_INITED)
		 && (state == BG_BLOCK_BOOTING)) {
		/* This means the user did a reboot through
		   mpirun but we missed the state
		   change */
		debug("Block %s skipped rebooting, "
		      "but it really is.",
		      bg_record->bg_block_id);
	} else if ((real_state == BG_BLOCK_TERM)
		   && (state == BG_BLOCK_BOOTING))
		/* This is a funky state IBM says
		   isn't a bug, but all their
		   documentation says this doesn't
		   happen, but IBM says oh yeah, you
		   weren't really suppose to notice
		   that. So we will just skip this
		   state and act like this didn't happen. */
		goto nochange_state;
	real_state = state;
	if (bg_record->state & BG_BLOCK_ERROR_FLAG)
		state |= BG_BLOCK_ERROR_FLAG;

	bg_record->state = state;

	if (real_state == BG_BLOCK_TERM || skipped_dealloc)
		_block_is_deallocating(bg_record, kill_job_list);
	else if (real_state == BG_BLOCK_BOOTING) {
		debug("Setting bootflag for %s", bg_record->bg_block_id);
		bg_record->boot_state = 1;
	} else if (real_state == BG_BLOCK_FREE) {
		/* Make sure block is cleaned up.  If there are
		 * running jobs on the block this happens when they
		 * are cleaned off. */
		if (bg_record->job_running == NO_JOB_RUNNING
		    && (!bg_record->job_list
			|| !list_count(bg_record->job_list)))
			bg_reset_block(bg_record, NULL);
		remove_from_bg_list(bg_lists->booted, bg_record);
	} else if (real_state & BG_BLOCK_ERROR_FLAG) {
		if (bg_record->boot_state)
			error("Block %s in an error state while booting.",
			      bg_record->bg_block_id);
		else
			error("Block %s in an error state.",
			      bg_record->bg_block_id);
		remove_from_bg_list(bg_lists->booted, bg_record);
		trigger_block_error();
	} else if (real_state == BG_BLOCK_INITED) {
		if (!block_ptr_exist_in_list(bg_lists->booted, bg_record))
			list_push(bg_lists->booted, bg_record);
	}
	updated = 1;
	last_bg_update = time(NULL);
nochange_state:

	/* check the boot state */
	debug3("boot state for block %s is %d",
	       bg_record->bg_block_id, bg_record->boot_state);
	if (bg_record->boot_state) {
		if (bg_record->state & BG_BLOCK_ERROR_FLAG) {
			/* If we get an error on boot that
			 * means it is a transparent L3 error
			 * and should be trying to fix
			 * itself.  If this is the case we
			 * just hang out waiting for the state
			 * to go to free where we will try to
			 * boot again below.
			 */
			return updated;
		}

		switch (real_state) {
		case BG_BLOCK_BOOTING:
			if (bg_record->job_ptr
			    && !IS_JOB_CONFIGURING(bg_record->job_ptr)) {
				debug3("Setting job %u on block %s "
				       "to configuring",
				       bg_record->job_ptr->job_id,
				       bg_record->bg_block_id);
				bg_record->job_ptr->job_state |=
					JOB_CONFIGURING;
				last_job_update = time(NULL);
			} else if (bg_record->job_list
				   && list_count(bg_record->job_list)) {
				struct job_record *job_ptr;
				ListIterator job_itr =
					list_iterator_create(
						bg_record->job_list);
				while ((job_ptr = list_next(job_itr))) {
					if (job_ptr->magic != JOB_MAGIC) {
						error("bg_status_update_"
						      "block_state: 1 "
						      "bad magic found when "
						      "looking at block %s",
						      bg_record->bg_block_id);
						list_delete_item(job_itr);
						continue;
					}
					job_ptr->job_state |= JOB_CONFIGURING;
				}
				list_iterator_destroy(job_itr);
				last_job_update = time(NULL);
			}
			break;
		case BG_BLOCK_FREE:
			if (bg_record->boot_count < RETRY_BOOT_COUNT) {
				bridge_block_boot(bg_record);

				if (bg_record->magic == BLOCK_MAGIC) {
					debug("boot count for block %s is %d",
					      bg_record->bg_block_id,
					      bg_record->boot_count);
					bg_record->boot_count++;
				}
			} else {
				char *reason = (char *)
					"status_check: Boot fails ";

				error("Couldn't boot Block %s",
				      bg_record->bg_block_id);

				/* We can't push on the kill_job_list
				   here since we have to put this
				   block in an error and that means
				   the killing has to take place
				   before the erroring of the block.
				*/
				slurm_mutex_unlock(&block_state_mutex);
				unlock_slurmctld(job_read_lock);
				requeue_and_error(bg_record, reason);
				lock_slurmctld(job_read_lock);
				slurm_mutex_lock(&block_state_mutex);

				bg_record->boot_state = 0;
				bg_record->boot_count = 0;

				remove_from_bg_list(bg_lists->booted,
						    bg_record);
			}
			break;
		case BG_BLOCK_INITED:
			debug("block %s is ready.",
			      bg_record->bg_block_id);
			if (bg_record->job_ptr
			    && IS_JOB_CONFIGURING(bg_record->job_ptr)) {
				bg_record->job_ptr->job_state &=
					(~JOB_CONFIGURING);
				last_job_update = time(NULL);
			} else if (bg_record->job_list
				   && list_count(bg_record->job_list)) {
				struct job_record *job_ptr;
				ListIterator job_itr =
					list_iterator_create(
						bg_record->job_list);
				while ((job_ptr = list_next(job_itr))) {
					if (job_ptr->magic != JOB_MAGIC) {
						error("bg_status_update_"
						      "block_state: 2 "
						      "bad magic found when "
						      "looking at block %s",
						      bg_record->bg_block_id);
						list_delete_item(job_itr);
						continue;
					}
					job_ptr->job_state &=
						(~JOB_CONFIGURING);
				}
				list_iterator_destroy(job_itr);
				last_job_update = time(NULL);
			}

			bg_record->boot_state = 0;
			bg_record->boot_count = 0;

			if (kill_job_list &&
			    bridge_block_sync_users(bg_record)
			    == SLURM_ERROR) {
				freeit = (kill_job_struct_t *)
					xmalloc(sizeof(kill_job_struct_t));
				freeit->jobid = bg_record->job_running;
				list_push(kill_job_list, freeit);
			}
			break;
		case BG_BLOCK_TERM:
			debug2("Block %s is in a deallocating state "
			       "during a boot.  Doing nothing until "
			       "free state.",
			       bg_record->bg_block_id);
			break;
		case BG_BLOCK_REBOOTING:
			debug2("Block %s is rebooting.",
			       bg_record->bg_block_id);
			break;
		default:
			debug("Hey the state of block "
			      "%s is %d(%s) doing nothing.",
			      bg_record->bg_block_id,
			      real_state,
			      bg_block_state_string(bg_record->state));
			break;
		}
	}

	return updated;
}
Exemplo n.º 13
0
static void _layout_block_record(GtkTreeView *treeview,
				 sview_block_info_t *block_ptr,
				 int update)
{
	char tmp_cnt[18], tmp_cnt2[18];
	char *tmp_char = NULL;
	GtkTreeIter iter;
	GtkTreeStore *treestore =
		GTK_TREE_STORE(gtk_tree_view_get_model(treeview));

	add_display_treestore_line(update, treestore, &iter,
				   find_col_name(display_data_block,
						 SORTID_NODELIST),
				   block_ptr->mp_str);
	tmp_char = conn_type_string_full(block_ptr->bg_conn_type);
	add_display_treestore_line(update, treestore, &iter,
				   find_col_name(display_data_block,
						 SORTID_CONN),
				   tmp_char);
	xfree(tmp_char);

	if (cluster_flags & CLUSTER_FLAG_BGQ) {
		add_display_treestore_line(update, treestore, &iter,
					   find_col_name(display_data_block,
							 SORTID_IMAGEMLOADER),
					   block_ptr->imagemloader);
	} else if (cluster_flags & CLUSTER_FLAG_BGP) {
		add_display_treestore_line(update, treestore, &iter,
					   find_col_name(display_data_block,
							 SORTID_IMAGELINUX),
					   block_ptr->imagelinux);
		add_display_treestore_line(update, treestore, &iter,
					   find_col_name(display_data_block,
							 SORTID_IMAGERAMDISK),
					   block_ptr->imageramdisk);
		add_display_treestore_line(update, treestore, &iter,
					   find_col_name(display_data_block,
							 SORTID_IMAGEMLOADER),
					   block_ptr->imagemloader);
	} else if (cluster_flags & CLUSTER_FLAG_BGL) {
		add_display_treestore_line(update, treestore, &iter,
					   find_col_name(display_data_block,
							 SORTID_IMAGEBLRTS),
					   block_ptr->imageblrts);
		add_display_treestore_line(update, treestore, &iter,
					   find_col_name(display_data_block,
							 SORTID_IMAGELINUX),
					   block_ptr->imagelinux);
		add_display_treestore_line(update, treestore, &iter,
					   find_col_name(display_data_block,
							 SORTID_IMAGEMLOADER),
					   block_ptr->imagemloader);
		add_display_treestore_line(update, treestore, &iter,
					   find_col_name(display_data_block,
							 SORTID_IMAGERAMDISK),
					   block_ptr->imageramdisk);
	}

	tmp_char = _set_running_job_str(block_ptr->job_list, 0);

	add_display_treestore_line(update, treestore, &iter,
				   find_col_name(display_data_block,
						 SORTID_JOB),
				   tmp_char);
	xfree(tmp_char);
	if (cluster_flags & CLUSTER_FLAG_BGL) {
		add_display_treestore_line(update, treestore, &iter,
					   find_col_name(display_data_block,
							 SORTID_USE),
					   node_use_string(
						   block_ptr->bg_node_use));
	} convert_num_unit((float)block_ptr->cnode_cnt, tmp_cnt,
			   sizeof(tmp_cnt), UNIT_NONE, NO_VAL,
			   working_sview_config.convert_flags);
	if (cluster_flags & CLUSTER_FLAG_BGQ) {
		convert_num_unit((float)block_ptr->cnode_err_cnt, tmp_cnt2,
				 sizeof(tmp_cnt2), UNIT_NONE, NO_VAL,
				 working_sview_config.convert_flags);
		tmp_char = xstrdup_printf("%s/%s", tmp_cnt, tmp_cnt2);
	} else
		tmp_char = tmp_cnt;
	add_display_treestore_line(update, treestore, &iter,
				   find_col_name(display_data_block,
						 SORTID_NODE_CNT),
				   tmp_char);
	if (cluster_flags & CLUSTER_FLAG_BGQ)
		xfree(tmp_char);
	add_display_treestore_line(update, treestore, &iter,
				   find_col_name(display_data_block,
						 SORTID_PARTITION),
				   block_ptr->slurm_part_name);
	add_display_treestore_line(update, treestore, &iter,
				   find_col_name(display_data_block,
						 SORTID_STATE),
				   bg_block_state_string(block_ptr->state));
	add_display_treestore_line(update, treestore, &iter,
				   find_col_name(display_data_block,
						 SORTID_REASON),
				   block_ptr->reason);
}
Exemplo n.º 14
0
/*
 * slurm_sprint_block_info - output information about a specific Bluegene
 *	block based upon message as loaded using slurm_load_block
 * IN block_ptr - an individual partition information record pointer
 * IN one_liner - print as a single line if true
 * RET out - char * containing formatted output (must be freed after call)
 *           NULL is returned on failure.
 */
char *slurm_sprint_block_info(
	block_info_t * block_ptr, int one_liner)
{
	int j;
	char tmp1[16], *tmp_char = NULL;
	char *out = NULL;
	char *line_end = "\n   ";
	uint32_t cluster_flags = slurmdb_setup_cluster_flags();

	if (one_liner)
		line_end = " ";

	/****** Line 1 ******/
	convert_num_unit((float)block_ptr->cnode_cnt, tmp1, sizeof(tmp1),
			 UNIT_NONE);

	out = xstrdup_printf("BlockName=%s TotalNodes=%s State=%s%s",
			     block_ptr->bg_block_id, tmp1,
			     bg_block_state_string(block_ptr->state),
			     line_end);

	/****** Line 2 ******/
	if (block_ptr->job_running > NO_JOB_RUNNING)
		xstrfmtcat(out, "JobRunning=%u ", block_ptr->job_running);
	else
		xstrcat(out, "JobRunning=NONE ");
	tmp_char = conn_type_string_full(block_ptr->conn_type);
	xstrfmtcat(out, "User=%s ConnType=%s",
		   block_ptr->owner_name, tmp_char);
	xfree(tmp_char);
	if(cluster_flags & CLUSTER_FLAG_BGL)
		xstrfmtcat(out, " NodeUse=%s",
			   node_use_string(block_ptr->node_use));

	xstrcat(out, line_end);

	/****** Line 3 ******/
	if(block_ptr->ionode_str)
		xstrfmtcat(out, "MidPlanes=%s[%s] MPIndices=",
			   block_ptr->mp_str, block_ptr->ionode_str);
	else
		xstrfmtcat(out, "MidPlanes=%s MPIndices=",
			   block_ptr->mp_str);
	for (j = 0;
	     (block_ptr->mp_inx && (block_ptr->mp_inx[j] != -1));
	     j+=2) {
		if (j > 0)
			xstrcat(out, ",");
		xstrfmtcat(out, "%d-%d", block_ptr->mp_inx[j],
			   block_ptr->mp_inx[j+1]);
	}
	xstrcat(out, line_end);

	/****** Line 4 ******/
	xstrfmtcat(out, "MloaderImage=%s%s",
		   block_ptr->mloaderimage, line_end);

	if (cluster_flags & CLUSTER_FLAG_BGL) {
		/****** Line 5 ******/
		xstrfmtcat(out, "BlrtsImage=%s%s", block_ptr->blrtsimage,
			   line_end);
		/****** Line 6 ******/
		xstrfmtcat(out, "LinuxImage=%s%s", block_ptr->linuximage,
			   line_end);
		/****** Line 7 ******/
		xstrfmtcat(out, "RamdiskImage=%s", block_ptr->ramdiskimage);
	} else if (cluster_flags & CLUSTER_FLAG_BGP) {
		/****** Line 5 ******/
		xstrfmtcat(out, "CnloadImage=%s%s", block_ptr->linuximage,
			   line_end);
		/****** Line 6 ******/
		xstrfmtcat(out, "IoloadImage=%s", block_ptr->ramdiskimage);
	}

	if (one_liner)
		xstrcat(out, "\n");
	else
		xstrcat(out, "\n\n");

	return out;
}
Exemplo n.º 15
0
extern int bg_free_block(bg_record_t *bg_record, bool wait, bool locked)
{
	int rc = SLURM_SUCCESS;
	int count = 0;

	if (!bg_record) {
		error("bg_free_block: there was no bg_record");
		return SLURM_ERROR;
	}

	if (!locked)
		slurm_mutex_lock(&block_state_mutex);

	while (count < MAX_FREE_RETRIES) {
		/* block was removed */
		if (bg_record->magic != BLOCK_MAGIC) {
			error("block was removed while freeing it here");
			xassert(0);
			if (!locked)
				slurm_mutex_unlock(&block_state_mutex);
			return SLURM_SUCCESS;
		}
		/* Reset these here so we don't try to reboot it
		   when the state goes to free.
		*/
		bg_record->boot_state = 0;
		bg_record->boot_count = 0;
		/* Here we don't need to check if the block is still
		 * in exsistance since this function can't be called on
		 * the same block twice.  It may
		 * had already been removed at this point also.
		 */
#ifdef HAVE_BG_FILES
		if (bg_record->state != BG_BLOCK_FREE
		    && bg_record->state != BG_BLOCK_TERM) {
			if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
				info("bridge_destroy %s",
				     bg_record->bg_block_id);
			rc = bridge_block_free(bg_record);
			if (rc != SLURM_SUCCESS) {
				if (rc == BG_ERROR_BLOCK_NOT_FOUND) {
					debug("block %s is not found",
					      bg_record->bg_block_id);
					bg_record->state = BG_BLOCK_FREE;
					break;
				} else if (rc == BG_ERROR_FREE) {
					if (bg_conf->slurm_debug_flags
					    & DEBUG_FLAG_SELECT_TYPE)
						info("bridge_block_free"
						     "(%s): %s State = %s",
						     bg_record->bg_block_id,
						     bg_err_str(rc),
						     bg_block_state_string(
							     bg_record->state));
				} else if (rc == BG_ERROR_INVALID_STATE) {
#ifndef HAVE_BGL
					/* If the state is error and
					   we get an incompatible
					   state back here, it means
					   we set it ourselves so
					   break out.
					*/
					if (bg_record->state
					    & BG_BLOCK_ERROR_FLAG)
						break;
#endif
					if (bg_conf->slurm_debug_flags
					    & DEBUG_FLAG_SELECT_TYPE)
						info("bridge_block_free"
						     "(%s): %s State = %s",
						     bg_record->bg_block_id,
						     bg_err_str(rc),
						     bg_block_state_string(
							     bg_record->state));
#ifdef HAVE_BGQ
					if (bg_record->state != BG_BLOCK_FREE
					    && bg_record->state
					    != BG_BLOCK_TERM)
					bg_record->state = BG_BLOCK_TERM;
#endif
				} else {
					error("bridge_block_free"
					      "(%s): %s State = %s",
					      bg_record->bg_block_id,
					      bg_err_str(rc),
					      bg_block_state_string(
						      bg_record->state));
				}
			}
		}
#else
		/* Fake a free since we are n deallocating
		   state before this.
		*/
		if (bg_record->state & BG_BLOCK_ERROR_FLAG) {
			/* This will set the state to ERROR(Free)
			 * just incase the state was ERROR(SOMETHING ELSE) */
			bg_record->state = BG_BLOCK_ERROR_FLAG;
			break;
		} else if (!wait || (count >= 3))
			bg_record->state = BG_BLOCK_FREE;
		else if (bg_record->state != BG_BLOCK_FREE)
			bg_record->state = BG_BLOCK_TERM;
#endif

		if (!wait || (bg_record->state == BG_BLOCK_FREE)
#ifndef HAVE_BGL
		    ||  (bg_record->state & BG_BLOCK_ERROR_FLAG)
#endif
			) {
			break;
		}
		/* If we were locked outside of this we need to unlock
		   to not cause deadlock on this mutex until we are
		   done.
		*/
		slurm_mutex_unlock(&block_state_mutex);
		sleep(FREE_SLEEP_INTERVAL);
		count++;
		slurm_mutex_lock(&block_state_mutex);
	}

	rc = SLURM_SUCCESS;
	if ((bg_record->state == BG_BLOCK_FREE)
	    || (bg_record->state & BG_BLOCK_ERROR_FLAG)) {

		if (bg_record->err_ratio
		    && (bg_record->state == BG_BLOCK_FREE)) {
			/* Sometime the realtime server can report
			   software error on cnodes even though the
			   block is free.  If this is the case we need
			   to manually clear them.
			*/
			ba_mp_t *found_ba_mp;
			ListIterator itr =
				list_iterator_create(bg_record->ba_mp_list);
			debug("Block %s is free, but has %u cnodes in error.  "
			      "This can happen if a large block goes into "
			      "error and then is freed and the state of "
			      "the block changes before the "
			      "database informs all the cnodes are back to "
			      "normal.  This is no big deal.",
			      bg_record->bg_block_id, bg_record->cnode_err_cnt);
			while ((found_ba_mp = list_next(itr))) {
				if (!found_ba_mp->used)
					continue;

				if (!found_ba_mp->cnode_err_bitmap)
					found_ba_mp->cnode_err_bitmap =
						bit_alloc(
							bg_conf->mp_cnode_cnt);

				bit_nclear(found_ba_mp->cnode_err_bitmap, 0,
					   bit_size(found_ba_mp->
						    cnode_err_bitmap)-1);
			}
			list_iterator_destroy(itr);
			bg_record->cnode_err_cnt = 0;
			bg_record->err_ratio = 0;
		}

		remove_from_bg_list(bg_lists->booted, bg_record);
	} else if (count >= MAX_FREE_RETRIES) {
		/* Something isn't right, go mark this one in an error
		   state. */
		update_block_msg_t block_msg;
		if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
			info("bg_free_block: block %s is not in state "
			     "free (%s), putting it in error state.",
			     bg_record->bg_block_id,
			     bg_block_state_string(bg_record->state));
		slurm_init_update_block_msg(&block_msg);
		block_msg.bg_block_id = bg_record->bg_block_id;
		block_msg.state = BG_BLOCK_ERROR_FLAG;
		block_msg.reason = "Block would not deallocate";
		slurm_mutex_unlock(&block_state_mutex);
		select_g_update_block(&block_msg);
		slurm_mutex_lock(&block_state_mutex);
		rc = SLURM_ERROR;
	}
	if (!locked)
		slurm_mutex_unlock(&block_state_mutex);

	return rc;
}
Exemplo n.º 16
0
/* block_state_mutex should be locked before calling this */
static int _post_block_free(bg_record_t *bg_record, bool restore)
{
	int rc = SLURM_SUCCESS;

	if (bg_record->magic != BLOCK_MAGIC) {
		error("block already destroyed %p", bg_record);
		xassert(0);
		return SLURM_ERROR;
	}

	bg_record->free_cnt--;
	if (bg_record->free_cnt == -1) {
		info("we got a negative 1 here for %s",
		     bg_record->bg_block_id);
		xassert(0);
		return SLURM_SUCCESS;
	} else if (bg_record->modifying) {
		info("others are modifing this block %s, don't clear it up",
		     bg_record->bg_block_id);
		return SLURM_SUCCESS;
	} else if (bg_record->free_cnt) {
		if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
			info("%d others are trying to destroy this block %s",
			     bg_record->free_cnt, bg_record->bg_block_id);
		return SLURM_SUCCESS;
	}

	/* Even if the block is already in error state we need to do this to
	   avoid any overlapping blocks that may have been created due
	   to bad hardware.
	*/
	if ((bg_record->state & (~BG_BLOCK_ERROR_FLAG)) != BG_BLOCK_FREE) {
		/* Something isn't right, go mark this one in an error
		   state. */
		update_block_msg_t block_msg;
		if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
			info("_post_block_free: block %s is not in state "
			     "free (%s), putting it in error state.",
			     bg_record->bg_block_id,
			     bg_block_state_string(bg_record->state));
		slurm_init_update_block_msg(&block_msg);
		block_msg.bg_block_id = bg_record->bg_block_id;
		block_msg.state = BG_BLOCK_ERROR_FLAG;
		block_msg.reason = "Block would not deallocate";
		slurm_mutex_unlock(&block_state_mutex);
		select_g_update_block(&block_msg);
		slurm_mutex_lock(&block_state_mutex);
		if (block_ptr_exist_in_list(bg_lists->main, bg_record))
			bg_record->destroy = 0;
		return SLURM_SUCCESS;
	}

	/* The reason restore is used on the entire list is if this
	 * was for a bunch of small blocks.  If we record is marked to
	 * be destroyed and it is bigger than 1 midplane destroy it
	 * even if restore is true.
	 */
	 if (restore && bg_record->destroy && (bg_record->mp_count > 1))
		restore = false;

	/* If we are here we are done with the destroy so just reset it. */
	bg_record->destroy = 0;

	/* A bit of a sanity check to make sure blocks are being
	   removed out of all the lists.
	*/
	remove_from_bg_list(bg_lists->booted, bg_record);
	if (remove_from_bg_list(bg_lists->job_running, bg_record)
	    == SLURM_SUCCESS) {
		debug2("_post_block_free: we are freeing block %s and "
		       "it was in the job_running list.  This can happen if a "
		       "block is removed while waiting for mmcs to finish "
		       "removing the job from the block.",
		       bg_record->bg_block_id);
		num_unused_cpus += bg_record->cpu_cnt;
	}

	/* If we don't have any mp_counts force block removal */
	if (restore && bg_record->mp_count)
		return SLURM_SUCCESS;

	if (remove_from_bg_list(bg_lists->main, bg_record) != SLURM_SUCCESS) {
		/* This should only happen if called from
		 * bg_job_place.c where the block was never added to
		 * the list. */
		debug("_post_block_free: It appears this block %s isn't "
		      "in the main list anymore.",
		      bg_record->bg_block_id);
	}

	if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
		info("_post_block_free: removing %s from database",
		     bg_record->bg_block_id);

	rc = bridge_block_remove(bg_record);
	if (rc != SLURM_SUCCESS) {
		if (rc == BG_ERROR_BLOCK_NOT_FOUND) {
			debug("_post_block_free: block %s is not found",
			      bg_record->bg_block_id);
		} else {
			error("_post_block_free: "
			      "bridge_block_remove(%s): %s",
			      bg_record->bg_block_id,
			      bg_err_str(rc));
		}
	} else if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
		info("_post_block_free: done %s(%p)",
		     bg_record->bg_block_id, bg_record);

	destroy_bg_record(bg_record);
	if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
		info("_post_block_free: destroyed");

	return SLURM_SUCCESS;
}
Exemplo n.º 17
0
static void _layout_block_record(GtkTreeView *treeview,
				 sview_block_info_t *block_ptr,
				 int update)
{
	char tmp_cnt[18];
	GtkTreeIter iter;
	GtkTreeStore *treestore =
		GTK_TREE_STORE(gtk_tree_view_get_model(treeview));

	add_display_treestore_line(update, treestore, &iter,
				   find_col_name(display_data_block,
						 SORTID_NODELIST),
				   block_ptr->mp_str);

	add_display_treestore_line(update, treestore, &iter,
				   find_col_name(display_data_block,
						 SORTID_CONN),
				   conn_type_string(
					   block_ptr->bg_conn_type));
	if (cluster_flags & CLUSTER_FLAG_BGL) {
		add_display_treestore_line(update, treestore, &iter,
					   find_col_name(display_data_block,
							 SORTID_IMAGEBLRTS),
					   block_ptr->imageblrts);
		add_display_treestore_line(update, treestore, &iter,
					   find_col_name(display_data_block,
							 SORTID_IMAGELINUX),
					   block_ptr->imagelinux);
		add_display_treestore_line(update, treestore, &iter,
					   find_col_name(display_data_block,
							 SORTID_IMAGEMLOADER),
					   block_ptr->imagemloader);
		add_display_treestore_line(update, treestore, &iter,
					   find_col_name(display_data_block,
							 SORTID_IMAGERAMDISK),
					   block_ptr->imageramdisk);
	} else {
		add_display_treestore_line(update, treestore, &iter,
					   find_col_name(display_data_block,
							 SORTID_IMAGELINUX),
					   block_ptr->imagelinux);
		add_display_treestore_line(update, treestore, &iter,
					   find_col_name(display_data_block,
							 SORTID_IMAGERAMDISK),
					   block_ptr->imageramdisk);
		add_display_treestore_line(update, treestore, &iter,
					   find_col_name(display_data_block,
							 SORTID_IMAGEMLOADER),
					   block_ptr->imagemloader);
	}

	if (block_ptr->job_running > NO_JOB_RUNNING)
		snprintf(tmp_cnt, sizeof(tmp_cnt),
			 "%d", block_ptr->job_running);
	else
		snprintf(tmp_cnt, sizeof(tmp_cnt), "-");

	add_display_treestore_line(update, treestore, &iter,
				   find_col_name(display_data_block,
						 SORTID_JOB),
				   tmp_cnt);
	if (cluster_flags & CLUSTER_FLAG_BGL) {
		add_display_treestore_line(update, treestore, &iter,
					   find_col_name(display_data_block,
							 SORTID_USE),
					   node_use_string(
						   block_ptr->bg_node_use));
	}
	convert_num_unit((float)block_ptr->cnode_cnt, tmp_cnt, sizeof(tmp_cnt),
			 UNIT_NONE);
	add_display_treestore_line(update, treestore, &iter,
				   find_col_name(display_data_block,
						 SORTID_MP_STR),
				   tmp_cnt);

	add_display_treestore_line(update, treestore, &iter,
				   find_col_name(display_data_block,
						 SORTID_PARTITION),
				   block_ptr->slurm_part_name);
	add_display_treestore_line(update, treestore, &iter,
				   find_col_name(display_data_block,
						 SORTID_STATE),
				   bg_block_state_string(block_ptr->state));
	add_display_treestore_line(update, treestore, &iter,
				   find_col_name(display_data_block,
						 SORTID_USER),
				   block_ptr->bg_user_name);
}