示例#1
0
文件: bg_core.c 项目: fafik23/slurm
/* block_state_mutex must be unlocked before calling this. */
extern void bg_requeue_job(uint32_t job_id, bool wait_for_start,
			   bool slurmctld_locked, uint32_t job_state,
			   bool preempted)
{
	int rc;
	slurmctld_lock_t job_write_lock = {
		NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK };

	/* Wait for the slurmd to begin the batch script, slurm_fail_job()
	   is a no-op if issued prior to the script initiation do
	   clean up just incase the fail job isn't ran. */
	if (wait_for_start)
		sleep(2);

	if (!slurmctld_locked)
		lock_slurmctld(job_write_lock);
	rc = job_requeue(0, job_id, NULL, preempted, 0);
	if (rc == ESLURM_JOB_PENDING) {
		error("%s: Could not requeue pending job %u", __func__, job_id);
	} else if (rc != SLURM_SUCCESS) {
		error("%s: Could not requeue job %u, failing it: %s",
		      __func__, job_id, slurm_strerror(rc));
		job_fail(job_id, job_state);
	}
	if (!slurmctld_locked)
		unlock_slurmctld(job_write_lock);
}
示例#2
0
/* block_state_mutex must be unlocked before calling this. */
extern void bg_requeue_job(uint32_t job_id, bool wait_for_start)
{
	int rc;
	slurmctld_lock_t job_write_lock = {
		NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK };

	/* Wait for the slurmd to begin the batch script, slurm_fail_job()
	   is a no-op if issued prior to the script initiation do
	   clean up just incase the fail job isn't ran. */
	if (wait_for_start)
		sleep(2);

	lock_slurmctld(job_write_lock);
	if ((rc = job_requeue(0, job_id, -1, (uint16_t)NO_VAL, false))) {
		error("Couldn't requeue job %u, failing it: %s",
		      job_id, slurm_strerror(rc));
		job_fail(job_id);
	}
	unlock_slurmctld(job_write_lock);
}
示例#3
0
static int _check_for_booted_overlapping_blocks(
	List block_list, ListIterator bg_record_itr,
	bg_record_t *bg_record, int overlap_check, List overlapped_list,
	uint16_t query_mode)
{
	bg_record_t *found_record = NULL;
	ListIterator itr = NULL;
	int rc = 0;
	int overlap = 0;
	bool is_test = SELECT_IS_TEST(query_mode);

	/* this test only is for actually picking a block not testing */
	if (is_test && bg_conf->layout_mode == LAYOUT_DYNAMIC)
		return rc;

	/* Make sure no other blocks are under this block
	   are booted and running jobs
	*/
	itr = list_iterator_create(block_list);
	while ((found_record = (bg_record_t*)list_next(itr)) != NULL) {
		if ((!found_record->bg_block_id)
		    || (bg_record == found_record)) {
			if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
				info("Don't need to look at myself %s %s",
				     bg_record->bg_block_id,
				     found_record->bg_block_id);
			continue;
		}

		slurm_mutex_lock(&block_state_mutex);
		overlap = blocks_overlap(bg_record, found_record);
		slurm_mutex_unlock(&block_state_mutex);

		if (overlap) {
			overlap = 0;
			/* make the available time on this block
			 * (bg_record) the max of this found_record's job
			 * or the one already set if in overlapped_block_list
			 * since we aren't setting job_running we
			 * don't have to remove them since the
			 * block_list should always be destroyed afterwards.
			 */
			if (is_test && overlapped_list
			    && found_record->job_ptr
			    && bg_record->job_running == NO_JOB_RUNNING) {
				ListIterator itr = list_iterator_create(
					overlapped_list);
				bg_record_t *tmp_rec = NULL;

				if (bg_conf->slurm_debug_flags
				    & DEBUG_FLAG_BG_PICK)
					info("found overlapping block %s "
					     "overlapped %s with job %u",
					     found_record->bg_block_id,
					     bg_record->bg_block_id,
					     found_record->job_ptr->job_id);

				while ((tmp_rec = list_next(itr))) {
					if (tmp_rec == bg_record)
						break;
				}
				list_iterator_destroy(itr);
				if (tmp_rec && tmp_rec->job_ptr->end_time
				    < found_record->job_ptr->end_time)
					tmp_rec->job_ptr =
						found_record->job_ptr;
				else if (!tmp_rec) {
					bg_record->job_ptr =
						found_record->job_ptr;
					list_append(overlapped_list,
						    bg_record);
				}
			}
			/* We already know this block doesn't work
			 * right now so we will if there is another
			 * overlapping block that ends later
			 */
			if (rc)
				continue;
			/* This test is here to check if the block we
			 * chose is not booted or if there is a block
			 * overlapping that we could avoid freeing if
			 * we choose something else
			 */
			if (bg_conf->layout_mode == LAYOUT_OVERLAP
			    && ((overlap_check == 0 && bg_record->state
				 != BG_BLOCK_INITED)
				|| (overlap_check == 1 && found_record->state
				    != BG_BLOCK_FREE))) {

				if (!is_test) {
					rc = 1;
					break;
				}
			}

			if (((bg_conf->layout_mode == LAYOUT_DYNAMIC)
			     || ((!SELECT_IS_CHECK_FULL_SET(query_mode)
				  || SELECT_IS_MODE_RUN_NOW(query_mode))
				 && (bg_conf->layout_mode != LAYOUT_DYNAMIC)))
			    && ((found_record->job_running != NO_JOB_RUNNING)
				|| (found_record->state
				    & BG_BLOCK_ERROR_FLAG))) {
				if ((found_record->job_running
				     == BLOCK_ERROR_STATE)
				    || (found_record->state
					& BG_BLOCK_ERROR_FLAG))
					error("can't use %s, "
					      "overlapping block %s "
					      "is in an error state.",
					      bg_record->bg_block_id,
					      found_record->bg_block_id);
				else if (bg_conf->slurm_debug_flags
					 & DEBUG_FLAG_BG_PICK)
					info("can't use %s, there is "
					     "a job (%d) running on "
					     "an overlapping "
					     "block %s",
					     bg_record->bg_block_id,
					     found_record->job_running,
					     found_record->bg_block_id);

				if (bg_conf->layout_mode == LAYOUT_DYNAMIC) {
					List tmp_list = list_create(NULL);
					/* this will remove and
					 * destroy the memory for
					 * bg_record
					 */
					list_remove(bg_record_itr);
					slurm_mutex_lock(&block_state_mutex);

					if (bg_record->original) {
						if (bg_conf->slurm_debug_flags
						    & DEBUG_FLAG_BG_PICK)
							info("This was a "
							     "copy %s",
							     bg_record->
							     bg_block_id);
						found_record =
							bg_record->original;
					} else {
						if (bg_conf->slurm_debug_flags
						    & DEBUG_FLAG_BG_PICK)
							info("looking for "
							     "original");
						found_record =
							find_org_in_bg_list(
								bg_lists->main,
								bg_record);
					}

					if (bg_conf->slurm_debug_flags
					    & DEBUG_FLAG_BG_PICK)
						info("Removing unusable block "
						     "%s from the system.",
						     bg_record->bg_block_id);

					if (!found_record) {
						if (bg_conf->slurm_debug_flags
						    & DEBUG_FLAG_BG_PICK)
							info("This record %s "
							     "wasn't found in "
							     "the "
							     "bg_lists->main, "
							     "no big deal, it "
							     "probably wasn't "
							     "added",
							     bg_record->
							     bg_block_id);
						found_record = bg_record;
					} else
						destroy_bg_record(bg_record);

					list_push(tmp_list, found_record);
					slurm_mutex_unlock(&block_state_mutex);

					/* We need to make sure if a
					   job is running here to not
					   call the regular method since
					   we are inside the job write
					   lock already.
					*/
					if (found_record->job_ptr
					    && !IS_JOB_FINISHED(
						    found_record->job_ptr)) {
						info("Somehow block %s "
						     "is being freed, but "
						     "appears to already have "
						     "a job %u(%u) running "
						     "on it.",
						     found_record->bg_block_id,
						     found_record->
						     job_ptr->job_id,
						     found_record->job_running);
						if (job_requeue(0,
								found_record->
								job_ptr->job_id,
								-1,
								(uint16_t)
								NO_VAL,
								false)) {
							error("Couldn't "
							      "requeue job %u, "
							      "failing it: %s",
							      found_record->
							      job_ptr->job_id,
							      slurm_strerror(
								      rc));
							job_fail(found_record->
								 job_ptr->
								 job_id);
						}
					}

					free_block_list(NO_VAL, tmp_list, 0, 0);
					list_destroy(tmp_list);
				}
				rc = 1;

				if (!is_test)
					break;
			}
		}
	}
	list_iterator_destroy(itr);

	return rc;
}
示例#4
0
/*
 * Synchronize BG block state to that of currently active jobs.
 * This can recover from slurmctld crashes when block usership
 * changes were queued
 */
extern int sync_jobs(List job_list)
{
	ListIterator itr;
	struct job_record  *job_ptr = NULL;
	List block_list = NULL;
	static bool run_already = false;
	bg_record_t *bg_record = NULL;

	/* Execute only on initial startup. We don't support bgblock
	 * creation on demand today, so there is no need to re-sync data. */
	if (run_already)
		return SLURM_SUCCESS;
	run_already = true;

	if (!job_list) {
		error("sync_jobs: no job_list");
		return SLURM_ERROR;
	}
	slurm_mutex_lock(&block_state_mutex);
	/* Insure that all running jobs own the specified block */
	itr = list_iterator_create(job_list);
	while ((job_ptr = list_next(itr))) {
		bg_action_t *bg_action_ptr = NULL;
		if (!IS_JOB_RUNNING(job_ptr) && !IS_JOB_COMPLETING(job_ptr))
			continue;

		bg_action_ptr = xmalloc(sizeof(bg_action_t));
		if (IS_JOB_COMPLETING(job_ptr))
			bg_action_ptr->op = TERM_OP;
		else
			bg_action_ptr->op = START_OP;
		bg_action_ptr->job_ptr = job_ptr;

		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_BLOCK_ID,
				   &(bg_action_ptr->bg_block_id));
#ifdef HAVE_BG_L_P
# ifdef HAVE_BGL
		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_BLRTS_IMAGE,
				   &(bg_action_ptr->blrtsimage));
# else
		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_CONN_TYPE,
				   &(bg_action_ptr->conn_type));
# endif
		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_LINUX_IMAGE,
				   &(bg_action_ptr->linuximage));
		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_RAMDISK_IMAGE,
				   &(bg_action_ptr->ramdiskimage));
#endif
		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_MLOADER_IMAGE,
				   &(bg_action_ptr->mloaderimage));

		if (bg_action_ptr->bg_block_id == NULL) {
			error("Running job %u has bgblock==NULL",
			      job_ptr->job_id);
		} else if (job_ptr->nodes == NULL) {
			error("Running job %u has nodes==NULL",
			      job_ptr->job_id);
		} else if (!(bg_record = find_bg_record_in_list(
				     bg_lists->main,
				     bg_action_ptr->bg_block_id))) {
			error("Kill job %u belongs to defunct "
			      "bgblock %s",
			      job_ptr->job_id,
			      bg_action_ptr->bg_block_id);
		}

		if (!bg_record) {
			/* Don't use slurm_fail_job, locks are already
			   in place.
			*/
			job_fail(job_ptr->job_id);
			_destroy_bg_action(bg_action_ptr);
			continue;
		}
		/* _sync_agent will destroy the bg_action_ptr */
		_sync_agent(bg_action_ptr, bg_record);
	}
	list_iterator_destroy(itr);

	block_list = list_create(destroy_bg_record);
	itr = list_iterator_create(bg_lists->main);
	while ((bg_record = list_next(itr))) {
		bg_record_t *rm_record;
		if (bg_record->job_ptr
		    || (bg_record->job_list
			&& list_count(bg_record->job_list)))
			continue;
		rm_record = xmalloc(sizeof(bg_record_t));
		rm_record->magic = BLOCK_MAGIC;
		rm_record->bg_block_id = xstrdup(bg_record->bg_block_id);
		rm_record->mp_str = xstrdup(bg_record->mp_str);
		list_append(block_list, rm_record);
	}
	list_iterator_destroy(itr);
	slurm_mutex_unlock(&block_state_mutex);

	/* Insure that all other blocks are free of users */
	if (block_list) {
		itr = list_iterator_create(block_list);
		while ((bg_record = list_next(itr))) {
			info("Queue clearing of users of BG block %s",
			     bg_record->bg_block_id);
			term_jobs_on_block(bg_record->bg_block_id);
		}
		list_iterator_destroy(itr);
		list_destroy(block_list);
	} else {
		/* this should never happen,
		 * vestigial logic */
		error("sync_jobs: no block_list");
		return SLURM_ERROR;
	}
	return SLURM_SUCCESS;
}
示例#5
0
/*
 * This could potentially lock the node lock in the slurmctld with
 * slurm_drain_node, or slurm_fail_job so if slurmctld_locked is called we
 * will call the functions without locking the locks again.
 */
extern int down_nodecard(char *mp_name, bitoff_t io_start,
			 bool slurmctld_locked)
{
	List requests = NULL;
	List delete_list = NULL;
	ListIterator itr = NULL;
	bg_record_t *bg_record = NULL, *found_record = NULL, tmp_record;
	bg_record_t *smallest_bg_record = NULL;
	struct node_record *node_ptr = NULL;
	int mp_bit = 0;
	static int io_cnt = NO_VAL;
	static int create_size = NO_VAL;
	static select_ba_request_t blockreq;
	int rc = SLURM_SUCCESS;
	char *reason = "select_bluegene: nodecard down";

	xassert(mp_name);

	if (io_cnt == NO_VAL) {
		io_cnt = 1;
		/* Translate 1 nodecard count to ionode count */
		if ((io_cnt *= bg_conf->io_ratio))
			io_cnt--;

		/* make sure we create something that is able to be
		   created */
		if (bg_conf->smallest_block < bg_conf->nodecard_cnode_cnt)
			create_size = bg_conf->nodecard_cnode_cnt;
		else
			create_size = bg_conf->smallest_block;
	}

	node_ptr = find_node_record(mp_name);
	if (!node_ptr) {
		error ("down_sub_node_blocks: invalid node specified '%s'",
		       mp_name);
		return EINVAL;
	}

	/* this is here for sanity check to make sure we don't core on
	   these bits when we set them below. */
	if (io_start >= bg_conf->ionodes_per_mp
	    || (io_start+io_cnt) >= bg_conf->ionodes_per_mp) {
		debug("io %d-%d not configured on this "
		      "system, only %d ionodes per midplane",
		      io_start, io_start+io_cnt, bg_conf->ionodes_per_mp);
		return EINVAL;
	}
	mp_bit = (node_ptr - node_record_table_ptr);

	memset(&blockreq, 0, sizeof(select_ba_request_t));

	blockreq.conn_type[0] = SELECT_SMALL;
	blockreq.save_name = mp_name;

	debug3("here setting node %d of %d and ionodes %d-%d of %d",
	       mp_bit, node_record_count, io_start,
	       io_start+io_cnt, bg_conf->ionodes_per_mp);

	memset(&tmp_record, 0, sizeof(bg_record_t));
	tmp_record.mp_count = 1;
	tmp_record.cnode_cnt = bg_conf->nodecard_cnode_cnt;
	tmp_record.mp_bitmap = bit_alloc(node_record_count);
	bit_set(tmp_record.mp_bitmap, mp_bit);

	tmp_record.ionode_bitmap = bit_alloc(bg_conf->ionodes_per_mp);
	bit_nset(tmp_record.ionode_bitmap, io_start, io_start+io_cnt);

	slurm_mutex_lock(&block_state_mutex);
	itr = list_iterator_create(bg_lists->main);
	while ((bg_record = list_next(itr))) {
		if (!bit_test(bg_record->mp_bitmap, mp_bit))
			continue;

		if (!blocks_overlap(bg_record, &tmp_record))
			continue;

		if (bg_record->job_running > NO_JOB_RUNNING) {
			if (slurmctld_locked)
				job_fail(bg_record->job_running);
			else
				slurm_fail_job(bg_record->job_running);

		}
		/* If Running Dynamic mode and the block is
		   smaller than the create size just continue on.
		*/
		if ((bg_conf->layout_mode == LAYOUT_DYNAMIC)
		    && (bg_record->cnode_cnt < create_size)) {
			if (!delete_list)
				delete_list = list_create(NULL);
			list_append(delete_list, bg_record);
			continue;
		}

		/* keep track of the smallest size that is at least
		   the size of create_size. */
		if (!smallest_bg_record ||
		    (smallest_bg_record->cnode_cnt > bg_record->cnode_cnt))
			smallest_bg_record = bg_record;
	}
	list_iterator_destroy(itr);
	slurm_mutex_unlock(&block_state_mutex);

	if (bg_conf->layout_mode != LAYOUT_DYNAMIC) {
		debug3("running non-dynamic mode");
		/* This should never happen, but just in case... */
		if (delete_list)
			list_destroy(delete_list);

		/* If we found a block that is smaller or equal to a
		   midplane we will just mark it in an error state as
		   opposed to draining the node.
		*/
		if (smallest_bg_record
		    && (smallest_bg_record->cnode_cnt < bg_conf->mp_cnode_cnt)){
			if (smallest_bg_record->state & BG_BLOCK_ERROR_FLAG) {
				rc = SLURM_NO_CHANGE_IN_DATA;
				goto cleanup;
			}

			rc = put_block_in_error_state(
				smallest_bg_record, reason);
			goto cleanup;
		}

		debug("No block under 1 midplane available for this nodecard.  "
		      "Draining the whole node.");
		if (!node_already_down(mp_name)) {
			if (slurmctld_locked)
				drain_nodes(mp_name, reason,
					    slurm_get_slurm_user_id());
			else
				slurm_drain_nodes(mp_name, reason,
						  slurm_get_slurm_user_id());
		}
		rc = SLURM_SUCCESS;
		goto cleanup;
	}

	/* below is only for Dynamic mode */

	if (delete_list) {
		int cnt_set = 0;
		bitstr_t *iobitmap = bit_alloc(bg_conf->ionodes_per_mp);
		/* don't lock here since it is handled inside
		   the put_block_in_error_state
		*/
		itr = list_iterator_create(delete_list);
		while ((bg_record = list_next(itr))) {
			debug2("combining smaller than nodecard "
			       "dynamic block %s",
			       bg_record->bg_block_id);
			while (bg_record->job_running > NO_JOB_RUNNING)
				sleep(1);

			bit_or(iobitmap, bg_record->ionode_bitmap);
			cnt_set++;
		}
		list_iterator_destroy(itr);
		list_destroy(delete_list);
		if (!cnt_set) {
			FREE_NULL_BITMAP(iobitmap);
			rc = SLURM_ERROR;
			goto cleanup;
		}
		/* set the start to be the same as the start of the
		   ionode_bitmap.  If no ionodes set (not a small
		   block) set io_start = 0. */
		if ((io_start = bit_ffs(iobitmap)) == -1) {
			io_start = 0;
			if (create_size > bg_conf->nodecard_cnode_cnt)
				blockreq.small128 = 4;
			else
				blockreq.small32 = 16;
		} else if (create_size <= bg_conf->nodecard_cnode_cnt)
			blockreq.small32 = 1;
		else
			/* this should never happen */
			blockreq.small128 = 1;

		FREE_NULL_BITMAP(iobitmap);
	} else if (smallest_bg_record) {
		debug2("smallest dynamic block is %s",
		       smallest_bg_record->bg_block_id);
		if (smallest_bg_record->state & BG_BLOCK_ERROR_FLAG) {
			rc = SLURM_NO_CHANGE_IN_DATA;
			goto cleanup;
		}

		while (smallest_bg_record->job_running > NO_JOB_RUNNING)
			sleep(1);

		if (smallest_bg_record->cnode_cnt == create_size) {
			rc = put_block_in_error_state(
				smallest_bg_record, reason);
			goto cleanup;
		}

		if (create_size > smallest_bg_record->cnode_cnt) {
			/* we should never get here.  This means we
			 * have a create_size that is bigger than a
			 * block that is already made.
			 */
			rc = put_block_in_error_state(
				smallest_bg_record, reason);
			goto cleanup;
		}
		debug3("node count is %d", smallest_bg_record->cnode_cnt);
		switch(smallest_bg_record->cnode_cnt) {
#ifndef HAVE_BGL
		case 64:
			blockreq.small32 = 2;
			break;
		case 256:
			blockreq.small32 = 8;
			break;
#endif
		case 128:
			blockreq.small32 = 4;
			break;
		case 512:
		default:
			blockreq.small32 = 16;
			break;
		}

		if (create_size != bg_conf->nodecard_cnode_cnt) {
			blockreq.small128 = blockreq.small32 / 4;
			blockreq.small32 = 0;
			io_start = 0;
		} else if ((io_start =
			    bit_ffs(smallest_bg_record->ionode_bitmap)) == -1)
			/* set the start to be the same as the start of the
			   ionode_bitmap.  If no ionodes set (not a small
			   block) set io_start = 0. */
			io_start = 0;
	} else {
		switch(create_size) {
#ifndef HAVE_BGL
		case 64:
			blockreq.small64 = 8;
			break;
		case 256:
			blockreq.small256 = 2;
#endif
		case 32:
			blockreq.small32 = 16;
			break;
		case 128:
			blockreq.small128 = 4;
			break;
		case 512:
			if (!node_already_down(mp_name)) {
				char *reason = "select_bluegene: nodecard down";
				if (slurmctld_locked)
					drain_nodes(mp_name, reason,
						    slurm_get_slurm_user_id());
				else
					slurm_drain_nodes(
						mp_name, reason,
						slurm_get_slurm_user_id());
			}
			rc = SLURM_SUCCESS;
			goto cleanup;
			break;
		default:
			error("Unknown create size of %d", create_size);
			break;
		}
		/* since we don't have a block in this midplane
		   we need to start at the beginning. */
		io_start = 0;
		/* we also need a bg_block to pretend to be the
		   smallest block that takes up the entire midplane. */
	}


	/* Here we need to add blocks that take up nodecards on this
	   midplane.  Since Slurm only keeps track of midplanes
	   natively this is the only want to handle this case.
	*/
	requests = list_create(destroy_bg_record);
	add_bg_record(requests, NULL, &blockreq, 1, io_start);

	slurm_mutex_lock(&block_state_mutex);
	delete_list = list_create(NULL);
	while ((bg_record = list_pop(requests))) {
		itr = list_iterator_create(bg_lists->main);
		while ((found_record = list_next(itr))) {
			if (!blocks_overlap(bg_record, found_record))
				continue;
			list_push(delete_list, found_record);
			list_remove(itr);
		}
		list_iterator_destroy(itr);

		/* we need to add this record since it doesn't exist */
		if (bridge_block_create(bg_record) == SLURM_ERROR) {
			destroy_bg_record(bg_record);
			error("down_sub_node_blocks: "
			      "unable to configure block in api");
			continue;
		}

		debug("adding block %s to fill in small blocks "
		      "around bad nodecards",
		      bg_record->bg_block_id);
		print_bg_record(bg_record);
		list_append(bg_lists->main, bg_record);
		if (bit_overlap(bg_record->ionode_bitmap,
				tmp_record.ionode_bitmap)) {
			/* here we know the error block doesn't exist
			   so just set the state here */
			slurm_mutex_unlock(&block_state_mutex);
			rc = put_block_in_error_state(bg_record, reason);
			slurm_mutex_lock(&block_state_mutex);
		}
	}
	list_destroy(requests);

	if (delete_list) {
		slurm_mutex_unlock(&block_state_mutex);
		free_block_list(NO_VAL, delete_list, 0, 0);
		list_destroy(delete_list);
	}
	slurm_mutex_lock(&block_state_mutex);
	sort_bg_record_inc_size(bg_lists->main);
	slurm_mutex_unlock(&block_state_mutex);
	last_bg_update = time(NULL);

cleanup:
	FREE_NULL_BITMAP(tmp_record.mp_bitmap);
	FREE_NULL_BITMAP(tmp_record.ionode_bitmap);

	return rc;

}