Esempio n. 1
0
static List _get_preemptables(uint16_t query_mode, bg_record_t *bg_record,
			      List preempt_jobs)
{
	List preempt = NULL;
	ListIterator itr;
	ListIterator job_itr;
	bg_record_t *found_record;
	struct job_record *job_ptr;

	xassert(bg_record);
	xassert(preempt_jobs);

	preempt = list_create(NULL);
	slurm_mutex_lock(&block_state_mutex);
	job_itr = list_iterator_create(preempt_jobs);
	itr = list_iterator_create(bg_lists->main);
	while ((found_record = list_next(itr))) {
		if (!found_record->job_ptr
		    || (!found_record->bg_block_id)
		    || (bg_record == found_record)
		    || !blocks_overlap(bg_record, found_record))
			continue;

		while ((job_ptr = list_next(job_itr))) {
			if (job_ptr == found_record->job_ptr)
				break;
		}
		if (job_ptr) {
			list_push(preempt, job_ptr);
/* 			info("going to preempt %u running on %s", */
/* 			     job_ptr->job_id, found_record->bg_block_id); */
		} else if (SELECT_IS_MODE_RUN_NOW(query_mode)) {
			error("Job %u running on block %s "
			      "wasn't in the preempt list, but needs to be "
			      "preempted for queried job to run on block %s",
			      found_record->job_ptr->job_id,
			      found_record->bg_block_id,
			      bg_record->bg_block_id);
			list_destroy(preempt);
			preempt = NULL;
			break;
		}
		list_iterator_reset(job_itr);
	}
	list_iterator_destroy(itr);
	list_iterator_destroy(job_itr);
	slurm_mutex_unlock(&block_state_mutex);

	return preempt;
}
Esempio n. 2
0
/* Perform job initiation work */
static void _start_agent(bg_action_t *bg_action_ptr)
{
	int rc, set_user_rc = SLURM_SUCCESS;
	bg_record_t *bg_record = NULL;
	bg_record_t *found_record = NULL;
	ListIterator itr;
	List delete_list = NULL;
	int requeue_job = 0;
	uint32_t req_job_id = bg_action_ptr->job_ptr->job_id;
	bool block_inited = 0;
	bool delete_it = 0;

	slurm_mutex_lock(&block_state_mutex);
	bg_record = find_bg_record_in_list(bg_lists->main,
					   bg_action_ptr->bg_block_id);

	if (!bg_record) {
		bg_record->modifying = 0;
		slurm_mutex_unlock(&block_state_mutex);
		error("block %s not found in bg_lists->main",
		      bg_action_ptr->bg_block_id);
		bg_requeue_job(req_job_id, 1, 0, JOB_BOOT_FAIL, false);
		return;
	}

	if ((bg_record->job_running <= NO_JOB_RUNNING)
	    && !find_job_in_bg_record(bg_record, req_job_id)) {
		bg_record->modifying = 0;
		// bg_reset_block(bg_record); should already happened
		slurm_mutex_unlock(&block_state_mutex);
		debug("job %u finished during the queueing job "
		      "(everything is ok)",
		      req_job_id);
		return;
	}

	if ((bg_record->state == BG_BLOCK_TERM) || bg_record->free_cnt) {
		/* It doesn't appear state of a small block
		   (conn_type) is held on a BGP system so
		   if we to reset it so, just set the reboot flag and
		   handle it later in that code. */
		bg_action_ptr->reboot = 1;
	}

	delete_list = list_create(NULL);
	itr = list_iterator_create(bg_lists->main);
	while ((found_record = list_next(itr))) {
		if (bg_record == found_record)
			continue;

		if (!blocks_overlap(bg_record, found_record)) {
			debug2("block %s isn't part of %s",
			       found_record->bg_block_id,
			       bg_record->bg_block_id);
			continue;
		}

		if (found_record->job_ptr
		    || (found_record->job_list
			&& list_count(found_record->job_list))) {
			struct job_record *job_ptr = found_record->job_ptr;
			if (!found_record->job_ptr)
				job_ptr = find_job_in_bg_record(
					found_record, NO_VAL);
			error("Trying to start job %u on block %s, "
			      "but there is a job %u running on an overlapping "
			      "block %s it will not end until %ld.  "
			      "This should never happen.",
			      req_job_id,
			      bg_record->bg_block_id,
			      job_ptr->job_id,
			      found_record->bg_block_id,
			      job_ptr->end_time);
			requeue_job = 1;
			break;
		}

		debug2("need to make sure %s is free, it's part of %s",
		       found_record->bg_block_id,
		       bg_record->bg_block_id);
		list_push(delete_list, found_record);
	}
	list_iterator_destroy(itr);

	if (requeue_job) {
		FREE_NULL_LIST(delete_list);

		bg_reset_block(bg_record, bg_action_ptr->job_ptr);

		bg_record->modifying = 0;
		slurm_mutex_unlock(&block_state_mutex);
		bg_requeue_job(req_job_id, 0, 0, JOB_BOOT_FAIL, false);
		return;
	}

	slurm_mutex_unlock(&block_state_mutex);

	if (bg_conf->layout_mode == LAYOUT_DYNAMIC)
		delete_it = 1;
	free_block_list(req_job_id, delete_list, delete_it, 1);
	FREE_NULL_LIST(delete_list);

	while (1) {
		slurm_mutex_lock(&block_state_mutex);
		/* Failure will unlock block_state_mutex so no need to
		   unlock before return.  No need to reset modifying
		   here if the block doesn't exist.
		*/
		if (!_make_sure_block_still_exists(bg_action_ptr, bg_record)) {
			error("Problem with deallocating blocks to run job %u "
			      "on block %s", req_job_id,
			      bg_action_ptr->bg_block_id);
			return;
		}
		/* If another thread is freeing this block we need to
		   wait until it is done or we will get into a state
		   where this job will be killed.
		*/
		if (!bg_record->free_cnt)
			break;
		debug("Waiting for block %s to free for job %u.  "
		      "%d thread(s) trying to free it",
		      bg_record->bg_block_id, req_job_id,
		      bg_record->free_cnt);
		slurm_mutex_unlock(&block_state_mutex);
		sleep(1);
	}
	/* This was set in the start_job function to close the above
	   window where a job could be mistakenly requeued if another
	   thread is trying to free this block as we are trying to run
	   on it, which is fine since we will reboot it later.
	*/
	bg_record->modifying = 0;

	if ((bg_record->job_running <= NO_JOB_RUNNING)
	    && !find_job_in_bg_record(bg_record, req_job_id)) {
		// bg_reset_block(bg_record); should already happened
		slurm_mutex_unlock(&block_state_mutex);
		debug("job %u already finished before boot",
		      req_job_id);
		return;
	}

	if (bg_record->job_list
	    && (bg_action_ptr->job_ptr->total_cpus != bg_record->cpu_cnt)
	    && (list_count(bg_record->job_list) != 1)) {
		/* We don't allow modification of a block or reboot of
		   a block if we are running multiple jobs on the
		   block.
		*/
		debug2("no reboot");
		goto no_reboot;
	}

	rc = 0;
#ifdef HAVE_BGL
	if (bg_action_ptr->blrtsimage
	   && xstrcasecmp(bg_action_ptr->blrtsimage, bg_record->blrtsimage)) {
		debug3("changing BlrtsImage from %s to %s",
		       bg_record->blrtsimage, bg_action_ptr->blrtsimage);
		xfree(bg_record->blrtsimage);
		bg_record->blrtsimage = xstrdup(bg_action_ptr->blrtsimage);
		rc = 1;
	}
#elif defined HAVE_BGP
	if ((bg_action_ptr->conn_type[0] >= SELECT_SMALL)
	   && (bg_action_ptr->conn_type[0] != bg_record->conn_type[0])) {
		if (bg_conf->slurm_debug_level >= LOG_LEVEL_DEBUG3) {
			char *req_conn_type =
				conn_type_string_full(bg_action_ptr->conn_type);
			char *conn_type =
				conn_type_string_full(bg_record->conn_type);
			debug3("changing small block mode from %s to %s",
			       conn_type, req_conn_type);
			xfree(req_conn_type);
			xfree(conn_type);
		}
		rc = 1;
# ifndef HAVE_BG_FILES
		/* since we don't check state on an emulated system we
		 * have to change it here
		 */
		bg_record->conn_type[0] = bg_action_ptr->conn_type[0];
# endif
	}
#endif

#ifdef HAVE_BG_L_P
	if (bg_action_ptr->linuximage
	   && xstrcasecmp(bg_action_ptr->linuximage, bg_record->linuximage)) {
# ifdef HAVE_BGL
		debug3("changing LinuxImage from %s to %s",
		       bg_record->linuximage, bg_action_ptr->linuximage);
# else
		debug3("changing CnloadImage from %s to %s",
		       bg_record->linuximage, bg_action_ptr->linuximage);
# endif
		xfree(bg_record->linuximage);
		bg_record->linuximage = xstrdup(bg_action_ptr->linuximage);
		rc = 1;
	}
	if (bg_action_ptr->ramdiskimage
	   && xstrcasecmp(bg_action_ptr->ramdiskimage,
			 bg_record->ramdiskimage)) {
# ifdef HAVE_BGL
		debug3("changing RamDiskImage from %s to %s",
		       bg_record->ramdiskimage, bg_action_ptr->ramdiskimage);
# else
		debug3("changing IoloadImage from %s to %s",
		       bg_record->ramdiskimage, bg_action_ptr->ramdiskimage);
# endif
		xfree(bg_record->ramdiskimage);
		bg_record->ramdiskimage = xstrdup(bg_action_ptr->ramdiskimage);
		rc = 1;
	}
#endif
	if (bg_action_ptr->mloaderimage
	   && xstrcasecmp(bg_action_ptr->mloaderimage,
			 bg_record->mloaderimage)) {
		debug3("changing MloaderImage from %s to %s",
		       bg_record->mloaderimage, bg_action_ptr->mloaderimage);
		xfree(bg_record->mloaderimage);
		bg_record->mloaderimage = xstrdup(bg_action_ptr->mloaderimage);
		rc = 1;
	}

	if (rc || bg_action_ptr->reboot) {
		bg_record->modifying = 1;

		/* Increment free_cnt to make sure we don't loose this
		 * block since bg_free_block will unlock block_state_mutex.
		 */
		bg_record->free_cnt++;
		bg_free_block(bg_record, 1, 1);
		bg_record->free_cnt--;

#if defined HAVE_BG_FILES && defined HAVE_BG_L_P
#ifdef HAVE_BGL
		if ((rc = bridge_block_modify(bg_record->bg_block_id,
					      RM_MODIFY_BlrtsImg,
					      bg_record->blrtsimage))
		    != SLURM_SUCCESS)
			error("bridge_block_modify(RM_MODIFY_BlrtsImg): %s",
			      bg_err_str(rc));

		if ((rc = bridge_block_modify(bg_record->bg_block_id,
					      RM_MODIFY_LinuxImg,
					      bg_record->linuximage))
		    != SLURM_SUCCESS)
			error("bridge_block_modify(RM_MODIFY_LinuxImg): %s",
			      bg_err_str(rc));

		if ((rc = bridge_block_modify(bg_record->bg_block_id,
					      RM_MODIFY_RamdiskImg,
					      bg_record->ramdiskimage))
		    != SLURM_SUCCESS)
			error("bridge_block_modify(RM_MODIFY_RamdiskImg): %s",
			      bg_err_str(rc));

#elif defined HAVE_BGP
		if ((rc = bridge_block_modify(bg_record->bg_block_id,
					      RM_MODIFY_CnloadImg,
					      bg_record->linuximage))
		    != SLURM_SUCCESS)
			error("bridge_block_modify(RM_MODIFY_CnloadImg): %s",
			      bg_err_str(rc));

		if ((rc = bridge_block_modify(bg_record->bg_block_id,
					      RM_MODIFY_IoloadImg,
					      bg_record->ramdiskimage))
		    != SLURM_SUCCESS)
			error("bridge_block_modify(RM_MODIFY_IoloadImg): %s",
			      bg_err_str(rc));

		if (bg_action_ptr->conn_type[0] > SELECT_SMALL) {
			char *conn_type = NULL;
			switch(bg_action_ptr->conn_type[0]) {
			case SELECT_HTC_S:
				conn_type = "s";
				break;
			case SELECT_HTC_D:
				conn_type = "d";
				break;
			case SELECT_HTC_V:
				conn_type = "v";
				break;
			case SELECT_HTC_L:
				conn_type = "l";
				break;
			default:
				break;
			}
			/* the option has to be set before the pool can be
			   set */
			if ((rc = bridge_block_modify(
				     bg_record->bg_block_id,
				     RM_MODIFY_Options,
				     conn_type)) != SLURM_SUCCESS)
				error("bridge_set_data(RM_MODIFY_Options): %s",
				      bg_err_str(rc));
		}
#endif
		if ((rc = bridge_block_modify(bg_record->bg_block_id,
					      RM_MODIFY_MloaderImg,
					      bg_record->mloaderimage))
		    != SLURM_SUCCESS)
			error("bridge_block_modify(RM_MODIFY_MloaderImg): %s",
			      bg_err_str(rc));

#endif
		bg_record->modifying = 0;
	}

no_reboot:
	if (bg_record->state == BG_BLOCK_FREE) {
		if ((rc = bridge_block_boot(bg_record)) != SLURM_SUCCESS) {
			char reason[200];

			bg_record->boot_state = 0;
			bg_record->boot_count = 0;

			if (rc == BG_ERROR_INVALID_STATE)
				snprintf(reason, sizeof(reason),
					 "Block %s is in an incompatible "
					 "state.  This usually means "
					 "hardware is allocated "
					 "by another block (maybe outside "
					 "of SLURM).",
					 bg_record->bg_block_id);
			else
				snprintf(reason, sizeof(reason),
					 "Couldn't boot block %s: %s",
					 bg_record->bg_block_id,
					 bg_err_str(rc));
			slurm_mutex_unlock(&block_state_mutex);
			requeue_and_error(bg_record, reason);
			return;
		}
	} else if (bg_record->state == BG_BLOCK_BOOTING) {
#ifdef HAVE_BG_FILES
		bg_record->boot_state = 1;
#else
		if (!block_ptr_exist_in_list(bg_lists->booted, bg_record))
			list_push(bg_lists->booted, bg_record);
		bg_record->state = BG_BLOCK_INITED;
		last_bg_update = time(NULL);
#endif
	}


	if ((bg_record->job_running <= NO_JOB_RUNNING)
	    && !find_job_in_bg_record(bg_record, req_job_id)) {
		slurm_mutex_unlock(&block_state_mutex);
		debug("job %u finished during the start of the boot "
		      "(everything is ok)",
		      req_job_id);
		return;
	}

	/* Don't reset boot_count, it will be reset when state
	   changes, and needs to outlast a job allocation.
	*/
	/* bg_record->boot_count = 0; */
	if (bg_record->state == BG_BLOCK_INITED) {
		debug("block %s is already ready.", bg_record->bg_block_id);
		/* Just in case reset the boot flags */
		bg_record->boot_state = 0;
		bg_record->boot_count = 0;
		set_user_rc = bridge_block_sync_users(bg_record);
		block_inited = 1;
	}
	slurm_mutex_unlock(&block_state_mutex);

	/* This lock needs to happen after the block_state_mutex to
	   avoid deadlock.
	*/
	if (block_inited && bg_action_ptr->job_ptr) {
		slurmctld_lock_t job_write_lock = {
			NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
		lock_slurmctld(job_write_lock);
		bg_action_ptr->job_ptr->job_state &= (~JOB_CONFIGURING);
		last_job_update = time(NULL);
		unlock_slurmctld(job_write_lock);
	}

	if (set_user_rc == SLURM_ERROR) {
		sleep(2);
		/* wait for the slurmd to begin
		   the batch script, slurm_fail_job()
		   is a no-op if issued prior
		   to the script initiation do clean up just
		   in case the fail job isn't ran */
		(void) slurm_fail_job(req_job_id, JOB_BOOT_FAIL);
	}
}
Esempio n. 3
0
static int _check_for_booted_overlapping_blocks(
	List block_list, ListIterator bg_record_itr,
	bg_record_t *bg_record, int overlap_check, List overlapped_list,
	uint16_t query_mode)
{
	bg_record_t *found_record = NULL;
	ListIterator itr = NULL;
	int rc = 0;
	int overlap = 0;
	bool is_test = SELECT_IS_TEST(query_mode);

	/* this test only is for actually picking a block not testing */
	if (is_test && bg_conf->layout_mode == LAYOUT_DYNAMIC)
		return rc;

	/* Make sure no other blocks are under this block
	   are booted and running jobs
	*/
	itr = list_iterator_create(block_list);
	while ((found_record = (bg_record_t*)list_next(itr)) != NULL) {
		if ((!found_record->bg_block_id)
		    || (bg_record == found_record)) {
			if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
				info("Don't need to look at myself %s %s",
				     bg_record->bg_block_id,
				     found_record->bg_block_id);
			continue;
		}

		slurm_mutex_lock(&block_state_mutex);
		overlap = blocks_overlap(bg_record, found_record);
		slurm_mutex_unlock(&block_state_mutex);

		if (overlap) {
			overlap = 0;
			/* make the available time on this block
			 * (bg_record) the max of this found_record's job
			 * or the one already set if in overlapped_block_list
			 * since we aren't setting job_running we
			 * don't have to remove them since the
			 * block_list should always be destroyed afterwards.
			 */
			if (is_test && overlapped_list
			    && found_record->job_ptr
			    && bg_record->job_running == NO_JOB_RUNNING) {
				ListIterator itr = list_iterator_create(
					overlapped_list);
				bg_record_t *tmp_rec = NULL;

				if (bg_conf->slurm_debug_flags
				    & DEBUG_FLAG_BG_PICK)
					info("found overlapping block %s "
					     "overlapped %s with job %u",
					     found_record->bg_block_id,
					     bg_record->bg_block_id,
					     found_record->job_ptr->job_id);

				while ((tmp_rec = list_next(itr))) {
					if (tmp_rec == bg_record)
						break;
				}
				list_iterator_destroy(itr);
				if (tmp_rec && tmp_rec->job_ptr->end_time
				    < found_record->job_ptr->end_time)
					tmp_rec->job_ptr =
						found_record->job_ptr;
				else if (!tmp_rec) {
					bg_record->job_ptr =
						found_record->job_ptr;
					list_append(overlapped_list,
						    bg_record);
				}
			}
			/* We already know this block doesn't work
			 * right now so we will if there is another
			 * overlapping block that ends later
			 */
			if (rc)
				continue;
			/* This test is here to check if the block we
			 * chose is not booted or if there is a block
			 * overlapping that we could avoid freeing if
			 * we choose something else
			 */
			if (bg_conf->layout_mode == LAYOUT_OVERLAP
			    && ((overlap_check == 0 && bg_record->state
				 != BG_BLOCK_INITED)
				|| (overlap_check == 1 && found_record->state
				    != BG_BLOCK_FREE))) {

				if (!is_test) {
					rc = 1;
					break;
				}
			}

			if (((bg_conf->layout_mode == LAYOUT_DYNAMIC)
			     || ((!SELECT_IS_CHECK_FULL_SET(query_mode)
				  || SELECT_IS_MODE_RUN_NOW(query_mode))
				 && (bg_conf->layout_mode != LAYOUT_DYNAMIC)))
			    && ((found_record->job_running != NO_JOB_RUNNING)
				|| (found_record->state
				    & BG_BLOCK_ERROR_FLAG))) {
				if ((found_record->job_running
				     == BLOCK_ERROR_STATE)
				    || (found_record->state
					& BG_BLOCK_ERROR_FLAG))
					error("can't use %s, "
					      "overlapping block %s "
					      "is in an error state.",
					      bg_record->bg_block_id,
					      found_record->bg_block_id);
				else if (bg_conf->slurm_debug_flags
					 & DEBUG_FLAG_BG_PICK)
					info("can't use %s, there is "
					     "a job (%d) running on "
					     "an overlapping "
					     "block %s",
					     bg_record->bg_block_id,
					     found_record->job_running,
					     found_record->bg_block_id);

				if (bg_conf->layout_mode == LAYOUT_DYNAMIC) {
					List tmp_list = list_create(NULL);
					/* this will remove and
					 * destroy the memory for
					 * bg_record
					 */
					list_remove(bg_record_itr);
					slurm_mutex_lock(&block_state_mutex);

					if (bg_record->original) {
						if (bg_conf->slurm_debug_flags
						    & DEBUG_FLAG_BG_PICK)
							info("This was a "
							     "copy %s",
							     bg_record->
							     bg_block_id);
						found_record =
							bg_record->original;
					} else {
						if (bg_conf->slurm_debug_flags
						    & DEBUG_FLAG_BG_PICK)
							info("looking for "
							     "original");
						found_record =
							find_org_in_bg_list(
								bg_lists->main,
								bg_record);
					}

					if (bg_conf->slurm_debug_flags
					    & DEBUG_FLAG_BG_PICK)
						info("Removing unusable block "
						     "%s from the system.",
						     bg_record->bg_block_id);

					if (!found_record) {
						if (bg_conf->slurm_debug_flags
						    & DEBUG_FLAG_BG_PICK)
							info("This record %s "
							     "wasn't found in "
							     "the "
							     "bg_lists->main, "
							     "no big deal, it "
							     "probably wasn't "
							     "added",
							     bg_record->
							     bg_block_id);
						found_record = bg_record;
					} else
						destroy_bg_record(bg_record);

					list_push(tmp_list, found_record);
					slurm_mutex_unlock(&block_state_mutex);

					/* We need to make sure if a
					   job is running here to not
					   call the regular method since
					   we are inside the job write
					   lock already.
					*/
					if (found_record->job_ptr
					    && !IS_JOB_FINISHED(
						    found_record->job_ptr)) {
						info("Somehow block %s "
						     "is being freed, but "
						     "appears to already have "
						     "a job %u(%u) running "
						     "on it.",
						     found_record->bg_block_id,
						     found_record->
						     job_ptr->job_id,
						     found_record->job_running);
						if (job_requeue(0,
								found_record->
								job_ptr->job_id,
								-1,
								(uint16_t)
								NO_VAL,
								false)) {
							error("Couldn't "
							      "requeue job %u, "
							      "failing it: %s",
							      found_record->
							      job_ptr->job_id,
							      slurm_strerror(
								      rc));
							job_fail(found_record->
								 job_ptr->
								 job_id);
						}
					}

					free_block_list(NO_VAL, tmp_list, 0, 0);
					list_destroy(tmp_list);
				}
				rc = 1;

				if (!is_test)
					break;
			}
		}
	}
	list_iterator_destroy(itr);

	return rc;
}
Esempio n. 4
0
/* Perform job initiation work */
static void _start_agent(bg_action_t *bg_action_ptr)
{
	int rc, set_user_rc = SLURM_SUCCESS;
	bg_record_t *bg_record = NULL;
	bg_record_t *found_record = NULL;
	ListIterator itr;
	List delete_list = NULL;
	int requeue_job = 0;

	slurm_mutex_lock(&block_state_mutex);
	bg_record = find_bg_record_in_list(bg_lists->main,
					   bg_action_ptr->bg_block_id);

	if (!bg_record) {
		slurm_mutex_unlock(&block_state_mutex);
		error("block %s not found in bg_lists->main",
		      bg_action_ptr->bg_block_id);
		bg_requeue_job(bg_action_ptr->job_ptr->job_id, 1);
		return;
	}

	if (bg_record->job_running <= NO_JOB_RUNNING) {
		// bg_reset_block(bg_record); should already happened
		slurm_mutex_unlock(&block_state_mutex);
		debug("job %u finished during the queueing job "
		      "(everything is ok)",
		      bg_action_ptr->job_ptr->job_id);
		return;
	}
	if (bg_record->state == BG_BLOCK_TERM) {
		debug("Block is in Deallocating state, waiting for free.");
		/* It doesn't appear state of a small block
		   (conn_type) is held on a BGP system so
		   if we to reset it so, just set the reboot flag and
		   handle it later in that code. */
		bg_action_ptr->reboot = 1;
	}

	delete_list = list_create(NULL);
	itr = list_iterator_create(bg_lists->main);
	while ((found_record = list_next(itr))) {
		if ((!found_record) || (bg_record == found_record))
			continue;

		if (!blocks_overlap(bg_record, found_record)) {
			debug2("block %s isn't part of %s",
			       found_record->bg_block_id,
			       bg_record->bg_block_id);
			continue;
		}

		if (found_record->job_ptr) {
			error("Trying to start job %u on block %s, "
			      "but there is a job %u running on an overlapping "
			      "block %s it will not end until %ld.  "
			      "This should never happen.",
			      bg_action_ptr->job_ptr->job_id,
			      bg_record->bg_block_id,
			      found_record->job_ptr->job_id,
			      found_record->bg_block_id,
			      found_record->job_ptr->end_time);
			requeue_job = 1;
			break;
		}

		debug2("need to make sure %s is free, it's part of %s",
		       found_record->bg_block_id,
		       bg_record->bg_block_id);
		list_push(delete_list, found_record);
	}
	list_iterator_destroy(itr);

	if (requeue_job) {
		list_destroy(delete_list);

		bg_reset_block(bg_record);

		slurm_mutex_unlock(&block_state_mutex);
		bg_requeue_job(bg_action_ptr->job_ptr->job_id, 0);
		return;
	}

	slurm_mutex_unlock(&block_state_mutex);

	rc = free_block_list(bg_action_ptr->job_ptr->job_id, delete_list, 0, 1);
	list_destroy(delete_list);
	if (rc != SLURM_SUCCESS) {
		error("Problem with deallocating blocks to run job %u "
		      "on block %s", bg_action_ptr->job_ptr->job_id,
		      bg_action_ptr->bg_block_id);
		if (IS_JOB_CONFIGURING(bg_action_ptr->job_ptr))
			bg_requeue_job(bg_action_ptr->job_ptr->job_id, 0);
		return;
	}

	slurm_mutex_lock(&block_state_mutex);
	/* Failure will unlock block_state_mutex so no need to unlock before
	   return. Failure will unlock block_state_mutex so no need to unlock
	   before return.
	*/
	if (!_make_sure_block_still_exists(bg_action_ptr, bg_record))
		return;

	if (bg_record->job_running <= NO_JOB_RUNNING) {
		// bg_reset_block(bg_record); should already happened
		slurm_mutex_unlock(&block_state_mutex);
		debug("job %u already finished before boot",
		      bg_action_ptr->job_ptr->job_id);
		return;
	}

	rc = 0;
#ifdef HAVE_BGL
	if (bg_action_ptr->blrtsimage
	   && strcasecmp(bg_action_ptr->blrtsimage, bg_record->blrtsimage)) {
		debug3("changing BlrtsImage from %s to %s",
		       bg_record->blrtsimage, bg_action_ptr->blrtsimage);
		xfree(bg_record->blrtsimage);
		bg_record->blrtsimage = xstrdup(bg_action_ptr->blrtsimage);
		rc = 1;
	}
#elif defined HAVE_BGP
	if ((bg_action_ptr->conn_type[0] >= SELECT_SMALL)
	   && (bg_action_ptr->conn_type[0] != bg_record->conn_type[0])) {
		debug3("changing small block mode from %s to %s",
		       conn_type_string(bg_record->conn_type[0]),
		       conn_type_string(bg_action_ptr->conn_type[0]));
		rc = 1;
# ifndef HAVE_BG_FILES
		/* since we don't check state on an emulated system we
		 * have to change it here
		 */
		bg_record->conn_type[0] = bg_action_ptr->conn_type[0];
# endif
	}
#endif

#ifdef HAVE_BG_L_P
	if (bg_action_ptr->linuximage
	   && strcasecmp(bg_action_ptr->linuximage, bg_record->linuximage)) {
# ifdef HAVE_BGL
		debug3("changing LinuxImage from %s to %s",
		       bg_record->linuximage, bg_action_ptr->linuximage);
# else
		debug3("changing CnloadImage from %s to %s",
		       bg_record->linuximage, bg_action_ptr->linuximage);
# endif
		xfree(bg_record->linuximage);
		bg_record->linuximage = xstrdup(bg_action_ptr->linuximage);
		rc = 1;
	}
	if (bg_action_ptr->ramdiskimage
	   && strcasecmp(bg_action_ptr->ramdiskimage,
			 bg_record->ramdiskimage)) {
# ifdef HAVE_BGL
		debug3("changing RamDiskImage from %s to %s",
		       bg_record->ramdiskimage, bg_action_ptr->ramdiskimage);
# else
		debug3("changing IoloadImage from %s to %s",
		       bg_record->ramdiskimage, bg_action_ptr->ramdiskimage);
# endif
		xfree(bg_record->ramdiskimage);
		bg_record->ramdiskimage = xstrdup(bg_action_ptr->ramdiskimage);
		rc = 1;
	}
#endif
	if (bg_action_ptr->mloaderimage
	   && strcasecmp(bg_action_ptr->mloaderimage,
			 bg_record->mloaderimage)) {
		debug3("changing MloaderImage from %s to %s",
		       bg_record->mloaderimage, bg_action_ptr->mloaderimage);
		xfree(bg_record->mloaderimage);
		bg_record->mloaderimage = xstrdup(bg_action_ptr->mloaderimage);
		rc = 1;
	}

	if (rc || bg_action_ptr->reboot) {
		bg_record->modifying = 1;

		/* Increment free_cnt to make sure we don't loose this
		 * block since bg_free_block will unlock block_state_mutex.
		 */
		bg_record->free_cnt++;
		bg_free_block(bg_record, 1, 1);
		bg_record->free_cnt--;

#if defined HAVE_BG_FILES && defined HAVE_BG_L_P
#ifdef HAVE_BGL
		if ((rc = bridge_block_modify(bg_record->bg_block_id,
					      RM_MODIFY_BlrtsImg,
					      bg_record->blrtsimage))
		    != SLURM_SUCCESS)
			error("bridge_block_modify(RM_MODIFY_BlrtsImg): %s",
			      bg_err_str(rc));

		if ((rc = bridge_block_modify(bg_record->bg_block_id,
					      RM_MODIFY_LinuxImg,
					      bg_record->linuximage))
		    != SLURM_SUCCESS)
			error("bridge_block_modify(RM_MODIFY_LinuxImg): %s",
			      bg_err_str(rc));

		if ((rc = bridge_block_modify(bg_record->bg_block_id,
					      RM_MODIFY_RamdiskImg,
					      bg_record->ramdiskimage))
		    != SLURM_SUCCESS)
			error("bridge_block_modify(RM_MODIFY_RamdiskImg): %s",
			      bg_err_str(rc));

#elif defined HAVE_BGP
		if ((rc = bridge_block_modify(bg_record->bg_block_id,
					      RM_MODIFY_CnloadImg,
					      bg_record->linuximage))
		    != SLURM_SUCCESS)
			error("bridge_block_modify(RM_MODIFY_CnloadImg): %s",
			      bg_err_str(rc));

		if ((rc = bridge_block_modify(bg_record->bg_block_id,
					      RM_MODIFY_IoloadImg,
					      bg_record->ramdiskimage))
		    != SLURM_SUCCESS)
			error("bridge_block_modify(RM_MODIFY_IoloadImg): %s",
			      bg_err_str(rc));

		if (bg_action_ptr->conn_type[0] > SELECT_SMALL) {
			char *conn_type = NULL;
			switch(bg_action_ptr->conn_type[0]) {
			case SELECT_HTC_S:
				conn_type = "s";
				break;
			case SELECT_HTC_D:
				conn_type = "d";
				break;
			case SELECT_HTC_V:
				conn_type = "v";
				break;
			case SELECT_HTC_L:
				conn_type = "l";
				break;
			default:
				break;
			}
			/* the option has to be set before the pool can be
			   set */
			if ((rc = bridge_block_modify(
				     bg_record->bg_block_id,
				     RM_MODIFY_Options,
				     conn_type)) != SLURM_SUCCESS)
				error("bridge_set_data(RM_MODIFY_Options): %s",
				      bg_err_str(rc));
		}
#endif
		if ((rc = bridge_block_modify(bg_record->bg_block_id,
					      RM_MODIFY_MloaderImg,
					      bg_record->mloaderimage))
		    != SLURM_SUCCESS)
			error("bridge_block_modify(RM_MODIFY_MloaderImg): %s",
			      bg_err_str(rc));

#endif
		bg_record->modifying = 0;
	}

	if (bg_record->state == BG_BLOCK_FREE) {
		if ((rc = bridge_block_boot(bg_record)) != SLURM_SUCCESS) {
			char reason[200];

			bg_record->boot_state = 0;
			bg_record->boot_count = 0;

			if (rc == BG_ERROR_INVALID_STATE)
				snprintf(reason, sizeof(reason),
					 "Block %s is in an incompatible "
					 "state.  This usually means "
					 "hardware is allocated "
					 "by another block (maybe outside "
					 "of SLURM).",
					 bg_record->bg_block_id);
			else
				snprintf(reason, sizeof(reason),
					 "Couldn't boot block %s: %s",
					 bg_record->bg_block_id,
					 bg_err_str(rc));
			slurm_mutex_unlock(&block_state_mutex);
			requeue_and_error(bg_record, reason);
			return;
		}
	} else if (bg_record->state == BG_BLOCK_BOOTING) {
#ifdef HAVE_BG_FILES
		bg_record->boot_state = 1;
#else
		if (!block_ptr_exist_in_list(bg_lists->booted, bg_record))
			list_push(bg_lists->booted, bg_record);
		bg_record->state = BG_BLOCK_INITED;
		last_bg_update = time(NULL);
#endif
	}


	if (bg_record->job_running <= NO_JOB_RUNNING) {
		slurm_mutex_unlock(&block_state_mutex);
		debug("job %u finished during the start of the boot "
		      "(everything is ok)",
		      bg_action_ptr->job_ptr->job_id);
		return;
	}

	/* Don't reset boot_count, it will be reset when state
	   changes, and needs to outlast a job allocation.
	*/
	/* bg_record->boot_count = 0; */
	xfree(bg_record->target_name);
	bg_record->target_name = uid_to_string(bg_action_ptr->job_ptr->user_id);
	debug("setting the target_name for Block %s to %s",
	      bg_record->bg_block_id, bg_record->target_name);

	if (bg_record->state == BG_BLOCK_INITED) {
		debug("block %s is ready.", bg_record->bg_block_id);
		set_user_rc = set_block_user(bg_record);
		if (bg_action_ptr->job_ptr) {
			bg_action_ptr->job_ptr->job_state &= (~JOB_CONFIGURING);
			last_job_update = time(NULL);
		}
	}
	slurm_mutex_unlock(&block_state_mutex);

	if (set_user_rc == SLURM_ERROR) {
		sleep(2);
		/* wait for the slurmd to begin
		   the batch script, slurm_fail_job()
		   is a no-op if issued prior
		   to the script initiation do clean up just
		   incase the fail job isn't ran */
		(void) slurm_fail_job(bg_record->job_running);
		slurm_mutex_lock(&block_state_mutex);
		if (remove_from_bg_list(bg_lists->job_running, bg_record)
		    == SLURM_SUCCESS)
			num_unused_cpus += bg_record->cpu_cnt;

		slurm_mutex_unlock(&block_state_mutex);
	}
}
Esempio n. 5
0
/*
 * This could potentially lock the node lock in the slurmctld with
 * slurm_drain_node, or slurm_fail_job so if slurmctld_locked is called we
 * will call the functions without locking the locks again.
 */
extern int down_nodecard(char *mp_name, bitoff_t io_start,
			 bool slurmctld_locked)
{
	List requests = NULL;
	List delete_list = NULL;
	ListIterator itr = NULL;
	bg_record_t *bg_record = NULL, *found_record = NULL, tmp_record;
	bg_record_t *smallest_bg_record = NULL;
	struct node_record *node_ptr = NULL;
	int mp_bit = 0;
	static int io_cnt = NO_VAL;
	static int create_size = NO_VAL;
	static select_ba_request_t blockreq;
	int rc = SLURM_SUCCESS;
	char *reason = "select_bluegene: nodecard down";

	xassert(mp_name);

	if (io_cnt == NO_VAL) {
		io_cnt = 1;
		/* Translate 1 nodecard count to ionode count */
		if ((io_cnt *= bg_conf->io_ratio))
			io_cnt--;

		/* make sure we create something that is able to be
		   created */
		if (bg_conf->smallest_block < bg_conf->nodecard_cnode_cnt)
			create_size = bg_conf->nodecard_cnode_cnt;
		else
			create_size = bg_conf->smallest_block;
	}

	node_ptr = find_node_record(mp_name);
	if (!node_ptr) {
		error ("down_sub_node_blocks: invalid node specified '%s'",
		       mp_name);
		return EINVAL;
	}

	/* this is here for sanity check to make sure we don't core on
	   these bits when we set them below. */
	if (io_start >= bg_conf->ionodes_per_mp
	    || (io_start+io_cnt) >= bg_conf->ionodes_per_mp) {
		debug("io %d-%d not configured on this "
		      "system, only %d ionodes per midplane",
		      io_start, io_start+io_cnt, bg_conf->ionodes_per_mp);
		return EINVAL;
	}
	mp_bit = (node_ptr - node_record_table_ptr);

	memset(&blockreq, 0, sizeof(select_ba_request_t));

	blockreq.conn_type[0] = SELECT_SMALL;
	blockreq.save_name = mp_name;

	debug3("here setting node %d of %d and ionodes %d-%d of %d",
	       mp_bit, node_record_count, io_start,
	       io_start+io_cnt, bg_conf->ionodes_per_mp);

	memset(&tmp_record, 0, sizeof(bg_record_t));
	tmp_record.mp_count = 1;
	tmp_record.cnode_cnt = bg_conf->nodecard_cnode_cnt;
	tmp_record.mp_bitmap = bit_alloc(node_record_count);
	bit_set(tmp_record.mp_bitmap, mp_bit);

	tmp_record.ionode_bitmap = bit_alloc(bg_conf->ionodes_per_mp);
	bit_nset(tmp_record.ionode_bitmap, io_start, io_start+io_cnt);

	slurm_mutex_lock(&block_state_mutex);
	itr = list_iterator_create(bg_lists->main);
	while ((bg_record = list_next(itr))) {
		if (!bit_test(bg_record->mp_bitmap, mp_bit))
			continue;

		if (!blocks_overlap(bg_record, &tmp_record))
			continue;

		if (bg_record->job_running > NO_JOB_RUNNING) {
			if (slurmctld_locked)
				job_fail(bg_record->job_running);
			else
				slurm_fail_job(bg_record->job_running);

		}
		/* If Running Dynamic mode and the block is
		   smaller than the create size just continue on.
		*/
		if ((bg_conf->layout_mode == LAYOUT_DYNAMIC)
		    && (bg_record->cnode_cnt < create_size)) {
			if (!delete_list)
				delete_list = list_create(NULL);
			list_append(delete_list, bg_record);
			continue;
		}

		/* keep track of the smallest size that is at least
		   the size of create_size. */
		if (!smallest_bg_record ||
		    (smallest_bg_record->cnode_cnt > bg_record->cnode_cnt))
			smallest_bg_record = bg_record;
	}
	list_iterator_destroy(itr);
	slurm_mutex_unlock(&block_state_mutex);

	if (bg_conf->layout_mode != LAYOUT_DYNAMIC) {
		debug3("running non-dynamic mode");
		/* This should never happen, but just in case... */
		if (delete_list)
			list_destroy(delete_list);

		/* If we found a block that is smaller or equal to a
		   midplane we will just mark it in an error state as
		   opposed to draining the node.
		*/
		if (smallest_bg_record
		    && (smallest_bg_record->cnode_cnt < bg_conf->mp_cnode_cnt)){
			if (smallest_bg_record->state & BG_BLOCK_ERROR_FLAG) {
				rc = SLURM_NO_CHANGE_IN_DATA;
				goto cleanup;
			}

			rc = put_block_in_error_state(
				smallest_bg_record, reason);
			goto cleanup;
		}

		debug("No block under 1 midplane available for this nodecard.  "
		      "Draining the whole node.");
		if (!node_already_down(mp_name)) {
			if (slurmctld_locked)
				drain_nodes(mp_name, reason,
					    slurm_get_slurm_user_id());
			else
				slurm_drain_nodes(mp_name, reason,
						  slurm_get_slurm_user_id());
		}
		rc = SLURM_SUCCESS;
		goto cleanup;
	}

	/* below is only for Dynamic mode */

	if (delete_list) {
		int cnt_set = 0;
		bitstr_t *iobitmap = bit_alloc(bg_conf->ionodes_per_mp);
		/* don't lock here since it is handled inside
		   the put_block_in_error_state
		*/
		itr = list_iterator_create(delete_list);
		while ((bg_record = list_next(itr))) {
			debug2("combining smaller than nodecard "
			       "dynamic block %s",
			       bg_record->bg_block_id);
			while (bg_record->job_running > NO_JOB_RUNNING)
				sleep(1);

			bit_or(iobitmap, bg_record->ionode_bitmap);
			cnt_set++;
		}
		list_iterator_destroy(itr);
		list_destroy(delete_list);
		if (!cnt_set) {
			FREE_NULL_BITMAP(iobitmap);
			rc = SLURM_ERROR;
			goto cleanup;
		}
		/* set the start to be the same as the start of the
		   ionode_bitmap.  If no ionodes set (not a small
		   block) set io_start = 0. */
		if ((io_start = bit_ffs(iobitmap)) == -1) {
			io_start = 0;
			if (create_size > bg_conf->nodecard_cnode_cnt)
				blockreq.small128 = 4;
			else
				blockreq.small32 = 16;
		} else if (create_size <= bg_conf->nodecard_cnode_cnt)
			blockreq.small32 = 1;
		else
			/* this should never happen */
			blockreq.small128 = 1;

		FREE_NULL_BITMAP(iobitmap);
	} else if (smallest_bg_record) {
		debug2("smallest dynamic block is %s",
		       smallest_bg_record->bg_block_id);
		if (smallest_bg_record->state & BG_BLOCK_ERROR_FLAG) {
			rc = SLURM_NO_CHANGE_IN_DATA;
			goto cleanup;
		}

		while (smallest_bg_record->job_running > NO_JOB_RUNNING)
			sleep(1);

		if (smallest_bg_record->cnode_cnt == create_size) {
			rc = put_block_in_error_state(
				smallest_bg_record, reason);
			goto cleanup;
		}

		if (create_size > smallest_bg_record->cnode_cnt) {
			/* we should never get here.  This means we
			 * have a create_size that is bigger than a
			 * block that is already made.
			 */
			rc = put_block_in_error_state(
				smallest_bg_record, reason);
			goto cleanup;
		}
		debug3("node count is %d", smallest_bg_record->cnode_cnt);
		switch(smallest_bg_record->cnode_cnt) {
#ifndef HAVE_BGL
		case 64:
			blockreq.small32 = 2;
			break;
		case 256:
			blockreq.small32 = 8;
			break;
#endif
		case 128:
			blockreq.small32 = 4;
			break;
		case 512:
		default:
			blockreq.small32 = 16;
			break;
		}

		if (create_size != bg_conf->nodecard_cnode_cnt) {
			blockreq.small128 = blockreq.small32 / 4;
			blockreq.small32 = 0;
			io_start = 0;
		} else if ((io_start =
			    bit_ffs(smallest_bg_record->ionode_bitmap)) == -1)
			/* set the start to be the same as the start of the
			   ionode_bitmap.  If no ionodes set (not a small
			   block) set io_start = 0. */
			io_start = 0;
	} else {
		switch(create_size) {
#ifndef HAVE_BGL
		case 64:
			blockreq.small64 = 8;
			break;
		case 256:
			blockreq.small256 = 2;
#endif
		case 32:
			blockreq.small32 = 16;
			break;
		case 128:
			blockreq.small128 = 4;
			break;
		case 512:
			if (!node_already_down(mp_name)) {
				char *reason = "select_bluegene: nodecard down";
				if (slurmctld_locked)
					drain_nodes(mp_name, reason,
						    slurm_get_slurm_user_id());
				else
					slurm_drain_nodes(
						mp_name, reason,
						slurm_get_slurm_user_id());
			}
			rc = SLURM_SUCCESS;
			goto cleanup;
			break;
		default:
			error("Unknown create size of %d", create_size);
			break;
		}
		/* since we don't have a block in this midplane
		   we need to start at the beginning. */
		io_start = 0;
		/* we also need a bg_block to pretend to be the
		   smallest block that takes up the entire midplane. */
	}


	/* Here we need to add blocks that take up nodecards on this
	   midplane.  Since Slurm only keeps track of midplanes
	   natively this is the only want to handle this case.
	*/
	requests = list_create(destroy_bg_record);
	add_bg_record(requests, NULL, &blockreq, 1, io_start);

	slurm_mutex_lock(&block_state_mutex);
	delete_list = list_create(NULL);
	while ((bg_record = list_pop(requests))) {
		itr = list_iterator_create(bg_lists->main);
		while ((found_record = list_next(itr))) {
			if (!blocks_overlap(bg_record, found_record))
				continue;
			list_push(delete_list, found_record);
			list_remove(itr);
		}
		list_iterator_destroy(itr);

		/* we need to add this record since it doesn't exist */
		if (bridge_block_create(bg_record) == SLURM_ERROR) {
			destroy_bg_record(bg_record);
			error("down_sub_node_blocks: "
			      "unable to configure block in api");
			continue;
		}

		debug("adding block %s to fill in small blocks "
		      "around bad nodecards",
		      bg_record->bg_block_id);
		print_bg_record(bg_record);
		list_append(bg_lists->main, bg_record);
		if (bit_overlap(bg_record->ionode_bitmap,
				tmp_record.ionode_bitmap)) {
			/* here we know the error block doesn't exist
			   so just set the state here */
			slurm_mutex_unlock(&block_state_mutex);
			rc = put_block_in_error_state(bg_record, reason);
			slurm_mutex_lock(&block_state_mutex);
		}
	}
	list_destroy(requests);

	if (delete_list) {
		slurm_mutex_unlock(&block_state_mutex);
		free_block_list(NO_VAL, delete_list, 0, 0);
		list_destroy(delete_list);
	}
	slurm_mutex_lock(&block_state_mutex);
	sort_bg_record_inc_size(bg_lists->main);
	slurm_mutex_unlock(&block_state_mutex);
	last_bg_update = time(NULL);

cleanup:
	FREE_NULL_BITMAP(tmp_record.mp_bitmap);
	FREE_NULL_BITMAP(tmp_record.ionode_bitmap);

	return rc;

}