예제 #1
0
파일: bg_job_run.c 프로젝트: HPCNow/slurm
/* Update block user and reboot as needed block_state_mutex needs to
 * be locked before coming in. */
static void _sync_agent(bg_action_t *bg_action_ptr, bg_record_t *bg_record)
{
	struct job_record *job_ptr = bg_action_ptr->job_ptr;

	debug3("Queue sync of job %u in BG block %s ending at %ld",
	       job_ptr->job_id, bg_action_ptr->bg_block_id,
	       job_ptr->end_time);

	last_bg_update = time(NULL);

	ba_sync_job_to_block(bg_record, job_ptr);

	set_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_BLOCK_PTR,
			   bg_record);

	num_unused_cpus -= job_ptr->total_cpus;

	if (!block_ptr_exist_in_list(bg_lists->job_running, bg_record))
		list_push(bg_lists->job_running, bg_record);

	if (!block_ptr_exist_in_list(bg_lists->booted, bg_record))
		list_push(bg_lists->booted, bg_record);

	if (bg_record->state == BG_BLOCK_INITED) {
		int sync_user_rc;
		job_ptr->job_state &= (~JOB_CONFIGURING);
		last_job_update = time(NULL);
		/* Just in case reset the boot flags */
		bg_record->boot_state = 0;
		bg_record->boot_count = 0;
		sync_user_rc = bridge_block_sync_users(bg_record);

		if (sync_user_rc == SLURM_ERROR) {
			slurm_mutex_unlock(&block_state_mutex);
			(void) slurm_fail_job(job_ptr->job_id, JOB_BOOT_FAIL);
			slurm_mutex_lock(&block_state_mutex);
		}
		_destroy_bg_action(bg_action_ptr);
	} else {
		if (bg_record->state != BG_BLOCK_BOOTING) {
			error("Block %s isn't ready and isn't "
			      "being configured! Starting job again.",
			      bg_action_ptr->bg_block_id);
		} else {
			debug("Block %s is booting, job ok",
			      bg_action_ptr->bg_block_id);
		}
		/* the function _block_op calls will destroy the
		   bg_action_ptr */
		_block_op(bg_action_ptr);
	}
}
예제 #2
0
/* block_state_mutex should be locked before calling */
extern int resume_block(bg_record_t *bg_record)
{
	xassert(bg_record);

	if (bg_record->job_running > NO_JOB_RUNNING)
		return SLURM_SUCCESS;

	if (bg_record->state & BG_BLOCK_ERROR_FLAG) {
		bg_record->state &= (~BG_BLOCK_ERROR_FLAG);
		info("Block %s put back into service after "
		     "being in an error state.",
		     bg_record->bg_block_id);
	}

	if (remove_from_bg_list(bg_lists->job_running, bg_record)
	    == SLURM_SUCCESS)
		num_unused_cpus += bg_record->cpu_cnt;

	if (bg_record->state != BG_BLOCK_INITED)
		remove_from_bg_list(bg_lists->booted, bg_record);
	else if (!block_ptr_exist_in_list(bg_lists->booted, bg_record))
		list_push(bg_lists->booted, bg_record);

	bg_record->job_running = NO_JOB_RUNNING;
	xfree(bg_record->reason);

	last_bg_update = time(NULL);
	_set_block_nodes_accounting(bg_record, NULL);

	return SLURM_SUCCESS;
}
예제 #3
0
/* block_state_mutex should be locked before
 * calling this function.  This should only be called in _start_agent.
 * RET 1 if exists 0 if not, and job is requeued.
 */
static int _make_sure_block_still_exists(bg_action_t *bg_action_ptr,
					 bg_record_t *bg_record)
{
	/* check to make sure this block still exists since
	 * something could had happened and the block is no
	 * longer in existance */
	if ((bg_record->magic != BLOCK_MAGIC)
	    || !block_ptr_exist_in_list(bg_lists->main, bg_record)) {
		slurm_mutex_unlock(&block_state_mutex);
		debug("The block %s disappeared while starting "
		      "job %u requeueing if possible.",
		      bg_action_ptr->bg_block_id,
		      bg_action_ptr->job_ptr->job_id);
		bg_requeue_job(bg_action_ptr->job_ptr->job_id, 1);
		return 0;
	}
	return 1;
}
예제 #4
0
/* Try to requeue job running on block and put block in an error state.
 * block_state_mutex must be unlocked before calling this.
 */
extern void requeue_and_error(bg_record_t *bg_record, char *reason)
{

	int rc;
	if (bg_record->magic != BLOCK_MAGIC) {
		error("requeue_and_error: magic was bad");
		return;
	}

	if (bg_record->job_running > NO_JOB_RUNNING)
		bg_requeue_job(bg_record->job_running, 0);

	slurm_mutex_lock(&block_state_mutex);
	rc = block_ptr_exist_in_list(bg_lists->main, bg_record);
	slurm_mutex_unlock(&block_state_mutex);

	if (rc)
		put_block_in_error_state(bg_record, reason);
	else
		error("requeue_and_error: block disappeared");

	return;
}
예제 #5
0
파일: bg_core.c 프로젝트: fafik23/slurm
/* block_state_mutex should be locked before calling this */
static int _post_block_free(bg_record_t *bg_record, bool restore)
{
	int rc = SLURM_SUCCESS;

	if (bg_record->magic != BLOCK_MAGIC) {
		error("block already destroyed %p", bg_record);
		xassert(0);
		return SLURM_ERROR;
	}

	bg_record->free_cnt--;
	if (bg_record->free_cnt == -1) {
		info("we got a negative 1 here for %s",
		     bg_record->bg_block_id);
		xassert(0);
		return SLURM_SUCCESS;
	} else if (bg_record->modifying) {
		info("others are modifing this block %s, don't clear it up",
		     bg_record->bg_block_id);
		return SLURM_SUCCESS;
	} else if (bg_record->free_cnt) {
		if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
			info("%d others are trying to destroy this block %s",
			     bg_record->free_cnt, bg_record->bg_block_id);
		return SLURM_SUCCESS;
	}

	/* Even if the block is already in error state we need to do this to
	   avoid any overlapping blocks that may have been created due
	   to bad hardware.
	*/
	if ((bg_record->state & (~BG_BLOCK_ERROR_FLAG)) != BG_BLOCK_FREE) {
		/* Something isn't right, go mark this one in an error
		   state. */
		update_block_msg_t block_msg;
		if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
			info("_post_block_free: block %s is not in state "
			     "free (%s), putting it in error state.",
			     bg_record->bg_block_id,
			     bg_block_state_string(bg_record->state));
		slurm_init_update_block_msg(&block_msg);
		block_msg.bg_block_id = bg_record->bg_block_id;
		block_msg.state = BG_BLOCK_ERROR_FLAG;
		block_msg.reason = "Block would not deallocate";
		slurm_mutex_unlock(&block_state_mutex);
		select_g_update_block(&block_msg);
		slurm_mutex_lock(&block_state_mutex);
		if (block_ptr_exist_in_list(bg_lists->main, bg_record))
			bg_record->destroy = 0;
		return SLURM_SUCCESS;
	}

	/* The reason restore is used on the entire list is if this
	 * was for a bunch of small blocks.  If we record is marked to
	 * be destroyed and it is bigger than 1 midplane destroy it
	 * even if restore is true.
	 */
	 if (restore && bg_record->destroy && (bg_record->mp_count > 1))
		restore = false;

	/* If we are here we are done with the destroy so just reset it. */
	bg_record->destroy = 0;

	/* A bit of a sanity check to make sure blocks are being
	   removed out of all the lists.
	*/
	remove_from_bg_list(bg_lists->booted, bg_record);
	if (remove_from_bg_list(bg_lists->job_running, bg_record)
	    == SLURM_SUCCESS) {
		debug2("_post_block_free: we are freeing block %s and "
		       "it was in the job_running list.  This can happen if a "
		       "block is removed while waiting for mmcs to finish "
		       "removing the job from the block.",
		       bg_record->bg_block_id);
		num_unused_cpus += bg_record->cpu_cnt;
	}

	/* If we don't have any mp_counts force block removal */
	if (restore && bg_record->mp_count)
		return SLURM_SUCCESS;

	if (remove_from_bg_list(bg_lists->main, bg_record) != SLURM_SUCCESS) {
		/* This should only happen if called from
		 * bg_job_place.c where the block was never added to
		 * the list. */
		debug("_post_block_free: It appears this block %s isn't "
		      "in the main list anymore.",
		      bg_record->bg_block_id);
	}

	if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
		info("_post_block_free: removing %s from database",
		     bg_record->bg_block_id);

	rc = bridge_block_remove(bg_record);
	if (rc != SLURM_SUCCESS) {
		if (rc == BG_ERROR_BLOCK_NOT_FOUND) {
			debug("_post_block_free: block %s is not found",
			      bg_record->bg_block_id);
		} else {
			error("_post_block_free: "
			      "bridge_block_remove(%s): %s",
			      bg_record->bg_block_id,
			      bg_err_str(rc));
		}
	} else if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
		info("_post_block_free: done %s(%p)",
		     bg_record->bg_block_id, bg_record);

	destroy_bg_record(bg_record);
	if (bg_conf->slurm_debug_flags & DEBUG_FLAG_SELECT_TYPE)
		info("_post_block_free: destroyed");

	return SLURM_SUCCESS;
}
예제 #6
0
extern int bg_status_update_block_state(bg_record_t *bg_record,
					uint16_t state,
					List kill_job_list)
{
	bool skipped_dealloc = false;
	kill_job_struct_t *freeit = NULL;
	int updated = 0;
	uint16_t real_state = bg_record->state & (~BG_BLOCK_ERROR_FLAG);

	if (real_state == state)
		return 0;

	debug("state of Block %s was %s and now is %s",
	      bg_record->bg_block_id,
	      bg_block_state_string(bg_record->state),
	      bg_block_state_string(state));

	/*
	  check to make sure block went
	  through freeing correctly
	*/
	if ((real_state != BG_BLOCK_TERM
	     && !(bg_record->state & BG_BLOCK_ERROR_FLAG))
	    && state == BG_BLOCK_FREE)
		skipped_dealloc = 1;
	else if ((real_state == BG_BLOCK_INITED)
		 && (state == BG_BLOCK_BOOTING)) {
		/* This means the user did a reboot through
		   mpirun but we missed the state
		   change */
		debug("Block %s skipped rebooting, "
		      "but it really is.  "
		      "Setting target_name back to %s",
		      bg_record->bg_block_id,
		      bg_record->user_name);
		xfree(bg_record->target_name);
		bg_record->target_name = xstrdup(bg_record->user_name);
	} else if ((real_state == BG_BLOCK_TERM)
		   && (state == BG_BLOCK_BOOTING))
		/* This is a funky state IBM says
		   isn't a bug, but all their
		   documentation says this doesn't
		   happen, but IBM says oh yeah, you
		   weren't really suppose to notice
		   that. So we will just skip this
		   state and act like this didn't happen. */
		goto nochange_state;
	real_state = state;
	if (bg_record->state & BG_BLOCK_ERROR_FLAG)
		state |= BG_BLOCK_ERROR_FLAG;

	bg_record->state = state;

	if (real_state == BG_BLOCK_TERM || skipped_dealloc)
		_block_is_deallocating(bg_record, kill_job_list);
	else if (real_state == BG_BLOCK_BOOTING) {
		debug("Setting bootflag for %s", bg_record->bg_block_id);
		bg_record->boot_state = 1;
	} else if (real_state == BG_BLOCK_FREE) {
		if (remove_from_bg_list(bg_lists->job_running, bg_record)
		    == SLURM_SUCCESS)
			num_unused_cpus += bg_record->cpu_cnt;
		remove_from_bg_list(bg_lists->booted,
				    bg_record);
	} else if (real_state & BG_BLOCK_ERROR_FLAG) {
		if (bg_record->boot_state)
			error("Block %s in an error state while booting.",
			      bg_record->bg_block_id);
		else
			error("Block %s in an error state.",
			      bg_record->bg_block_id);
		remove_from_bg_list(bg_lists->booted, bg_record);
		trigger_block_error();
	} else if (real_state == BG_BLOCK_INITED) {
		if (!block_ptr_exist_in_list(bg_lists->booted, bg_record))
			list_push(bg_lists->booted, bg_record);
	}
	updated = 1;
nochange_state:

	/* check the boot state */
	debug3("boot state for block %s is %d",
	       bg_record->bg_block_id, bg_record->boot_state);
	if (bg_record->boot_state) {
		if (bg_record->state & BG_BLOCK_ERROR_FLAG) {
			/* If we get an error on boot that
			 * means it is a transparent L3 error
			 * and should be trying to fix
			 * itself.  If this is the case we
			 * just hang out waiting for the state
			 * to go to free where we will try to
			 * boot again below.
			 */
			return updated;
		}

		switch (real_state) {
		case BG_BLOCK_BOOTING:
			debug3("checking to make sure user %s "
			       "is the user.",
			       bg_record->target_name);

			if (update_block_user(bg_record, 0) == 1)
				last_bg_update = time(NULL);
			if (bg_record->job_ptr) {
				bg_record->job_ptr->job_state |=
					JOB_CONFIGURING;
				last_job_update = time(NULL);
			}
			break;
		case BG_BLOCK_FREE:
			if (bg_record->boot_count < RETRY_BOOT_COUNT) {
				bridge_block_boot(bg_record);

				if (bg_record->magic == BLOCK_MAGIC) {
					debug("boot count for block %s is %d",
					      bg_record->bg_block_id,
					      bg_record->boot_count);
					bg_record->boot_count++;
				}
			} else {
				char *reason = (char *)
					"status_check: Boot fails ";

				error("Couldn't boot Block %s for user %s",
				      bg_record->bg_block_id,
				      bg_record->target_name);

				slurm_mutex_unlock(&block_state_mutex);
				requeue_and_error(bg_record, reason);
				slurm_mutex_lock(&block_state_mutex);

				bg_record->boot_state = 0;
				bg_record->boot_count = 0;
				if (remove_from_bg_list(
					    bg_lists->job_running, bg_record)
				    == SLURM_SUCCESS)
					num_unused_cpus += bg_record->cpu_cnt;

				remove_from_bg_list(bg_lists->booted,
						    bg_record);
			}
			break;
		case BG_BLOCK_INITED:
			debug("block %s is ready.",
			      bg_record->bg_block_id);
			if (bg_record->job_ptr) {
				bg_record->job_ptr->job_state &=
					(~JOB_CONFIGURING);
				last_job_update = time(NULL);
			}
			/* boot flags are reset here */
			if (kill_job_list &&
			    set_block_user(bg_record) == SLURM_ERROR) {
				freeit = (kill_job_struct_t *)
					xmalloc(sizeof(kill_job_struct_t));
				freeit->jobid = bg_record->job_running;
				list_push(kill_job_list, freeit);
			}
			break;
		case BG_BLOCK_TERM:
			debug2("Block %s is in a deallocating state "
			       "during a boot.  Doing nothing until "
			       "free state.",
			       bg_record->bg_block_id);
			break;
		case BG_BLOCK_REBOOTING:
			debug2("Block %s is rebooting.",
			       bg_record->bg_block_id);
			break;
		default:
			debug("Hey the state of block "
			      "%s is %d(%s) doing nothing.",
			      bg_record->bg_block_id,
			      real_state,
			      bg_block_state_string(bg_record->state));
			break;
		}
	}

	return updated;
}
예제 #7
0
파일: bg_job_run.c 프로젝트: HPCNow/slurm
/*
 * Perform any setup required to initiate a job
 * job_ptr IN - pointer to the job being initiated
 * RET - SLURM_SUCCESS or an error code
 *
 * NOTE: This happens in parallel with srun and slurmd spawning
 * the job. A prolog script is expected to defer initiation of
 * the job script until the BG block is available for use.
 */
extern int start_job(struct job_record *job_ptr)
{
	int rc = SLURM_SUCCESS;
	bg_record_t *bg_record = NULL;
	bg_action_t *bg_action_ptr = NULL;
	select_jobinfo_t *jobinfo = job_ptr->select_jobinfo->data;

	slurm_mutex_lock(&block_state_mutex);
	bg_record = jobinfo->bg_record;

	if (!bg_record || !block_ptr_exist_in_list(bg_lists->main, bg_record)) {
		slurm_mutex_unlock(&block_state_mutex);
		error("bg_record %s doesn't exist, requested for job (%d)",
		      jobinfo->bg_block_id, job_ptr->job_id);
		return SLURM_ERROR;
	}

	if ((jobinfo->conn_type[0] != SELECT_NAV)
	    && (jobinfo->conn_type[0] < SELECT_SMALL)) {
		int dim;
		for (dim=0; dim<SYSTEM_DIMENSIONS; dim++)
			jobinfo->conn_type[dim] = bg_record->conn_type[dim];
	}

	/* If it isn't 0 then it was setup previous (sub-block)
	*/
	if (jobinfo->geometry[SYSTEM_DIMENSIONS] == 0)
		memcpy(jobinfo->geometry, bg_record->geo,
		       sizeof(bg_record->geo));

	if (bg_record->job_list) {
		/* Mark the ba_mp cnodes as used now. */
		ba_mp_t *ba_mp = list_peek(bg_record->ba_mp_list);
		xassert(ba_mp);
		xassert(ba_mp->cnode_bitmap);
		bit_or(ba_mp->cnode_bitmap, jobinfo->units_avail);
		if (!find_job_in_bg_record(bg_record, job_ptr->job_id))
			list_append(bg_record->job_list, job_ptr);
	} else {
		bg_record->job_running = job_ptr->job_id;
		bg_record->job_ptr = job_ptr;
	}

	job_ptr->job_state |= JOB_CONFIGURING;

	bg_action_ptr = xmalloc(sizeof(bg_action_t));
	bg_action_ptr->op = START_OP;
	bg_action_ptr->job_ptr = job_ptr;

	/* FIXME: The below get_select_jobinfo calls could be avoided
	 * by just using the jobinfo as we do above.
	 */
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_BLOCK_ID,
			   &(bg_action_ptr->bg_block_id));
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_REBOOT,
			   &(bg_action_ptr->reboot));
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_CONN_TYPE,
			   &(bg_action_ptr->conn_type));
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_MLOADER_IMAGE,
			   &(bg_action_ptr->mloaderimage));
#ifdef HAVE_BG_L_P
# ifdef HAVE_BGL
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_BLRTS_IMAGE,
			   &(bg_action_ptr->blrtsimage));
	if (!bg_action_ptr->blrtsimage) {
		bg_action_ptr->blrtsimage =
			xstrdup(bg_conf->default_blrtsimage);
		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_BLRTS_IMAGE,
				   bg_action_ptr->blrtsimage);
	}
# elif defined HAVE_BGP
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_CONN_TYPE,
			   &(bg_action_ptr->conn_type));
# endif
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_LINUX_IMAGE,
			   &(bg_action_ptr->linuximage));
	if (!bg_action_ptr->linuximage) {
		bg_action_ptr->linuximage =
			xstrdup(bg_conf->default_linuximage);
		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_LINUX_IMAGE,
				   bg_action_ptr->linuximage);
	}

	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_RAMDISK_IMAGE,
			   &(bg_action_ptr->ramdiskimage));
	if (!bg_action_ptr->ramdiskimage) {
		bg_action_ptr->ramdiskimage =
			xstrdup(bg_conf->default_ramdiskimage);
		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_RAMDISK_IMAGE,
				   bg_action_ptr->ramdiskimage);
	}

#endif
	if (!bg_action_ptr->mloaderimage) {
		bg_action_ptr->mloaderimage =
			xstrdup(bg_conf->default_mloaderimage);
		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_MLOADER_IMAGE,
				   bg_action_ptr->mloaderimage);
	}

	num_unused_cpus -= job_ptr->total_cpus;

	if (!block_ptr_exist_in_list(bg_lists->job_running, bg_record))
		list_push(bg_lists->job_running, bg_record);

	if (!block_ptr_exist_in_list(bg_lists->booted, bg_record))
		list_push(bg_lists->booted, bg_record);
	/* Just in case something happens to free this block before we
	   start the job we will make it so this job doesn't get blown
	   away.
	*/
	bg_record->modifying = 1;
	last_bg_update = time(NULL);

	slurm_mutex_unlock(&block_state_mutex);

	info("Queue start of job %u in BG block %s",
	     job_ptr->job_id,
	     bg_action_ptr->bg_block_id);
	_block_op(bg_action_ptr);
	return rc;
}
예제 #8
0
파일: bg_job_run.c 프로젝트: HPCNow/slurm
/* Perform job initiation work */
static void _start_agent(bg_action_t *bg_action_ptr)
{
	int rc, set_user_rc = SLURM_SUCCESS;
	bg_record_t *bg_record = NULL;
	bg_record_t *found_record = NULL;
	ListIterator itr;
	List delete_list = NULL;
	int requeue_job = 0;
	uint32_t req_job_id = bg_action_ptr->job_ptr->job_id;
	bool block_inited = 0;
	bool delete_it = 0;

	slurm_mutex_lock(&block_state_mutex);
	bg_record = find_bg_record_in_list(bg_lists->main,
					   bg_action_ptr->bg_block_id);

	if (!bg_record) {
		bg_record->modifying = 0;
		slurm_mutex_unlock(&block_state_mutex);
		error("block %s not found in bg_lists->main",
		      bg_action_ptr->bg_block_id);
		bg_requeue_job(req_job_id, 1, 0, JOB_BOOT_FAIL, false);
		return;
	}

	if ((bg_record->job_running <= NO_JOB_RUNNING)
	    && !find_job_in_bg_record(bg_record, req_job_id)) {
		bg_record->modifying = 0;
		// bg_reset_block(bg_record); should already happened
		slurm_mutex_unlock(&block_state_mutex);
		debug("job %u finished during the queueing job "
		      "(everything is ok)",
		      req_job_id);
		return;
	}

	if ((bg_record->state == BG_BLOCK_TERM) || bg_record->free_cnt) {
		/* It doesn't appear state of a small block
		   (conn_type) is held on a BGP system so
		   if we to reset it so, just set the reboot flag and
		   handle it later in that code. */
		bg_action_ptr->reboot = 1;
	}

	delete_list = list_create(NULL);
	itr = list_iterator_create(bg_lists->main);
	while ((found_record = list_next(itr))) {
		if (bg_record == found_record)
			continue;

		if (!blocks_overlap(bg_record, found_record)) {
			debug2("block %s isn't part of %s",
			       found_record->bg_block_id,
			       bg_record->bg_block_id);
			continue;
		}

		if (found_record->job_ptr
		    || (found_record->job_list
			&& list_count(found_record->job_list))) {
			struct job_record *job_ptr = found_record->job_ptr;
			if (!found_record->job_ptr)
				job_ptr = find_job_in_bg_record(
					found_record, NO_VAL);
			error("Trying to start job %u on block %s, "
			      "but there is a job %u running on an overlapping "
			      "block %s it will not end until %ld.  "
			      "This should never happen.",
			      req_job_id,
			      bg_record->bg_block_id,
			      job_ptr->job_id,
			      found_record->bg_block_id,
			      job_ptr->end_time);
			requeue_job = 1;
			break;
		}

		debug2("need to make sure %s is free, it's part of %s",
		       found_record->bg_block_id,
		       bg_record->bg_block_id);
		list_push(delete_list, found_record);
	}
	list_iterator_destroy(itr);

	if (requeue_job) {
		FREE_NULL_LIST(delete_list);

		bg_reset_block(bg_record, bg_action_ptr->job_ptr);

		bg_record->modifying = 0;
		slurm_mutex_unlock(&block_state_mutex);
		bg_requeue_job(req_job_id, 0, 0, JOB_BOOT_FAIL, false);
		return;
	}

	slurm_mutex_unlock(&block_state_mutex);

	if (bg_conf->layout_mode == LAYOUT_DYNAMIC)
		delete_it = 1;
	free_block_list(req_job_id, delete_list, delete_it, 1);
	FREE_NULL_LIST(delete_list);

	while (1) {
		slurm_mutex_lock(&block_state_mutex);
		/* Failure will unlock block_state_mutex so no need to
		   unlock before return.  No need to reset modifying
		   here if the block doesn't exist.
		*/
		if (!_make_sure_block_still_exists(bg_action_ptr, bg_record)) {
			error("Problem with deallocating blocks to run job %u "
			      "on block %s", req_job_id,
			      bg_action_ptr->bg_block_id);
			return;
		}
		/* If another thread is freeing this block we need to
		   wait until it is done or we will get into a state
		   where this job will be killed.
		*/
		if (!bg_record->free_cnt)
			break;
		debug("Waiting for block %s to free for job %u.  "
		      "%d thread(s) trying to free it",
		      bg_record->bg_block_id, req_job_id,
		      bg_record->free_cnt);
		slurm_mutex_unlock(&block_state_mutex);
		sleep(1);
	}
	/* This was set in the start_job function to close the above
	   window where a job could be mistakenly requeued if another
	   thread is trying to free this block as we are trying to run
	   on it, which is fine since we will reboot it later.
	*/
	bg_record->modifying = 0;

	if ((bg_record->job_running <= NO_JOB_RUNNING)
	    && !find_job_in_bg_record(bg_record, req_job_id)) {
		// bg_reset_block(bg_record); should already happened
		slurm_mutex_unlock(&block_state_mutex);
		debug("job %u already finished before boot",
		      req_job_id);
		return;
	}

	if (bg_record->job_list
	    && (bg_action_ptr->job_ptr->total_cpus != bg_record->cpu_cnt)
	    && (list_count(bg_record->job_list) != 1)) {
		/* We don't allow modification of a block or reboot of
		   a block if we are running multiple jobs on the
		   block.
		*/
		debug2("no reboot");
		goto no_reboot;
	}

	rc = 0;
#ifdef HAVE_BGL
	if (bg_action_ptr->blrtsimage
	   && xstrcasecmp(bg_action_ptr->blrtsimage, bg_record->blrtsimage)) {
		debug3("changing BlrtsImage from %s to %s",
		       bg_record->blrtsimage, bg_action_ptr->blrtsimage);
		xfree(bg_record->blrtsimage);
		bg_record->blrtsimage = xstrdup(bg_action_ptr->blrtsimage);
		rc = 1;
	}
#elif defined HAVE_BGP
	if ((bg_action_ptr->conn_type[0] >= SELECT_SMALL)
	   && (bg_action_ptr->conn_type[0] != bg_record->conn_type[0])) {
		if (bg_conf->slurm_debug_level >= LOG_LEVEL_DEBUG3) {
			char *req_conn_type =
				conn_type_string_full(bg_action_ptr->conn_type);
			char *conn_type =
				conn_type_string_full(bg_record->conn_type);
			debug3("changing small block mode from %s to %s",
			       conn_type, req_conn_type);
			xfree(req_conn_type);
			xfree(conn_type);
		}
		rc = 1;
# ifndef HAVE_BG_FILES
		/* since we don't check state on an emulated system we
		 * have to change it here
		 */
		bg_record->conn_type[0] = bg_action_ptr->conn_type[0];
# endif
	}
#endif

#ifdef HAVE_BG_L_P
	if (bg_action_ptr->linuximage
	   && xstrcasecmp(bg_action_ptr->linuximage, bg_record->linuximage)) {
# ifdef HAVE_BGL
		debug3("changing LinuxImage from %s to %s",
		       bg_record->linuximage, bg_action_ptr->linuximage);
# else
		debug3("changing CnloadImage from %s to %s",
		       bg_record->linuximage, bg_action_ptr->linuximage);
# endif
		xfree(bg_record->linuximage);
		bg_record->linuximage = xstrdup(bg_action_ptr->linuximage);
		rc = 1;
	}
	if (bg_action_ptr->ramdiskimage
	   && xstrcasecmp(bg_action_ptr->ramdiskimage,
			 bg_record->ramdiskimage)) {
# ifdef HAVE_BGL
		debug3("changing RamDiskImage from %s to %s",
		       bg_record->ramdiskimage, bg_action_ptr->ramdiskimage);
# else
		debug3("changing IoloadImage from %s to %s",
		       bg_record->ramdiskimage, bg_action_ptr->ramdiskimage);
# endif
		xfree(bg_record->ramdiskimage);
		bg_record->ramdiskimage = xstrdup(bg_action_ptr->ramdiskimage);
		rc = 1;
	}
#endif
	if (bg_action_ptr->mloaderimage
	   && xstrcasecmp(bg_action_ptr->mloaderimage,
			 bg_record->mloaderimage)) {
		debug3("changing MloaderImage from %s to %s",
		       bg_record->mloaderimage, bg_action_ptr->mloaderimage);
		xfree(bg_record->mloaderimage);
		bg_record->mloaderimage = xstrdup(bg_action_ptr->mloaderimage);
		rc = 1;
	}

	if (rc || bg_action_ptr->reboot) {
		bg_record->modifying = 1;

		/* Increment free_cnt to make sure we don't loose this
		 * block since bg_free_block will unlock block_state_mutex.
		 */
		bg_record->free_cnt++;
		bg_free_block(bg_record, 1, 1);
		bg_record->free_cnt--;

#if defined HAVE_BG_FILES && defined HAVE_BG_L_P
#ifdef HAVE_BGL
		if ((rc = bridge_block_modify(bg_record->bg_block_id,
					      RM_MODIFY_BlrtsImg,
					      bg_record->blrtsimage))
		    != SLURM_SUCCESS)
			error("bridge_block_modify(RM_MODIFY_BlrtsImg): %s",
			      bg_err_str(rc));

		if ((rc = bridge_block_modify(bg_record->bg_block_id,
					      RM_MODIFY_LinuxImg,
					      bg_record->linuximage))
		    != SLURM_SUCCESS)
			error("bridge_block_modify(RM_MODIFY_LinuxImg): %s",
			      bg_err_str(rc));

		if ((rc = bridge_block_modify(bg_record->bg_block_id,
					      RM_MODIFY_RamdiskImg,
					      bg_record->ramdiskimage))
		    != SLURM_SUCCESS)
			error("bridge_block_modify(RM_MODIFY_RamdiskImg): %s",
			      bg_err_str(rc));

#elif defined HAVE_BGP
		if ((rc = bridge_block_modify(bg_record->bg_block_id,
					      RM_MODIFY_CnloadImg,
					      bg_record->linuximage))
		    != SLURM_SUCCESS)
			error("bridge_block_modify(RM_MODIFY_CnloadImg): %s",
			      bg_err_str(rc));

		if ((rc = bridge_block_modify(bg_record->bg_block_id,
					      RM_MODIFY_IoloadImg,
					      bg_record->ramdiskimage))
		    != SLURM_SUCCESS)
			error("bridge_block_modify(RM_MODIFY_IoloadImg): %s",
			      bg_err_str(rc));

		if (bg_action_ptr->conn_type[0] > SELECT_SMALL) {
			char *conn_type = NULL;
			switch(bg_action_ptr->conn_type[0]) {
			case SELECT_HTC_S:
				conn_type = "s";
				break;
			case SELECT_HTC_D:
				conn_type = "d";
				break;
			case SELECT_HTC_V:
				conn_type = "v";
				break;
			case SELECT_HTC_L:
				conn_type = "l";
				break;
			default:
				break;
			}
			/* the option has to be set before the pool can be
			   set */
			if ((rc = bridge_block_modify(
				     bg_record->bg_block_id,
				     RM_MODIFY_Options,
				     conn_type)) != SLURM_SUCCESS)
				error("bridge_set_data(RM_MODIFY_Options): %s",
				      bg_err_str(rc));
		}
#endif
		if ((rc = bridge_block_modify(bg_record->bg_block_id,
					      RM_MODIFY_MloaderImg,
					      bg_record->mloaderimage))
		    != SLURM_SUCCESS)
			error("bridge_block_modify(RM_MODIFY_MloaderImg): %s",
			      bg_err_str(rc));

#endif
		bg_record->modifying = 0;
	}

no_reboot:
	if (bg_record->state == BG_BLOCK_FREE) {
		if ((rc = bridge_block_boot(bg_record)) != SLURM_SUCCESS) {
			char reason[200];

			bg_record->boot_state = 0;
			bg_record->boot_count = 0;

			if (rc == BG_ERROR_INVALID_STATE)
				snprintf(reason, sizeof(reason),
					 "Block %s is in an incompatible "
					 "state.  This usually means "
					 "hardware is allocated "
					 "by another block (maybe outside "
					 "of SLURM).",
					 bg_record->bg_block_id);
			else
				snprintf(reason, sizeof(reason),
					 "Couldn't boot block %s: %s",
					 bg_record->bg_block_id,
					 bg_err_str(rc));
			slurm_mutex_unlock(&block_state_mutex);
			requeue_and_error(bg_record, reason);
			return;
		}
	} else if (bg_record->state == BG_BLOCK_BOOTING) {
#ifdef HAVE_BG_FILES
		bg_record->boot_state = 1;
#else
		if (!block_ptr_exist_in_list(bg_lists->booted, bg_record))
			list_push(bg_lists->booted, bg_record);
		bg_record->state = BG_BLOCK_INITED;
		last_bg_update = time(NULL);
#endif
	}


	if ((bg_record->job_running <= NO_JOB_RUNNING)
	    && !find_job_in_bg_record(bg_record, req_job_id)) {
		slurm_mutex_unlock(&block_state_mutex);
		debug("job %u finished during the start of the boot "
		      "(everything is ok)",
		      req_job_id);
		return;
	}

	/* Don't reset boot_count, it will be reset when state
	   changes, and needs to outlast a job allocation.
	*/
	/* bg_record->boot_count = 0; */
	if (bg_record->state == BG_BLOCK_INITED) {
		debug("block %s is already ready.", bg_record->bg_block_id);
		/* Just in case reset the boot flags */
		bg_record->boot_state = 0;
		bg_record->boot_count = 0;
		set_user_rc = bridge_block_sync_users(bg_record);
		block_inited = 1;
	}
	slurm_mutex_unlock(&block_state_mutex);

	/* This lock needs to happen after the block_state_mutex to
	   avoid deadlock.
	*/
	if (block_inited && bg_action_ptr->job_ptr) {
		slurmctld_lock_t job_write_lock = {
			NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
		lock_slurmctld(job_write_lock);
		bg_action_ptr->job_ptr->job_state &= (~JOB_CONFIGURING);
		last_job_update = time(NULL);
		unlock_slurmctld(job_write_lock);
	}

	if (set_user_rc == SLURM_ERROR) {
		sleep(2);
		/* wait for the slurmd to begin
		   the batch script, slurm_fail_job()
		   is a no-op if issued prior
		   to the script initiation do clean up just
		   in case the fail job isn't ran */
		(void) slurm_fail_job(req_job_id, JOB_BOOT_FAIL);
	}
}
예제 #9
0
파일: bg_job_run.c 프로젝트: Cray/slurm
/*
 * Perform any setup required to initiate a job
 * job_ptr IN - pointer to the job being initiated
 * RET - SLURM_SUCCESS or an error code
 *
 * NOTE: This happens in parallel with srun and slurmd spawning
 * the job. A prolog script is expected to defer initiation of
 * the job script until the BG block is available for use.
 */
extern int start_job(struct job_record *job_ptr)
{
	int rc = SLURM_SUCCESS;
	bg_record_t *bg_record = NULL;

	bg_action_t *bg_action_ptr = NULL;

	bg_action_ptr = xmalloc(sizeof(bg_action_t));
	bg_action_ptr->op = START_OP;
	bg_action_ptr->job_ptr = job_ptr;

	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_BLOCK_ID,
			   &(bg_action_ptr->bg_block_id));
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_REBOOT,
			   &(bg_action_ptr->reboot));
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_CONN_TYPE,
			   &(bg_action_ptr->conn_type));
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_MLOADER_IMAGE,
			   &(bg_action_ptr->mloaderimage));
#ifdef HAVE_BG_L_P
# ifdef HAVE_BGL
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_BLRTS_IMAGE,
			   &(bg_action_ptr->blrtsimage));
	if (!bg_action_ptr->blrtsimage) {
		bg_action_ptr->blrtsimage =
			xstrdup(bg_conf->default_blrtsimage);
		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_BLRTS_IMAGE,
				   bg_action_ptr->blrtsimage);
	}
# elif defined HAVE_BGP
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_CONN_TYPE,
			   &(bg_action_ptr->conn_type));
# endif
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_LINUX_IMAGE,
			   &(bg_action_ptr->linuximage));
	if (!bg_action_ptr->linuximage) {
		bg_action_ptr->linuximage =
			xstrdup(bg_conf->default_linuximage);
		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_LINUX_IMAGE,
				   bg_action_ptr->linuximage);
	}

	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_RAMDISK_IMAGE,
			   &(bg_action_ptr->ramdiskimage));
	if (!bg_action_ptr->ramdiskimage) {
		bg_action_ptr->ramdiskimage =
			xstrdup(bg_conf->default_ramdiskimage);
		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_RAMDISK_IMAGE,
				   bg_action_ptr->ramdiskimage);
	}

#endif
	if (!bg_action_ptr->mloaderimage) {
		bg_action_ptr->mloaderimage =
			xstrdup(bg_conf->default_mloaderimage);
		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_MLOADER_IMAGE,
				   bg_action_ptr->mloaderimage);
	}

	slurm_mutex_lock(&block_state_mutex);
	bg_record = find_bg_record_in_list(bg_lists->main,
					   bg_action_ptr->bg_block_id);
	if (!bg_record) {
		slurm_mutex_unlock(&block_state_mutex);
		error("bg_record %s doesn't exist, requested for job (%d)",
		      bg_action_ptr->bg_block_id, job_ptr->job_id);
		_destroy_bg_action(bg_action_ptr);
		return SLURM_ERROR;
	}

	last_bg_update = time(NULL);

	if (bg_record->job_list) {
		if (!find_job_in_bg_record(bg_record, job_ptr->job_id))
			list_append(bg_record->job_list, job_ptr);
	} else {
		bg_record->job_running = bg_action_ptr->job_ptr->job_id;
		bg_record->job_ptr = bg_action_ptr->job_ptr;
	}
	num_unused_cpus -= job_ptr->total_cpus;

	if (!block_ptr_exist_in_list(bg_lists->job_running, bg_record))
		list_push(bg_lists->job_running, bg_record);

	if (!block_ptr_exist_in_list(bg_lists->booted, bg_record))
		list_push(bg_lists->booted, bg_record);
	/* Just incase something happens to free this block before we
	   start the job we will make it so this job doesn't get blown
	   away.
	*/
	bg_record->modifying = 1;
	slurm_mutex_unlock(&block_state_mutex);

	info("Queue start of job %u in BG block %s",
	     job_ptr->job_id,
	     bg_action_ptr->bg_block_id);
	_block_op(bg_action_ptr);
	return rc;
}
예제 #10
0
/* Perform job initiation work */
static void _start_agent(bg_action_t *bg_action_ptr)
{
	int rc, set_user_rc = SLURM_SUCCESS;
	bg_record_t *bg_record = NULL;
	bg_record_t *found_record = NULL;
	ListIterator itr;
	List delete_list = NULL;
	int requeue_job = 0;

	slurm_mutex_lock(&block_state_mutex);
	bg_record = find_bg_record_in_list(bg_lists->main,
					   bg_action_ptr->bg_block_id);

	if (!bg_record) {
		slurm_mutex_unlock(&block_state_mutex);
		error("block %s not found in bg_lists->main",
		      bg_action_ptr->bg_block_id);
		bg_requeue_job(bg_action_ptr->job_ptr->job_id, 1);
		return;
	}

	if (bg_record->job_running <= NO_JOB_RUNNING) {
		// bg_reset_block(bg_record); should already happened
		slurm_mutex_unlock(&block_state_mutex);
		debug("job %u finished during the queueing job "
		      "(everything is ok)",
		      bg_action_ptr->job_ptr->job_id);
		return;
	}
	if (bg_record->state == BG_BLOCK_TERM) {
		debug("Block is in Deallocating state, waiting for free.");
		/* It doesn't appear state of a small block
		   (conn_type) is held on a BGP system so
		   if we to reset it so, just set the reboot flag and
		   handle it later in that code. */
		bg_action_ptr->reboot = 1;
	}

	delete_list = list_create(NULL);
	itr = list_iterator_create(bg_lists->main);
	while ((found_record = list_next(itr))) {
		if ((!found_record) || (bg_record == found_record))
			continue;

		if (!blocks_overlap(bg_record, found_record)) {
			debug2("block %s isn't part of %s",
			       found_record->bg_block_id,
			       bg_record->bg_block_id);
			continue;
		}

		if (found_record->job_ptr) {
			error("Trying to start job %u on block %s, "
			      "but there is a job %u running on an overlapping "
			      "block %s it will not end until %ld.  "
			      "This should never happen.",
			      bg_action_ptr->job_ptr->job_id,
			      bg_record->bg_block_id,
			      found_record->job_ptr->job_id,
			      found_record->bg_block_id,
			      found_record->job_ptr->end_time);
			requeue_job = 1;
			break;
		}

		debug2("need to make sure %s is free, it's part of %s",
		       found_record->bg_block_id,
		       bg_record->bg_block_id);
		list_push(delete_list, found_record);
	}
	list_iterator_destroy(itr);

	if (requeue_job) {
		list_destroy(delete_list);

		bg_reset_block(bg_record);

		slurm_mutex_unlock(&block_state_mutex);
		bg_requeue_job(bg_action_ptr->job_ptr->job_id, 0);
		return;
	}

	slurm_mutex_unlock(&block_state_mutex);

	rc = free_block_list(bg_action_ptr->job_ptr->job_id, delete_list, 0, 1);
	list_destroy(delete_list);
	if (rc != SLURM_SUCCESS) {
		error("Problem with deallocating blocks to run job %u "
		      "on block %s", bg_action_ptr->job_ptr->job_id,
		      bg_action_ptr->bg_block_id);
		if (IS_JOB_CONFIGURING(bg_action_ptr->job_ptr))
			bg_requeue_job(bg_action_ptr->job_ptr->job_id, 0);
		return;
	}

	slurm_mutex_lock(&block_state_mutex);
	/* Failure will unlock block_state_mutex so no need to unlock before
	   return. Failure will unlock block_state_mutex so no need to unlock
	   before return.
	*/
	if (!_make_sure_block_still_exists(bg_action_ptr, bg_record))
		return;

	if (bg_record->job_running <= NO_JOB_RUNNING) {
		// bg_reset_block(bg_record); should already happened
		slurm_mutex_unlock(&block_state_mutex);
		debug("job %u already finished before boot",
		      bg_action_ptr->job_ptr->job_id);
		return;
	}

	rc = 0;
#ifdef HAVE_BGL
	if (bg_action_ptr->blrtsimage
	   && strcasecmp(bg_action_ptr->blrtsimage, bg_record->blrtsimage)) {
		debug3("changing BlrtsImage from %s to %s",
		       bg_record->blrtsimage, bg_action_ptr->blrtsimage);
		xfree(bg_record->blrtsimage);
		bg_record->blrtsimage = xstrdup(bg_action_ptr->blrtsimage);
		rc = 1;
	}
#elif defined HAVE_BGP
	if ((bg_action_ptr->conn_type[0] >= SELECT_SMALL)
	   && (bg_action_ptr->conn_type[0] != bg_record->conn_type[0])) {
		debug3("changing small block mode from %s to %s",
		       conn_type_string(bg_record->conn_type[0]),
		       conn_type_string(bg_action_ptr->conn_type[0]));
		rc = 1;
# ifndef HAVE_BG_FILES
		/* since we don't check state on an emulated system we
		 * have to change it here
		 */
		bg_record->conn_type[0] = bg_action_ptr->conn_type[0];
# endif
	}
#endif

#ifdef HAVE_BG_L_P
	if (bg_action_ptr->linuximage
	   && strcasecmp(bg_action_ptr->linuximage, bg_record->linuximage)) {
# ifdef HAVE_BGL
		debug3("changing LinuxImage from %s to %s",
		       bg_record->linuximage, bg_action_ptr->linuximage);
# else
		debug3("changing CnloadImage from %s to %s",
		       bg_record->linuximage, bg_action_ptr->linuximage);
# endif
		xfree(bg_record->linuximage);
		bg_record->linuximage = xstrdup(bg_action_ptr->linuximage);
		rc = 1;
	}
	if (bg_action_ptr->ramdiskimage
	   && strcasecmp(bg_action_ptr->ramdiskimage,
			 bg_record->ramdiskimage)) {
# ifdef HAVE_BGL
		debug3("changing RamDiskImage from %s to %s",
		       bg_record->ramdiskimage, bg_action_ptr->ramdiskimage);
# else
		debug3("changing IoloadImage from %s to %s",
		       bg_record->ramdiskimage, bg_action_ptr->ramdiskimage);
# endif
		xfree(bg_record->ramdiskimage);
		bg_record->ramdiskimage = xstrdup(bg_action_ptr->ramdiskimage);
		rc = 1;
	}
#endif
	if (bg_action_ptr->mloaderimage
	   && strcasecmp(bg_action_ptr->mloaderimage,
			 bg_record->mloaderimage)) {
		debug3("changing MloaderImage from %s to %s",
		       bg_record->mloaderimage, bg_action_ptr->mloaderimage);
		xfree(bg_record->mloaderimage);
		bg_record->mloaderimage = xstrdup(bg_action_ptr->mloaderimage);
		rc = 1;
	}

	if (rc || bg_action_ptr->reboot) {
		bg_record->modifying = 1;

		/* Increment free_cnt to make sure we don't loose this
		 * block since bg_free_block will unlock block_state_mutex.
		 */
		bg_record->free_cnt++;
		bg_free_block(bg_record, 1, 1);
		bg_record->free_cnt--;

#if defined HAVE_BG_FILES && defined HAVE_BG_L_P
#ifdef HAVE_BGL
		if ((rc = bridge_block_modify(bg_record->bg_block_id,
					      RM_MODIFY_BlrtsImg,
					      bg_record->blrtsimage))
		    != SLURM_SUCCESS)
			error("bridge_block_modify(RM_MODIFY_BlrtsImg): %s",
			      bg_err_str(rc));

		if ((rc = bridge_block_modify(bg_record->bg_block_id,
					      RM_MODIFY_LinuxImg,
					      bg_record->linuximage))
		    != SLURM_SUCCESS)
			error("bridge_block_modify(RM_MODIFY_LinuxImg): %s",
			      bg_err_str(rc));

		if ((rc = bridge_block_modify(bg_record->bg_block_id,
					      RM_MODIFY_RamdiskImg,
					      bg_record->ramdiskimage))
		    != SLURM_SUCCESS)
			error("bridge_block_modify(RM_MODIFY_RamdiskImg): %s",
			      bg_err_str(rc));

#elif defined HAVE_BGP
		if ((rc = bridge_block_modify(bg_record->bg_block_id,
					      RM_MODIFY_CnloadImg,
					      bg_record->linuximage))
		    != SLURM_SUCCESS)
			error("bridge_block_modify(RM_MODIFY_CnloadImg): %s",
			      bg_err_str(rc));

		if ((rc = bridge_block_modify(bg_record->bg_block_id,
					      RM_MODIFY_IoloadImg,
					      bg_record->ramdiskimage))
		    != SLURM_SUCCESS)
			error("bridge_block_modify(RM_MODIFY_IoloadImg): %s",
			      bg_err_str(rc));

		if (bg_action_ptr->conn_type[0] > SELECT_SMALL) {
			char *conn_type = NULL;
			switch(bg_action_ptr->conn_type[0]) {
			case SELECT_HTC_S:
				conn_type = "s";
				break;
			case SELECT_HTC_D:
				conn_type = "d";
				break;
			case SELECT_HTC_V:
				conn_type = "v";
				break;
			case SELECT_HTC_L:
				conn_type = "l";
				break;
			default:
				break;
			}
			/* the option has to be set before the pool can be
			   set */
			if ((rc = bridge_block_modify(
				     bg_record->bg_block_id,
				     RM_MODIFY_Options,
				     conn_type)) != SLURM_SUCCESS)
				error("bridge_set_data(RM_MODIFY_Options): %s",
				      bg_err_str(rc));
		}
#endif
		if ((rc = bridge_block_modify(bg_record->bg_block_id,
					      RM_MODIFY_MloaderImg,
					      bg_record->mloaderimage))
		    != SLURM_SUCCESS)
			error("bridge_block_modify(RM_MODIFY_MloaderImg): %s",
			      bg_err_str(rc));

#endif
		bg_record->modifying = 0;
	}

	if (bg_record->state == BG_BLOCK_FREE) {
		if ((rc = bridge_block_boot(bg_record)) != SLURM_SUCCESS) {
			char reason[200];

			bg_record->boot_state = 0;
			bg_record->boot_count = 0;

			if (rc == BG_ERROR_INVALID_STATE)
				snprintf(reason, sizeof(reason),
					 "Block %s is in an incompatible "
					 "state.  This usually means "
					 "hardware is allocated "
					 "by another block (maybe outside "
					 "of SLURM).",
					 bg_record->bg_block_id);
			else
				snprintf(reason, sizeof(reason),
					 "Couldn't boot block %s: %s",
					 bg_record->bg_block_id,
					 bg_err_str(rc));
			slurm_mutex_unlock(&block_state_mutex);
			requeue_and_error(bg_record, reason);
			return;
		}
	} else if (bg_record->state == BG_BLOCK_BOOTING) {
#ifdef HAVE_BG_FILES
		bg_record->boot_state = 1;
#else
		if (!block_ptr_exist_in_list(bg_lists->booted, bg_record))
			list_push(bg_lists->booted, bg_record);
		bg_record->state = BG_BLOCK_INITED;
		last_bg_update = time(NULL);
#endif
	}


	if (bg_record->job_running <= NO_JOB_RUNNING) {
		slurm_mutex_unlock(&block_state_mutex);
		debug("job %u finished during the start of the boot "
		      "(everything is ok)",
		      bg_action_ptr->job_ptr->job_id);
		return;
	}

	/* Don't reset boot_count, it will be reset when state
	   changes, and needs to outlast a job allocation.
	*/
	/* bg_record->boot_count = 0; */
	xfree(bg_record->target_name);
	bg_record->target_name = uid_to_string(bg_action_ptr->job_ptr->user_id);
	debug("setting the target_name for Block %s to %s",
	      bg_record->bg_block_id, bg_record->target_name);

	if (bg_record->state == BG_BLOCK_INITED) {
		debug("block %s is ready.", bg_record->bg_block_id);
		set_user_rc = set_block_user(bg_record);
		if (bg_action_ptr->job_ptr) {
			bg_action_ptr->job_ptr->job_state &= (~JOB_CONFIGURING);
			last_job_update = time(NULL);
		}
	}
	slurm_mutex_unlock(&block_state_mutex);

	if (set_user_rc == SLURM_ERROR) {
		sleep(2);
		/* wait for the slurmd to begin
		   the batch script, slurm_fail_job()
		   is a no-op if issued prior
		   to the script initiation do clean up just
		   incase the fail job isn't ran */
		(void) slurm_fail_job(bg_record->job_running);
		slurm_mutex_lock(&block_state_mutex);
		if (remove_from_bg_list(bg_lists->job_running, bg_record)
		    == SLURM_SUCCESS)
			num_unused_cpus += bg_record->cpu_cnt;

		slurm_mutex_unlock(&block_state_mutex);
	}
}
예제 #11
0
/* Update block user and reboot as needed */
static void _sync_agent(bg_action_t *bg_action_ptr)
{
	bg_record_t * bg_record = NULL;

	slurm_mutex_lock(&block_state_mutex);
	bg_record = find_bg_record_in_list(bg_lists->main,
					   bg_action_ptr->bg_block_id);
	if (!bg_record) {
		slurm_mutex_unlock(&block_state_mutex);
		error("No block %s", bg_action_ptr->bg_block_id);
		bg_requeue_job(bg_action_ptr->job_ptr->job_id, 1);
		return;
	}

	last_bg_update = time(NULL);
	bg_action_ptr->job_ptr->total_cpus =
		bg_action_ptr->job_ptr->details->min_cpus = bg_record->cpu_cnt;
	bg_record->job_running = bg_action_ptr->job_ptr->job_id;
	bg_record->job_ptr = bg_action_ptr->job_ptr;
	set_select_jobinfo(bg_record->job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_BLOCK_PTR,
			   bg_record);

	if (!block_ptr_exist_in_list(bg_lists->job_running, bg_record)) {
		list_push(bg_lists->job_running, bg_record);
		num_unused_cpus -= bg_record->cpu_cnt;
	}
	if (!block_ptr_exist_in_list(bg_lists->booted, bg_record))
		list_push(bg_lists->booted, bg_record);

	if (bg_record->state == BG_BLOCK_INITED) {
		if (bg_record->job_ptr) {
			bg_record->job_ptr->job_state &= (~JOB_CONFIGURING);
			last_job_update = time(NULL);
		}
		if (bg_record->user_uid != bg_action_ptr->job_ptr->user_id) {
			int set_user_rc = SLURM_SUCCESS;

			debug("User isn't correct for job %d on %s, "
			      "fixing...",
			      bg_action_ptr->job_ptr->job_id,
			      bg_action_ptr->bg_block_id);
			xfree(bg_record->target_name);
			bg_record->target_name =
				uid_to_string(bg_action_ptr->job_ptr->user_id);
			set_user_rc = set_block_user(bg_record);
			slurm_mutex_unlock(&block_state_mutex);

			if (set_user_rc == SLURM_ERROR)
				(void) slurm_fail_job(bg_record->job_running);
		} else
			slurm_mutex_unlock(&block_state_mutex);

	} else {
		if (bg_record->state != BG_BLOCK_BOOTING) {
			error("Block %s isn't ready and isn't "
			      "being configured! Starting job again.",
			      bg_action_ptr->bg_block_id);
		} else {
			debug("Block %s is booting, job ok",
			      bg_action_ptr->bg_block_id);
		}
		slurm_mutex_unlock(&block_state_mutex);
		_start_agent(bg_action_ptr);
	}
}
예제 #12
0
extern int bg_status_update_block_state(bg_record_t *bg_record,
					uint16_t state,
					List kill_job_list)
{
	bool skipped_dealloc = false;
	kill_job_struct_t *freeit = NULL;
	int updated = 0;
	uint16_t real_state = bg_record->state & (~BG_BLOCK_ERROR_FLAG);

	if (real_state == state)
		return 0;

	debug("state of Block %s was %s and now is %s",
	      bg_record->bg_block_id,
	      bg_block_state_string(bg_record->state),
	      bg_block_state_string(state));

	/*
	  check to make sure block went
	  through freeing correctly
	*/
	if ((real_state != BG_BLOCK_TERM
	     && !(bg_record->state & BG_BLOCK_ERROR_FLAG))
	    && state == BG_BLOCK_FREE)
		skipped_dealloc = 1;
	else if ((real_state == BG_BLOCK_INITED)
		 && (state == BG_BLOCK_BOOTING)) {
		/* This means the user did a reboot through
		   mpirun but we missed the state
		   change */
		debug("Block %s skipped rebooting, "
		      "but it really is.",
		      bg_record->bg_block_id);
	} else if ((real_state == BG_BLOCK_TERM)
		   && (state == BG_BLOCK_BOOTING))
		/* This is a funky state IBM says
		   isn't a bug, but all their
		   documentation says this doesn't
		   happen, but IBM says oh yeah, you
		   weren't really suppose to notice
		   that. So we will just skip this
		   state and act like this didn't happen. */
		goto nochange_state;
	real_state = state;
	if (bg_record->state & BG_BLOCK_ERROR_FLAG)
		state |= BG_BLOCK_ERROR_FLAG;

	bg_record->state = state;

	if (real_state == BG_BLOCK_TERM || skipped_dealloc)
		_block_is_deallocating(bg_record, kill_job_list);
	else if (real_state == BG_BLOCK_BOOTING) {
		debug("Setting bootflag for %s", bg_record->bg_block_id);
		bg_record->boot_state = 1;
	} else if (real_state == BG_BLOCK_FREE) {
		/* Make sure block is cleaned up.  If there are
		 * running jobs on the block this happens when they
		 * are cleaned off. */
		if (bg_record->job_running == NO_JOB_RUNNING
		    && (!bg_record->job_list
			|| !list_count(bg_record->job_list)))
			bg_reset_block(bg_record, NULL);
		remove_from_bg_list(bg_lists->booted, bg_record);
	} else if (real_state & BG_BLOCK_ERROR_FLAG) {
		if (bg_record->boot_state)
			error("Block %s in an error state while booting.",
			      bg_record->bg_block_id);
		else
			error("Block %s in an error state.",
			      bg_record->bg_block_id);
		remove_from_bg_list(bg_lists->booted, bg_record);
		trigger_block_error();
	} else if (real_state == BG_BLOCK_INITED) {
		if (!block_ptr_exist_in_list(bg_lists->booted, bg_record))
			list_push(bg_lists->booted, bg_record);
	}
	updated = 1;
	last_bg_update = time(NULL);
nochange_state:

	/* check the boot state */
	debug3("boot state for block %s is %d",
	       bg_record->bg_block_id, bg_record->boot_state);
	if (bg_record->boot_state) {
		if (bg_record->state & BG_BLOCK_ERROR_FLAG) {
			/* If we get an error on boot that
			 * means it is a transparent L3 error
			 * and should be trying to fix
			 * itself.  If this is the case we
			 * just hang out waiting for the state
			 * to go to free where we will try to
			 * boot again below.
			 */
			return updated;
		}

		switch (real_state) {
		case BG_BLOCK_BOOTING:
			if (bg_record->job_ptr
			    && !IS_JOB_CONFIGURING(bg_record->job_ptr)) {
				debug3("Setting job %u on block %s "
				       "to configuring",
				       bg_record->job_ptr->job_id,
				       bg_record->bg_block_id);
				bg_record->job_ptr->job_state |=
					JOB_CONFIGURING;
				last_job_update = time(NULL);
			} else if (bg_record->job_list
				   && list_count(bg_record->job_list)) {
				struct job_record *job_ptr;
				ListIterator job_itr =
					list_iterator_create(
						bg_record->job_list);
				while ((job_ptr = list_next(job_itr))) {
					if (job_ptr->magic != JOB_MAGIC) {
						error("bg_status_update_"
						      "block_state: 1 "
						      "bad magic found when "
						      "looking at block %s",
						      bg_record->bg_block_id);
						list_delete_item(job_itr);
						continue;
					}
					job_ptr->job_state |= JOB_CONFIGURING;
				}
				list_iterator_destroy(job_itr);
				last_job_update = time(NULL);
			}
			break;
		case BG_BLOCK_FREE:
			if (bg_record->boot_count < RETRY_BOOT_COUNT) {
				bridge_block_boot(bg_record);

				if (bg_record->magic == BLOCK_MAGIC) {
					debug("boot count for block %s is %d",
					      bg_record->bg_block_id,
					      bg_record->boot_count);
					bg_record->boot_count++;
				}
			} else {
				char *reason = (char *)
					"status_check: Boot fails ";

				error("Couldn't boot Block %s",
				      bg_record->bg_block_id);

				/* We can't push on the kill_job_list
				   here since we have to put this
				   block in an error and that means
				   the killing has to take place
				   before the erroring of the block.
				*/
				slurm_mutex_unlock(&block_state_mutex);
				unlock_slurmctld(job_read_lock);
				requeue_and_error(bg_record, reason);
				lock_slurmctld(job_read_lock);
				slurm_mutex_lock(&block_state_mutex);

				bg_record->boot_state = 0;
				bg_record->boot_count = 0;

				remove_from_bg_list(bg_lists->booted,
						    bg_record);
			}
			break;
		case BG_BLOCK_INITED:
			debug("block %s is ready.",
			      bg_record->bg_block_id);
			if (bg_record->job_ptr
			    && IS_JOB_CONFIGURING(bg_record->job_ptr)) {
				bg_record->job_ptr->job_state &=
					(~JOB_CONFIGURING);
				last_job_update = time(NULL);
			} else if (bg_record->job_list
				   && list_count(bg_record->job_list)) {
				struct job_record *job_ptr;
				ListIterator job_itr =
					list_iterator_create(
						bg_record->job_list);
				while ((job_ptr = list_next(job_itr))) {
					if (job_ptr->magic != JOB_MAGIC) {
						error("bg_status_update_"
						      "block_state: 2 "
						      "bad magic found when "
						      "looking at block %s",
						      bg_record->bg_block_id);
						list_delete_item(job_itr);
						continue;
					}
					job_ptr->job_state &=
						(~JOB_CONFIGURING);
				}
				list_iterator_destroy(job_itr);
				last_job_update = time(NULL);
			}

			bg_record->boot_state = 0;
			bg_record->boot_count = 0;

			if (kill_job_list &&
			    bridge_block_sync_users(bg_record)
			    == SLURM_ERROR) {
				freeit = (kill_job_struct_t *)
					xmalloc(sizeof(kill_job_struct_t));
				freeit->jobid = bg_record->job_running;
				list_push(kill_job_list, freeit);
			}
			break;
		case BG_BLOCK_TERM:
			debug2("Block %s is in a deallocating state "
			       "during a boot.  Doing nothing until "
			       "free state.",
			       bg_record->bg_block_id);
			break;
		case BG_BLOCK_REBOOTING:
			debug2("Block %s is rebooting.",
			       bg_record->bg_block_id);
			break;
		default:
			debug("Hey the state of block "
			      "%s is %d(%s) doing nothing.",
			      bg_record->bg_block_id,
			      real_state,
			      bg_block_state_string(bg_record->state));
			break;
		}
	}

	return updated;
}
예제 #13
0
/* block_state_mutex must be unlocked before calling this. */
extern int put_block_in_error_state(bg_record_t *bg_record, char *reason)
{
	uid_t pw_uid;

	xassert(bg_record);

	/* Only check this if the blocks are created, meaning this
	   isn't at startup.
	*/
	if (blocks_are_created) {
		/* Since we are putting this block in an error state we need
		   to wait for the job to be removed.  We don't really
		   need to free the block though since we may just
		   want it to be in an error state for some reason. */
		while (bg_record->job_running > NO_JOB_RUNNING) {
			if (bg_record->magic != BLOCK_MAGIC) {
				error("While putting block %s in a error "
				      "state it was destroyed",
				      bg_record->bg_block_id);
				return SLURM_ERROR;
			}
			debug2("block %s is still running job %d",
			       bg_record->bg_block_id, bg_record->job_running);
			sleep(1);
		}
	}

	slurm_mutex_lock(&block_state_mutex);
	if (!block_ptr_exist_in_list(bg_lists->main, bg_record)) {
		slurm_mutex_unlock(&block_state_mutex);
		error("while trying to put block in "
		      "error state it disappeared");
		return SLURM_ERROR;
	}

	info("Setting Block %s to ERROR state. (reason: '%s')",
	     bg_record->bg_block_id, reason);
	/* we add the block to these lists so we don't try to schedule
	   on them. */
	if (!block_ptr_exist_in_list(bg_lists->job_running, bg_record)) {
		list_push(bg_lists->job_running, bg_record);
		num_unused_cpus -= bg_record->cpu_cnt;
	}
	if (!block_ptr_exist_in_list(bg_lists->booted, bg_record))
		list_push(bg_lists->booted, bg_record);

	bg_record->job_running = BLOCK_ERROR_STATE;
	bg_record->state |= BG_BLOCK_ERROR_FLAG;

	xfree(bg_record->user_name);
	xfree(bg_record->target_name);
	bg_record->user_name = xstrdup(bg_conf->slurm_user_name);
	bg_record->target_name = xstrdup(bg_conf->slurm_user_name);
	bg_record->reason = xstrdup(reason);

	if (uid_from_string (bg_record->user_name, &pw_uid) < 0)
		error("No such user: %s", bg_record->user_name);
	else
		bg_record->user_uid = pw_uid;

	/* Only send if reason is set.  If it isn't set then
	   accounting should already know about this error state */
	if (reason)
		_set_block_nodes_accounting(bg_record, reason);
	slurm_mutex_unlock(&block_state_mutex);

	trigger_block_error();
	return SLURM_SUCCESS;
}