Ejemplo n.º 1
0
/*
 * Perform any work required to terminate a job
 * job_ptr IN - pointer to the job being terminated
 * RET - SLURM_SUCCESS or an error code
 *
 * NOTE: This happens in parallel with srun and slurmd terminating
 * the job. Insure that this function, mpirun and the epilog can
 * all deal with termination race conditions.
 */
int term_job(struct job_record *job_ptr)
{
	int rc = SLURM_SUCCESS;
	bg_action_t *bg_action_ptr = NULL;

	bg_action_ptr = xmalloc(sizeof(bg_action_t));
	bg_action_ptr->op = TERM_OP;
	bg_action_ptr->job_ptr = job_ptr;
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_BLOCK_ID,
			   &(bg_action_ptr->bg_block_id));
	info("Queue termination of job %u in BG block %s",
	     job_ptr->job_id, bg_action_ptr->bg_block_id);
	_block_op(bg_action_ptr);

	return rc;
}
Ejemplo n.º 2
0
/*
 * Synchronize BG block state to that of currently active jobs.
 * This can recover from slurmctld crashes when block usership
 * changes were queued
 */
extern int sync_jobs(List job_list)
{
	ListIterator itr;
	struct job_record  *job_ptr = NULL;
	List block_list = NULL, kill_list = NULL;
	static bool run_already = false;
	bg_record_t *bg_record = NULL;

	/* Execute only on initial startup. We don't support bgblock
	 * creation on demand today, so there is no need to re-sync data. */
	if (run_already)
		return SLURM_SUCCESS;
	run_already = true;

	if (!job_list) {
		error("sync_jobs: no job_list");
		return SLURM_ERROR;
	}
	slurm_mutex_lock(&block_state_mutex);
	/* Insure that all running jobs own the specified block */
	itr = list_iterator_create(job_list);
	while ((job_ptr = list_next(itr))) {
		bg_action_t *bg_action_ptr = NULL;
		if (!IS_JOB_RUNNING(job_ptr) && !IS_JOB_COMPLETING(job_ptr))
			continue;

		bg_action_ptr = xmalloc(sizeof(bg_action_t));
		if (IS_JOB_COMPLETING(job_ptr))
			bg_action_ptr->op = TERM_OP;
		else
			bg_action_ptr->op = START_OP;
		bg_action_ptr->job_ptr = job_ptr;

		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_BLOCK_ID,
				   &(bg_action_ptr->bg_block_id));
#ifdef HAVE_BG_L_P
# ifdef HAVE_BGL
		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_BLRTS_IMAGE,
				   &(bg_action_ptr->blrtsimage));
# else
		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_CONN_TYPE,
				   &(bg_action_ptr->conn_type));
# endif
		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_LINUX_IMAGE,
				   &(bg_action_ptr->linuximage));
		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_RAMDISK_IMAGE,
				   &(bg_action_ptr->ramdiskimage));
#endif
		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_MLOADER_IMAGE,
				   &(bg_action_ptr->mloaderimage));

		if (bg_action_ptr->bg_block_id == NULL) {
			error("Running job %u has bgblock==NULL",
			      job_ptr->job_id);
		} else if (job_ptr->nodes == NULL) {
			error("Running job %u has nodes==NULL",
			      job_ptr->job_id);
		} else if (!(bg_record = find_bg_record_in_list(
				     bg_lists->main,
				     bg_action_ptr->bg_block_id))) {
			error("Kill job %u belongs to defunct "
			      "bgblock %s",
			      job_ptr->job_id,
			      bg_action_ptr->bg_block_id);
		}

		if (!bg_record) {
			/* Can't fail it just now, we have locks in
			   place. */
			bg_status_add_job_kill_list(job_ptr, &kill_list);
			_destroy_bg_action(bg_action_ptr);
			continue;
		}
		/* _sync_agent will destroy the bg_action_ptr */
		_sync_agent(bg_action_ptr, bg_record);
	}
	list_iterator_destroy(itr);

	block_list = list_create(destroy_bg_record);
	itr = list_iterator_create(bg_lists->main);
	while ((bg_record = list_next(itr))) {
		bg_record_t *rm_record;
		if (bg_record->job_ptr
		    || (bg_record->job_list
			&& list_count(bg_record->job_list)))
			continue;
		rm_record = xmalloc(sizeof(bg_record_t));
		rm_record->magic = BLOCK_MAGIC;
		rm_record->bg_block_id = xstrdup(bg_record->bg_block_id);
		rm_record->mp_str = xstrdup(bg_record->mp_str);
		list_append(block_list, rm_record);
	}
	list_iterator_destroy(itr);
	slurm_mutex_unlock(&block_state_mutex);

	if (kill_list) {
		/* slurmctld is already locked up, so handle this right after
		 * the unlock of block_state_mutex.
		 */
		bg_status_process_kill_job_list(kill_list, JOB_BOOT_FAIL, 1);
		FREE_NULL_LIST(kill_list);
	}

	/* Insure that all other blocks are free of users */
	if (block_list) {
		itr = list_iterator_create(block_list);
		while ((bg_record = list_next(itr))) {
			info("Queue clearing of users of BG block %s",
			     bg_record->bg_block_id);
			term_jobs_on_block(bg_record->bg_block_id);
		}
		list_iterator_destroy(itr);
		FREE_NULL_LIST(block_list);
	} else {
		/* this should never happen,
		 * vestigial logic */
		error("sync_jobs: no block_list");
		return SLURM_ERROR;
	}
	return SLURM_SUCCESS;
}
Ejemplo n.º 3
0
/*
 * Perform any setup required to initiate a job
 * job_ptr IN - pointer to the job being initiated
 * RET - SLURM_SUCCESS or an error code
 *
 * NOTE: This happens in parallel with srun and slurmd spawning
 * the job. A prolog script is expected to defer initiation of
 * the job script until the BG block is available for use.
 */
extern int start_job(struct job_record *job_ptr)
{
	int rc = SLURM_SUCCESS;
	bg_record_t *bg_record = NULL;
	bg_action_t *bg_action_ptr = NULL;
	select_jobinfo_t *jobinfo = job_ptr->select_jobinfo->data;

	slurm_mutex_lock(&block_state_mutex);
	bg_record = jobinfo->bg_record;

	if (!bg_record || !block_ptr_exist_in_list(bg_lists->main, bg_record)) {
		slurm_mutex_unlock(&block_state_mutex);
		error("bg_record %s doesn't exist, requested for job (%d)",
		      jobinfo->bg_block_id, job_ptr->job_id);
		return SLURM_ERROR;
	}

	if ((jobinfo->conn_type[0] != SELECT_NAV)
	    && (jobinfo->conn_type[0] < SELECT_SMALL)) {
		int dim;
		for (dim=0; dim<SYSTEM_DIMENSIONS; dim++)
			jobinfo->conn_type[dim] = bg_record->conn_type[dim];
	}

	/* If it isn't 0 then it was setup previous (sub-block)
	*/
	if (jobinfo->geometry[SYSTEM_DIMENSIONS] == 0)
		memcpy(jobinfo->geometry, bg_record->geo,
		       sizeof(bg_record->geo));

	if (bg_record->job_list) {
		/* Mark the ba_mp cnodes as used now. */
		ba_mp_t *ba_mp = list_peek(bg_record->ba_mp_list);
		xassert(ba_mp);
		xassert(ba_mp->cnode_bitmap);
		bit_or(ba_mp->cnode_bitmap, jobinfo->units_avail);
		if (!find_job_in_bg_record(bg_record, job_ptr->job_id))
			list_append(bg_record->job_list, job_ptr);
	} else {
		bg_record->job_running = job_ptr->job_id;
		bg_record->job_ptr = job_ptr;
	}

	job_ptr->job_state |= JOB_CONFIGURING;

	bg_action_ptr = xmalloc(sizeof(bg_action_t));
	bg_action_ptr->op = START_OP;
	bg_action_ptr->job_ptr = job_ptr;

	/* FIXME: The below get_select_jobinfo calls could be avoided
	 * by just using the jobinfo as we do above.
	 */
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_BLOCK_ID,
			   &(bg_action_ptr->bg_block_id));
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_REBOOT,
			   &(bg_action_ptr->reboot));
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_CONN_TYPE,
			   &(bg_action_ptr->conn_type));
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_MLOADER_IMAGE,
			   &(bg_action_ptr->mloaderimage));
#ifdef HAVE_BG_L_P
# ifdef HAVE_BGL
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_BLRTS_IMAGE,
			   &(bg_action_ptr->blrtsimage));
	if (!bg_action_ptr->blrtsimage) {
		bg_action_ptr->blrtsimage =
			xstrdup(bg_conf->default_blrtsimage);
		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_BLRTS_IMAGE,
				   bg_action_ptr->blrtsimage);
	}
# elif defined HAVE_BGP
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_CONN_TYPE,
			   &(bg_action_ptr->conn_type));
# endif
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_LINUX_IMAGE,
			   &(bg_action_ptr->linuximage));
	if (!bg_action_ptr->linuximage) {
		bg_action_ptr->linuximage =
			xstrdup(bg_conf->default_linuximage);
		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_LINUX_IMAGE,
				   bg_action_ptr->linuximage);
	}

	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_RAMDISK_IMAGE,
			   &(bg_action_ptr->ramdiskimage));
	if (!bg_action_ptr->ramdiskimage) {
		bg_action_ptr->ramdiskimage =
			xstrdup(bg_conf->default_ramdiskimage);
		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_RAMDISK_IMAGE,
				   bg_action_ptr->ramdiskimage);
	}

#endif
	if (!bg_action_ptr->mloaderimage) {
		bg_action_ptr->mloaderimage =
			xstrdup(bg_conf->default_mloaderimage);
		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_MLOADER_IMAGE,
				   bg_action_ptr->mloaderimage);
	}

	num_unused_cpus -= job_ptr->total_cpus;

	if (!block_ptr_exist_in_list(bg_lists->job_running, bg_record))
		list_push(bg_lists->job_running, bg_record);

	if (!block_ptr_exist_in_list(bg_lists->booted, bg_record))
		list_push(bg_lists->booted, bg_record);
	/* Just in case something happens to free this block before we
	   start the job we will make it so this job doesn't get blown
	   away.
	*/
	bg_record->modifying = 1;
	last_bg_update = time(NULL);

	slurm_mutex_unlock(&block_state_mutex);

	info("Queue start of job %u in BG block %s",
	     job_ptr->job_id,
	     bg_action_ptr->bg_block_id);
	_block_op(bg_action_ptr);
	return rc;
}
Ejemplo n.º 4
0
/*
 * finds the best match for a given job request
 *
 *
 * OUT - block_id of matched block, NULL otherwise
 * returns 1 for error (no match)
 *
 */
static int _find_best_block_match(List block_list,
				  int *blocks_added,
				  struct job_record* job_ptr,
				  bitstr_t* slurm_block_bitmap,
				  uint32_t min_nodes, uint32_t max_nodes,
				  uint32_t req_nodes,
				  bg_record_t** found_bg_record,
				  uint16_t query_mode, int avail_cpus)
{
	bg_record_t *bg_record = NULL;
	uint16_t req_geometry[SYSTEM_DIMENSIONS];
	uint16_t target_size = 0;
	uint32_t req_procs = job_ptr->details->min_cpus;
	select_ba_request_t request;
	int i, dim;
	int overlap_check = 0;
	int allow = 0;
	int check_image = 1;
	uint32_t max_cpus = job_ptr->details->max_cpus;
	char tmp_char[256];
	static int total_cpus = 0;
	int rc = SLURM_SUCCESS;
	int create_try = 0;
	List overlapped_list = NULL;
	bool is_test = SELECT_IS_TEST(query_mode);

	if (!total_cpus) {
		int *cluster_dims = select_g_ba_get_dims();
		total_cpus = 1;
		for (dim=0; dim<SYSTEM_DIMENSIONS; dim++)
			total_cpus *= cluster_dims[dim];
		total_cpus *= bg_conf->cpus_per_mp;
	}

	if (req_nodes > max_nodes) {
		error("can't run this job max mps is %u asking for %u",
		      max_nodes, req_nodes);
		return SLURM_ERROR;
	}

	if (!is_test && (req_procs > avail_cpus)) {
		if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
			info("asking for %u I only have %d",
			     req_procs, avail_cpus);
		return SLURM_ERROR;
	}

	if (!block_list) {
		error("_find_best_block_match: There is no block_list");
		return SLURM_ERROR;
	}

	memset(&request, 0, sizeof(select_ba_request_t));

	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_CONN_TYPE, &request.conn_type);
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_GEOMETRY, &req_geometry);
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_ROTATE, &request.rotate);

	if ((rc = _check_images(job_ptr, &request)) == SLURM_ERROR)
		goto end_it;

	if (req_geometry[0] != 0 && req_geometry[0] != (uint16_t)NO_VAL) {
		char tmp_geo[SYSTEM_DIMENSIONS+1];

		target_size = 1;
		for (i=0; i<SYSTEM_DIMENSIONS; i++) {
			target_size *= req_geometry[i];
			tmp_geo[i] = alpha_num[req_geometry[i]];
		}
		tmp_geo[i] = '\0';

		if (target_size != min_nodes) {
			debug2("min_nodes not set correctly %u "
			       "should be %u from %s",
			       min_nodes, target_size,
			       tmp_geo);
			min_nodes = target_size;
		}
		if (!req_nodes)
			req_nodes = min_nodes;
	} else {
		req_geometry[0] = (uint16_t)NO_VAL;
		target_size = min_nodes;
	}

	*found_bg_record = NULL;
	allow = 0;

	memcpy(request.geometry, req_geometry, sizeof(req_geometry));

	request.deny_pass = (uint16_t)NO_VAL;
	request.save_name = NULL;
	request.size = target_size;
	request.procs = req_procs;
	request.elongate = request.rotate;
	/* request.start[0] = 1; */
	/* request.start[1] = 2; */
	/* request.start[2] = 0; */
	/* request.start[3] = 2; */
	/* request.start_req = 1; */

	if (job_ptr->details->req_node_bitmap)
		request.avail_mp_bitmap = job_ptr->details->req_node_bitmap;
	else
		request.avail_mp_bitmap = slurm_block_bitmap;

	/* since we only look at procs after this and not nodes we
	 *  need to set a max_cpus if given
	 */
	if (max_cpus == (uint32_t)NO_VAL)
		max_cpus = max_nodes * bg_conf->cpus_per_mp;

	while (1) {
		/* Here we are creating a list of all the blocks that
		 * have overlapped jobs so if we don't find one that
		 * works we will have can look and see the earliest
		 * the job can start.  This doesn't apply to Dynamic mode.
		 */
		if (is_test && SELECT_IS_CHECK_FULL_SET(query_mode)
		    && bg_conf->layout_mode != LAYOUT_DYNAMIC)
			overlapped_list = list_create(NULL);

		bg_record = _find_matching_block(block_list,
						 job_ptr,
						 slurm_block_bitmap,
						 &request,
						 max_cpus,
						 &allow, check_image,
						 overlap_check,
						 overlapped_list,
						 query_mode);
		/* this could get altered in _find_matching_block so we
		   need to reset it */
		memcpy(request.geometry, req_geometry, sizeof(req_geometry));

		if (!bg_record && overlapped_list
		    && list_count(overlapped_list)) {
			ListIterator itr =
				list_iterator_create(overlapped_list);
			bg_record_t *tmp_rec = NULL;
			while ((tmp_rec = list_next(itr))) {
				if (!bg_record ||
				    (tmp_rec->job_ptr->end_time <
				     bg_record->job_ptr->end_time))
					bg_record = tmp_rec;
			}
			list_iterator_destroy(itr);
		}

		if (overlapped_list)
			list_destroy(overlapped_list);

		/* set the bitmap and do other allocation activities */
		if (bg_record) {
#ifdef HAVE_BG_L_P
			if (!is_test) {
				if (bridge_block_check_mp_states(
					    bg_record->bg_block_id, 1)
				    != SLURM_SUCCESS) {
					/* check_block_mp_states will
					   set this block in the main
					   list to an error state, but
					   we aren't looking
					   at the main list, so we
					   need to set this copy of
					   the block in an
					   error state as well.
					*/
					bg_record->job_running =
						BLOCK_ERROR_STATE;
					bg_record->state |= BG_BLOCK_ERROR_FLAG;
					error("_find_best_block_match: Picked "
					      "block (%s) had some issues with "
					      "hardware, trying a different "
					      "one.",
					      bg_record->bg_block_id);
					continue;
				}
			}
#endif
			format_node_name(bg_record, tmp_char, sizeof(tmp_char));

			debug("_find_best_block_match %s <%s>",
			      bg_record->bg_block_id, tmp_char);
			bit_and(slurm_block_bitmap, bg_record->mp_bitmap);
			rc = SLURM_SUCCESS;
			*found_bg_record = bg_record;
			goto end_it;
		}

		/* see if we can just reset the image and reboot the block */
		if (allow) {
			check_image = 0;
			allow = 0;
			continue;
		}

		check_image = 1;

		/* all these assume that the *bg_record is NULL */

		if (bg_conf->layout_mode == LAYOUT_OVERLAP
		    && !is_test && overlap_check < 2) {
			overlap_check++;
			continue;
		}

		if (create_try || bg_conf->layout_mode != LAYOUT_DYNAMIC)
			goto no_match;

		if ((rc = _dynamically_request(block_list, blocks_added,
					       &request,
					       job_ptr->details->req_nodes,
					       query_mode))
		    == SLURM_SUCCESS) {
			create_try = 1;
			continue;
		}

		/* Only look at the full system if we aren't going to
		   preempt jobs later and look.
		*/
		if (is_test && SELECT_IS_CHECK_FULL_SET(query_mode)) {
			List new_blocks = NULL;
			List job_list = list_create(NULL);
			ListIterator itr = NULL;
			if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
				info("trying with empty machine");

			/* Here we need to make sure the blocks in the
			   job list are those in the block list so go
			   through and grab them and add them to a
			   separate list.
			*/
			itr = list_iterator_create(block_list);
			while ((bg_record = list_next(itr))) {
				if (bg_record->job_running != NO_JOB_RUNNING)
					list_append(job_list, bg_record);
				/* Since the error blocks are at the
				   end we only really need to look at
				   the first one to make sure it will
				   work, so don't add more than one to
				   the job list.
				   We do need to check for at least
				   one error block because that lets
				   us know not to hold up the entire
				   machine for a job that won't run
				   until the error is removed which
				   could be a very long time.
				*/
				if (bg_record->job_running == BLOCK_ERROR_STATE)
					break;
			}
			list_iterator_destroy(itr);

			/* Block list is already in the correct order,
			   earliest avaliable first,
			   so the job list will also be. No need to
			   sort. */
			while (1) {
				bool track_down_nodes = true;

				if ((bg_record = list_pop(job_list))) {
					if (bg_record->job_ptr) {
						if (bg_conf->slurm_debug_flags
						    & DEBUG_FLAG_BG_PICK)
							info("taking off "
							     "%d(%s) started "
							     "at %ld ends "
							     "at %ld",
							     bg_record->
							     job_running,
							     bg_record->
							     bg_block_id,
							     bg_record->
							     job_ptr->
							     start_time,
							     bg_record->
							     job_ptr->
							     end_time);
						/* Mark the block as
						   not running a job,
						   this should
						   correspond to the
						   pointer in the
						   block_list.  We
						   only look at the
						   job_running var so
						   don't remove the
						   job_ptr.
						*/
						bg_record->job_running =
							NO_JOB_RUNNING;
					} else if ((bg_record->job_running
						    == BLOCK_ERROR_STATE)
						   && (bg_conf->
						       slurm_debug_flags
						       & DEBUG_FLAG_BG_PICK))
						info("taking off (%s) "
						     "which is in an "
						     "error state",
						     bg_record->bg_block_id);
				} else
					/* This means we didn't have
					   any jobs to take off
					   anymore so we are making
					   sure we can look at every
					   node on the system.
					*/
					track_down_nodes = false;

				if (!(new_blocks = create_dynamic_block(
					      block_list, &request, job_list,
					      track_down_nodes))) {
					if (errno == ESLURM_INTERCONNECT_FAILURE
					    || !list_count(job_list)) {
						char *nodes;
						if (slurmctld_conf.
						    slurmctld_debug < 5)
							break;
						nodes = bitmap2node_name(
							slurm_block_bitmap);
						debug("job %u not "
						      "runable on %s",
						      job_ptr->job_id,
						      nodes);
						xfree(nodes);
						break;
					}
					continue;
				}
				rc = SLURM_SUCCESS;
				/* outside of the job_test_list this
				 * gets destroyed later, so don't worry
				 * about it now
				 */
				(*found_bg_record) = list_pop(new_blocks);
				if (!(*found_bg_record)) {
					list_destroy(new_blocks);
					if (!bg_record) {
						/* This should never happen */
						error("got an empty list back");
						rc = SLURM_ERROR;
						break;
					}

					if (bg_conf->slurm_debug_flags
					    & DEBUG_FLAG_BG_PICK)
						info("Appears we are trying "
						     "to place this job on "
						     "the block we just "
						     "removed %s.",
						     bg_record->bg_block_id);
					/* This means we placed the job on
					   the block we just popped off.
					*/
					bit_and(slurm_block_bitmap,
						bg_record->mp_bitmap);
					*found_bg_record = bg_record;
					break;
				}
				bit_and(slurm_block_bitmap,
					(*found_bg_record)->mp_bitmap);

				if (bg_record) {
					(*found_bg_record)->job_running =
						bg_record->job_running;
					(*found_bg_record)->job_ptr
						= bg_record->job_ptr;
				}
				list_destroy(new_blocks);
				break;
			}

			list_destroy(job_list);

			goto end_it;
		} else {
			break;
		}
	}

no_match:
	debug("_find_best_block_match none found");
	rc = SLURM_ERROR;

end_it:

	xfree(request.blrtsimage);
	xfree(request.linuximage);
	xfree(request.mloaderimage);
	xfree(request.ramdiskimage);

	return rc;
}
Ejemplo n.º 5
0
static int _check_images(struct job_record* job_ptr,
			 select_ba_request_t *request)
{
	int allow = 0;

#ifdef HAVE_BGL
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_BLRTS_IMAGE, &request->blrtsimage);

	if (request->blrtsimage) {
		allow = _test_image_perms(request->blrtsimage,
					  bg_conf->blrts_list,
					  job_ptr);
		if (!allow) {
			error("User %u:%u is not allowed to use BlrtsImage %s",
			      job_ptr->user_id, job_ptr->group_id,
			      request->blrtsimage);
			return SLURM_ERROR;

		}
	}
#endif

#ifdef HAVE_BG_L_P
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_LINUX_IMAGE, &request->linuximage);
	if (request->linuximage) {
		allow = _test_image_perms(request->linuximage,
					  bg_conf->linux_list,
					  job_ptr);
		if (!allow) {
			error("User %u:%u is not allowed to use LinuxImage %s",
			      job_ptr->user_id, job_ptr->group_id,
			      request->linuximage);
			return SLURM_ERROR;
		}
	}

	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_RAMDISK_IMAGE,
			   &request->ramdiskimage);
	if (request->ramdiskimage) {
		allow = _test_image_perms(request->ramdiskimage,
					  bg_conf->ramdisk_list,
					  job_ptr);
		if (!allow) {
			error("User %u:%u is not allowed "
			      "to use RamDiskImage %s",
			      job_ptr->user_id, job_ptr->group_id,
			      request->ramdiskimage);
			return SLURM_ERROR;
		}
	}
#endif
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_MLOADER_IMAGE,
			   &request->mloaderimage);
	if (request->mloaderimage) {
		allow = _test_image_perms(request->mloaderimage,
					  bg_conf->mloader_list,
					  job_ptr);
		if (!allow) {
			error("User %u:%u is not allowed "
			      "to use MloaderImage %s",
			      job_ptr->user_id, job_ptr->group_id,
			      request->mloaderimage);
			return SLURM_ERROR;
		}
	}

	return SLURM_SUCCESS;
}
Ejemplo n.º 6
0
/*
 * Try to find resources for a given job request
 * IN job_ptr - pointer to job record in slurmctld
 * IN/OUT bitmap - nodes available for assignment to job, clear those not to
 *	be used
 * IN min_nodes, max_nodes  - minimum and maximum number of nodes to allocate
 *	to this job (considers slurm block limits)
 * IN mode - SELECT_MODE_RUN_NOW: try to schedule job now
 *           SELECT_MODE_TEST_ONLY: test if job can ever run
 *           SELECT_MODE_WILL_RUN: determine when and where job can run
 * IN preemptee_candidates - List of pointers to jobs which can be preempted.
 * IN/OUT preemptee_job_list - Pointer to list of job pointers. These are the
 *		jobs to be preempted to initiate the pending job. Not set
 *		if mode=SELECT_MODE_TEST_ONLY or input pointer is NULL.
 * RET - SLURM_SUCCESS if job runnable now, error code otherwise
 */
extern int submit_job(struct job_record *job_ptr, bitstr_t *slurm_block_bitmap,
		      uint32_t min_nodes, uint32_t max_nodes,
		      uint32_t req_nodes, uint16_t mode,
		      List preemptee_candidates,
		      List *preemptee_job_list)
{
	int rc = SLURM_SUCCESS;
	bg_record_t* bg_record = NULL;
	char buf[256];
	uint16_t conn_type[SYSTEM_DIMENSIONS];
	List block_list = NULL;
	int blocks_added = 0;
	time_t starttime = time(NULL);
	uint16_t local_mode = mode;
	int avail_cpus = num_unused_cpus;
	int dim = 0;

	for (dim=0; dim<SYSTEM_DIMENSIONS; dim++)
		conn_type[dim] = (uint16_t)NO_VAL;
	if (preemptee_candidates && preemptee_job_list
	    && list_count(preemptee_candidates))
		local_mode |= SELECT_MODE_PREEMPT_FLAG;
	else
		local_mode |= SELECT_MODE_CHECK_FULL;

	if (bg_conf->layout_mode == LAYOUT_DYNAMIC)
		slurm_mutex_lock(&create_dynamic_mutex);

	slurm_mutex_lock(&block_state_mutex);
	block_list = copy_bg_list(bg_lists->main);
	slurm_mutex_unlock(&block_state_mutex);

	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_CONN_TYPE, &conn_type);
	if (conn_type[0] == SELECT_NAV) {
		if (bg_conf->mp_cnode_cnt == bg_conf->nodecard_cnode_cnt)
			conn_type[0] = SELECT_SMALL;
		else if (min_nodes > 1) {
			for (dim=0; dim<SYSTEM_DIMENSIONS; dim++)
				conn_type[dim] = SELECT_TORUS;
		} else if (job_ptr->details->min_cpus < bg_conf->cpus_per_mp)
			conn_type[0] = SELECT_SMALL;
		else {
			for (dim=1; dim<SYSTEM_DIMENSIONS; dim++)
				conn_type[dim] = SELECT_NAV;
		}
		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_CONN_TYPE,
				   &conn_type);
	}

	if (slurm_block_bitmap && !bit_set_count(slurm_block_bitmap)) {
		error("no nodes given to place job %u.", job_ptr->job_id);

		if (bg_conf->layout_mode == LAYOUT_DYNAMIC)
			slurm_mutex_unlock(&create_dynamic_mutex);

		return SLURM_ERROR;
	}

	sprint_select_jobinfo(job_ptr->select_jobinfo->data,
			      buf, sizeof(buf),
			      SELECT_PRINT_MIXED);

	debug("bluegene:submit_job: %u mode=%d %s nodes=%u-%u-%u",
	      job_ptr->job_id, local_mode, buf,
	      min_nodes, req_nodes, max_nodes);

#ifdef HAVE_BG_L_P
# ifdef HAVE_BGL
	sprint_select_jobinfo(job_ptr->select_jobinfo->data,
			      buf, sizeof(buf),
			      SELECT_PRINT_BLRTS_IMAGE);
	debug3("BlrtsImage=%s", buf);
# endif
	sprint_select_jobinfo(job_ptr->select_jobinfo->data,
			      buf, sizeof(buf),
			      SELECT_PRINT_LINUX_IMAGE);
# ifdef HAVE_BGL
	debug3("LinuxImage=%s", buf);
# else
	debug3("ComputNodeImage=%s", buf);
# endif
	sprint_select_jobinfo(job_ptr->select_jobinfo->data,
			      buf, sizeof(buf),
			      SELECT_PRINT_RAMDISK_IMAGE);
# ifdef HAVE_BGL
	debug3("RamDiskImage=%s", buf);
# else
	debug3("RamDiskIoLoadImage=%s", buf);
# endif
#endif
	sprint_select_jobinfo(job_ptr->select_jobinfo->data,
			      buf, sizeof(buf),
			      SELECT_PRINT_MLOADER_IMAGE);
	debug3("MloaderImage=%s", buf);

	/* First look at the empty space, and then remove the
	   preemptable jobs and try again. */
	list_sort(block_list, (ListCmpF)bg_record_sort_aval_inc);

	rc = _find_best_block_match(block_list, &blocks_added,
				    job_ptr, slurm_block_bitmap, min_nodes,
				    max_nodes, req_nodes,
				    &bg_record, local_mode, avail_cpus);

	if (rc == SLURM_SUCCESS && SELECT_IS_PREEMPT_SET(local_mode)) {
		ListIterator itr;
		ListIterator job_itr;
		bg_record_t *found_record;
		struct job_record *preempt_job_ptr;

		if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
			info("doing preemption");
		local_mode |= SELECT_MODE_CHECK_FULL;

		job_itr = list_iterator_create(preemptee_candidates);
		itr = list_iterator_create(block_list);
		while ((preempt_job_ptr = list_next(job_itr))) {
			while ((found_record = list_next(itr))) {
				if (found_record->job_ptr == preempt_job_ptr) {
					/* info("removing job %u running on %s", */
					/*      preempt_job_ptr->job_id, */
					/*      found_record->bg_block_id); */
					found_record->job_ptr = NULL;
					found_record->job_running =
						NO_JOB_RUNNING;
					avail_cpus += found_record->cpu_cnt;
					break;
				}
			}
			if (!found_record) {
				list_iterator_reset(itr);
				error("Job %u wasn't found running anywhere, "
				      "can't preempt",
				      preempt_job_ptr->job_id);
				continue;
			} else if (job_ptr->details->min_cpus > avail_cpus)
				continue;

			list_sort(block_list,
				  (ListCmpF)bg_record_sort_aval_inc);
			if ((rc = _find_best_block_match(
				     block_list, &blocks_added,
				     job_ptr, slurm_block_bitmap,
				     min_nodes, max_nodes, req_nodes,
				     &bg_record, local_mode, avail_cpus))
			    == SLURM_SUCCESS)
				break;

			list_iterator_reset(itr);
		}
		list_iterator_destroy(itr);
		list_iterator_destroy(job_itr);
	}

	if (rc == SLURM_SUCCESS) {
		if (!bg_record)
			fatal("we got a success, but no block back");
		/* Here we see if there is a job running since
		 * some jobs take awhile to finish we need to
		 * make sure the time of the end is in the
		 * future.  If it isn't (meaning it is in the
		 * past or current time) we add 5 seconds to
		 * it so we don't use the block immediately.
		 */
		if (bg_record->job_ptr
		    && bg_record->job_ptr->end_time) {
			if (bg_record->job_ptr->end_time <= starttime)
				starttime += 5;
			else
				starttime = bg_record->job_ptr->end_time;
		} else if (bg_record->job_running == BLOCK_ERROR_STATE)
			starttime = INFINITE;

		/* make sure the job is eligible to run */
		if (job_ptr->details->begin_time > starttime)
			starttime = job_ptr->details->begin_time;

		job_ptr->start_time = starttime;

		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_NODES,
				   bg_record->mp_str);
		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_IONODES,
				   bg_record->ionode_str);
		if (!bg_record->bg_block_id) {
			debug("%d can start unassigned job %u "
			      "at %ld on %s",
			      local_mode, job_ptr->job_id,
			      starttime, bg_record->mp_str);

			set_select_jobinfo(job_ptr->select_jobinfo->data,
					   SELECT_JOBDATA_BLOCK_PTR,
					   NULL);
			set_select_jobinfo(job_ptr->select_jobinfo->data,
					   SELECT_JOBDATA_NODE_CNT,
					   &bg_record->cnode_cnt);
		} else {
			if ((bg_record->ionode_str)
			    && (job_ptr->part_ptr->max_share <= 1))
				error("Small block used in "
				      "non-shared partition");

			debug("%d(%d) can start job %u "
			      "at %ld on %s(%s) %d",
			      local_mode, mode, job_ptr->job_id,
			      starttime, bg_record->bg_block_id,
			      bg_record->mp_str,
			      SELECT_IS_MODE_RUN_NOW(local_mode));

			if (SELECT_IS_MODE_RUN_NOW(local_mode)) {
				/* Set this up to be the
				   correct pointer since we
				   probably are working off a
				   copy.
				*/
				if (bg_record->original)
					bg_record = bg_record->original;
				set_select_jobinfo(
					job_ptr->select_jobinfo->data,
					SELECT_JOBDATA_BLOCK_PTR,
					bg_record);
				if (job_ptr) {
					bg_record->job_running =
						job_ptr->job_id;
					bg_record->job_ptr = job_ptr;

					job_ptr->job_state |= JOB_CONFIGURING;
					last_bg_update = time(NULL);
				}
			} else {
				set_select_jobinfo(
					job_ptr->select_jobinfo->data,
					SELECT_JOBDATA_BLOCK_PTR,
					NULL);
				/* Just to make sure we don't
				   end up using this on
				   another job, or we have to
				   wait until preemption is
				   done.
				*/
				bg_record->job_ptr = NULL;
				bg_record->job_running = NO_JOB_RUNNING;
			}

			set_select_jobinfo(job_ptr->select_jobinfo->data,
					   SELECT_JOBDATA_NODE_CNT,
					   &bg_record->cnode_cnt);
		}
		if (SELECT_IS_MODE_RUN_NOW(local_mode))
			_build_select_struct(job_ptr,
					     slurm_block_bitmap,
					     bg_record->cnode_cnt);
		/* set up the preempted job list */
		if (SELECT_IS_PREEMPT_SET(local_mode)) {
			if (*preemptee_job_list)
				list_destroy(*preemptee_job_list);
			*preemptee_job_list = _get_preemptables(
				local_mode, bg_record,
				preemptee_candidates);
		}
		if (!bg_record->bg_block_id) {
			/* This is a fake record so we need to
			 * destroy it after we get the info from
			 * it.  If it was just testing then
			 * we added this record to the
			 * block_list.  If this is the case
			 * it will be handled if se sync the
			 * lists.  But we don't want to do
			 * that so we will set blocks_added to
			 * 0 so it doesn't happen. */
			if (!blocks_added) {
				destroy_bg_record(bg_record);
				bg_record = NULL;
			}
			blocks_added = 0;
		}
		last_job_update = time(NULL);
	}

	if (bg_conf->layout_mode == LAYOUT_DYNAMIC) {
		slurm_mutex_lock(&block_state_mutex);
		if (blocks_added)
			_sync_block_lists(block_list, bg_lists->main);
		slurm_mutex_unlock(&block_state_mutex);
		slurm_mutex_unlock(&create_dynamic_mutex);
	}

	list_destroy(block_list);
	return rc;
}
Ejemplo n.º 7
0
/*
 * Perform any setup required to initiate a job
 * job_ptr IN - pointer to the job being initiated
 * RET - SLURM_SUCCESS or an error code
 *
 * NOTE: This happens in parallel with srun and slurmd spawning
 * the job. A prolog script is expected to defer initiation of
 * the job script until the BG block is available for use.
 */
extern int start_job(struct job_record *job_ptr)
{
	int rc = SLURM_SUCCESS;
	bg_record_t *bg_record = NULL;

	bg_action_t *bg_action_ptr = NULL;

	bg_action_ptr = xmalloc(sizeof(bg_action_t));
	bg_action_ptr->op = START_OP;
	bg_action_ptr->job_ptr = job_ptr;

	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_BLOCK_ID,
			   &(bg_action_ptr->bg_block_id));
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_REBOOT,
			   &(bg_action_ptr->reboot));
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_CONN_TYPE,
			   &(bg_action_ptr->conn_type));
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_MLOADER_IMAGE,
			   &(bg_action_ptr->mloaderimage));
#ifdef HAVE_BG_L_P
# ifdef HAVE_BGL
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_BLRTS_IMAGE,
			   &(bg_action_ptr->blrtsimage));
	if (!bg_action_ptr->blrtsimage) {
		bg_action_ptr->blrtsimage =
			xstrdup(bg_conf->default_blrtsimage);
		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_BLRTS_IMAGE,
				   bg_action_ptr->blrtsimage);
	}
# elif defined HAVE_BGP
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_CONN_TYPE,
			   &(bg_action_ptr->conn_type));
# endif
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_LINUX_IMAGE,
			   &(bg_action_ptr->linuximage));
	if (!bg_action_ptr->linuximage) {
		bg_action_ptr->linuximage =
			xstrdup(bg_conf->default_linuximage);
		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_LINUX_IMAGE,
				   bg_action_ptr->linuximage);
	}

	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_RAMDISK_IMAGE,
			   &(bg_action_ptr->ramdiskimage));
	if (!bg_action_ptr->ramdiskimage) {
		bg_action_ptr->ramdiskimage =
			xstrdup(bg_conf->default_ramdiskimage);
		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_RAMDISK_IMAGE,
				   bg_action_ptr->ramdiskimage);
	}

#endif
	if (!bg_action_ptr->mloaderimage) {
		bg_action_ptr->mloaderimage =
			xstrdup(bg_conf->default_mloaderimage);
		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_MLOADER_IMAGE,
				   bg_action_ptr->mloaderimage);
	}

	slurm_mutex_lock(&block_state_mutex);
	bg_record = find_bg_record_in_list(bg_lists->main,
					   bg_action_ptr->bg_block_id);
	if (!bg_record) {
		slurm_mutex_unlock(&block_state_mutex);
		error("bg_record %s doesn't exist, requested for job (%d)",
		      bg_action_ptr->bg_block_id, job_ptr->job_id);
		_destroy_bg_action(bg_action_ptr);
		return SLURM_ERROR;
	}

	last_bg_update = time(NULL);

	if (bg_record->job_list) {
		if (!find_job_in_bg_record(bg_record, job_ptr->job_id))
			list_append(bg_record->job_list, job_ptr);
	} else {
		bg_record->job_running = bg_action_ptr->job_ptr->job_id;
		bg_record->job_ptr = bg_action_ptr->job_ptr;
	}
	num_unused_cpus -= job_ptr->total_cpus;

	if (!block_ptr_exist_in_list(bg_lists->job_running, bg_record))
		list_push(bg_lists->job_running, bg_record);

	if (!block_ptr_exist_in_list(bg_lists->booted, bg_record))
		list_push(bg_lists->booted, bg_record);
	/* Just incase something happens to free this block before we
	   start the job we will make it so this job doesn't get blown
	   away.
	*/
	bg_record->modifying = 1;
	slurm_mutex_unlock(&block_state_mutex);

	info("Queue start of job %u in BG block %s",
	     job_ptr->job_id,
	     bg_action_ptr->bg_block_id);
	_block_op(bg_action_ptr);
	return rc;
}
Ejemplo n.º 8
0
/*
 * Synchronize BG block state to that of currently active jobs.
 * This can recover from slurmctld crashes when block usership
 * changes were queued
 */
extern int sync_jobs(List job_list)
{
	ListIterator job_iterator;
	struct job_record  *job_ptr = NULL;
	bg_action_t *bg_action_ptr = NULL;
	List block_list = NULL;
	static bool run_already = false;

	/* Execute only on initial startup. We don't support bgblock
	 * creation on demand today, so there is no need to re-sync data. */
	if (run_already)
		return SLURM_SUCCESS;
	run_already = true;

	if (!job_list) {
		error("sync_jobs: no job_list");
		return SLURM_ERROR;
	}
	/* Insure that all running jobs own the specified block */
	block_list = _get_all_allocated_blocks();
	job_iterator = list_iterator_create(job_list);
	while ((job_ptr = list_next(job_iterator))) {
		bool good_block = true;
		if (!IS_JOB_RUNNING(job_ptr))
			continue;

		bg_action_ptr = xmalloc(sizeof(bg_action_t));
		bg_action_ptr->op = SYNC_OP;
		bg_action_ptr->job_ptr = job_ptr;

		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_BLOCK_ID,
				   &(bg_action_ptr->bg_block_id));
#ifdef HAVE_BG_L_P
# ifdef HAVE_BGL
		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_BLRTS_IMAGE,
				   &(bg_action_ptr->blrtsimage));
# else
		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_CONN_TYPE,
				   &(bg_action_ptr->conn_type));
# endif
		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_LINUX_IMAGE,
				   &(bg_action_ptr->linuximage));
		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_RAMDISK_IMAGE,
				   &(bg_action_ptr->ramdiskimage));
#endif
		get_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_MLOADER_IMAGE,
				   &(bg_action_ptr->mloaderimage));

		if (bg_action_ptr->bg_block_id == NULL) {
			error("Running job %u has bgblock==NULL",
			      job_ptr->job_id);
			good_block = false;
		} else if (job_ptr->nodes == NULL) {
			error("Running job %u has nodes==NULL",
			      job_ptr->job_id);
			good_block = false;
		} else if (_excise_block(block_list,
					 bg_action_ptr->bg_block_id,
					 job_ptr->nodes)
			   != SLURM_SUCCESS) {
			error("Kill job %u belongs to defunct "
			      "bgblock %s",
			      job_ptr->job_id,
			      bg_action_ptr->bg_block_id);
			good_block = false;
		}
		if (!good_block) {
			job_ptr->job_state = JOB_FAILED
				| JOB_COMPLETING;
			job_ptr->end_time = time(NULL);
			last_job_update = time(NULL);
			_destroy_bg_action(bg_action_ptr);
			continue;
		}

		debug3("Queue sync of job %u in BG block %s "
		       "ending at %ld",
		       job_ptr->job_id,
		       bg_action_ptr->bg_block_id,
		       job_ptr->end_time);
		_block_op(bg_action_ptr);
	}
	list_iterator_destroy(job_iterator);

	/* Insure that all other blocks are free of users */
	if (block_list) {
		bridge_reset_block_list(block_list);
		list_destroy(block_list);
	} else {
		/* this should never happen,
		 * vestigial logic */
		error("sync_jobs: no block_list");
		return SLURM_ERROR;
	}
	return SLURM_SUCCESS;
}