Example #1
0
/* Reset a job's time limit (and end_time) as high as possible
 *	within the range job_ptr->time_min and job_ptr->time_limit.
 *	Avoid using resources reserved for pending jobs or in resource
 *	reservations */
static void _reset_job_time_limit(struct job_record *job_ptr, time_t now,
				  node_space_map_t *node_space)
{
	int32_t j, resv_delay;
	uint32_t orig_time_limit = job_ptr->time_limit;

	for (j=0; ; ) {
		if ((node_space[j].begin_time != now) &&
		    (node_space[j].begin_time < job_ptr->end_time) &&
		    (!bit_super_set(job_ptr->node_bitmap,
				    node_space[j].avail_bitmap))) {
			/* Job overlaps pending job's resource reservation */
			resv_delay = difftime(node_space[j].begin_time, now);
			resv_delay /= 60;	/* seconds to minutes */
			if (resv_delay < job_ptr->time_limit)
				job_ptr->time_limit = resv_delay;
		}
		if ((j = node_space[j].next) == 0)
			break;
	}
	job_ptr->time_limit = MAX(job_ptr->time_min, job_ptr->time_limit);
	job_ptr->end_time = job_ptr->start_time + (job_ptr->time_limit * 60);

	job_time_adj_resv(job_ptr);

	if (orig_time_limit != job_ptr->time_limit) {
		info("backfill: job %u time limit changed from %u to %u",
		     job_ptr->job_id, orig_time_limit, job_ptr->time_limit);
	}
}
Example #2
0
/*
 * Determine if the resource specification for a new job overlaps with a
 *	reservation that the backfill scheduler has made for a job to be
 *	started in the future.
 * IN use_bitmap - nodes to be allocated
 * IN start_time - start time of job
 * IN end_reserve - end time of job
 */
static bool _test_resv_overlap(node_space_map_t *node_space,
			       bitstr_t *use_bitmap, uint32_t start_time,
			       uint32_t end_reserve)
{
	bool overlap = false;
	int j;

	for (j=0; ; ) {
		if ((node_space[j].end_time   > start_time) &&
		    (node_space[j].begin_time < end_reserve) &&
		    (!bit_super_set(use_bitmap, node_space[j].avail_bitmap))) {
			overlap = true;
			break;
		}
		if ((j = node_space[j].next) == 0)
			break;
	}
	return overlap;
}
Example #3
0
static int _attempt_backfill(void)
{
	DEF_TIMERS;
	bool filter_root = false;
	List job_queue;
	job_queue_rec_t *job_queue_rec;
	slurmdb_qos_rec_t *qos_ptr = NULL;
	int i, j, node_space_recs;
	struct job_record *job_ptr;
	struct part_record *part_ptr;
	uint32_t end_time, end_reserve;
	uint32_t time_limit, comp_time_limit, orig_time_limit;
	uint32_t min_nodes, max_nodes, req_nodes;
	bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL;
	bitstr_t *exc_core_bitmap = NULL;
	time_t now, sched_start, later_start, start_res, resv_end;
	node_space_map_t *node_space;
	struct timeval bf_time1, bf_time2;
	int sched_timeout = 2, yield_sleep = 1;
	int rc = 0;
	int job_test_count = 0;
	uint32_t *uid = NULL, nuser = 0;
	uint16_t *njobs = NULL;
	bool already_counted;
	uint32_t reject_array_job_id = 0;

#ifdef HAVE_CRAY
	/*
	 * Run a Basil Inventory immediately before setting up the schedule
	 * plan, to avoid race conditions caused by ALPS node state change.
	 * Needs to be done with the node-state lock taken.
	 */
	START_TIMER;
	if (select_g_reconfigure()) {
		debug4("backfill: not scheduling due to ALPS");
		return SLURM_SUCCESS;
	}
	END_TIMER;
	if (debug_flags & DEBUG_FLAG_BACKFILL)
		info("backfill: ALPS inventory completed, %s", TIME_STR);

	/* The Basil inventory can take a long time to complete. Process
	 * pending RPCs before starting the backfill scheduling logic */
	_yield_locks(1);
#endif

	START_TIMER;
	if (debug_flags & DEBUG_FLAG_BACKFILL)
		info("backfill: beginning");
	sched_start = now = time(NULL);

	if (slurm_get_root_filter())
		filter_root = true;

	job_queue = build_job_queue(true);
	if (list_count(job_queue) == 0) {
		debug("backfill: no jobs to backfill");
		list_destroy(job_queue);
		return 0;
	}

	gettimeofday(&bf_time1, NULL);

	slurmctld_diag_stats.bf_queue_len = list_count(job_queue);
	slurmctld_diag_stats.bf_queue_len_sum += slurmctld_diag_stats.
						 bf_queue_len;
	slurmctld_diag_stats.bf_last_depth = 0;
	slurmctld_diag_stats.bf_last_depth_try = 0;
	slurmctld_diag_stats.bf_when_last_cycle = now;
	bf_last_yields = 0;
	slurmctld_diag_stats.bf_active = 1;

	node_space = xmalloc(sizeof(node_space_map_t) *
			     (max_backfill_job_cnt + 3));
	node_space[0].begin_time = sched_start;
	node_space[0].end_time = sched_start + backfill_window;
	node_space[0].avail_bitmap = bit_copy(avail_node_bitmap);
	node_space[0].next = 0;
	node_space_recs = 1;
	if (debug_flags & DEBUG_FLAG_BACKFILL)
		_dump_node_space_table(node_space);

	if (max_backfill_job_per_user) {
		uid = xmalloc(BF_MAX_USERS * sizeof(uint32_t));
		njobs = xmalloc(BF_MAX_USERS * sizeof(uint16_t));
	}
	while ((job_queue_rec = (job_queue_rec_t *)
				list_pop_bottom(job_queue, sort_job_queue2))) {
		job_ptr  = job_queue_rec->job_ptr;
		orig_time_limit = job_ptr->time_limit;

		if ((time(NULL) - sched_start) >= sched_timeout) {
			uint32_t save_time_limit = job_ptr->time_limit;
			job_ptr->time_limit = orig_time_limit;
			if (debug_flags & DEBUG_FLAG_BACKFILL) {
				END_TIMER;
				info("backfill: completed yielding locks "
				     "after testing %d jobs, %s",
				     job_test_count, TIME_STR);
			}
			if (_yield_locks(yield_sleep) && !backfill_continue) {
				if (debug_flags & DEBUG_FLAG_BACKFILL) {
					info("backfill: system state changed, "
					     "breaking out after testing %d "
					     "jobs", job_test_count);
				}
				rc = 1;
				break;
			}
			job_ptr->time_limit = save_time_limit;
			/* Reset backfill scheduling timers, resume testing */
			sched_start = time(NULL);
			job_test_count = 0;
			START_TIMER;
		}

		part_ptr = job_queue_rec->part_ptr;
		job_test_count++;

		xfree(job_queue_rec);
		if (!IS_JOB_PENDING(job_ptr))
			continue;	/* started in other partition */
		if (!avail_front_end(job_ptr))
			continue;	/* No available frontend for this job */
		if (job_ptr->array_task_id != (uint16_t) NO_VAL) {
			if (reject_array_job_id == job_ptr->array_job_id)
				continue;  /* already rejected array element */
			/* assume reject whole array for now, clear if OK */
			reject_array_job_id = job_ptr->array_job_id;
		}
		job_ptr->part_ptr = part_ptr;

		if (debug_flags & DEBUG_FLAG_BACKFILL)
			info("backfill test for job %u", job_ptr->job_id);

		slurmctld_diag_stats.bf_last_depth++;
		already_counted = false;

		if (max_backfill_job_per_user) {
			for (j = 0; j < nuser; j++) {
				if (job_ptr->user_id == uid[j]) {
					njobs[j]++;
					if (debug_flags & DEBUG_FLAG_BACKFILL)
						debug("backfill: user %u: "
						      "#jobs %u",
						      uid[j], njobs[j]);
					break;
				}
			}
			if (j == nuser) { /* user not found */
				if (nuser < BF_MAX_USERS) {
					uid[j] = job_ptr->user_id;
					njobs[j] = 1;
					nuser++;
				} else {
					error("backfill: too many users in "
					      "queue. Consider increasing "
					      "BF_MAX_USERS");
				}
				if (debug_flags & DEBUG_FLAG_BACKFILL)
					debug2("backfill: found new user %u. "
					       "Total #users now %u",
					       job_ptr->user_id, nuser);
			} else {
				if (njobs[j] > max_backfill_job_per_user) {
					/* skip job */
					if (debug_flags & DEBUG_FLAG_BACKFILL)
						debug("backfill: have already "
						      "checked %u jobs for "
						      "user %u; skipping "
						      "job %u",
						      max_backfill_job_per_user,
						      job_ptr->user_id,
						      job_ptr->job_id);
					continue;
				}
			}
		}

		if (((part_ptr->state_up & PARTITION_SCHED) == 0) ||
		    (part_ptr->node_bitmap == NULL))
		 	continue;
		if ((part_ptr->flags & PART_FLAG_ROOT_ONLY) && filter_root)
			continue;

		if ((!job_independent(job_ptr, 0)) ||
		    (license_job_test(job_ptr, time(NULL)) != SLURM_SUCCESS))
			continue;

		/* Determine minimum and maximum node counts */
		min_nodes = MAX(job_ptr->details->min_nodes,
				part_ptr->min_nodes);
		if (job_ptr->details->max_nodes == 0)
			max_nodes = part_ptr->max_nodes;
		else
			max_nodes = MIN(job_ptr->details->max_nodes,
					part_ptr->max_nodes);
		max_nodes = MIN(max_nodes, 500000);     /* prevent overflows */
		if (job_ptr->details->max_nodes)
			req_nodes = max_nodes;
		else
			req_nodes = min_nodes;
		if (min_nodes > max_nodes) {
			/* job's min_nodes exceeds partition's max_nodes */
			continue;
		}

		/* Determine job's expected completion time */
		if (job_ptr->time_limit == NO_VAL) {
			if (part_ptr->max_time == INFINITE)
				time_limit = 365 * 24 * 60; /* one year */
			else
				time_limit = part_ptr->max_time;
		} else {
			if (part_ptr->max_time == INFINITE)
				time_limit = job_ptr->time_limit;
			else
				time_limit = MIN(job_ptr->time_limit,
						 part_ptr->max_time);
		}
		comp_time_limit = time_limit;
		qos_ptr = job_ptr->qos_ptr;
		if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE) &&
		    slurm_get_preempt_mode())
			time_limit = job_ptr->time_limit = 1;
		else if (job_ptr->time_min && (job_ptr->time_min < time_limit))
			time_limit = job_ptr->time_limit = job_ptr->time_min;

		/* Determine impact of any resource reservations */
		later_start = now;
 TRY_LATER:
		if ((time(NULL) - sched_start) >= sched_timeout) {
			uint32_t save_time_limit = job_ptr->time_limit;
			job_ptr->time_limit = orig_time_limit;
			if (debug_flags & DEBUG_FLAG_BACKFILL) {
				END_TIMER;
				info("backfill: completed yielding locks 2"
				     "after testing %d jobs, %s",
				     job_test_count, TIME_STR);
			}
			if (_yield_locks(yield_sleep) && !backfill_continue) {
				if (debug_flags & DEBUG_FLAG_BACKFILL) {
					info("backfill: system state changed, "
					     "breaking out after testing %d "
					     "jobs", job_test_count);
				}
				rc = 1;
				break;
			}
			job_ptr->time_limit = save_time_limit;
			/* Reset backfill scheduling timers, resume testing */
			sched_start = time(NULL);
			job_test_count = 1;
			START_TIMER;
		}

		FREE_NULL_BITMAP(avail_bitmap);
		FREE_NULL_BITMAP(exc_core_bitmap);
		start_res   = later_start;
		later_start = 0;
		j = job_test_resv(job_ptr, &start_res, true, &avail_bitmap,
				  &exc_core_bitmap);
		if (j != SLURM_SUCCESS) {
			job_ptr->time_limit = orig_time_limit;
			continue;
		}
		if (start_res > now)
			end_time = (time_limit * 60) + start_res;
		else
			end_time = (time_limit * 60) + now;
		resv_end = find_resv_end(start_res);
		/* Identify usable nodes for this job */
		bit_and(avail_bitmap, part_ptr->node_bitmap);
		bit_and(avail_bitmap, up_node_bitmap);
		for (j=0; ; ) {
			if ((node_space[j].end_time > start_res) &&
			     node_space[j].next && (later_start == 0))
				later_start = node_space[j].end_time;
			if (node_space[j].end_time <= start_res)
				;
			else if (node_space[j].begin_time <= end_time) {
				bit_and(avail_bitmap,
					node_space[j].avail_bitmap);
			} else
				break;
			if ((j = node_space[j].next) == 0)
				break;
		}
		if ((resv_end++) &&
		    ((later_start == 0) || (resv_end < later_start))) {
			later_start = resv_end;
		}

		if (job_ptr->details->exc_node_bitmap) {
			bit_not(job_ptr->details->exc_node_bitmap);
			bit_and(avail_bitmap,
				job_ptr->details->exc_node_bitmap);
			bit_not(job_ptr->details->exc_node_bitmap);
		}

		/* Test if insufficient nodes remain OR
		 *	required nodes missing OR
		 *	nodes lack features */
		if ((bit_set_count(avail_bitmap) < min_nodes) ||
		    ((job_ptr->details->req_node_bitmap) &&
		     (!bit_super_set(job_ptr->details->req_node_bitmap,
				     avail_bitmap))) ||
		    (job_req_node_filter(job_ptr, avail_bitmap))) {
			if (later_start) {
				job_ptr->start_time = 0;
				goto TRY_LATER;
			}
			/* Job can not start until too far in the future */
			job_ptr->time_limit = orig_time_limit;
			job_ptr->start_time = sched_start + backfill_window;
			continue;
		}

		/* Identify nodes which are definitely off limits */
		FREE_NULL_BITMAP(resv_bitmap);
		resv_bitmap = bit_copy(avail_bitmap);
		bit_not(resv_bitmap);

		/* this is the time consuming operation */
		debug2("backfill: entering _try_sched for job %u.",
		       job_ptr->job_id);

		if (!already_counted) {
			slurmctld_diag_stats.bf_last_depth_try++;
			already_counted = true;
		}

		j = _try_sched(job_ptr, &avail_bitmap, min_nodes, max_nodes,
			       req_nodes, exc_core_bitmap);

		now = time(NULL);
		if (j != SLURM_SUCCESS) {
			job_ptr->time_limit = orig_time_limit;
			job_ptr->start_time = 0;
			continue;	/* not runable */
		}

		if (start_res > job_ptr->start_time) {
			job_ptr->start_time = start_res;
			last_job_update = now;
		}
		if (job_ptr->start_time <= now) {
			uint32_t save_time_limit = job_ptr->time_limit;
			int rc = _start_job(job_ptr, resv_bitmap);
			if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) {
				if (orig_time_limit == NO_VAL)
					job_ptr->time_limit = comp_time_limit;
				else
					job_ptr->time_limit = orig_time_limit;
				job_ptr->end_time = job_ptr->start_time +
						    (job_ptr->time_limit * 60);
			} else if ((rc == SLURM_SUCCESS) && job_ptr->time_min) {
				/* Set time limit as high as possible */
				job_ptr->time_limit = comp_time_limit;
				job_ptr->end_time = job_ptr->start_time +
						    (comp_time_limit * 60);
				_reset_job_time_limit(job_ptr, now,
						      node_space);
				time_limit = job_ptr->time_limit;
			} else {
				job_ptr->time_limit = orig_time_limit;
			}
			if (rc == ESLURM_ACCOUNTING_POLICY) {
				/* Unknown future start time, just skip job */
				job_ptr->start_time = 0;
				continue;
			} else if (rc != SLURM_SUCCESS) {
				/* Planned to start job, but something bad
				 * happended. */
				job_ptr->start_time = 0;
				break;
			} else {
				/* Started this job, move to next one */
				reject_array_job_id = 0;

				/* Update the database if job time limit
				 * changed and move to next job */
				if (save_time_limit != job_ptr->time_limit)
					jobacct_storage_g_job_start(acct_db_conn,
								    job_ptr);
				continue;
			}
		} else
			job_ptr->time_limit = orig_time_limit;

		if (later_start && (job_ptr->start_time > later_start)) {
			/* Try later when some nodes currently reserved for
			 * pending jobs are free */
			job_ptr->start_time = 0;
			goto TRY_LATER;
		}

		if (job_ptr->start_time > (sched_start + backfill_window)) {
			/* Starts too far in the future to worry about */
			continue;
		}

		if (node_space_recs >= max_backfill_job_cnt) {
			/* Already have too many jobs to deal with */
			break;
		}

		end_reserve = job_ptr->start_time + (time_limit * 60);
		if (_test_resv_overlap(node_space, avail_bitmap,
				       job_ptr->start_time, end_reserve)) {
			/* This job overlaps with an existing reservation for
			 * job to be backfill scheduled, which the sched
			 * plugin does not know about. Try again later. */
			later_start = job_ptr->start_time;
			job_ptr->start_time = 0;
			goto TRY_LATER;
		}

		/*
		 * Add reservation to scheduling table if appropriate
		 */
		if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE))
			continue;
		reject_array_job_id = 0;
		bit_not(avail_bitmap);
		_add_reservation(job_ptr->start_time, end_reserve,
				 avail_bitmap, node_space, &node_space_recs);
		if (debug_flags & DEBUG_FLAG_BACKFILL)
			_dump_node_space_table(node_space);
	}
	xfree(uid);
	xfree(njobs);
	FREE_NULL_BITMAP(avail_bitmap);
	FREE_NULL_BITMAP(exc_core_bitmap);
	FREE_NULL_BITMAP(resv_bitmap);

	for (i=0; ; ) {
		FREE_NULL_BITMAP(node_space[i].avail_bitmap);
		if ((i = node_space[i].next) == 0)
			break;
	}
	xfree(node_space);
	list_destroy(job_queue);
	gettimeofday(&bf_time2, NULL);
	_do_diag_stats(&bf_time1, &bf_time2, yield_sleep);
	if (debug_flags & DEBUG_FLAG_BACKFILL) {
		END_TIMER;
		info("backfill: completed testing %d jobs, %s",
		     job_test_count, TIME_STR);
	}
	return rc;
}
Example #4
0
/*
 * Attempt to start a job
 * jobid     (IN) - job id
 * task_cnt  (IN) - total count of tasks to start
 * hostlist  (IN) - SLURM hostlist expression with no repeated hostnames
 * tasklist  (IN/OUT) - comma separated list of hosts with tasks to be started,
 *                  list hostname once per task to start
 * comment_ptr (IN) - new comment field for the job or NULL for no change
 * err_code (OUT) - Moab error code
 * err_msg  (OUT) - Moab error message
 */
static int	_start_job(uint32_t jobid, int task_cnt, char *hostlist,
			char *tasklist, char *comment_ptr,
			int *err_code, char **err_msg)
{
	int rc = 0, old_task_cnt = 1;
	struct job_record *job_ptr;
	/* Write lock on job info, read lock on node info */
	slurmctld_lock_t job_write_lock = {
		NO_LOCK, WRITE_LOCK, READ_LOCK, NO_LOCK };
	char *new_node_list = NULL;
	static char tmp_msg[128];
	bitstr_t *new_bitmap = (bitstr_t *) NULL;
	bitstr_t *save_req_bitmap = (bitstr_t *) NULL;
	bitoff_t i, bsize;
	int ll; /* layout info index */
	char *node_name, *node_idx, *node_cur, *save_req_nodes = NULL;
	size_t node_name_len;
	static uint32_t cr_test = 0, cr_enabled = 0;

	if (cr_test == 0) {
		select_g_get_info_from_plugin(SELECT_CR_PLUGIN, NULL,
						&cr_enabled);
		cr_test = 1;
	}

	lock_slurmctld(job_write_lock);
	job_ptr = find_job_record(jobid);
	if (job_ptr == NULL) {
		*err_code = -700;
		*err_msg = "No such job";
		error("wiki: Failed to find job %u", jobid);
		rc = -1;
		goto fini;
	}

	if ((job_ptr->details == NULL) || (!IS_JOB_PENDING(job_ptr))) {
		*err_code = -700;
		*err_msg = "Job not pending, can't start";
		error("wiki: Attempt to start job %u in state %s",
			jobid, job_state_string(job_ptr->job_state));
		rc = -1;
		goto fini;
	}

	if (comment_ptr) {
		char *reserved = strstr(comment_ptr, "RESERVED:");
		if (reserved) {
			reserved += 9;
			job_ptr->details->reserved_resources =
				strtol(reserved, NULL, 10);
		}
		xfree(job_ptr->comment);
		job_ptr->comment = xstrdup(comment_ptr);
	}

	if (task_cnt) {
		new_node_list = xstrdup(hostlist);
		if (node_name2bitmap(new_node_list, false, &new_bitmap) != 0) {
			*err_code = -700;
			*err_msg = "Invalid TASKLIST";
			error("wiki: Attempt to set invalid node list for "
				"job %u, %s",
				jobid, hostlist);
			xfree(new_node_list);
			rc = -1;
			goto fini;
		}

		if (!bit_super_set(new_bitmap, avail_node_bitmap)) {
			/* Selected node is UP and not responding
			 * or it just went DOWN */
			*err_code = -700;
			*err_msg = "TASKLIST includes non-responsive node";
			error("wiki: Attempt to use non-responsive nodes for "
				"job %u, %s",
				jobid, hostlist);
			xfree(new_node_list);
			FREE_NULL_BITMAP(new_bitmap);
			rc = -1;
			goto fini;
		}

		/* User excluded node list incompatible with Wiki
		 * Exclude all nodes not explicitly requested */
		FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap);
		job_ptr->details->exc_node_bitmap = bit_copy(new_bitmap);
		bit_not(job_ptr->details->exc_node_bitmap);
	}

	/* Build layout information from tasklist (assuming that Moab
	 * sends a non-bracketed list of nodes, repeated as many times
	 * as cpus should be used per node); at this point, node names
	 * are comma-separated. This is _not_ a fast algorithm as it
	 * performs many string compares. */
	xfree(job_ptr->details->req_node_layout);
	if (task_cnt && cr_enabled) {
		uint16_t cpus_per_task = MAX(1, job_ptr->details->cpus_per_task);
		job_ptr->details->req_node_layout = (uint16_t *)
			xmalloc(bit_set_count(new_bitmap) * sizeof(uint16_t));
		bsize = bit_size(new_bitmap);
		for (i = 0, ll = -1; i < bsize; i++) {
			if (!bit_test(new_bitmap, i))
				continue;
			ll++;
			node_name = node_record_table_ptr[i].name;
			node_name_len  = strlen(node_name);
			if (node_name_len == 0)
				continue;
			node_cur = tasklist;
			while (*node_cur) {
				if ((node_idx = strstr(node_cur, node_name))) {
					if ((node_idx[node_name_len] == ',') ||
				 	    (node_idx[node_name_len] == '\0')) {
						job_ptr->details->
							req_node_layout[ll] +=
							cpus_per_task;
					}
					node_cur = strchr(node_idx, ',');
					if (node_cur)
						continue;
				}
				break;
			}
		}
	}

	/* save and update job state to start now */
	save_req_nodes = job_ptr->details->req_nodes;
	job_ptr->details->req_nodes = new_node_list;
	save_req_bitmap = job_ptr->details->req_node_bitmap;
	job_ptr->details->req_node_bitmap = new_bitmap;
	old_task_cnt = job_ptr->details->min_cpus;
	job_ptr->details->min_cpus = MAX(task_cnt, old_task_cnt);
	job_ptr->priority = 100000000;

 fini:	unlock_slurmctld(job_write_lock);
	if (rc)
		return rc;

	/* No errors so far */
	(void) schedule(INFINITE);	/* provides own locking */

	/* Check to insure the job was actually started */
	lock_slurmctld(job_write_lock);
	if (job_ptr->job_id != jobid)
		job_ptr = find_job_record(jobid);

	if (job_ptr && (job_ptr->job_id == jobid) &&
	    (!IS_JOB_RUNNING(job_ptr))) {
		uint16_t wait_reason = 0;
		char *wait_string;

		if (IS_JOB_FAILED(job_ptr))
			wait_string = "Invalid request, job aborted";
		else {
			wait_reason = job_ptr->state_reason;
			if (wait_reason == WAIT_HELD) {
				/* some job is completing, slurmctld did
				 * not even try to schedule this job */
				wait_reason = WAIT_RESOURCES;
			}
			wait_string = job_reason_string(wait_reason);
			job_ptr->state_reason = WAIT_HELD;
			xfree(job_ptr->state_desc);
		}
		*err_code = -910 - wait_reason;
		snprintf(tmp_msg, sizeof(tmp_msg),
			"Could not start job %u(%s): %s",
			jobid, new_node_list, wait_string);
		*err_msg = tmp_msg;
		error("wiki: %s", tmp_msg);

		/* restore some of job state */
		job_ptr->priority = 0;
		job_ptr->details->min_cpus = old_task_cnt;
		rc = -1;
	}

	if (job_ptr && (job_ptr->job_id == jobid) && job_ptr->details) {
		/* Restore required node list in case job requeued */
		xfree(job_ptr->details->req_nodes);
		job_ptr->details->req_nodes = save_req_nodes;
		FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap);
		job_ptr->details->req_node_bitmap = save_req_bitmap;
		FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap);
		xfree(job_ptr->details->req_node_layout);
	} else {
		error("wiki: start_job(%u) job missing", jobid);
		xfree(save_req_nodes);
		FREE_NULL_BITMAP(save_req_bitmap);
	}

	unlock_slurmctld(job_write_lock);
	schedule_node_save();	/* provides own locking */
	schedule_job_save();	/* provides own locking */
	return rc;
}
Example #5
0
static int _breakup_blocks(List block_list, List new_blocks,
			   select_ba_request_t *request, List my_block_list,
			   bool only_free, bool only_small)
{
	int rc = SLURM_ERROR;
	bg_record_t *bg_record = NULL;
	ListIterator itr = NULL, bit_itr = NULL;
	int total_cnode_cnt=0;
	char start_char[SYSTEM_DIMENSIONS+1];
	bitstr_t *ionodes = bit_alloc(bg_conf->ionodes_per_mp);
	int cnodes = request->procs / bg_conf->cpu_ratio;
	int curr_mp_bit = -1;
	int dim;

	if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
		info("cpu_count=%d cnodes=%d o_free=%d o_small=%d",
		     request->procs, cnodes, only_free, only_small);

	switch(cnodes) {
	case 16:
		/* a 16 can go anywhere */
		break;
	case 32:
		bit_itr = list_iterator_create(bg_lists->valid_small32);
		break;
	case 64:
		bit_itr = list_iterator_create(bg_lists->valid_small64);
		break;
	case 128:
		bit_itr = list_iterator_create(bg_lists->valid_small128);
		break;
	case 256:
		bit_itr = list_iterator_create(bg_lists->valid_small256);
		break;
	default:
		error("We shouldn't be here with this size %d", cnodes);
		goto finished;
		break;
	}

	/* First try with free blocks a midplane or less.  Then try with the
	 * smallest blocks.
	 */
	itr = list_iterator_create(block_list);
	while ((bg_record = list_next(itr))) {
		if (bg_record->free_cnt) {
			if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
				info("%s being freed by other job(s), skipping",
				     bg_record->bg_block_id);
			continue;
		}
		/* never look at a block if a job is running */
		if (bg_record->job_running != NO_JOB_RUNNING)
			continue;
		/* on the third time through look for just a block
		 * that isn't used */

		/* check for free blocks on the first and second time */
		if (only_free && (bg_record->state != BG_BLOCK_FREE))
			continue;

		/* check small blocks first */
		if (only_small
		    && (bg_record->cnode_cnt > bg_conf->mp_cnode_cnt))
			continue;

		if (request->avail_mp_bitmap &&
		    !bit_super_set(bg_record->bitmap,
				   request->avail_mp_bitmap)) {
			if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
				info("bg block %s has nodes not usable "
				     "by this job",
				     bg_record->bg_block_id);
			continue;
		}

		if (bg_record->cnode_cnt == cnodes) {
			ba_mp_t *ba_mp = NULL;
			if (bg_record->ba_mp_list)
				ba_mp = list_peek(bg_record->ba_mp_list);
			if (!ba_mp) {
				for (dim=0; dim<SYSTEM_DIMENSIONS; dim++)
					start_char[dim] = alpha_num[
						bg_record->start[dim]];
				start_char[dim] = '\0';
				request->save_name = xstrdup(start_char);
			} else
				request->save_name = xstrdup(ba_mp->coord_str);

			rc = SLURM_SUCCESS;
			goto finished;
		}
		/* lets see if we can combine some small ones */
		if (bg_record->cnode_cnt < cnodes) {
			char bitstring[BITSIZE];
			bitstr_t *bitstr = NULL;
			int num_over = 0;
			int num_cnodes = bg_record->cnode_cnt;
			int rec_mp_bit = bit_ffs(bg_record->bitmap);

			if (curr_mp_bit != rec_mp_bit) {
				/* Got a different node than
				 * previously, since the list should
				 * be in order of nodes for small blocks
				 * just clear here since the last node
				 * doesn't have any more. */
				curr_mp_bit = rec_mp_bit;
				bit_nclear(ionodes, 0,
					   (bg_conf->ionodes_per_mp-1));
				total_cnode_cnt = 0;
			}

			/* On really busy systems we can get
			   overlapping blocks here.  If that is the
			   case only add that which doesn't overlap.
			*/
			if ((num_over = bit_overlap(
				     ionodes, bg_record->ionode_bitmap))) {
				/* Since the smallest block size is
				   the number of cnodes in an io node,
				   just multiply the num_over by that to
				   get the number of cnodes to remove.
				*/
				if ((num_cnodes -=
				     num_over * bg_conf->smallest_block) <= 0)
					continue;
			}
			bit_or(ionodes, bg_record->ionode_bitmap);

			/* check and see if the bits set are a valid
			   combo */
			if (bit_itr) {
				while ((bitstr = list_next(bit_itr))) {
					if (bit_super_set(ionodes, bitstr))
						break;
				}
				list_iterator_reset(bit_itr);
			}
			if (!bitstr) {
				bit_nclear(ionodes, 0,
					   (bg_conf->ionodes_per_mp-1));
				bit_or(ionodes, bg_record->ionode_bitmap);
				total_cnode_cnt = num_cnodes =
					bg_record->cnode_cnt;
			} else
				total_cnode_cnt += num_cnodes;

			bit_fmt(bitstring, BITSIZE, ionodes);
			if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
				info("combine adding %s %s %d got %d set "
				     "ionodes %s total is %s",
				     bg_record->bg_block_id, bg_record->mp_str,
				     num_cnodes, total_cnode_cnt,
				     bg_record->ionode_str, bitstring);
			if (total_cnode_cnt == cnodes) {
				ba_mp_t *ba_mp = NULL;
				if (bg_record->ba_mp_list)
					ba_mp = list_peek(
						bg_record->ba_mp_list);
				if (!ba_mp) {
					for (dim=0; dim<SYSTEM_DIMENSIONS;
					     dim++)
						start_char[dim] = alpha_num[
							bg_record->start[dim]];
					start_char[dim] = '\0';
					request->save_name =
						xstrdup(start_char);
				} else
					request->save_name =
						xstrdup(ba_mp->coord_str);

				if (!my_block_list) {
					rc = SLURM_SUCCESS;
					goto finished;
				}

				bg_record = create_small_record(bg_record,
								ionodes,
								cnodes);
				list_append(new_blocks, bg_record);

				rc = SLURM_SUCCESS;
				goto finished;
			}
			continue;
		}
		/* we found a block that is bigger than requested */
		break;
	}

	if (bg_record) {
		ba_mp_t *ba_mp = NULL;
		if (bg_record->ba_mp_list)
			ba_mp = list_peek(bg_record->ba_mp_list);
		if (!ba_mp) {
			for (dim=0; dim<SYSTEM_DIMENSIONS; dim++)
				start_char[dim] = alpha_num[
					bg_record->start[dim]];
			start_char[dim] = '\0';
			request->save_name = xstrdup(start_char);
		} else
			request->save_name = xstrdup(ba_mp->coord_str);
		/* It appears we don't need this original record
		 * anymore, just work off the copy if indeed it is a copy. */

		/* bg_record_t *found_record = NULL; */

		/* if (bg_record->original) { */
		/* 	if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) */
		/* 		info("1 This was a copy %s", */
		/* 		     bg_record->bg_block_id); */
		/* 	found_record = bg_record->original; */
		/* } else { */
		/* 	if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) */
		/* 		info("looking for original"); */
		/* 	found_record = find_org_in_bg_list( */
		/* 		bg_lists->main, bg_record); */
		/* } */
		if ((bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
		    && bg_record->original
		    && (bg_record->original->magic != BLOCK_MAGIC)) {
			info("This record %s has bad magic, it must be "
			     "getting freed.  No worries it will all be "
			     "figured out later.",
			     bg_record->bg_block_id);
		}

		/* if (!found_record || found_record->magic != BLOCK_MAGIC) { */
		/* 	error("this record wasn't found in the list!"); */
		/* 	rc = SLURM_ERROR; */
		/* 	goto finished; */
		/* } */

		if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) {
			char tmp_char[256];
			format_node_name(bg_record, tmp_char,
					 sizeof(tmp_char));
			info("going to split %s, %s",
			     bg_record->bg_block_id,
			     tmp_char);
		}

		if (!my_block_list) {
			rc = SLURM_SUCCESS;
			goto finished;
		}
		_split_block(block_list, new_blocks, bg_record, cnodes);
		rc = SLURM_SUCCESS;
		goto finished;
	}

finished:
	if (bit_itr)
		list_iterator_destroy(bit_itr);

	FREE_NULL_BITMAP(ionodes);
	if (itr)
		list_iterator_destroy(itr);

	return rc;
}
Example #6
0
static char *	_will_run_test2(uint32_t jobid, time_t start_time,
				char *node_list,
				uint32_t *preemptee, int preemptee_cnt,
				int *err_code, char **err_msg)
{
	struct job_record *job_ptr = NULL, *pre_ptr;
	struct part_record *part_ptr;
	bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL;
	bitstr_t *exc_core_bitmap = NULL;
	time_t start_res;
	uint32_t min_nodes, max_nodes, req_nodes;
	List preemptee_candidates = NULL, preempted_jobs = NULL;
	time_t orig_start_time;
	char *reply_msg = NULL;
	int i, rc;
	bool resv_overlap = false;

	xassert(node_list);
	debug2("wiki2: will_run2 job_id=%u start_time=%u node_list=%s",
		jobid, (uint32_t)start_time, node_list);

	job_ptr = find_job_record(jobid);
	if (job_ptr == NULL) {
		*err_code = -700;
		*err_msg = "No such job";
		error("wiki: Failed to find job %u", jobid);
		return NULL;
	}
	if ((job_ptr->details == NULL) || (!IS_JOB_PENDING(job_ptr))) {
		*err_code = -700;
		*err_msg = "WillRun not applicable to non-pending job";
		error("wiki: WillRun on non-pending job %u", jobid);
		return NULL;
	}

	part_ptr = job_ptr->part_ptr;
	if (part_ptr == NULL) {
		*err_code = -700;
		*err_msg = "Job lacks a partition";
		error("wiki: Job %u lacks a partition", jobid);
		return NULL;
	}

	if (node_name2bitmap(node_list, false, &avail_bitmap) != 0) {
		*err_code = -700;
		*err_msg = "Invalid available nodes value";
		error("wiki: Attempt to set invalid available node "
		      "list for job %u, %s", jobid, node_list);
		return NULL;
	}

	/* Enforce reservation: access control, time and nodes */
	start_res = start_time;
	rc = job_test_resv(job_ptr, &start_res, true, &resv_bitmap,
			   &exc_core_bitmap, &resv_overlap);
	if (rc != SLURM_SUCCESS) {
		*err_code = -730;
		*err_msg = "Job denied access to reservation";
		error("wiki: reservation access denied for job %u", jobid);
		FREE_NULL_BITMAP(avail_bitmap);
		FREE_NULL_BITMAP(exc_core_bitmap);
		return NULL;
	}
	bit_and(avail_bitmap, resv_bitmap);
	FREE_NULL_BITMAP(resv_bitmap);

	/* Only consider nodes that are not DOWN or DRAINED */
	bit_and(avail_bitmap, avail_node_bitmap);

	/* Consider only nodes in this job's partition */
	if (part_ptr->node_bitmap)
		bit_and(avail_bitmap, part_ptr->node_bitmap);
	else {
		*err_code = -730;
		*err_msg = "Job's partition has no nodes";
		error("wiki: no nodes in partition %s for job %u",
			part_ptr->name, jobid);
		FREE_NULL_BITMAP(avail_bitmap);
		FREE_NULL_BITMAP(exc_core_bitmap);
		return NULL;
	}

	if (job_req_node_filter(job_ptr, avail_bitmap) != SLURM_SUCCESS) {
		/* Job probably has invalid feature list */
		*err_code = -730;
		*err_msg = "Job's required features not available "
			   "on selected nodes";
		error("wiki: job %u not runnable on hosts=%s",
			jobid, node_list);
		FREE_NULL_BITMAP(avail_bitmap);
		FREE_NULL_BITMAP(exc_core_bitmap);
		return NULL;
	}
	if (job_ptr->details->exc_node_bitmap) {
		bit_not(job_ptr->details->exc_node_bitmap);
		bit_and(avail_bitmap, job_ptr->details->exc_node_bitmap);
		bit_not(job_ptr->details->exc_node_bitmap);
	}
	if ((job_ptr->details->req_node_bitmap) &&
	    (!bit_super_set(job_ptr->details->req_node_bitmap,
			    avail_bitmap))) {
		*err_code = -730;
		*err_msg = "Job's required nodes not available";
		error("wiki: job %u not runnable on hosts=%s",
			jobid, node_list);
		FREE_NULL_BITMAP(avail_bitmap);
		FREE_NULL_BITMAP(exc_core_bitmap);
		return NULL;
	}

	min_nodes = MAX(job_ptr->details->min_nodes, part_ptr->min_nodes);
	if (job_ptr->details->max_nodes == 0)
		max_nodes = part_ptr->max_nodes;
	else
		max_nodes = MIN(job_ptr->details->max_nodes,
				part_ptr->max_nodes);
	max_nodes = MIN(max_nodes, 500000); /* prevent overflows */
	if (job_ptr->details->max_nodes)
		req_nodes = max_nodes;
	else
		req_nodes = min_nodes;
	if (min_nodes > max_nodes) {
		/* job's min_nodes exceeds partitions max_nodes */
		*err_code = -730;
		*err_msg = "Job's min_nodes > max_nodes";
		error("wiki: job %u not runnable on hosts=%s",
			jobid, node_list);
		FREE_NULL_BITMAP(avail_bitmap);
		FREE_NULL_BITMAP(exc_core_bitmap);
		return NULL;
	}

	if (preemptee_cnt) {
		preemptee_candidates = list_create(NULL);
		for (i=0; i<preemptee_cnt; i++) {
			if ((pre_ptr = find_job_record(preemptee[i])))
				list_append(preemptee_candidates, pre_ptr);
		}
	}

	orig_start_time = job_ptr->start_time;
	rc = select_g_job_test(job_ptr, avail_bitmap, min_nodes, max_nodes,
			       req_nodes, SELECT_MODE_WILL_RUN,
			       preemptee_candidates, &preempted_jobs,
			       exc_core_bitmap);
	FREE_NULL_LIST(preemptee_candidates);

	if (rc == SLURM_SUCCESS) {
		char *hostlist, *sep, tmp_str[128];
		uint32_t pre_cnt = 0, proc_cnt = 0;

#ifdef HAVE_BG
		select_g_select_jobinfo_get(job_ptr->select_jobinfo,
				     SELECT_JOBDATA_NODE_CNT, &proc_cnt);
#else
		proc_cnt = job_ptr->total_cpus;
#endif
		snprintf(tmp_str, sizeof(tmp_str),
			 "STARTINFO=%u TASKS=%u STARTTIME=%u NODES=",
			 job_ptr->job_id, proc_cnt,
			 (uint32_t) job_ptr->start_time);
		xstrcat(reply_msg, tmp_str);
		hostlist = bitmap2node_name(avail_bitmap);
		xstrcat(reply_msg, hostlist);
		xfree(hostlist);

		if (preempted_jobs) {
			while ((pre_ptr = list_pop(preempted_jobs))) {
				if (pre_cnt++)
					sep = ",";
				else
					sep = " PREEMPT=";
				snprintf(tmp_str, sizeof(tmp_str), "%s%u",
					 sep, pre_ptr->job_id);
				xstrcat(reply_msg, tmp_str);
			}
			FREE_NULL_LIST(preempted_jobs);
		}
	} else {
		xstrcat(reply_msg, "Jobs not runable on selected nodes");
		error("wiki: jobs not runnable on nodes");
	}

	/* Restore pending job's expected start time */
	job_ptr->start_time = orig_start_time;

	FREE_NULL_BITMAP(avail_bitmap);
	FREE_NULL_BITMAP(exc_core_bitmap);
	return reply_msg;
}
Example #7
0
static bg_record_t *_find_matching_block(List block_list,
					 struct job_record* job_ptr,
					 bitstr_t* slurm_block_bitmap,
					 select_ba_request_t *request,
					 uint32_t max_cpus,
					 int *allow, int check_image,
					 int overlap_check,
					 List overlapped_list,
					 uint16_t query_mode)
{
	bg_record_t *bg_record = NULL;
	ListIterator itr = NULL;
	char tmp_char[256];

	if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
		info("number of blocks to check: %d state %d "
		     "asking for %u-%u cpus",
		     list_count(block_list),
		     query_mode, request->procs, max_cpus);

	itr = list_iterator_create(block_list);
	while ((bg_record = list_next(itr))) {
		/* If test_only we want to fall through to tell the
		   scheduler that it is runnable just not right now.
		*/

		/* The job running could be reset so set it back up
		   here if there is a job_ptr
		*/
		if (bg_record->job_ptr)
			bg_record->job_running = bg_record->job_ptr->job_id;

		/*block is messed up some how (BLOCK_ERROR_STATE_FLAG)
		 * ignore it or if state == BG_BLOCK_ERROR */
		if ((bg_record->job_running == BLOCK_ERROR_STATE)
		    || (bg_record->state & BG_BLOCK_ERROR_FLAG)) {
			if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
				info("block %s is in an error "
				     "state (can't use)",
				     bg_record->bg_block_id);
			continue;
		} else if ((bg_conf->layout_mode == LAYOUT_DYNAMIC)
			   || ((!SELECT_IS_CHECK_FULL_SET(query_mode)
				|| SELECT_IS_MODE_RUN_NOW(query_mode))
			       && (bg_conf->layout_mode != LAYOUT_DYNAMIC))) {
			if (bg_record->free_cnt) {
				/* No reason to look at a block that
				   is being freed unless we are
				   running static and looking at the
				   full set.
				*/
				if (bg_conf->slurm_debug_flags
				    & DEBUG_FLAG_BG_PICK)
					info("block %s being free for other "
					     "job(s), skipping",
					     bg_record->bg_block_id);
				continue;
			} else if ((bg_record->job_running != NO_JOB_RUNNING)
				   && (bg_record->job_running
				       != job_ptr->job_id)) {
				/* Look here if you are trying to run now or
				   if you aren't looking at the full set.  We
				   don't continue on running blocks for the
				   full set because we are seeing if the job
				   can ever run so look here.
				*/
				if (bg_conf->slurm_debug_flags
				    & DEBUG_FLAG_BG_PICK)
					info("block %s in use by %s job %d",
					     bg_record->bg_block_id,
					     bg_record->user_name,
					     bg_record->job_running);
				continue;
			}
		}

		/* Check processor count */
		if ((bg_record->cpu_cnt < request->procs)
		    || ((max_cpus != NO_VAL)
			&& (bg_record->cpu_cnt > max_cpus))) {
			/* We use the proccessor count per block here
			   mostly to see if we can run on a smaller block.
			*/
			if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) {
				convert_num_unit((float)bg_record->cpu_cnt,
						 tmp_char,
						 sizeof(tmp_char), UNIT_NONE);
				info("block %s CPU count (%s) not suitable",
				     bg_record->bg_block_id,
				     tmp_char);
			}
			continue;
		}

		/*
		 * Next we check that this block's bitmap is within
		 * the set of nodes which the job can use.
		 * Nodes not available for the job could be down,
		 * drained, allocated to some other job, or in some
		 * SLURM block not available to this job.
		 */
		if (!bit_super_set(bg_record->mp_bitmap, slurm_block_bitmap)) {
			if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) {
				char *temp = bitmap2node_name(
					bg_record->mp_bitmap);
				char *temp2 = bitmap2node_name(
					slurm_block_bitmap);
				info("bg block %s has nodes not "
				     "usable by this job %s %s",
				     bg_record->bg_block_id, temp, temp2);
				xfree(temp);
				xfree(temp2);
			}
			continue;
		}

		/*
		 * Insure that any required nodes are in this BG block
		 */
		if (job_ptr->details->req_node_bitmap
		    && (!bit_super_set(job_ptr->details->req_node_bitmap,
				       bg_record->mp_bitmap))) {
			if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
				info("bg block %s lacks required nodes",
				     bg_record->bg_block_id);
			continue;
		}

		if (_check_for_booted_overlapping_blocks(
			    block_list, itr, bg_record,
			    overlap_check, overlapped_list, query_mode))
			continue;

		if (check_image) {
#ifdef HAVE_BGL
			if (request->blrtsimage &&
			    strcasecmp(request->blrtsimage,
				       bg_record->blrtsimage)) {
				*allow = 1;
				continue;
			}
#endif
#ifdef HAVE_BG_L_P
			if (request->linuximage &&
			    strcasecmp(request->linuximage,
				       bg_record->linuximage)) {
				*allow = 1;
				continue;
			}

			if (request->ramdiskimage &&
			    strcasecmp(request->ramdiskimage,
				       bg_record->ramdiskimage)) {
				*allow = 1;
				continue;
			}
#endif
			if (request->mloaderimage &&
			    strcasecmp(request->mloaderimage,
				       bg_record->mloaderimage)) {
				*allow = 1;
				continue;
			}
		}

		/***********************************************/
		/* check the connection type specified matches */
		/***********************************************/
		if ((request->conn_type[0] != bg_record->conn_type[0])
		    && (request->conn_type[0] != SELECT_NAV)) {
#ifdef HAVE_BGP
			if (request->conn_type[0] >= SELECT_SMALL) {
				/* we only want to reboot blocks if
				   they have to be so skip booted
				   blocks if in small state
				*/
				if (check_image
				    && (bg_record->state
					== BG_BLOCK_INITED)) {
					*allow = 1;
					continue;
				}
				goto good_conn_type;
			} else if (bg_record->conn_type[0] >= SELECT_SMALL) {
				/* since we already checked to see if
				   the cpus were good this means we are
				   looking for a block in a range that
				   includes small and regular blocks.
				   So we can just continue on.
				*/
				goto good_conn_type;
			}
#endif
			if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
				info("bg block %s conn-type not usable "
				     "asking for %s bg_record is %s",
				     bg_record->bg_block_id,
				     conn_type_string(request->conn_type[0]),
				     conn_type_string(bg_record->conn_type[0]));
			continue;
		}
#ifdef HAVE_BGP
	good_conn_type:
#endif
		/*****************************************/
		/* match up geometry as "best" possible  */
		/*****************************************/
		if ((request->geometry[0] != (uint16_t)NO_VAL)
		    && (!_check_rotate_geo(bg_record->geo, request->geometry,
					   request->rotate)))
			continue;

		if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
			info("we found one! %s", bg_record->bg_block_id);
		break;
	}
	list_iterator_destroy(itr);

	return bg_record;
}
Example #8
0
/*
 * create_dynamic_block - create new block(s) to be used for a new
 * job allocation.
 * RET - a list of created block(s) or NULL on failure errno is set.
 */
extern List create_dynamic_block(List block_list,
				 select_ba_request_t *request,
				 List my_block_list,
				 bool track_down_nodes)
{
	int rc = SLURM_SUCCESS;

	ListIterator itr, itr2;
	bg_record_t *bg_record = NULL, *found_record = NULL;
	List results = NULL;
	List new_blocks = NULL;
	bitstr_t *my_bitmap = NULL;
	select_ba_request_t blockreq;
	int cnodes = request->procs / bg_conf->cpu_ratio;
	uint16_t start_geo[SYSTEM_DIMENSIONS];

	if (cnodes < bg_conf->smallest_block) {
		error("Can't create this size %d "
		      "on this system ionodes_per_mp is %d",
		      request->procs,
		      bg_conf->ionodes_per_mp);
		goto finished;
	}
	memset(&blockreq, 0, sizeof(select_ba_request_t));
	memcpy(start_geo, request->geometry, sizeof(start_geo));

	/* We need to lock this just incase a blocks_overlap is called
	   which will in turn reset and set the system as it sees fit.
	*/
	slurm_mutex_lock(&block_state_mutex);
	if (my_block_list) {
		reset_ba_system(track_down_nodes);
		itr = list_iterator_create(my_block_list);
		while ((bg_record = list_next(itr))) {
			if (bg_record->magic != BLOCK_MAGIC) {
				/* This should never happen since we
				   only call this on copies of blocks
				   and we check on this during the
				   copy.
				*/
				error("create_dynamic_block: "
				      "got a block with bad magic?");
				continue;
			}
			if (bg_record->free_cnt) {
				if (bg_conf->slurm_debug_flags
				    & DEBUG_FLAG_BG_PICK) {
					int dim;
					char start_geo[SYSTEM_DIMENSIONS+1];
					char geo[SYSTEM_DIMENSIONS+1];
					for (dim=0; dim<SYSTEM_DIMENSIONS;
					     dim++) {
						start_geo[dim] = alpha_num[
							bg_record->start[dim]];
						geo[dim] = alpha_num[
							bg_record->geo[dim]];
					}
					start_geo[dim] = '\0';
					geo[dim] = '\0';
					info("not adding %s(%s) %s %s %s %u "
					     "(free_cnt)",
					     bg_record->bg_block_id,
					     bg_record->mp_str,
					     bg_block_state_string(
						     bg_record->state),
					     start_geo,
					     geo,
					     bg_record->cnode_cnt);
				}
				continue;
			}

			if (!my_bitmap) {
				my_bitmap =
					bit_alloc(bit_size(bg_record->bitmap));
			}

			if (!bit_super_set(bg_record->bitmap, my_bitmap)) {
				bit_or(my_bitmap, bg_record->bitmap);

				if (bg_conf->slurm_debug_flags
				    & DEBUG_FLAG_BG_PICK) {
					int dim;
					char start_geo[SYSTEM_DIMENSIONS+1];
					char geo[SYSTEM_DIMENSIONS+1];
					for (dim=0; dim<SYSTEM_DIMENSIONS;
					     dim++) {
						start_geo[dim] = alpha_num[
							bg_record->start[dim]];
						geo[dim] = alpha_num[
							bg_record->geo[dim]];
					}
					start_geo[dim] = '\0';
					geo[dim] = '\0';
					info("adding %s(%s) %s %s %s %u",
					     bg_record->bg_block_id,
					     bg_record->mp_str,
					     bg_block_state_string(
						     bg_record->state),
					     start_geo, geo,
					     bg_record->cnode_cnt);
				}
				if (check_and_set_mp_list(
					    bg_record->ba_mp_list)
				    == SLURM_ERROR) {
					if (bg_conf->slurm_debug_flags
					    & DEBUG_FLAG_BG_PICK)
						info("something happened in "
						     "the load of %s",
						     bg_record->bg_block_id);
					list_iterator_destroy(itr);
					FREE_NULL_BITMAP(my_bitmap);
					rc = SLURM_ERROR;
					goto finished;
				}
			} else {
				if (bg_conf->slurm_debug_flags
				    & DEBUG_FLAG_BG_PICK) {
					int dim;
					char start_geo[SYSTEM_DIMENSIONS+1];
					char geo[SYSTEM_DIMENSIONS+1];
					for (dim=0; dim<SYSTEM_DIMENSIONS;
					     dim++) {
						start_geo[dim] = alpha_num[
							bg_record->start[dim]];
						geo[dim] = alpha_num[
							bg_record->geo[dim]];
					}
					start_geo[dim] = '\0';
					geo[dim] = '\0';
					info("not adding %s(%s) %s %s %s %u ",
					     bg_record->bg_block_id,
					     bg_record->mp_str,
					     bg_block_state_string(
						     bg_record->state),
					     start_geo,
					     geo,
					     bg_record->cnode_cnt);
				}
				/* just so we don't look at it later */
				bg_record->free_cnt = -1;
			}
		}
		list_iterator_destroy(itr);
		FREE_NULL_BITMAP(my_bitmap);
	} else {
		reset_ba_system(false);
		if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
			info("No list was given");
	}

	if (request->avail_mp_bitmap)
		ba_set_removable_mps(request->avail_mp_bitmap, 1);

	if (request->size==1 && cnodes < bg_conf->mp_cnode_cnt) {
		switch(cnodes) {
#ifdef HAVE_BGL
		case 32:
			blockreq.small32 = 4;
			blockreq.small128 = 3;
			break;
		case 128:
			blockreq.small128 = 4;
			break;
#else
		case 16:
			blockreq.small16 = 2;
			blockreq.small32 = 1;
			blockreq.small64 = 1;
			blockreq.small128 = 1;
			blockreq.small256 = 1;
			break;
		case 32:
			blockreq.small32 = 2;
			blockreq.small64 = 1;
			blockreq.small128 = 1;
			blockreq.small256 = 1;
			break;
		case 64:
			blockreq.small64 = 2;
			blockreq.small128 = 1;
			blockreq.small256 = 1;
			break;
		case 128:
			blockreq.small128 = 2;
			blockreq.small256 = 1;
			break;
		case 256:
			blockreq.small256 = 2;
			break;
#endif
		default:
			error("This size %d is unknown on this system", cnodes);
			goto finished;
			break;
		}

		/* Sort the list so the small blocks are in the order
		 * of ionodes. */
		list_sort(block_list, (ListCmpF)bg_record_cmpf_inc);
		request->conn_type[0] = SELECT_SMALL;
		new_blocks = list_create(destroy_bg_record);
		/* check only blocks that are free and small */
		if (_breakup_blocks(block_list, new_blocks,
				    request, my_block_list,
				    true, true)
		    == SLURM_SUCCESS)
			goto finished;

		/* check only blocks that are free and any size */
		if (_breakup_blocks(block_list, new_blocks,
				    request, my_block_list,
				    true, false)
		    == SLURM_SUCCESS)
			goto finished;

		/* check usable blocks that are small with any state */
		if (_breakup_blocks(block_list, new_blocks,
				    request, my_block_list,
				    false, true)
		    == SLURM_SUCCESS)
			goto finished;

		/* check all usable blocks */
		if (_breakup_blocks(block_list, new_blocks,
				    request, my_block_list,
				    false, false)
		    == SLURM_SUCCESS)
			goto finished;

		/* Re-sort the list back to the original order. */
		list_sort(block_list, (ListCmpF)bg_record_sort_aval_inc);
		list_destroy(new_blocks);
		new_blocks = NULL;
		if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
			info("small block not able to be placed inside others");
	}

	if (request->conn_type[0] == SELECT_NAV)
		request->conn_type[0] = SELECT_TORUS;

	//debug("going to create %d", request->size);
	if (!new_ba_request(request)) {
		if (request->geometry[0] != (uint16_t)NO_VAL) {
			char *geo = give_geo(request->geometry);
			error("Problems with request for size %d geo %s",
			      request->size, geo);
			xfree(geo);
		} else {
			error("Problems with request for size %d.  "
			      "No geo given.",
			      request->size);
		}
		rc = ESLURM_INTERCONNECT_FAILURE;
		goto finished;
	}

	/* try on free midplanes */
	rc = SLURM_SUCCESS;
	if (results)
		list_flush(results);
	else {
#ifdef HAVE_BGQ
		results = list_create(destroy_ba_mp);
#else
		results = list_create(NULL);
#endif
	}

	rc = allocate_block(request, results);
	/* This could be changed in allocate_block so set it back up */
	memcpy(request->geometry, start_geo, sizeof(start_geo));

	if (rc) {
		rc = SLURM_SUCCESS;
		goto setup_records;
	}

	if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
		info("allocate failure for size %d base "
		     "partitions of free midplanes",
		     request->size);
	rc = SLURM_ERROR;

	if (!list_count(my_block_list) || !my_block_list)
		goto finished;

	/*Try to put block starting in the smallest of the exisiting blocks*/
	itr = list_iterator_create(my_block_list);
	itr2 = list_iterator_create(my_block_list);
	while ((bg_record = (bg_record_t *) list_next(itr)) != NULL) {
		bool is_small = 0;
		/* never check a block with a job running */
		if (bg_record->free_cnt
		    || bg_record->job_running != NO_JOB_RUNNING)
			continue;

		/* Here we are only looking for the first
		   block on the midplane.  So either the count
		   is greater or equal than
		   bg_conf->mp_cnode_cnt or the first bit is
		   set in the ionode_bitmap.
		*/
		if (bg_record->cnode_cnt < bg_conf->mp_cnode_cnt) {
			bool found = 0;
			if (bit_ffs(bg_record->ionode_bitmap) != 0)
				continue;
			/* Check to see if we have other blocks in
			   this midplane that have jobs running.
			*/
			while ((found_record = list_next(itr2))) {
				if (!found_record->free_cnt
				    && (found_record->job_running
					!= NO_JOB_RUNNING)
				    && bit_overlap(bg_record->bitmap,
						   found_record->bitmap)) {
					found = 1;
					break;
				}
			}
			list_iterator_reset(itr2);
			if (found)
				continue;
			is_small = 1;
		}

		if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
			info("removing %s(%s) for request %d",
			     bg_record->bg_block_id,
			     bg_record->mp_str, request->size);

		remove_block(bg_record->ba_mp_list, is_small);
		rc = SLURM_SUCCESS;
		if (results)
			list_flush(results);
		else {
#ifdef HAVE_BGQ
			results = list_create(destroy_ba_mp);
#else
			results = list_create(NULL);
#endif
		}

		rc = allocate_block(request, results);
		/* This could be changed in allocate_block so set it back up */
		memcpy(request->geometry, start_geo, sizeof(start_geo));
		if (rc) {
			rc = SLURM_SUCCESS;
			break;
		}

		if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
			info("allocate failure for size %d base partitions",
			     request->size);
		rc = SLURM_ERROR;
	}
	list_iterator_destroy(itr);
	list_iterator_destroy(itr2);

setup_records:
	if (rc == SLURM_SUCCESS) {
		/*set up bg_record(s) here */
		new_blocks = list_create(destroy_bg_record);

		blockreq.save_name = request->save_name;
#ifdef HAVE_BGL
		blockreq.blrtsimage = request->blrtsimage;
#endif
		blockreq.linuximage = request->linuximage;
		blockreq.mloaderimage = request->mloaderimage;
		blockreq.ramdiskimage = request->ramdiskimage;
		memcpy(blockreq.conn_type, request->conn_type,
		       sizeof(blockreq.conn_type));

		add_bg_record(new_blocks, &results, &blockreq, 0, 0);
	}

finished:
	if (request->avail_mp_bitmap
	    && (bit_ffc(request->avail_mp_bitmap) == -1))
		ba_reset_all_removed_mps();
	slurm_mutex_unlock(&block_state_mutex);

	/* reset the ones we mucked with */
	itr = list_iterator_create(my_block_list);
	while ((bg_record = (bg_record_t *) list_next(itr))) {
		if (bg_record->free_cnt == -1)
			bg_record->free_cnt = 0;
	}
	list_iterator_destroy(itr);


	xfree(request->save_name);

	if (results)
		list_destroy(results);
	errno = rc;
	return new_blocks;
}
Example #9
0
static char *	_will_run_test(uint32_t jobid, time_t start_time,
			       char *node_list, int *err_code, char **err_msg)
{
	struct job_record *job_ptr = NULL;
	struct part_record *part_ptr;
	bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL;
	bitstr_t *exc_core_bitmap = NULL;
	char *hostlist, *reply_msg = NULL;
	uint32_t min_nodes, max_nodes, req_nodes;
	int rc;
	time_t start_res, orig_start_time;
	List preemptee_candidates;

	debug2("wiki2: will_run job_id=%u start_time=%u node_list=%s",
		jobid, (uint32_t)start_time, node_list);

	job_ptr = find_job_record(jobid);
	if (job_ptr == NULL) {
		*err_code = -700;
		*err_msg = "No such job";
		error("wiki: Failed to find job %u", jobid);
		return NULL;
	}
	if ((job_ptr->details == NULL) || (!IS_JOB_PENDING(job_ptr))) {
		*err_code = -700;
		*err_msg = "WillRun not applicable to non-pending job";
		error("wiki: WillRun on non-pending job %u", jobid);
		return NULL;
	}

	part_ptr = job_ptr->part_ptr;
	if (part_ptr == NULL) {
		*err_code = -700;
		*err_msg = "Job lacks a partition";
		error("wiki: Job %u lacks a partition", jobid);
		return NULL;
	}

	if ((node_list == NULL) || (node_list[0] == '\0')) {
		/* assume all nodes available to job for testing */
		avail_bitmap = bit_copy(avail_node_bitmap);
	} else if (node_name2bitmap(node_list, false, &avail_bitmap) != 0) {
		*err_code = -700;
		*err_msg = "Invalid available nodes value";
		error("wiki: Attempt to set invalid available node "
		      "list for job %u, %s", jobid, node_list);
		return NULL;
	}

	/* Enforce reservation: access control, time and nodes */
	start_res = start_time;
	rc = job_test_resv(job_ptr, &start_res, true, &resv_bitmap,
			   &exc_core_bitmap);
	if (rc != SLURM_SUCCESS) {
		*err_code = -730;
		*err_msg = "Job denied access to reservation";
		error("wiki: reservation access denied for job %u", jobid);
		FREE_NULL_BITMAP(avail_bitmap);
		return NULL;
	}
	start_time = MAX(start_time, start_res);
	bit_and(avail_bitmap, resv_bitmap);
	FREE_NULL_BITMAP(resv_bitmap);

	/* Only consider nodes that are not DOWN or DRAINED */
	bit_and(avail_bitmap, avail_node_bitmap);

	/* Consider only nodes in this job's partition */
	if (part_ptr->node_bitmap)
		bit_and(avail_bitmap, part_ptr->node_bitmap);
	else {
		*err_code = -730;
		*err_msg = "Job's partition has no nodes";
		error("wiki: no nodes in partition %s for job %u",
			part_ptr->name, jobid);
		FREE_NULL_BITMAP(avail_bitmap);
		return NULL;
	}

	if (job_req_node_filter(job_ptr, avail_bitmap) != SLURM_SUCCESS) {
		/* Job probably has invalid feature list */
		*err_code = -730;
		*err_msg = "Job's required features not available "
			   "on selected nodes";
		error("wiki: job %u not runnable on hosts=%s",
		      jobid, node_list);
		FREE_NULL_BITMAP(avail_bitmap);
		return NULL;
	}
	if (job_ptr->details->exc_node_bitmap) {
		bit_not(job_ptr->details->exc_node_bitmap);
		bit_and(avail_bitmap, job_ptr->details->exc_node_bitmap);
		bit_not(job_ptr->details->exc_node_bitmap);
	}
	if ((job_ptr->details->req_node_bitmap) &&
	    (!bit_super_set(job_ptr->details->req_node_bitmap,
			    avail_bitmap))) {
		*err_code = -730;
		*err_msg = "Job's required nodes not available";
		error("wiki: job %u not runnable on hosts=%s",
		      jobid, node_list);
		FREE_NULL_BITMAP(avail_bitmap);
		return NULL;
	}

	min_nodes = MAX(job_ptr->details->min_nodes, part_ptr->min_nodes);
	if (job_ptr->details->max_nodes == 0)
		max_nodes = part_ptr->max_nodes;
	else
		max_nodes = MIN(job_ptr->details->max_nodes,
				part_ptr->max_nodes);
	max_nodes = MIN(max_nodes, 500000); /* prevent overflows */
	if (job_ptr->details->max_nodes)
		req_nodes = max_nodes;
	else
		req_nodes = min_nodes;
	if (min_nodes > max_nodes) {
		/* job's min_nodes exceeds partitions max_nodes */
		*err_code = -730;
		*err_msg = "Job's min_nodes > max_nodes";
		error("wiki: job %u not runnable on hosts=%s",
		      jobid, node_list);
		FREE_NULL_BITMAP(avail_bitmap);
		return NULL;
	}

	preemptee_candidates = slurm_find_preemptable_jobs(job_ptr);

	orig_start_time = job_ptr->start_time;
	rc = select_g_job_test(job_ptr, avail_bitmap,
			       min_nodes, max_nodes, req_nodes,
			       SELECT_MODE_WILL_RUN,
			       preemptee_candidates, NULL, exc_core_bitmap);
	if (preemptee_candidates)
		list_destroy(preemptee_candidates);

	if (rc == SLURM_SUCCESS) {
		char tmp_str[128];
		*err_code = 0;
		uint32_t proc_cnt = 0;

		xstrcat(reply_msg, "STARTINFO=");
#ifdef HAVE_BG
		select_g_select_jobinfo_get(job_ptr->select_jobinfo,
                             		    SELECT_JOBDATA_NODE_CNT,
					    &proc_cnt);

#else
		proc_cnt = job_ptr->total_cpus;
#endif
		snprintf(tmp_str, sizeof(tmp_str), "%u:%u@%u,",
			 jobid, proc_cnt, (uint32_t) job_ptr->start_time);
		xstrcat(reply_msg, tmp_str);
		hostlist = bitmap2node_name(avail_bitmap);
		xstrcat(reply_msg, hostlist);
		xfree(hostlist);
	} else {
		xstrcat(reply_msg, "Jobs not runable on selected nodes");
		error("wiki: jobs not runnable on nodes");
	}

	/* Restore pending job's expected start time */
	job_ptr->start_time = orig_start_time;
	FREE_NULL_BITMAP(avail_bitmap);
	return reply_msg;
}
Example #10
0
int
main(int argc, char *argv[])
{
	note("Testing static decl");
	{
		bitstr_t bit_decl(bs, 65);
		/*bitstr_t *bsp = bs;*/

		bit_set(bs,9);
		bit_set(bs,14);
		TEST(bit_test(bs,9), "bit 9 set"); 
		TEST(!bit_test(bs,12), "bit 12 not set");
		TEST(bit_test(bs,14), "bit 14 set" );
		/*bit_free(bsp);*/	/* triggers TEST in bit_free - OK */
	}
	note("Testing basic vixie functions");
	{
		bitstr_t *bs = bit_alloc(16), *bs2;


		/*bit_set(bs, 42);*/ 	/* triggers TEST in bit_set - OK */
		bit_set(bs,9);
		bit_set(bs,14);
		TEST(bit_test(bs,9), "bit 9 set"); 
		TEST(!bit_test(bs,12), "bit 12 not set" );
		TEST(bit_test(bs,14), "bit 14 set");

		bs2 = bit_copy(bs);
		bit_fill_gaps(bs2);
		TEST(bit_ffs(bs2) == 9, "first bit set = 9 ");
		TEST(bit_fls(bs2) == 14, "last bit set = 14");
		TEST(bit_set_count(bs2) == 6, "bitstring");
		TEST(bit_test(bs2,12), "bitstring");
		TEST(bit_super_set(bs,bs2) == 1, "bitstring");
		TEST(bit_super_set(bs2,bs) == 0, "bitstring");

		bit_clear(bs,14);
		TEST(!bit_test(bs,14), "bitstring");

		bit_nclear(bs,9,14);
		TEST(!bit_test(bs,9), "bitstring");
		TEST(!bit_test(bs,12), "bitstring");
		TEST(!bit_test(bs,14), "bitstring");

		bit_nset(bs,9,14);
		TEST(bit_test(bs,9), "bitstring");
		TEST(bit_test(bs,12), "bitstring");
		TEST(bit_test(bs,14), "bitstring");

		TEST(bit_ffs(bs) == 9, "ffs");
		TEST(bit_ffc(bs) == 0, "ffc");
		bit_nset(bs,0,8);
		TEST(bit_ffc(bs) == 15, "ffc");

		bit_free(bs);
		/*bit_set(bs,9); */	/* triggers TEST in bit_set - OK */
	}
	note("Testing and/or/not");
	{
		bitstr_t *bs1 = bit_alloc(128);
		bitstr_t *bs2 = bit_alloc(128);

		bit_set(bs1, 100);
		bit_set(bs1, 104);
		bit_set(bs2, 100);

		bit_and(bs1, bs2);
		TEST(bit_test(bs1, 100), "and");
		TEST(!bit_test(bs1, 104), "and");

		bit_set(bs2, 110);
		bit_set(bs2, 111);
		bit_set(bs2, 112);
		bit_or(bs1, bs2);
		TEST(bit_test(bs1, 100), "or");
		TEST(bit_test(bs1, 110), "or");
		TEST(bit_test(bs1, 111), "or");
		TEST(bit_test(bs1, 112), "or");

		bit_not(bs1);
		TEST(!bit_test(bs1, 100), "not");
		TEST(bit_test(bs1, 12), "not");

		bit_free(bs1);
		bit_free(bs2);
	}

	note("testing bit selection");
	{
		bitstr_t *bs1 = bit_alloc(128), *bs2;
		bit_set(bs1, 21);
		bit_set(bs1, 100);
		bit_fill_gaps(bs1);
		bs2 = bit_pick_cnt(bs1,20);

		if (bs2) {
			TEST(bit_set_count(bs2) == 20, "pick");
			TEST(bit_ffs(bs2) == 21, "pick");
			TEST(bit_fls(bs2) == 40, "pick");
			bit_free(bs2);
		}
		else
			TEST(0, "alloc fail");

		bit_free(bs1);
	}
	note("Testing realloc");
	{
		bitstr_t *bs = bit_alloc(1);

		TEST(bit_ffs(bs) == -1, "bitstring");
		bit_set(bs,0);
		/*bit_set(bs, 1000);*/	/* triggers TEST in bit_set - OK */
		bs = bit_realloc(bs,1048576);
		bit_set(bs,1000);
		bit_set(bs,1048575);
		TEST(bit_test(bs, 0), "bitstring");
		TEST(bit_test(bs, 1000), "bitstring");
		TEST(bit_test(bs, 1048575), "bitstring");
		TEST(bit_set_count(bs) == 3, "bitstring");
		bit_clear(bs,0);
		bit_clear(bs,1000);
		TEST(bit_set_count(bs) == 1, "bitstring");
		TEST(bit_ffs(bs) == 1048575, "bitstring");
		bit_free(bs);
	}
	note("Testing bit_fmt");
	{
		char tmpstr[1024];
		bitstr_t *bs = bit_alloc(1024);

		TEST(!strcmp(bit_fmt(tmpstr,sizeof(tmpstr),bs), ""), "bitstring");
		bit_set(bs,42);
		TEST(!strcmp(bit_fmt(tmpstr,sizeof(tmpstr),bs), "42"), "bitstring");
		bit_set(bs,102);
		TEST(!strcmp(bit_fmt(tmpstr,sizeof(tmpstr),bs), "42,102"), "bitstring");
		bit_nset(bs,9,14);
		TEST(!strcmp(bit_fmt(tmpstr,sizeof(tmpstr), bs), 
					"9-14,42,102"), "bitstring");
	}

	note("Testing bit_nffc/bit_nffs");
	{
		bitstr_t *bs = bit_alloc(1024);

		bit_set(bs, 2);
		bit_set(bs, 6);
		bit_set(bs, 7);
		bit_nset(bs,12,1018); 

		TEST(bit_nffc(bs, 2) == 0, "bitstring");
		TEST(bit_nffc(bs, 3) == 3, "bitstring");
		TEST(bit_nffc(bs, 4) == 8, "bitstring");
		TEST(bit_nffc(bs, 5) == 1019, "bitstring");
		TEST(bit_nffc(bs, 6) == -1, "bitstring");

		TEST(bit_nffs(bs, 1) == 2, "bitstring");
		TEST(bit_nffs(bs, 2) == 6, "bitstring");
		TEST(bit_nffs(bs, 100) == 12, "bitstring");
		TEST(bit_nffs(bs, 1023) == -1, "bitstring");

		bit_free(bs);
	}

	note("Testing bit_unfmt");
	{
		bitstr_t *bs = bit_alloc(1024);
		bitstr_t *bs2 = bit_alloc(1024);
		char tmpstr[4096];

		bit_set(bs,1);
		bit_set(bs,3);
		bit_set(bs,30);
		bit_nset(bs,42,64);
		bit_nset(bs,97,1000);

		bit_fmt(tmpstr, sizeof(tmpstr), bs);
		TEST(bit_unfmt(bs2, tmpstr) != -1, "bitstring");
		TEST(bit_equal(bs, bs2), "bitstring");
	}

	totals();
	return failed;
}
Example #11
0
/*
 * route_p_split_hostlist - logic to split an input hostlist into
 *                           a set of hostlists to forward to.
 *
 * IN: hl        - hostlist_t   - list of every node to send message to
 *                                will be empty on return;
 * OUT: sp_hl    - hostlist_t** - the array of hostlists that will be malloced
 * OUT: count    - int*         - the count of created hostlists
 * RET: SLURM_SUCCESS - int
 *
 * Note: created hostlist will have to be freed independently using
 *       hostlist_destroy by the caller.
 * Note: the hostlist_t array will have to be xfree.
 */
extern int route_p_split_hostlist(hostlist_t hl,
				  hostlist_t** sp_hl,
				  int* count)
{
	int i, j, k, hl_ndx, msg_count, sw_count, lst_count;
	char  *buf;
	bitstr_t *nodes_bitmap = NULL;		/* nodes in message list */
	bitstr_t *fwd_bitmap = NULL;		/* nodes in forward list */

	msg_count = hostlist_count(hl);
	if (switch_record_cnt == 0) {
		/* configs have not already been processed */
		slurm_conf_init(NULL);
		if (init_node_conf()) {
			fatal("ROUTE: Failed to init slurm config");
		}
		if (build_all_nodeline_info(false)) {
			fatal("ROUTE: Failed to build node config");
		}
		rehash_node();

		if (slurm_topo_build_config() != SLURM_SUCCESS) {
			fatal("ROUTE: Failed to build topology config");
		}
	}
	*sp_hl = (hostlist_t*) xmalloc(switch_record_cnt * sizeof(hostlist_t));
	/* create bitmap of nodes to send message too */
	if (hostlist2bitmap (hl, false, &nodes_bitmap) != SLURM_SUCCESS) {
		buf = hostlist_ranged_string_xmalloc(hl);
		fatal("ROUTE: Failed to make bitmap from hostlist=%s.", buf);
	}

	/* Find lowest level switch containing all the nodes in the list */
	j = 0;
	for (i = 0; i <= switch_levels; i++) {
		for (j=0; j<switch_record_cnt; j++) {
			if (switch_record_table[j].level == i) {
				if (bit_super_set(nodes_bitmap,
						  switch_record_table[j].
						  node_bitmap)) {
					/* All nodes in message list are in
					 * this switch */
					break;
				}
			}
		}
		if (j < switch_record_cnt) {
			/* Got here via break after bit_super_set */
			break; // 'j' is our switch
		} /* else, no switches at this level reach all nodes */
	}
	if (i > switch_levels) {
		/* This can only happen if trying to schedule multiple physical
		 * clusters as a single logical cluster under the control of a
		 * single slurmctld daemon, and sending something like a
		 * node_registation request to all nodes.
		 * Revert to default behavior*/
		if (debug_flags & DEBUG_FLAG_ROUTE) {
			buf = hostlist_ranged_string_xmalloc(hl);
			debug("ROUTE: didn't find switch containing nodes=%s",
			      buf);
			xfree(buf);
		}
		FREE_NULL_BITMAP(nodes_bitmap);
		xfree(*sp_hl);
		return route_split_hostlist_treewidth(hl, sp_hl, count);
	}
	if (switch_record_table[j].level == 0) {
		/* This is a leaf switch. Construct list based on TreeWidth */
		FREE_NULL_BITMAP(nodes_bitmap);
		xfree(*sp_hl);
		return route_split_hostlist_treewidth(hl, sp_hl, count);
	}
	/* loop through children, construction a hostlist for each child switch
	 * with nodes in the message list */
	hl_ndx = 0;
	lst_count = 0;
	for (i=0; i < switch_record_table[j].num_switches; i++) {
		k = switch_record_table[j].switch_index[i];
		fwd_bitmap = bit_copy(switch_record_table[k].node_bitmap);
		bit_and(fwd_bitmap, nodes_bitmap);
		sw_count = bit_set_count(fwd_bitmap);
		if (sw_count == 0) {
			continue; /* no nodes on this switch in message list */
		}
		(*sp_hl)[hl_ndx] = bitmap2hostlist(fwd_bitmap);
		/* Now remove nodes from this switch from message list */
		bit_not(fwd_bitmap);
		bit_and(nodes_bitmap, fwd_bitmap);
		FREE_NULL_BITMAP(fwd_bitmap);
		if (debug_flags & DEBUG_FLAG_ROUTE) {
			buf = hostlist_ranged_string_xmalloc((*sp_hl)[hl_ndx]);
			debug("ROUTE: ... sublist[%d] switch=%s :: %s",
			      i, switch_record_table[i].name, buf);
			xfree(buf);
		}
		hl_ndx++;
		lst_count += sw_count;
		if (lst_count == msg_count)
			break; /* all nodes in message are in a child list */
	}
	FREE_NULL_BITMAP(nodes_bitmap);

	*count = hl_ndx;
	return SLURM_SUCCESS;

}
Example #12
0
static int _attempt_backfill(void)
{
	DEF_TIMERS;
	bool filter_root = false;
	List job_queue;
	job_queue_rec_t *job_queue_rec;
	slurmdb_qos_rec_t *qos_ptr = NULL;
	int i, j, node_space_recs;
	struct job_record *job_ptr;
	struct part_record *part_ptr, **bf_part_ptr = NULL;
	uint32_t end_time, end_reserve;
	uint32_t time_limit, comp_time_limit, orig_time_limit, part_time_limit;
	uint32_t min_nodes, max_nodes, req_nodes;
	bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL;
	bitstr_t *exc_core_bitmap = NULL, *non_cg_bitmap = NULL;
	time_t now, sched_start, later_start, start_res, resv_end, window_end;
	node_space_map_t *node_space;
	struct timeval bf_time1, bf_time2;
	int rc = 0;
	int job_test_count = 0;
	uint32_t *uid = NULL, nuser = 0, bf_parts = 0, *bf_part_jobs = NULL;
	uint16_t *njobs = NULL;
	bool already_counted;
	uint32_t reject_array_job_id = 0;
	struct part_record *reject_array_part = NULL;
	uint32_t job_start_cnt = 0, start_time;
	time_t config_update = slurmctld_conf.last_update;
	time_t part_update = last_part_update;
	struct timeval start_tv;

	bf_last_yields = 0;
#ifdef HAVE_ALPS_CRAY
	/*
	 * Run a Basil Inventory immediately before setting up the schedule
	 * plan, to avoid race conditions caused by ALPS node state change.
	 * Needs to be done with the node-state lock taken.
	 */
	START_TIMER;
	if (select_g_reconfigure()) {
		debug4("backfill: not scheduling due to ALPS");
		return SLURM_SUCCESS;
	}
	END_TIMER;
	if (debug_flags & DEBUG_FLAG_BACKFILL)
		info("backfill: ALPS inventory completed, %s", TIME_STR);

	/* The Basil inventory can take a long time to complete. Process
	 * pending RPCs before starting the backfill scheduling logic */
	_yield_locks(1000000);
#endif

	START_TIMER;
	if (debug_flags & DEBUG_FLAG_BACKFILL)
		info("backfill: beginning");
	else
		debug("backfill: beginning");
	sched_start = now = time(NULL);
	gettimeofday(&start_tv, NULL);

	if (slurm_get_root_filter())
		filter_root = true;

	job_queue = build_job_queue(true, true);
	if (list_count(job_queue) == 0) {
		if (debug_flags & DEBUG_FLAG_BACKFILL)
			info("backfill: no jobs to backfill");
		else
			debug("backfill: no jobs to backfill");
		list_destroy(job_queue);
		return 0;
	}

	gettimeofday(&bf_time1, NULL);

	non_cg_bitmap = bit_copy(cg_node_bitmap);
	bit_not(non_cg_bitmap);

	slurmctld_diag_stats.bf_queue_len = list_count(job_queue);
	slurmctld_diag_stats.bf_queue_len_sum += slurmctld_diag_stats.
						 bf_queue_len;
	slurmctld_diag_stats.bf_last_depth = 0;
	slurmctld_diag_stats.bf_last_depth_try = 0;
	slurmctld_diag_stats.bf_when_last_cycle = now;
	slurmctld_diag_stats.bf_active = 1;

	node_space = xmalloc(sizeof(node_space_map_t) *
			     (max_backfill_job_cnt * 2 + 1));
	node_space[0].begin_time = sched_start;
	window_end = sched_start + backfill_window;
	node_space[0].end_time = window_end;
	node_space[0].avail_bitmap = bit_copy(avail_node_bitmap);
	node_space[0].next = 0;
	node_space_recs = 1;
	if (debug_flags & DEBUG_FLAG_BACKFILL_MAP)
		_dump_node_space_table(node_space);

	if (max_backfill_job_per_part) {
		ListIterator part_iterator;
		struct part_record *part_ptr;
		bf_parts = list_count(part_list);
		bf_part_ptr  = xmalloc(sizeof(struct part_record *) * bf_parts);
		bf_part_jobs = xmalloc(sizeof(int) * bf_parts);
		part_iterator = list_iterator_create(part_list);
		i = 0;
		while ((part_ptr = (struct part_record *)
				   list_next(part_iterator))) {
			bf_part_ptr[i++] = part_ptr;
		}
		list_iterator_destroy(part_iterator);
	}
	if (max_backfill_job_per_user) {
		uid = xmalloc(BF_MAX_USERS * sizeof(uint32_t));
		njobs = xmalloc(BF_MAX_USERS * sizeof(uint16_t));
	}
	sort_job_queue(job_queue);
	while (1) {
		job_queue_rec = (job_queue_rec_t *) list_pop(job_queue);
		if (!job_queue_rec) {
			if (debug_flags & DEBUG_FLAG_BACKFILL)
				info("backfill: reached end of job queue");
			break;
		}
		if (slurmctld_config.shutdown_time)
			break;
		if (((defer_rpc_cnt > 0) &&
		     (slurmctld_config.server_thread_count >= defer_rpc_cnt)) ||
		    (_delta_tv(&start_tv) >= sched_timeout)) {
			if (debug_flags & DEBUG_FLAG_BACKFILL) {
				END_TIMER;
				info("backfill: completed yielding locks "
				     "after testing %u(%d) jobs, %s",
				     slurmctld_diag_stats.bf_last_depth,
				     job_test_count, TIME_STR);
			}
			if ((_yield_locks(yield_sleep) && !backfill_continue) ||
			    (slurmctld_conf.last_update != config_update) ||
			    (last_part_update != part_update)) {
				if (debug_flags & DEBUG_FLAG_BACKFILL) {
					info("backfill: system state changed, "
					     "breaking out after testing "
					     "%u(%d) jobs",
					     slurmctld_diag_stats.bf_last_depth,
					     job_test_count);
				}
				rc = 1;
				xfree(job_queue_rec);
				break;
			}
			/* cg_node_bitmap may be changed */
			bit_copybits(non_cg_bitmap, cg_node_bitmap);
			bit_not(non_cg_bitmap);
			/* Reset backfill scheduling timers, resume testing */
			sched_start = time(NULL);
			gettimeofday(&start_tv, NULL);
			job_test_count = 0;
			START_TIMER;
		}

		job_ptr  = job_queue_rec->job_ptr;
		/* With bf_continue configured, the original job could have
		 * been cancelled and purged. Validate pointer here. */
		if ((job_ptr->magic  != JOB_MAGIC) ||
		    (job_ptr->job_id != job_queue_rec->job_id)) {
			xfree(job_queue_rec);
			continue;
		}
		orig_time_limit = job_ptr->time_limit;
		part_ptr = job_queue_rec->part_ptr;

		job_test_count++;
		slurmctld_diag_stats.bf_last_depth++;
		already_counted = false;

		xfree(job_queue_rec);
		if (!IS_JOB_PENDING(job_ptr))
			continue;	/* started in other partition */
		if (!avail_front_end(job_ptr))
			continue;	/* No available frontend for this job */
		if (job_ptr->array_task_id != NO_VAL) {
			if ((reject_array_job_id == job_ptr->array_job_id) &&
			    (reject_array_part   == part_ptr))
				continue;  /* already rejected array element */
			/* assume reject whole array for now, clear if OK */
			reject_array_job_id = job_ptr->array_job_id;
			reject_array_part   = part_ptr;
		}
		job_ptr->part_ptr = part_ptr;

		if (debug_flags & DEBUG_FLAG_BACKFILL) {
			info("backfill test for JobID=%u Prio=%u Partition=%s",
			     job_ptr->job_id, job_ptr->priority,
			     job_ptr->part_ptr->name);
		}

		if (max_backfill_job_per_part) {
			bool skip_job = false;
			for (j = 0; j < bf_parts; j++) {
				if (bf_part_ptr[j] != job_ptr->part_ptr)
					continue;
				if (bf_part_jobs[j]++ >=
				    max_backfill_job_per_part)
					skip_job = true;
				break;
			}
			if (skip_job) {
				if (debug_flags & DEBUG_FLAG_BACKFILL)
					info("backfill: have already "
					     "checked %u jobs for "
					     "partition %s; skipping "
					     "job %u",
					     max_backfill_job_per_part,
					     job_ptr->part_ptr->name,
					     job_ptr->job_id);
				continue;
			}
		}
		if (max_backfill_job_per_user) {
			for (j = 0; j < nuser; j++) {
				if (job_ptr->user_id == uid[j]) {
					njobs[j]++;
					if (debug_flags & DEBUG_FLAG_BACKFILL)
						debug("backfill: user %u: "
						      "#jobs %u",
						      uid[j], njobs[j]);
					break;
				}
			}
			if (j == nuser) { /* user not found */
				static bool bf_max_user_msg = true;
				if (nuser < BF_MAX_USERS) {
					uid[j] = job_ptr->user_id;
					njobs[j] = 1;
					nuser++;
				} else if (bf_max_user_msg) {
					bf_max_user_msg = false;
					error("backfill: too many users in "
					      "queue. Consider increasing "
					      "BF_MAX_USERS");
				}
				if (debug_flags & DEBUG_FLAG_BACKFILL)
					debug2("backfill: found new user %u. "
					       "Total #users now %u",
					       job_ptr->user_id, nuser);
			} else {
				if (njobs[j] >= max_backfill_job_per_user) {
					/* skip job */
					if (debug_flags & DEBUG_FLAG_BACKFILL)
						info("backfill: have already "
						     "checked %u jobs for "
						     "user %u; skipping "
						     "job %u",
						     max_backfill_job_per_user,
						     job_ptr->user_id,
						     job_ptr->job_id);
					continue;
				}
			}
		}

		if (((part_ptr->state_up & PARTITION_SCHED) == 0) ||
		    (part_ptr->node_bitmap == NULL) ||
		    ((part_ptr->flags & PART_FLAG_ROOT_ONLY) && filter_root)) {
			if (debug_flags & DEBUG_FLAG_BACKFILL)
				info("backfill: partition %s not usable",
				     job_ptr->part_ptr->name);
			continue;
		}

		if ((!job_independent(job_ptr, 0)) ||
		    (license_job_test(job_ptr, time(NULL)) != SLURM_SUCCESS)) {
			if (debug_flags & DEBUG_FLAG_BACKFILL)
				info("backfill: job %u not runable now",
				     job_ptr->job_id);
			continue;
		}

		/* Determine minimum and maximum node counts */
		min_nodes = MAX(job_ptr->details->min_nodes,
				part_ptr->min_nodes);
		if (job_ptr->details->max_nodes == 0)
			max_nodes = part_ptr->max_nodes;
		else
			max_nodes = MIN(job_ptr->details->max_nodes,
					part_ptr->max_nodes);
		max_nodes = MIN(max_nodes, 500000);     /* prevent overflows */
		if (job_ptr->details->max_nodes)
			req_nodes = max_nodes;
		else
			req_nodes = min_nodes;
		if (min_nodes > max_nodes) {
			if (debug_flags & DEBUG_FLAG_BACKFILL)
				info("backfill: job %u node count too high",
				     job_ptr->job_id);
			continue;
		}

		/* Determine job's expected completion time */
		if (part_ptr->max_time == INFINITE)
			part_time_limit = 365 * 24 * 60; /* one year */
		else
			part_time_limit = part_ptr->max_time;
		if (job_ptr->time_limit == NO_VAL) {
			time_limit = part_time_limit;
		} else {
			if (part_ptr->max_time == INFINITE)
				time_limit = job_ptr->time_limit;
			else
				time_limit = MIN(job_ptr->time_limit,
						 part_time_limit);
		}
		comp_time_limit = time_limit;
		qos_ptr = job_ptr->qos_ptr;
		if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE) &&
		    slurm_get_preempt_mode())
			time_limit = job_ptr->time_limit = 1;
		else if (job_ptr->time_min && (job_ptr->time_min < time_limit))
			time_limit = job_ptr->time_limit = job_ptr->time_min;

		/* Determine impact of any resource reservations */
		later_start = now;
 TRY_LATER:
		if (slurmctld_config.shutdown_time)
			break;
		if (((defer_rpc_cnt > 0) &&
		     (slurmctld_config.server_thread_count >= defer_rpc_cnt)) ||
		    (_delta_tv(&start_tv) >= sched_timeout)) {
			uint32_t save_job_id = job_ptr->job_id;
			uint32_t save_time_limit = job_ptr->time_limit;
			job_ptr->time_limit = orig_time_limit;
			if (debug_flags & DEBUG_FLAG_BACKFILL) {
				END_TIMER;
				info("backfill: completed yielding locks "
				     "after testing %u(%d) jobs, %s",
				     slurmctld_diag_stats.bf_last_depth,
				     job_test_count, TIME_STR);
			}
			if ((_yield_locks(yield_sleep) && !backfill_continue) ||
			    (slurmctld_conf.last_update != config_update) ||
			    (last_part_update != part_update)) {
				if (debug_flags & DEBUG_FLAG_BACKFILL) {
					info("backfill: system state changed, "
					     "breaking out after testing "
					     "%u(%d) jobs",
					     slurmctld_diag_stats.bf_last_depth,
					     job_test_count);
				}
				rc = 1;
				break;
			}
			/* cg_node_bitmap may be changed */
			bit_copybits(non_cg_bitmap, cg_node_bitmap);
			bit_not(non_cg_bitmap);

			/* With bf_continue configured, the original job could
			 * have been scheduled or cancelled and purged.
			 * Revalidate job the record here. */
			if ((job_ptr->magic  != JOB_MAGIC) ||
			    (job_ptr->job_id != save_job_id))
				continue;
			if (!IS_JOB_PENDING(job_ptr))
				continue;
			if (!avail_front_end(job_ptr))
				continue;	/* No available frontend */

			job_ptr->time_limit = save_time_limit;
			/* Reset backfill scheduling timers, resume testing */
			sched_start = time(NULL);
			gettimeofday(&start_tv, NULL);
			job_test_count = 1;
			START_TIMER;
		}

		FREE_NULL_BITMAP(avail_bitmap);
		FREE_NULL_BITMAP(exc_core_bitmap);
		start_res   = later_start;
		later_start = 0;
		j = job_test_resv(job_ptr, &start_res, true, &avail_bitmap,
				  &exc_core_bitmap);
		if (j != SLURM_SUCCESS) {
			if (debug_flags & DEBUG_FLAG_BACKFILL)
				info("backfill: job %u reservation defer",
				     job_ptr->job_id);
			job_ptr->time_limit = orig_time_limit;
			continue;
		}
		if (start_res > now)
			end_time = (time_limit * 60) + start_res;
		else
			end_time = (time_limit * 60) + now;
		resv_end = find_resv_end(start_res);
		/* Identify usable nodes for this job */
		bit_and(avail_bitmap, part_ptr->node_bitmap);
		bit_and(avail_bitmap, up_node_bitmap);
		bit_and(avail_bitmap, non_cg_bitmap);
		for (j=0; ; ) {
			if ((node_space[j].end_time > start_res) &&
			     node_space[j].next && (later_start == 0))
				later_start = node_space[j].end_time;
			if (node_space[j].end_time <= start_res)
				;
			else if (node_space[j].begin_time <= end_time) {
				bit_and(avail_bitmap,
					node_space[j].avail_bitmap);
			} else
				break;
			if ((j = node_space[j].next) == 0)
				break;
		}
		if (resv_end && (++resv_end < window_end) &&
		    ((later_start == 0) || (resv_end < later_start))) {
			later_start = resv_end;
		}

		if (job_ptr->details->exc_node_bitmap) {
			bit_not(job_ptr->details->exc_node_bitmap);
			bit_and(avail_bitmap,
				job_ptr->details->exc_node_bitmap);
			bit_not(job_ptr->details->exc_node_bitmap);
		}

		/* Test if insufficient nodes remain OR
		 *	required nodes missing OR
		 *	nodes lack features OR
		 *	no change since previously tested nodes (only changes
		 *	in other partition nodes) */
		if ((bit_set_count(avail_bitmap) < min_nodes) ||
		    ((job_ptr->details->req_node_bitmap) &&
		     (!bit_super_set(job_ptr->details->req_node_bitmap,
				     avail_bitmap))) ||
		    (job_req_node_filter(job_ptr, avail_bitmap))) {
			if (later_start) {
				job_ptr->start_time = 0;
				goto TRY_LATER;
			}

			/* Job can not start until too far in the future */
			job_ptr->time_limit = orig_time_limit;
			job_ptr->start_time = sched_start + backfill_window;
			continue;
		}

		/* Identify nodes which are definitely off limits */
		FREE_NULL_BITMAP(resv_bitmap);
		resv_bitmap = bit_copy(avail_bitmap);
		bit_not(resv_bitmap);

		/* this is the time consuming operation */
		debug2("backfill: entering _try_sched for job %u.",
		       job_ptr->job_id);

		if (!already_counted) {
			slurmctld_diag_stats.bf_last_depth_try++;
			already_counted = true;
		}

		if (debug_flags & DEBUG_FLAG_BACKFILL_MAP)
			_dump_job_test(job_ptr, avail_bitmap, start_res);
		j = _try_sched(job_ptr, &avail_bitmap, min_nodes, max_nodes,
			       req_nodes, exc_core_bitmap);

		now = time(NULL);
		if (j != SLURM_SUCCESS) {
			job_ptr->time_limit = orig_time_limit;
			job_ptr->start_time = 0;
			continue;	/* not runable */
		}

		if (start_res > job_ptr->start_time) {
			job_ptr->start_time = start_res;
			last_job_update = now;
		}
		if (job_ptr->start_time <= now) {	/* Can start now */
			uint32_t save_time_limit = job_ptr->time_limit;
			uint32_t hard_limit;
			bool reset_time = false;
			int rc = _start_job(job_ptr, resv_bitmap);
			if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) {
				if (orig_time_limit == NO_VAL) {
					acct_policy_alter_job(
						job_ptr, comp_time_limit);
					job_ptr->time_limit = comp_time_limit;
				} else {
					acct_policy_alter_job(
						job_ptr, orig_time_limit);
					job_ptr->time_limit = orig_time_limit;
				}
			} else if ((rc == SLURM_SUCCESS) && job_ptr->time_min) {
				/* Set time limit as high as possible */
				acct_policy_alter_job(job_ptr, comp_time_limit);
				job_ptr->time_limit = comp_time_limit;
				reset_time = true;
			} else if (orig_time_limit == NO_VAL) {
				acct_policy_alter_job(job_ptr, comp_time_limit);
				job_ptr->time_limit = comp_time_limit;
			} else {
				acct_policy_alter_job(job_ptr, orig_time_limit);
				job_ptr->time_limit = orig_time_limit;

			}
			if (job_ptr->time_limit == INFINITE)
				hard_limit = 365 * 24 * 60;	/* one year */
			else
				hard_limit = job_ptr->time_limit;
			job_ptr->end_time = job_ptr->start_time +
					    (hard_limit * 60);
			if (reset_time) {
				_reset_job_time_limit(job_ptr, now,
						      node_space);
				time_limit = job_ptr->time_limit;
			}

			if (rc == ESLURM_ACCOUNTING_POLICY) {
				/* Unknown future start time, just skip job */
				job_ptr->start_time = 0;
				continue;
			} else if (rc != SLURM_SUCCESS) {
				if (debug_flags & DEBUG_FLAG_BACKFILL) {
					info("backfill: planned start of job %u"
					     " failed: %s", job_ptr->job_id,
					     slurm_strerror(rc));
				}
				/* Drop through and reserve these resources.
				 * Likely due to state changes during sleep.
				 * Make best-effort based upon original state */
				job_ptr->time_limit = orig_time_limit;
				later_start = 0;
			} else {
				/* Started this job, move to next one */
				reject_array_job_id = 0;
				reject_array_part   = NULL;

				/* Update the database if job time limit
				 * changed and move to next job */
				if (save_time_limit != job_ptr->time_limit)
					jobacct_storage_g_job_start(acct_db_conn,
								    job_ptr);
				job_start_cnt++;
				if (max_backfill_jobs_start &&
				    (job_start_cnt >= max_backfill_jobs_start)){
					if (debug_flags & DEBUG_FLAG_BACKFILL) {
						info("backfill: bf_max_job_start"
						     " limit of %d reached",
						     max_backfill_jobs_start);
					}
					break;
				}
				continue;
			}
		} else {
			job_ptr->time_limit = orig_time_limit;
		}

		start_time  = job_ptr->start_time;
		end_reserve = job_ptr->start_time + (time_limit * 60);
		start_time  = (start_time / backfill_resolution) *
			      backfill_resolution;
		end_reserve = (end_reserve / backfill_resolution) *
			      backfill_resolution;

		if (later_start && (start_time > later_start)) {
			/* Try later when some nodes currently reserved for
			 * pending jobs are free */
			job_ptr->start_time = 0;
			goto TRY_LATER;
		}

		if (job_ptr->start_time > (sched_start + backfill_window)) {
			/* Starts too far in the future to worry about */
			if (debug_flags & DEBUG_FLAG_BACKFILL)
				_dump_job_sched(job_ptr, end_reserve,
						avail_bitmap);
			continue;
		}

		if (node_space_recs >= max_backfill_job_cnt) {
			if (debug_flags & DEBUG_FLAG_BACKFILL) {
				info("backfill: table size limit of %u reached",
				     max_backfill_job_cnt);
			}
			break;
		}

		if ((job_ptr->start_time > now) &&
		    _test_resv_overlap(node_space, avail_bitmap,
				       start_time, end_reserve)) {
			/* This job overlaps with an existing reservation for
			 * job to be backfill scheduled, which the sched
			 * plugin does not know about. Try again later. */
			later_start = job_ptr->start_time;
			job_ptr->start_time = 0;
			goto TRY_LATER;
		}

		/*
		 * Add reservation to scheduling table if appropriate
		 */
		if (debug_flags & DEBUG_FLAG_BACKFILL)
			_dump_job_sched(job_ptr, end_reserve, avail_bitmap);
		if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE))
			continue;
		reject_array_job_id = 0;
		reject_array_part   = NULL;
		if (debug_flags & DEBUG_FLAG_BACKFILL)
			_dump_job_sched(job_ptr, end_reserve, avail_bitmap);
		xfree(job_ptr->sched_nodes);
		job_ptr->sched_nodes = bitmap2node_name(avail_bitmap);
		bit_not(avail_bitmap);
		_add_reservation(start_time, end_reserve,
				 avail_bitmap, node_space, &node_space_recs);
		if (debug_flags & DEBUG_FLAG_BACKFILL_MAP)
			_dump_node_space_table(node_space);
	}
	xfree(bf_part_jobs);
	xfree(bf_part_ptr);
	xfree(uid);
	xfree(njobs);
	FREE_NULL_BITMAP(avail_bitmap);
	FREE_NULL_BITMAP(exc_core_bitmap);
	FREE_NULL_BITMAP(resv_bitmap);
	FREE_NULL_BITMAP(non_cg_bitmap);

	for (i=0; ; ) {
		FREE_NULL_BITMAP(node_space[i].avail_bitmap);
		if ((i = node_space[i].next) == 0)
			break;
	}
	xfree(node_space);
	list_destroy(job_queue);
	gettimeofday(&bf_time2, NULL);
	_do_diag_stats(&bf_time1, &bf_time2, yield_sleep);
	if (debug_flags & DEBUG_FLAG_BACKFILL) {
		END_TIMER;
		info("backfill: completed testing %u(%d) jobs, %s",
		     slurmctld_diag_stats.bf_last_depth,
		     job_test_count, TIME_STR);
	}
	return rc;
}
Example #13
0
static int _attempt_backfill(void)
{
    bool filter_root = false;
    List job_queue;
    job_queue_rec_t *job_queue_rec;
    slurmdb_qos_rec_t *qos_ptr = NULL;
    int i, j, node_space_recs;
    struct job_record *job_ptr;
    struct part_record *part_ptr;
    uint32_t end_time, end_reserve;
    uint32_t time_limit, comp_time_limit, orig_time_limit;
    uint32_t min_nodes, max_nodes, req_nodes;
    bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL;
    time_t now = time(NULL), sched_start, later_start, start_res;
    node_space_map_t *node_space;
    static int sched_timeout = 0;
    int this_sched_timeout = 0, rc = 0;

    sched_start = now;
    if (sched_timeout == 0) {
        sched_timeout = slurm_get_msg_timeout() / 2;
        sched_timeout = MAX(sched_timeout, 1);
        sched_timeout = MIN(sched_timeout, 10);
    }
    this_sched_timeout = sched_timeout;

#ifdef HAVE_CRAY
    /*
     * Run a Basil Inventory immediately before setting up the schedule
     * plan, to avoid race conditions caused by ALPS node state change.
     * Needs to be done with the node-state lock taken.
     */
    if (select_g_reconfigure()) {
        debug4("backfill: not scheduling due to ALPS");
        return SLURM_SUCCESS;
    }
#endif

    if (slurm_get_root_filter())
        filter_root = true;

    job_queue = build_job_queue(true);
    if (list_count(job_queue) <= 1) {
        debug("backfill: no jobs to backfill");
        list_destroy(job_queue);
        return 0;
    }

    node_space = xmalloc(sizeof(node_space_map_t) *
                         (max_backfill_job_cnt + 3));
    node_space[0].begin_time = sched_start;
    node_space[0].end_time = sched_start + backfill_window;
    node_space[0].avail_bitmap = bit_copy(avail_node_bitmap);
    node_space[0].next = 0;
    node_space_recs = 1;
    if (debug_flags & DEBUG_FLAG_BACKFILL)
        _dump_node_space_table(node_space);

    while ((job_queue_rec = (job_queue_rec_t *)
                            list_pop_bottom(job_queue, sort_job_queue2))) {
        job_ptr  = job_queue_rec->job_ptr;
        part_ptr = job_queue_rec->part_ptr;
        xfree(job_queue_rec);
        if (!IS_JOB_PENDING(job_ptr))
            continue;	/* started in other partition */
        job_ptr->part_ptr = part_ptr;

        if (debug_flags & DEBUG_FLAG_BACKFILL)
            info("backfill test for job %u", job_ptr->job_id);

        if ((job_ptr->state_reason == WAIT_ASSOC_JOB_LIMIT) ||
                (job_ptr->state_reason == WAIT_ASSOC_RESOURCE_LIMIT) ||
                (job_ptr->state_reason == WAIT_ASSOC_TIME_LIMIT) ||
                (job_ptr->state_reason == WAIT_QOS_JOB_LIMIT) ||
                (job_ptr->state_reason == WAIT_QOS_RESOURCE_LIMIT) ||
                (job_ptr->state_reason == WAIT_QOS_TIME_LIMIT) ||
                !acct_policy_job_runnable(job_ptr)) {
            debug2("backfill: job %u is not allowed to run now. "
                   "Skipping it. State=%s. Reason=%s. Priority=%u",
                   job_ptr->job_id,
                   job_state_string(job_ptr->job_state),
                   job_reason_string(job_ptr->state_reason),
                   job_ptr->priority);
            continue;
        }

        if (((part_ptr->state_up & PARTITION_SCHED) == 0) ||
                (part_ptr->node_bitmap == NULL))
            continue;
        if ((part_ptr->flags & PART_FLAG_ROOT_ONLY) && filter_root)
            continue;

        if ((!job_independent(job_ptr, 0)) ||
                (license_job_test(job_ptr, time(NULL)) != SLURM_SUCCESS))
            continue;

        /* Determine minimum and maximum node counts */
        min_nodes = MAX(job_ptr->details->min_nodes,
                        part_ptr->min_nodes);
        if (job_ptr->details->max_nodes == 0)
            max_nodes = part_ptr->max_nodes;
        else
            max_nodes = MIN(job_ptr->details->max_nodes,
                            part_ptr->max_nodes);
        max_nodes = MIN(max_nodes, 500000);     /* prevent overflows */
        if (job_ptr->details->max_nodes)
            req_nodes = max_nodes;
        else
            req_nodes = min_nodes;
        if (min_nodes > max_nodes) {
            /* job's min_nodes exceeds partition's max_nodes */
            continue;
        }

        /* Determine job's expected completion time */
        if (job_ptr->time_limit == NO_VAL) {
            if (part_ptr->max_time == INFINITE)
                time_limit = 365 * 24 * 60; /* one year */
            else
                time_limit = part_ptr->max_time;
        } else {
            if (part_ptr->max_time == INFINITE)
                time_limit = job_ptr->time_limit;
            else
                time_limit = MIN(job_ptr->time_limit,
                                 part_ptr->max_time);
        }
        comp_time_limit = time_limit;
        orig_time_limit = job_ptr->time_limit;
        if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE))
            time_limit = job_ptr->time_limit = 1;
        else if (job_ptr->time_min && (job_ptr->time_min < time_limit))
            time_limit = job_ptr->time_limit = job_ptr->time_min;

        /* Determine impact of any resource reservations */
        later_start = now;
TRY_LATER:
        FREE_NULL_BITMAP(avail_bitmap);
        start_res   = later_start;
        later_start = 0;
        j = job_test_resv(job_ptr, &start_res, true, &avail_bitmap);
        if (j != SLURM_SUCCESS) {
            job_ptr->time_limit = orig_time_limit;
            continue;
        }
        if (start_res > now)
            end_time = (time_limit * 60) + start_res;
        else
            end_time = (time_limit * 60) + now;

        /* Identify usable nodes for this job */
        bit_and(avail_bitmap, part_ptr->node_bitmap);
        bit_and(avail_bitmap, up_node_bitmap);
        for (j=0; ; ) {
            if ((node_space[j].end_time > start_res) &&
                    node_space[j].next && (later_start == 0))
                later_start = node_space[j].end_time;
            if (node_space[j].end_time <= start_res)
                ;
            else if (node_space[j].begin_time <= end_time) {
                bit_and(avail_bitmap,
                        node_space[j].avail_bitmap);
            } else
                break;
            if ((j = node_space[j].next) == 0)
                break;
        }

        if (job_ptr->details->exc_node_bitmap) {
            bit_not(job_ptr->details->exc_node_bitmap);
            bit_and(avail_bitmap,
                    job_ptr->details->exc_node_bitmap);
            bit_not(job_ptr->details->exc_node_bitmap);
        }

        /* Test if insufficient nodes remain OR
         *	required nodes missing OR
         *	nodes lack features */
        if ((bit_set_count(avail_bitmap) < min_nodes) ||
                ((job_ptr->details->req_node_bitmap) &&
                 (!bit_super_set(job_ptr->details->req_node_bitmap,
                                 avail_bitmap))) ||
                (job_req_node_filter(job_ptr, avail_bitmap))) {
            if (later_start) {
                job_ptr->start_time = 0;
                goto TRY_LATER;
            }
            job_ptr->time_limit = orig_time_limit;
            continue;
        }

        /* Identify nodes which are definitely off limits */
        FREE_NULL_BITMAP(resv_bitmap);
        resv_bitmap = bit_copy(avail_bitmap);
        bit_not(resv_bitmap);

        if ((time(NULL) - sched_start) >= this_sched_timeout) {
            debug("backfill: loop taking too long, yielding locks");
            if (_yield_locks()) {
                debug("backfill: system state changed, "
                      "breaking out");
                rc = 1;
                break;
            } else {
                this_sched_timeout += sched_timeout;
            }
        }
        /* this is the time consuming operation */
        debug2("backfill: entering _try_sched for job %u.",
               job_ptr->job_id);
        j = _try_sched(job_ptr, &avail_bitmap,
                       min_nodes, max_nodes, req_nodes);
        debug2("backfill: finished _try_sched for job %u.",
               job_ptr->job_id);
        now = time(NULL);
        if (j != SLURM_SUCCESS) {
            job_ptr->time_limit = orig_time_limit;
            job_ptr->start_time = 0;
            continue;	/* not runable */
        }

        if (start_res > job_ptr->start_time) {
            job_ptr->start_time = start_res;
            last_job_update = now;
        }
        if (job_ptr->start_time <= now) {
            int rc = _start_job(job_ptr, resv_bitmap);
            if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE))
                job_ptr->time_limit = orig_time_limit;
            else if ((rc == SLURM_SUCCESS) && job_ptr->time_min) {
                /* Set time limit as high as possible */
                job_ptr->time_limit = comp_time_limit;
                job_ptr->end_time = job_ptr->start_time +
                                    (comp_time_limit * 60);
                _reset_job_time_limit(job_ptr, now,
                                      node_space);
                time_limit = job_ptr->time_limit;
            } else {
                job_ptr->time_limit = orig_time_limit;
            }
            if (rc == ESLURM_ACCOUNTING_POLICY) {
                /* Unknown future start time, just skip job */
                job_ptr->start_time = 0;
                continue;
            } else if (rc != SLURM_SUCCESS) {
                /* Planned to start job, but something bad
                 * happended. */
                job_ptr->start_time = 0;
                break;
            } else {
                /* Started this job, move to next one */
                continue;
            }
        } else
            job_ptr->time_limit = orig_time_limit;

        if (later_start && (job_ptr->start_time > later_start)) {
            /* Try later when some nodes currently reserved for
             * pending jobs are free */
            job_ptr->start_time = 0;
            goto TRY_LATER;
        }

        if (job_ptr->start_time > (sched_start + backfill_window)) {
            /* Starts too far in the future to worry about */
            continue;
        }

        if (node_space_recs >= max_backfill_job_cnt) {
            /* Already have too many jobs to deal with */
            break;
        }

        end_reserve = job_ptr->start_time + (time_limit * 60);
        if (_test_resv_overlap(node_space, avail_bitmap,
                               job_ptr->start_time, end_reserve)) {
            /* This job overlaps with an existing reservation for
             * job to be backfill scheduled, which the sched
             * plugin does not know about. Try again later. */
            later_start = job_ptr->start_time;
            job_ptr->start_time = 0;
            goto TRY_LATER;
        }

        /*
         * Add reservation to scheduling table if appropriate
         */
        qos_ptr = job_ptr->qos_ptr;
        if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE))
            continue;
        bit_not(avail_bitmap);
        _add_reservation(job_ptr->start_time, end_reserve,
                         avail_bitmap, node_space, &node_space_recs);
        if (debug_flags & DEBUG_FLAG_BACKFILL)
            _dump_node_space_table(node_space);
    }
    FREE_NULL_BITMAP(avail_bitmap);
    FREE_NULL_BITMAP(resv_bitmap);

    for (i=0; ; ) {
        FREE_NULL_BITMAP(node_space[i].avail_bitmap);
        if ((i = node_space[i].next) == 0)
            break;
    }
    xfree(node_space);
    list_destroy(job_queue);
    return rc;
}