C++ (Cpp) slurm_get_preempt_mode 예제들

예제 #1

0

파일 보기

파일: preempt_qos.c 프로젝트: SchedMD/slurm

extern uint16_t job_preempt_mode(struct job_record *job_ptr)
{
	if (job_ptr->qos_ptr && job_ptr->qos_ptr->preempt_mode)
		return job_ptr->qos_ptr->preempt_mode;

	return (slurm_get_preempt_mode() & (~PREEMPT_MODE_GANG));
}

예제 #2

0

파일 보기

파일: preempt_partition_prio.c 프로젝트: IFCA/slurm

extern uint16_t job_preempt_mode(struct job_record *job_ptr)
{
	if (job_ptr->part_ptr &&
	    (job_ptr->part_ptr->preempt_mode != (uint16_t) NO_VAL))
		return job_ptr->part_ptr->preempt_mode;

	return (slurm_get_preempt_mode() & (~PREEMPT_MODE_GANG));
}

예제 #3

0

파일 보기

파일: sched_plugin.c 프로젝트: alepharchives/slurm

/* *********************************************************************** */
void
slurm_sched_partition_change( void )
{
	if ( slurm_sched_init() < 0 )
		return;

	if ( (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) &&
	     (gs_reconfig() != SLURM_SUCCESS))
		error( "cannot reconfigure gang scheduler" );

	(*(g_sched_context->ops.partition_change))();
}

예제 #4

0

파일 보기

파일: sched_plugin.c 프로젝트: alepharchives/slurm

/* *********************************************************************** */
extern int
slurm_sched_reconfig( void )
{
	if ( slurm_sched_init() < 0 )
		return SLURM_ERROR;

	if ( (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) &&
	     (gs_reconfig() != SLURM_SUCCESS))
		error( "cannot reconfigure gang scheduler" );

	return (*(g_sched_context->ops.reconfig))();
}

예제 #5

0

파일 보기

파일: preempt_partition_prio.c 프로젝트: FredHutch/slurm

extern uint16_t job_preempt_mode(struct job_record *job_ptr)
{
	struct part_record *part_ptr = job_ptr->part_ptr;
	if (part_ptr && (part_ptr->preempt_mode != (uint16_t) NO_VAL)) {
		if (part_ptr->preempt_mode & PREEMPT_MODE_GANG)
			verbose("Partition '%s' preempt mode 'gang' has no "
				"sense. Filtered out.\n", part_ptr->name);
		return (part_ptr->preempt_mode & (~PREEMPT_MODE_GANG));
	}

	return (slurm_get_preempt_mode() & (~PREEMPT_MODE_GANG));
}

예제 #6

0

파일 보기

파일: sched_plugin.c 프로젝트: BYUHPC/slurm

/* *********************************************************************** */
int
slurm_sched_g_newalloc( struct job_record *job_ptr )
{
	if ( slurm_sched_init() < 0 )
		return SLURM_ERROR;

	if ( (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) &&
	     (gs_job_start( job_ptr ) != SLURM_SUCCESS)) {
		error( "gang scheduler problem starting job %u",
		       job_ptr->job_id);
	}

	return (*(ops.newalloc))( job_ptr );
}

예제 #7

0

파일 보기

파일: sched_plugin.c 프로젝트: alepharchives/slurm

/* *********************************************************************** */
int
slurm_sched_freealloc( struct job_record *job_ptr )
{
	if ( slurm_sched_init() < 0 )
		return SLURM_ERROR;

	if ( (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) &&
	     (gs_job_fini( job_ptr ) != SLURM_SUCCESS)) {
		error( "gang scheduler problem finishing job %u",
		       job_ptr->job_id);
	}

	return (*(g_sched_context->ops.freealloc))( job_ptr );
}

예제 #8

0

파일 보기

파일: sched_plugin.c 프로젝트: alepharchives/slurm

/* *********************************************************************** */
int
slurm_sched_schedule( void )
{
	if ( slurm_sched_init() < 0 )
		return SLURM_ERROR;

#if 0
	/* Must have job write lock and node read lock set here */
	if ( (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) &&
	     (gs_job_scan() != SLURM_SUCCESS))
		error( "gang scheduler could not rescan jobs" );
#endif

	return (*(g_sched_context->ops.schedule))();
}

예제 #9

0

파일 보기

파일: print.c 프로젝트: Xarthisius/slurm

int _print_preempt_mode(sinfo_data_t * sinfo_data, int width,
			bool right_justify, char *suffix)
{
	if (sinfo_data) {
		uint16_t preempt_mode = sinfo_data->part_info->preempt_mode;
		if (preempt_mode == (uint16_t) NO_VAL)
			preempt_mode =  slurm_get_preempt_mode();
		_print_str(preempt_mode_string(preempt_mode), 
			   width, right_justify, true);
	} else
		_print_str("PREEMPT_MODE", width, right_justify, true);

	if (suffix)
		printf("%s", suffix);
	return SLURM_SUCCESS;
}

예제 #10

0

파일 보기

파일: sched_plugin.c 프로젝트: alepharchives/slurm

/* *********************************************************************** */
extern int
slurm_sched_fini( void )
{
	int rc;

	if (!g_sched_context)
		return SLURM_SUCCESS;

	rc = slurm_sched_context_destroy(g_sched_context);
	g_sched_context = NULL;

	if ( (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) &&
	     (gs_fini() != SLURM_SUCCESS))
		error( "cannot stop gang scheduler" );

	return rc;
}

예제 #11

0

파일 보기

파일: preempt_job_prio.c 프로젝트: dinesh121991/Backup-M2R-Intern-Bull-Slurm-Codes

extern uint16_t job_preempt_mode(struct job_record *job_ptr)
{
	uint16_t mode;

	if (job_ptr->qos_ptr &&
	   ((slurmdb_qos_rec_t *)job_ptr->qos_ptr)->preempt_mode) {
		mode = ((slurmdb_qos_rec_t *)job_ptr->qos_ptr)->preempt_mode;
		if (slurm_get_debug_flags() & DEBUG_FLAG_PRIO) {
			info("%s: in job_preempt_mode return = %s",
			     plugin_type, preempt_mode_string(mode));
		}
		return mode;
	}

	mode = slurm_get_preempt_mode() & (~PREEMPT_MODE_GANG);
	if (slurm_get_debug_flags() & DEBUG_FLAG_PRIO) {
		info("%s: in job_preempt_mode return = %s",
		     plugin_type, preempt_mode_string(mode));
	}
	return mode;
}

예제 #12

0

파일 보기

파일: sched_plugin.c 프로젝트: alepharchives/slurm

/* *********************************************************************** */
extern int
slurm_sched_init( void )
{
	int retval = SLURM_SUCCESS;
	char *sched_type = NULL;

	slurm_mutex_lock( &g_sched_context_lock );

	if ( g_sched_context )
		goto done;

	sched_type = slurm_get_sched_type();
	g_sched_context = slurm_sched_context_create( sched_type );
	if ( g_sched_context == NULL ) {
		error( "cannot create scheduler context for %s",
			 sched_type );
		retval = SLURM_ERROR;
		goto done;
	}

	if ( slurm_sched_get_ops( g_sched_context ) == NULL ) {
		error( "cannot resolve scheduler plugin operations" );
		slurm_sched_context_destroy( g_sched_context );
		g_sched_context = NULL;
		retval = SLURM_ERROR;
		goto done;
	}

	if ( (slurm_get_preempt_mode() & PREEMPT_MODE_GANG) &&
	     (gs_init() != SLURM_SUCCESS))
		error( "cannot start gang scheduler ");

 done:
	slurm_mutex_unlock( &g_sched_context_lock );
	xfree(sched_type);
	return retval;
}

예제 #13

0

파일 보기

파일: backfill.c 프로젝트: perryh/slurm

static int _attempt_backfill(void)
{
	DEF_TIMERS;
	bool filter_root = false;
	List job_queue;
	job_queue_rec_t *job_queue_rec;
	slurmdb_qos_rec_t *qos_ptr = NULL;
	int i, j, node_space_recs;
	struct job_record *job_ptr;
	struct part_record *part_ptr;
	uint32_t end_time, end_reserve;
	uint32_t time_limit, comp_time_limit, orig_time_limit;
	uint32_t min_nodes, max_nodes, req_nodes;
	bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL;
	bitstr_t *exc_core_bitmap = NULL;
	time_t now, sched_start, later_start, start_res, resv_end;
	node_space_map_t *node_space;
	struct timeval bf_time1, bf_time2;
	int sched_timeout = 2, yield_sleep = 1;
	int rc = 0;
	int job_test_count = 0;
	uint32_t *uid = NULL, nuser = 0;
	uint16_t *njobs = NULL;
	bool already_counted;
	uint32_t reject_array_job_id = 0;

#ifdef HAVE_CRAY
	/*
	 * Run a Basil Inventory immediately before setting up the schedule
	 * plan, to avoid race conditions caused by ALPS node state change.
	 * Needs to be done with the node-state lock taken.
	 */
	START_TIMER;
	if (select_g_reconfigure()) {
		debug4("backfill: not scheduling due to ALPS");
		return SLURM_SUCCESS;
	}
	END_TIMER;
	if (debug_flags & DEBUG_FLAG_BACKFILL)
		info("backfill: ALPS inventory completed, %s", TIME_STR);

	/* The Basil inventory can take a long time to complete. Process
	 * pending RPCs before starting the backfill scheduling logic */
	_yield_locks(1);
#endif

	START_TIMER;
	if (debug_flags & DEBUG_FLAG_BACKFILL)
		info("backfill: beginning");
	sched_start = now = time(NULL);

	if (slurm_get_root_filter())
		filter_root = true;

	job_queue = build_job_queue(true);
	if (list_count(job_queue) == 0) {
		debug("backfill: no jobs to backfill");
		list_destroy(job_queue);
		return 0;
	}

	gettimeofday(&bf_time1, NULL);

	slurmctld_diag_stats.bf_queue_len = list_count(job_queue);
	slurmctld_diag_stats.bf_queue_len_sum += slurmctld_diag_stats.
						 bf_queue_len;
	slurmctld_diag_stats.bf_last_depth = 0;
	slurmctld_diag_stats.bf_last_depth_try = 0;
	slurmctld_diag_stats.bf_when_last_cycle = now;
	bf_last_yields = 0;
	slurmctld_diag_stats.bf_active = 1;

	node_space = xmalloc(sizeof(node_space_map_t) *
			     (max_backfill_job_cnt + 3));
	node_space[0].begin_time = sched_start;
	node_space[0].end_time = sched_start + backfill_window;
	node_space[0].avail_bitmap = bit_copy(avail_node_bitmap);
	node_space[0].next = 0;
	node_space_recs = 1;
	if (debug_flags & DEBUG_FLAG_BACKFILL)
		_dump_node_space_table(node_space);

	if (max_backfill_job_per_user) {
		uid = xmalloc(BF_MAX_USERS * sizeof(uint32_t));
		njobs = xmalloc(BF_MAX_USERS * sizeof(uint16_t));
	}
	while ((job_queue_rec = (job_queue_rec_t *)
				list_pop_bottom(job_queue, sort_job_queue2))) {
		job_ptr  = job_queue_rec->job_ptr;
		orig_time_limit = job_ptr->time_limit;

		if ((time(NULL) - sched_start) >= sched_timeout) {
			uint32_t save_time_limit = job_ptr->time_limit;
			job_ptr->time_limit = orig_time_limit;
			if (debug_flags & DEBUG_FLAG_BACKFILL) {
				END_TIMER;
				info("backfill: completed yielding locks "
				     "after testing %d jobs, %s",
				     job_test_count, TIME_STR);
			}
			if (_yield_locks(yield_sleep) && !backfill_continue) {
				if (debug_flags & DEBUG_FLAG_BACKFILL) {
					info("backfill: system state changed, "
					     "breaking out after testing %d "
					     "jobs", job_test_count);
				}
				rc = 1;
				break;
			}
			job_ptr->time_limit = save_time_limit;
			/* Reset backfill scheduling timers, resume testing */
			sched_start = time(NULL);
			job_test_count = 0;
			START_TIMER;
		}

		part_ptr = job_queue_rec->part_ptr;
		job_test_count++;

		xfree(job_queue_rec);
		if (!IS_JOB_PENDING(job_ptr))
			continue;	/* started in other partition */
		if (!avail_front_end(job_ptr))
			continue;	/* No available frontend for this job */
		if (job_ptr->array_task_id != (uint16_t) NO_VAL) {
			if (reject_array_job_id == job_ptr->array_job_id)
				continue;  /* already rejected array element */
			/* assume reject whole array for now, clear if OK */
			reject_array_job_id = job_ptr->array_job_id;
		}
		job_ptr->part_ptr = part_ptr;

		if (debug_flags & DEBUG_FLAG_BACKFILL)
			info("backfill test for job %u", job_ptr->job_id);

		slurmctld_diag_stats.bf_last_depth++;
		already_counted = false;

		if (max_backfill_job_per_user) {
			for (j = 0; j < nuser; j++) {
				if (job_ptr->user_id == uid[j]) {
					njobs[j]++;
					if (debug_flags & DEBUG_FLAG_BACKFILL)
						debug("backfill: user %u: "
						      "#jobs %u",
						      uid[j], njobs[j]);
					break;
				}
			}
			if (j == nuser) { /* user not found */
				if (nuser < BF_MAX_USERS) {
					uid[j] = job_ptr->user_id;
					njobs[j] = 1;
					nuser++;
				} else {
					error("backfill: too many users in "
					      "queue. Consider increasing "
					      "BF_MAX_USERS");
				}
				if (debug_flags & DEBUG_FLAG_BACKFILL)
					debug2("backfill: found new user %u. "
					       "Total #users now %u",
					       job_ptr->user_id, nuser);
			} else {
				if (njobs[j] > max_backfill_job_per_user) {
					/* skip job */
					if (debug_flags & DEBUG_FLAG_BACKFILL)
						debug("backfill: have already "
						      "checked %u jobs for "
						      "user %u; skipping "
						      "job %u",
						      max_backfill_job_per_user,
						      job_ptr->user_id,
						      job_ptr->job_id);
					continue;
				}
			}
		}

		if (((part_ptr->state_up & PARTITION_SCHED) == 0) ||
		    (part_ptr->node_bitmap == NULL))
		 	continue;
		if ((part_ptr->flags & PART_FLAG_ROOT_ONLY) && filter_root)
			continue;

		if ((!job_independent(job_ptr, 0)) ||
		    (license_job_test(job_ptr, time(NULL)) != SLURM_SUCCESS))
			continue;

		/* Determine minimum and maximum node counts */
		min_nodes = MAX(job_ptr->details->min_nodes,
				part_ptr->min_nodes);
		if (job_ptr->details->max_nodes == 0)
			max_nodes = part_ptr->max_nodes;
		else
			max_nodes = MIN(job_ptr->details->max_nodes,
					part_ptr->max_nodes);
		max_nodes = MIN(max_nodes, 500000);     /* prevent overflows */
		if (job_ptr->details->max_nodes)
			req_nodes = max_nodes;
		else
			req_nodes = min_nodes;
		if (min_nodes > max_nodes) {
			/* job's min_nodes exceeds partition's max_nodes */
			continue;
		}

		/* Determine job's expected completion time */
		if (job_ptr->time_limit == NO_VAL) {
			if (part_ptr->max_time == INFINITE)
				time_limit = 365 * 24 * 60; /* one year */
			else
				time_limit = part_ptr->max_time;
		} else {
			if (part_ptr->max_time == INFINITE)
				time_limit = job_ptr->time_limit;
			else
				time_limit = MIN(job_ptr->time_limit,
						 part_ptr->max_time);
		}
		comp_time_limit = time_limit;
		qos_ptr = job_ptr->qos_ptr;
		if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE) &&
		    slurm_get_preempt_mode())
			time_limit = job_ptr->time_limit = 1;
		else if (job_ptr->time_min && (job_ptr->time_min < time_limit))
			time_limit = job_ptr->time_limit = job_ptr->time_min;

		/* Determine impact of any resource reservations */
		later_start = now;
 TRY_LATER:
		if ((time(NULL) - sched_start) >= sched_timeout) {
			uint32_t save_time_limit = job_ptr->time_limit;
			job_ptr->time_limit = orig_time_limit;
			if (debug_flags & DEBUG_FLAG_BACKFILL) {
				END_TIMER;
				info("backfill: completed yielding locks 2"
				     "after testing %d jobs, %s",
				     job_test_count, TIME_STR);
			}
			if (_yield_locks(yield_sleep) && !backfill_continue) {
				if (debug_flags & DEBUG_FLAG_BACKFILL) {
					info("backfill: system state changed, "
					     "breaking out after testing %d "
					     "jobs", job_test_count);
				}
				rc = 1;
				break;
			}
			job_ptr->time_limit = save_time_limit;
			/* Reset backfill scheduling timers, resume testing */
			sched_start = time(NULL);
			job_test_count = 1;
			START_TIMER;
		}

		FREE_NULL_BITMAP(avail_bitmap);
		FREE_NULL_BITMAP(exc_core_bitmap);
		start_res   = later_start;
		later_start = 0;
		j = job_test_resv(job_ptr, &start_res, true, &avail_bitmap,
				  &exc_core_bitmap);
		if (j != SLURM_SUCCESS) {
			job_ptr->time_limit = orig_time_limit;
			continue;
		}
		if (start_res > now)
			end_time = (time_limit * 60) + start_res;
		else
			end_time = (time_limit * 60) + now;
		resv_end = find_resv_end(start_res);
		/* Identify usable nodes for this job */
		bit_and(avail_bitmap, part_ptr->node_bitmap);
		bit_and(avail_bitmap, up_node_bitmap);
		for (j=0; ; ) {
			if ((node_space[j].end_time > start_res) &&
			     node_space[j].next && (later_start == 0))
				later_start = node_space[j].end_time;
			if (node_space[j].end_time <= start_res)
				;
			else if (node_space[j].begin_time <= end_time) {
				bit_and(avail_bitmap,
					node_space[j].avail_bitmap);
			} else
				break;
			if ((j = node_space[j].next) == 0)
				break;
		}
		if ((resv_end++) &&
		    ((later_start == 0) || (resv_end < later_start))) {
			later_start = resv_end;
		}

		if (job_ptr->details->exc_node_bitmap) {
			bit_not(job_ptr->details->exc_node_bitmap);
			bit_and(avail_bitmap,
				job_ptr->details->exc_node_bitmap);
			bit_not(job_ptr->details->exc_node_bitmap);
		}

		/* Test if insufficient nodes remain OR
		 *	required nodes missing OR
		 *	nodes lack features */
		if ((bit_set_count(avail_bitmap) < min_nodes) ||
		    ((job_ptr->details->req_node_bitmap) &&
		     (!bit_super_set(job_ptr->details->req_node_bitmap,
				     avail_bitmap))) ||
		    (job_req_node_filter(job_ptr, avail_bitmap))) {
			if (later_start) {
				job_ptr->start_time = 0;
				goto TRY_LATER;
			}
			/* Job can not start until too far in the future */
			job_ptr->time_limit = orig_time_limit;
			job_ptr->start_time = sched_start + backfill_window;
			continue;
		}

		/* Identify nodes which are definitely off limits */
		FREE_NULL_BITMAP(resv_bitmap);
		resv_bitmap = bit_copy(avail_bitmap);
		bit_not(resv_bitmap);

		/* this is the time consuming operation */
		debug2("backfill: entering _try_sched for job %u.",
		       job_ptr->job_id);

		if (!already_counted) {
			slurmctld_diag_stats.bf_last_depth_try++;
			already_counted = true;
		}

		j = _try_sched(job_ptr, &avail_bitmap, min_nodes, max_nodes,
			       req_nodes, exc_core_bitmap);

		now = time(NULL);
		if (j != SLURM_SUCCESS) {
			job_ptr->time_limit = orig_time_limit;
			job_ptr->start_time = 0;
			continue;	/* not runable */
		}

		if (start_res > job_ptr->start_time) {
			job_ptr->start_time = start_res;
			last_job_update = now;
		}
		if (job_ptr->start_time <= now) {
			uint32_t save_time_limit = job_ptr->time_limit;
			int rc = _start_job(job_ptr, resv_bitmap);
			if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) {
				if (orig_time_limit == NO_VAL)
					job_ptr->time_limit = comp_time_limit;
				else
					job_ptr->time_limit = orig_time_limit;
				job_ptr->end_time = job_ptr->start_time +
						    (job_ptr->time_limit * 60);
			} else if ((rc == SLURM_SUCCESS) && job_ptr->time_min) {
				/* Set time limit as high as possible */
				job_ptr->time_limit = comp_time_limit;
				job_ptr->end_time = job_ptr->start_time +
						    (comp_time_limit * 60);
				_reset_job_time_limit(job_ptr, now,
						      node_space);
				time_limit = job_ptr->time_limit;
			} else {
				job_ptr->time_limit = orig_time_limit;
			}
			if (rc == ESLURM_ACCOUNTING_POLICY) {
				/* Unknown future start time, just skip job */
				job_ptr->start_time = 0;
				continue;
			} else if (rc != SLURM_SUCCESS) {
				/* Planned to start job, but something bad
				 * happended. */
				job_ptr->start_time = 0;
				break;
			} else {
				/* Started this job, move to next one */
				reject_array_job_id = 0;

				/* Update the database if job time limit
				 * changed and move to next job */
				if (save_time_limit != job_ptr->time_limit)
					jobacct_storage_g_job_start(acct_db_conn,
								    job_ptr);
				continue;
			}
		} else
			job_ptr->time_limit = orig_time_limit;

		if (later_start && (job_ptr->start_time > later_start)) {
			/* Try later when some nodes currently reserved for
			 * pending jobs are free */
			job_ptr->start_time = 0;
			goto TRY_LATER;
		}

		if (job_ptr->start_time > (sched_start + backfill_window)) {
			/* Starts too far in the future to worry about */
			continue;
		}

		if (node_space_recs >= max_backfill_job_cnt) {
			/* Already have too many jobs to deal with */
			break;
		}

		end_reserve = job_ptr->start_time + (time_limit * 60);
		if (_test_resv_overlap(node_space, avail_bitmap,
				       job_ptr->start_time, end_reserve)) {
			/* This job overlaps with an existing reservation for
			 * job to be backfill scheduled, which the sched
			 * plugin does not know about. Try again later. */
			later_start = job_ptr->start_time;
			job_ptr->start_time = 0;
			goto TRY_LATER;
		}

		/*
		 * Add reservation to scheduling table if appropriate
		 */
		if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE))
			continue;
		reject_array_job_id = 0;
		bit_not(avail_bitmap);
		_add_reservation(job_ptr->start_time, end_reserve,
				 avail_bitmap, node_space, &node_space_recs);
		if (debug_flags & DEBUG_FLAG_BACKFILL)
			_dump_node_space_table(node_space);
	}
	xfree(uid);
	xfree(njobs);
	FREE_NULL_BITMAP(avail_bitmap);
	FREE_NULL_BITMAP(exc_core_bitmap);
	FREE_NULL_BITMAP(resv_bitmap);

	for (i=0; ; ) {
		FREE_NULL_BITMAP(node_space[i].avail_bitmap);
		if ((i = node_space[i].next) == 0)
			break;
	}
	xfree(node_space);
	list_destroy(job_queue);
	gettimeofday(&bf_time2, NULL);
	_do_diag_stats(&bf_time1, &bf_time2, yield_sleep);
	if (debug_flags & DEBUG_FLAG_BACKFILL) {
		END_TIMER;
		info("backfill: completed testing %d jobs, %s",
		     job_test_count, TIME_STR);
	}
	return rc;
}

예제 #14

0

파일 보기

파일: preempt_partition_prio.c 프로젝트: FredHutch/slurm

extern bool preemption_enabled(void)
{
	return (slurm_get_preempt_mode() != PREEMPT_MODE_OFF);
}

예제 #15

0

파일 보기

파일: job_test.c 프로젝트: BYUHPC/slurm

/* cr_job_test - does most of the real work for select_p_job_test(), which
 *	includes contiguous selection, load-leveling and max_share logic
 *
 * PROCEDURE:
 *
 * Step 1: compare nodes in "avail" bitmap with current node state data
 *         to find available nodes that match the job request
 *
 * Step 2: check resources in "avail" bitmap with allocated resources from
 *         higher priority partitions (busy resources are UNavailable)
 *
 * Step 3: select resource usage on remaining resources in "avail" bitmap
 *         for this job, with the placement influenced by existing
 *         allocations
 */
extern int cr_job_test(struct job_record *job_ptr, bitstr_t *bitmap, int mode,
			uint16_t cr_type, enum node_cr_state job_node_req, 
			uint32_t cr_node_cnt,
			struct part_res_record *cr_part_ptr,
			struct node_use_record *node_usage)
{
	static int gang_mode = -1;
	int error_code = SLURM_SUCCESS;
	bitstr_t *orig_map, *avail_cores, *free_cores;
	bitstr_t *tmpcore = NULL;
	bool test_only;
	uint32_t c, i, j, k, n, csize, save_mem = 0;
	job_resources_t *job_res;
	struct job_details *details_ptr;
	struct part_res_record *p_ptr, *jp_ptr;
	uint16_t *cpu_count;

	if (gang_mode == -1) {
		if (slurm_get_preempt_mode() & PREEMPT_MODE_GANG)
			gang_mode = 1;
		else
			gang_mode = 0;
	}

	details_ptr = job_ptr->details;

	free_job_resources(&job_ptr->job_resrcs);

	if (mode == SELECT_MODE_TEST_ONLY)
		test_only = true;
	else	/* SELECT_MODE_RUN_NOW || SELECT_MODE_WILL_RUN  */
		test_only = false;

	/* check node_state and update the node bitmap as necessary */
	if (!test_only) {
		error_code = _verify_node_state(cr_part_ptr, job_ptr,
						bitmap, cr_type, node_usage,
						job_node_req);
		if (error_code != SLURM_SUCCESS)
			return error_code;
	}

	if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
		info("select/serial: evaluating job %u on %u nodes",
		     job_ptr->job_id, bit_set_count(bitmap));
	}

	orig_map = bit_copy(bitmap);
	avail_cores = _make_core_bitmap(bitmap);

	/* test to make sure that this job can succeed with all avail_cores
	 * if 'no' then return FAIL
	 * if 'yes' then we will seek the optimal placement for this job
	 *          within avail_cores
	 */
	free_cores = bit_copy(avail_cores);
	cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores,
				  node_usage, cr_type, test_only);
	if (cpu_count == NULL) {
		/* job cannot fit */
		FREE_NULL_BITMAP(orig_map);
		FREE_NULL_BITMAP(free_cores);
		FREE_NULL_BITMAP(avail_cores);
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
			info("select/serial: cr_job_test: test 0 fail: "
			     "insufficient resources");
		}
		return SLURM_ERROR;
	} else if (test_only) {
		FREE_NULL_BITMAP(orig_map);
		FREE_NULL_BITMAP(free_cores);
		FREE_NULL_BITMAP(avail_cores);
		xfree(cpu_count);
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE)
			info("select/serial: cr_job_test: test 0 pass: "******"test_only");
		return SLURM_SUCCESS;
	}
	if (cr_type == CR_MEMORY) {
		/* CR_MEMORY does not care about existing CPU allocations,
		 * so we can jump right to job allocation from here */
		goto alloc_job;
	}
	xfree(cpu_count);
	if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
		info("select/serial: cr_job_test: test 0 pass - "
		     "job fits on given resources");
	}

	/* now that we know that this job can run with the given resources,
	 * let's factor in the existing allocations and seek the optimal set
	 * of resources for this job. Here is the procedure:
	 *
	 * Step 1: Seek idle CPUs across all partitions. If successful then
	 *         place job and exit. If not successful, then continue. Two
	 *         related items to note:
	 *          1. Jobs that don't share CPUs finish with step 1.
	 *          2. The remaining steps assume sharing or preemption.
	 *
	 * Step 2: Remove resources that are in use by higher-priority
	 *         partitions, and test that job can still succeed. If not
	 *         then exit.
	 *
	 * Step 3: Seek idle nodes among the partitions with the same
	 *         priority as the job's partition. If successful then
	 *         goto Step 6. If not then continue:
	 *
	 * Step 4: Seek placement within the job's partition. Search
	 *         row-by-row. If no placement if found, then exit. If a row
	 *         is found, then continue:
	 *
	 * Step 5: Place job and exit. FIXME! Here is where we need a
	 *         placement algorithm that recognizes existing job
	 *         boundaries and tries to "overlap jobs" as efficiently
	 *         as possible.
	 *
	 * Step 6: Place job and exit. FIXME! here is we use a placement
	 *         algorithm similar to Step 5 on jobs from lower-priority
	 *         partitions.
	 */


	/*** Step 1 ***/
	bit_copybits(bitmap, orig_map);
	bit_copybits(free_cores, avail_cores);

	/* remove all existing allocations from free_cores */
	tmpcore = bit_copy(free_cores);
	for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) {
		if (!p_ptr->row)
			continue;
		for (i = 0; i < p_ptr->num_rows; i++) {
			if (!p_ptr->row[i].row_bitmap)
				continue;
			bit_copybits(tmpcore, p_ptr->row[i].row_bitmap);
			bit_not(tmpcore); /* set bits now "free" resources */
			bit_and(free_cores, tmpcore);
		}
	}
	cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores,
				  node_usage, cr_type, test_only);
	if (cpu_count) {
		/* job fits! We're done. */
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
			info("select/serial: cr_job_test: test 1 pass - "
			     "idle resources found");
		}
		goto alloc_job;
	}

	if ((gang_mode == 0) && (job_node_req == NODE_CR_ONE_ROW)) {
		/* This job CANNOT share CPUs regardless of priority,
		 * so we fail here. Note that Shared=EXCLUSIVE was already
		 * addressed in _verify_node_state() and job preemption
		 * removes jobs from simulated resource allocation map
		 * before this point. */
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
			info("select/serial: cr_job_test: test 1 fail - "
			     "no idle resources available");
		}
		goto alloc_job;
	}
	if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
		info("select/serial: cr_job_test: test 1 fail - "
		     "not enough idle resources");
	}

	/*** Step 2 ***/
	bit_copybits(bitmap, orig_map);
	bit_copybits(free_cores, avail_cores);

	for (jp_ptr = cr_part_ptr; jp_ptr; jp_ptr = jp_ptr->next) {
		if (jp_ptr->part_ptr == job_ptr->part_ptr)
			break;
	}
	if (!jp_ptr) {
		fatal("select/serial: could not find partition for job %u",
		      job_ptr->job_id);
		return SLURM_ERROR;	/* Fix CLANG false positive */
	}

	/* remove existing allocations (jobs) from higher-priority partitions
	 * from avail_cores */
	for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) {
		if ((p_ptr->part_ptr->priority <= jp_ptr->part_ptr->priority) &&
		    (p_ptr->part_ptr->preempt_mode != PREEMPT_MODE_OFF))
			continue;
		if (!p_ptr->row)
			continue;
		for (i = 0; i < p_ptr->num_rows; i++) {
			if (!p_ptr->row[i].row_bitmap)
				continue;
			bit_copybits(tmpcore, p_ptr->row[i].row_bitmap);
			bit_not(tmpcore); /* set bits now "free" resources */
			bit_and(free_cores, tmpcore);
		}
	}
	/* make these changes permanent */
	bit_copybits(avail_cores, free_cores);
	cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores,
				  node_usage, cr_type, test_only);
	if (!cpu_count) {
		/* job needs resources that are currently in use by
		 * higher-priority jobs, so fail for now */
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
			info("select/serial: cr_job_test: test 2 fail - "
			     "resources busy with higher priority jobs");
		}
		goto alloc_job;
	}
	xfree(cpu_count);
	if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
		info("select/serial: cr_job_test: test 2 pass - "
		     "available resources for this priority");
	}

	/*** Step 3 ***/
	bit_copybits(bitmap, orig_map);
	bit_copybits(free_cores, avail_cores);

	/* remove existing allocations (jobs) from same-priority partitions
	 * from avail_cores */
	for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) {
		if (p_ptr->part_ptr->priority != jp_ptr->part_ptr->priority)
			continue;
		if (!p_ptr->row)
			continue;
		for (i = 0; i < p_ptr->num_rows; i++) {
			if (!p_ptr->row[i].row_bitmap)
				continue;
			bit_copybits(tmpcore, p_ptr->row[i].row_bitmap);
			bit_not(tmpcore); /* set bits now "free" resources */
			bit_and(free_cores, tmpcore);
		}
	}
	cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores,
				  node_usage, cr_type, test_only);
	if (cpu_count) {
		/* jobs from low-priority partitions are the only thing left
		 * in our way. for now we'll ignore them, but FIXME: we need
		 * a good placement algorithm here that optimizes "job overlap"
		 * between this job (in these idle nodes) and the low-priority
		 * jobs */
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
			info("select/serial: cr_job_test: test 3 pass - "
			     "found resources");
		}
		goto alloc_job;
	}
	if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
		info("select/serial: cr_job_test: test 3 fail - "
		     "not enough idle resources in same priority");
	}


	/*** Step 4 ***/
	/* try to fit the job into an existing row
	 *
	 * tmpcore = worker core_bitmap
	 * free_cores = core_bitmap to be built
	 * avail_cores = static core_bitmap of all available cores
	 */

	if (!jp_ptr || !jp_ptr->row) {
		/* there's no existing jobs in this partition, so place
		 * the job in avail_cores. FIXME: still need a good
		 * placement algorithm here that optimizes "job overlap"
		 * between this job (in these idle nodes) and existing
		 * jobs in the other partitions with <= priority to
		 * this partition */
		bit_copybits(bitmap, orig_map);
		bit_copybits(free_cores, avail_cores);
		cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt,
					  free_cores, node_usage, cr_type,
					  test_only);
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
			info("select/serial: cr_job_test: test 4 pass - "
			     "first row found");
		}
		goto alloc_job;
	}

	cr_sort_part_rows(jp_ptr);
	c = jp_ptr->num_rows;
	if (job_node_req != NODE_CR_AVAILABLE)
		c = 1;
	for (i = 0; i < c; i++) {
		if (!jp_ptr->row[i].row_bitmap)
			break;
		bit_copybits(bitmap, orig_map);
		bit_copybits(free_cores, avail_cores);
		bit_copybits(tmpcore, jp_ptr->row[i].row_bitmap);
		bit_not(tmpcore);
		bit_and(free_cores, tmpcore);
		cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt,
					  free_cores, node_usage, cr_type,
					  test_only);
		if (cpu_count) {
			if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
				info("select/serial: cr_job_test: "
				     "test 4 pass - row %i", i);
			}
			break;
		}
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
			info("select/serial: cr_job_test: "
			     "test 4 fail - row %i", i);
		}
	}

	if ((i < c) && !jp_ptr->row[i].row_bitmap) {
		/* we've found an empty row, so use it */
		bit_copybits(bitmap, orig_map);
		bit_copybits(free_cores, avail_cores);
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
			info("select/serial: cr_job_test: "
			     "test 4 trying empty row %i",i);
		}
		cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt,
					  free_cores, node_usage, cr_type,
					  test_only);
	}

	if (!cpu_count) {
		/* job can't fit into any row, so exit */
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
			info("select/serial: cr_job_test: test 4 fail - "
			     "busy partition");
		}
		goto alloc_job;

	}

	/*** CONSTRUCTION ZONE FOR STEPs 5 AND 6 ***
	 * Note that while the job may have fit into a row, it should
	 * still be run through a good placement algorithm here that
	 * optimizes "job overlap" between this job (in these idle nodes)
	 * and existing jobs in the other partitions with <= priority to
	 * this partition */

alloc_job:
	/* at this point we've found a good set of
	 * bits to allocate to this job:
	 * - bitmap is the set of nodes to allocate
	 * - free_cores is the set of allocated cores
	 * - cpu_count is the number of cpus per allocated node
	 *
	 * Next steps are to cleanup the worker variables,
	 * create the job_resources struct,
	 * distribute the job on the bits, and exit
	 */
	FREE_NULL_BITMAP(orig_map);
	FREE_NULL_BITMAP(avail_cores);
	FREE_NULL_BITMAP(tmpcore);
	if (!cpu_count) {
		/* we were sent here to cleanup and exit */
		FREE_NULL_BITMAP(free_cores);
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
			info("select/serial: exiting cr_job_test with no "
			     "allocation");
		}
		return SLURM_ERROR;
	}

	/* At this point we have:
	 * - a bitmap of selected nodes
	 * - a free_cores bitmap of usable cores on each selected node
	 * - a per-alloc-node cpu_count array
	 */

	if ((mode != SELECT_MODE_WILL_RUN) && (job_ptr->part_ptr == NULL))
		error_code = EINVAL;
	if ((error_code == SLURM_SUCCESS) && (mode == SELECT_MODE_WILL_RUN))
		job_ptr->total_cpus = 1;
	if ((error_code != SLURM_SUCCESS) || (mode != SELECT_MODE_RUN_NOW)) {
		FREE_NULL_BITMAP(free_cores);
		xfree(cpu_count);
		return error_code;
	}

	n = bit_ffs(bitmap);
	if (n < 0) {
		FREE_NULL_BITMAP(free_cores);
		xfree(cpu_count);
		return error_code;
	}

	if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
		info("select/serial: cr_job_test: distributing job %u",
		     job_ptr->job_id);
	}

	/** create the struct_job_res  **/
	job_res                   = create_job_resources();
	job_res->node_bitmap      = bit_copy(bitmap);
	job_res->nodes            = bitmap2node_name(bitmap);
	job_res->nhosts           = bit_set_count(bitmap);
	job_res->ncpus            = job_res->nhosts;
	if (job_ptr->details->ntasks_per_node)
		job_res->ncpus   *= details_ptr->ntasks_per_node;
	job_res->ncpus            = MAX(job_res->ncpus,
					details_ptr->min_cpus);
	job_res->ncpus            = MAX(job_res->ncpus,
					details_ptr->pn_min_cpus);
	job_res->node_req         = job_node_req;
	job_res->cpus             = cpu_count;
	job_res->cpus_used        = xmalloc(job_res->nhosts *
					    sizeof(uint16_t));
	job_res->memory_allocated = xmalloc(job_res->nhosts *
					    sizeof(uint32_t));
	job_res->memory_used      = xmalloc(job_res->nhosts *
					    sizeof(uint32_t));

	/* store the hardware data for the selected nodes */
	error_code = build_job_resources(job_res, node_record_table_ptr,
					  select_fast_schedule);
	if (error_code != SLURM_SUCCESS) {
		free_job_resources(&job_res);
		FREE_NULL_BITMAP(free_cores);
		return error_code;
	}

	c = 0;
	csize = bit_size(job_res->core_bitmap);
	j = cr_get_coremap_offset(n);
	k = cr_get_coremap_offset(n + 1);
	for (; j < k; j++, c++) {
		if (!bit_test(free_cores, j))
			continue;
		if (c >= csize)	{
			error("select/serial: cr_job_test "
			      "core_bitmap index error on node %s", 
			      select_node_record[n].node_ptr->name);
			drain_nodes(select_node_record[n].node_ptr->name,
				    "Bad core count", getuid());
			free_job_resources(&job_res);
			FREE_NULL_BITMAP(free_cores);
			return SLURM_ERROR;
		}
		bit_set(job_res->core_bitmap, c);
		break;
	}

	if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
		info("select/serial: cr_job_test: job %u ncpus %u cbits %u/%d "
		     "nbits %u", job_ptr->job_id,
		     job_res->ncpus, bit_set_count(free_cores), 1,
		     job_res->nhosts);
	}
	FREE_NULL_BITMAP(free_cores);

	/* distribute the tasks and clear any unused cores */
	job_ptr->job_resrcs = job_res;
	error_code = cr_dist(job_ptr, cr_type);
	if (error_code != SLURM_SUCCESS) {
		free_job_resources(&job_ptr->job_resrcs);
		return error_code;
	}

	/* translate job_res->cpus array into format with rep count */
	job_ptr->total_cpus = build_job_resources_cpu_array(job_res);

	if (!(cr_type & CR_MEMORY))
		return error_code;

	/* load memory allocated array */
	save_mem = details_ptr->pn_min_memory;
	if (save_mem & MEM_PER_CPU) {
		/* memory is per-cpu */
		save_mem &= (~MEM_PER_CPU);
		job_res->memory_allocated[0] = job_res->cpus[0] * save_mem;
	} else {
		/* memory is per-node */
		job_res->memory_allocated[0] = save_mem;
	}
	return error_code;
}

예제 #16

0

파일 보기

파일: partition_info.c 프로젝트: jsollom/slurm

/*
 * slurm_sprint_partition_info - output information about a specific Slurm
 *	partition based upon message as loaded using slurm_load_partitions
 * IN part_ptr - an individual partition information record pointer
 * IN one_liner - print as a single line if true
 * RET out - char * containing formatted output (must be freed after call)
 *           NULL is returned on failure.
 */
char *slurm_sprint_partition_info ( partition_info_t * part_ptr,
				    int one_liner )
{
	char tmp1[16], tmp2[16];
	char tmp_line[MAXHOSTRANGELEN];
	char *out = NULL;
	char *allow_deny, *value;
	uint16_t force, preempt_mode, val;
	uint32_t cluster_flags = slurmdb_setup_cluster_flags();

	/****** Line 1 ******/

	snprintf(tmp_line, sizeof(tmp_line),
		 "PartitionName=%s",
		 part_ptr->name);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 2 ******/

	if ((part_ptr->allow_groups == NULL) ||
	    (part_ptr->allow_groups[0] == '\0'))
		sprintf(tmp_line, "AllowGroups=ALL");
	else {
		snprintf(tmp_line, sizeof(tmp_line),
			 "AllowGroups=%s", part_ptr->allow_groups);
	}
	xstrcat(out, tmp_line);

	if (part_ptr->allow_accounts || !part_ptr->deny_accounts) {
		allow_deny = "Allow";
		if ((part_ptr->allow_accounts == NULL) ||
		    (part_ptr->allow_accounts[0] == '\0'))
			value = "ALL";
		else
			value = part_ptr->allow_accounts;
	} else {
		allow_deny = "Deny";
		value = part_ptr->deny_accounts;
	}
	snprintf(tmp_line, sizeof(tmp_line),
		 " %sAccounts=%s", allow_deny, value);
	xstrcat(out, tmp_line);

	if (part_ptr->allow_qos || !part_ptr->deny_qos) {
		allow_deny = "Allow";
		if ((part_ptr->allow_qos == NULL) ||
		    (part_ptr->allow_qos[0] == '\0'))
			value = "ALL";
		else
			value = part_ptr->allow_qos;
	} else {
		allow_deny = "Deny";
		value = part_ptr->deny_qos;
	}
	snprintf(tmp_line, sizeof(tmp_line),
		 " %sQos=%s", allow_deny, value);
	xstrcat(out, tmp_line);

	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 3 ******/
	if (part_ptr->allow_alloc_nodes == NULL)
		snprintf(tmp_line, sizeof(tmp_line), "AllocNodes=%s","ALL");
	else
		snprintf(tmp_line, sizeof(tmp_line), "AllocNodes=%s",
			 part_ptr->allow_alloc_nodes);
	xstrcat(out, tmp_line);

	if (part_ptr->alternate != NULL) {
		snprintf(tmp_line, sizeof(tmp_line), " Alternate=%s",
			 part_ptr->alternate);
		xstrcat(out, tmp_line);
	}

	if (part_ptr->flags & PART_FLAG_DEFAULT)
		sprintf(tmp_line, " Default=YES");
	else
		sprintf(tmp_line, " Default=NO");
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 4 added here for BG partitions only
	 ****** to maintain alphabetized output ******/

	if (cluster_flags & CLUSTER_FLAG_BG) {
		snprintf(tmp_line, sizeof(tmp_line), "BasePartitions=%s",
			 part_ptr->nodes);
		xstrcat(out, tmp_line);
		if (one_liner)
			xstrcat(out, " ");
		else
			xstrcat(out, "\n   ");
	}

	/****** Line 5 ******/

	if (part_ptr->default_time == INFINITE)
		sprintf(tmp_line, "DefaultTime=UNLIMITED");
	else if (part_ptr->default_time == NO_VAL)
		sprintf(tmp_line, "DefaultTime=NONE");
	else {
		char time_line[32];
		secs2time_str(part_ptr->default_time * 60, time_line,
			sizeof(time_line));
		sprintf(tmp_line, "DefaultTime=%s", time_line);
	}
	xstrcat(out, tmp_line);
	if (part_ptr->flags & PART_FLAG_NO_ROOT)
		sprintf(tmp_line, " DisableRootJobs=YES");
	else
		sprintf(tmp_line, " DisableRootJobs=NO");
	xstrcat(out, tmp_line);
	sprintf(tmp_line, " GraceTime=%u", part_ptr->grace_time);
	xstrcat(out, tmp_line);
	if (part_ptr->flags & PART_FLAG_HIDDEN)
		sprintf(tmp_line, " Hidden=YES");
	else
		sprintf(tmp_line, " Hidden=NO");
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 6 ******/

	if (part_ptr->max_nodes == INFINITE)
		sprintf(tmp_line, "MaxNodes=UNLIMITED");
	else {
		if (cluster_flags & CLUSTER_FLAG_BG)
			convert_num_unit((float)part_ptr->max_nodes,
					 tmp1, sizeof(tmp1), UNIT_NONE);
		else
			snprintf(tmp1, sizeof(tmp1),"%u", part_ptr->max_nodes);

		sprintf(tmp_line, "MaxNodes=%s", tmp1);
	}
	xstrcat(out, tmp_line);
	if (part_ptr->max_time == INFINITE)
		sprintf(tmp_line, " MaxTime=UNLIMITED");
	else {
		char time_line[32];
		secs2time_str(part_ptr->max_time * 60, time_line,
			      sizeof(time_line));
		sprintf(tmp_line, " MaxTime=%s", time_line);
	}
	xstrcat(out, tmp_line);
	if (cluster_flags & CLUSTER_FLAG_BG)
		convert_num_unit((float)part_ptr->min_nodes, tmp1, sizeof(tmp1),
				 UNIT_NONE);
	else
		snprintf(tmp1, sizeof(tmp1), "%u", part_ptr->min_nodes);
	sprintf(tmp_line, " MinNodes=%s", tmp1);
	xstrcat(out, tmp_line);
	if (part_ptr->max_cpus_per_node == INFINITE)
		sprintf(tmp_line, " MaxCPUsPerNode=UNLIMITED");
	else {
		sprintf(tmp_line, " MaxCPUsPerNode=%u",
			part_ptr->max_cpus_per_node);
	}
	xstrcat(out, tmp_line);

	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line added here for non BG nodes
	 to keep with alphabetized output******/

	if (!(cluster_flags & CLUSTER_FLAG_BG)) {
		snprintf(tmp_line, sizeof(tmp_line), "Nodes=%s",
			 part_ptr->nodes);
		xstrcat(out, tmp_line);
		if (one_liner)
			xstrcat(out, " ");
		else
			xstrcat(out, "\n   ");
	}

	/****** Line 7 ******/

	sprintf(tmp_line, "Priority=%u", part_ptr->priority);
	xstrcat(out, tmp_line);
	if (part_ptr->flags & PART_FLAG_ROOT_ONLY)
		sprintf(tmp_line, " RootOnly=YES");
	else
		sprintf(tmp_line, " RootOnly=NO");
	xstrcat(out, tmp_line);
	if (part_ptr->flags & PART_FLAG_REQ_RESV)
		sprintf(tmp_line, " ReqResv=YES");
	else
		sprintf(tmp_line, " ReqResv=NO");
	xstrcat(out, tmp_line);

	force = part_ptr->max_share & SHARED_FORCE;
	val = part_ptr->max_share & (~SHARED_FORCE);
	if (val == 0)
		xstrcat(out, " Shared=EXCLUSIVE");
	else if (force) {
		sprintf(tmp_line, " Shared=FORCE:%u", val);
		xstrcat(out, tmp_line);
	} else if (val == 1)
		xstrcat(out, " Shared=NO");
	else {
		sprintf(tmp_line, " Shared=YES:%u", val);
		xstrcat(out, tmp_line);
	}
	preempt_mode = part_ptr->preempt_mode;
	if (preempt_mode == (uint16_t) NO_VAL)
		preempt_mode = slurm_get_preempt_mode(); /* use cluster param */
	snprintf(tmp_line, sizeof(tmp_line), " PreemptMode=%s",
		 preempt_mode_string(preempt_mode));
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 8 ******/

	if (part_ptr->state_up == PARTITION_UP)
		sprintf(tmp_line, "State=UP");
	else if (part_ptr->state_up == PARTITION_DOWN)
		sprintf(tmp_line, "State=DOWN");
	else if (part_ptr->state_up == PARTITION_INACTIVE)
		sprintf(tmp_line, "State=INACTIVE");
	else if (part_ptr->state_up == PARTITION_DRAIN)
		sprintf(tmp_line, "State=DRAIN");
	else
		sprintf(tmp_line, "State=UNKNOWN");

	xstrcat(out, tmp_line);

	if (cluster_flags & CLUSTER_FLAG_BG)
		convert_num_unit((float)part_ptr->total_cpus, tmp1,
				 sizeof(tmp1), UNIT_NONE);
	else
		snprintf(tmp1, sizeof(tmp1), "%u", part_ptr->total_cpus);

	sprintf(tmp_line, " TotalCPUs=%s", tmp1);
	xstrcat(out, tmp_line);

	if (cluster_flags & CLUSTER_FLAG_BG)
		convert_num_unit((float)part_ptr->total_nodes, tmp2,
				 sizeof(tmp2), UNIT_NONE);
	else
		snprintf(tmp2, sizeof(tmp2), "%u", part_ptr->total_nodes);

	sprintf(tmp_line, " TotalNodes=%s", tmp2);
	xstrcat(out, tmp_line);

	if (part_ptr->cr_type & CR_CORE)
		sprintf(tmp_line, " SelectTypeParameters=CR_CORE");
	else if (part_ptr->cr_type & CR_SOCKET)
		sprintf(tmp_line, " SelectTypeParameters=CR_SOCKET");
	else
		sprintf(tmp_line, " SelectTypeParameters=N/A");
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 9 ******/
	if (part_ptr->def_mem_per_cpu & MEM_PER_CPU) {
		snprintf(tmp_line, sizeof(tmp_line), "DefMemPerCPU=%u",
			 part_ptr->def_mem_per_cpu & (~MEM_PER_CPU));
		xstrcat(out, tmp_line);

	} else if (part_ptr->def_mem_per_cpu == 0) {
		xstrcat(out, "DefMemPerNode=UNLIMITED");
	} else {
		snprintf(tmp_line, sizeof(tmp_line), "DefMemPerNode=%u",
			 part_ptr->def_mem_per_cpu);
		xstrcat(out, tmp_line);
	}

	if (part_ptr->max_mem_per_cpu & MEM_PER_CPU) {
		snprintf(tmp_line, sizeof(tmp_line), " MaxMemPerCPU=%u",
			 part_ptr->max_mem_per_cpu & (~MEM_PER_CPU));
		xstrcat(out, tmp_line);

	} else if (part_ptr->max_mem_per_cpu == 0) {
		xstrcat(out, " MaxMemPerNode=UNLIMITED");
	} else {
		snprintf(tmp_line, sizeof(tmp_line), " MaxMemPerNode=%u",
			 part_ptr->max_mem_per_cpu);
		xstrcat(out, tmp_line);
	}

	if (one_liner)
		xstrcat(out, "\n");
	else
		xstrcat(out, "\n\n");

	return out;
}

예제 #17

0

파일 보기

파일: partition_info.c 프로젝트: Q-Leap-Networks/qlustar-slurm

/*
 * slurm_sprint_partition_info - output information about a specific Slurm
 *	partition based upon message as loaded using slurm_load_partitions
 * IN part_ptr - an individual partition information record pointer
 * IN one_liner - print as a single line if true
 * RET out - char * containing formatted output (must be freed after call)
 *           NULL is returned on failure.
 */
char *slurm_sprint_partition_info ( partition_info_t * part_ptr,
				    int one_liner )
{
	char tmp[16];
	char *out = NULL;
	char *allow_deny, *value;
	uint16_t force, preempt_mode, val;
	uint32_t cluster_flags = slurmdb_setup_cluster_flags();
	char *line_end = (one_liner) ? " " : "\n   ";

	/****** Line 1 ******/

	xstrfmtcat(out, "PartitionName=%s", part_ptr->name);
	xstrcat(out, line_end);

	/****** Line 2 ******/

	if ((part_ptr->allow_groups == NULL) ||
	    (part_ptr->allow_groups[0] == '\0'))
		xstrcat(out, "AllowGroups=ALL");
	else {
		xstrfmtcat(out, "AllowGroups=%s", part_ptr->allow_groups);
	}

	if (part_ptr->allow_accounts || !part_ptr->deny_accounts) {
		allow_deny = "Allow";
		if ((part_ptr->allow_accounts == NULL) ||
		    (part_ptr->allow_accounts[0] == '\0'))
			value = "ALL";
		else
			value = part_ptr->allow_accounts;
	} else {
		allow_deny = "Deny";
		value = part_ptr->deny_accounts;
	}
	xstrfmtcat(out, " %sAccounts=%s", allow_deny, value);

	if (part_ptr->allow_qos || !part_ptr->deny_qos) {
		allow_deny = "Allow";
		if ((part_ptr->allow_qos == NULL) ||
		    (part_ptr->allow_qos[0] == '\0'))
			value = "ALL";
		else
			value = part_ptr->allow_qos;
	} else {
		allow_deny = "Deny";
		value = part_ptr->deny_qos;
	}
	xstrfmtcat(out, " %sQos=%s", allow_deny, value);
	xstrcat(out, line_end);

	/****** Line 3 ******/
	if (part_ptr->allow_alloc_nodes == NULL)
		xstrcat(out, "AllocNodes=ALL");
	else
		xstrfmtcat(out, "AllocNodes=%s", part_ptr->allow_alloc_nodes);

	if (part_ptr->alternate != NULL) {
		xstrfmtcat(out, " Alternate=%s", part_ptr->alternate);
	}

	if (part_ptr->flags & PART_FLAG_DEFAULT)
		xstrcat(out, " Default=YES");
	else
		xstrcat(out, " Default=NO");

	if (part_ptr->qos_char)
		xstrfmtcat(out, " QoS=%s", part_ptr->qos_char);
	else
		xstrcat(out, " QoS=N/A");

	xstrcat(out, line_end);

	/****** Line 4 added here for BG partitions only
	 ****** to maintain alphabetized output ******/

	if (cluster_flags & CLUSTER_FLAG_BG) {
		xstrfmtcat(out, "Midplanes=%s", part_ptr->nodes);
		xstrcat(out, line_end);
	}

	/****** Line 5 ******/

	if (part_ptr->default_time == INFINITE)
		xstrcat(out, "DefaultTime=UNLIMITED");
	else if (part_ptr->default_time == NO_VAL)
		xstrcat(out, "DefaultTime=NONE");
	else {
		char time_line[32];
		secs2time_str(part_ptr->default_time * 60, time_line,
			sizeof(time_line));
		xstrfmtcat(out, "DefaultTime=%s", time_line);
	}

	if (part_ptr->flags & PART_FLAG_NO_ROOT)
		xstrcat(out, " DisableRootJobs=YES");
	else
		xstrcat(out, " DisableRootJobs=NO");

	if (part_ptr->flags & PART_FLAG_EXCLUSIVE_USER)
		xstrcat(out, " ExclusiveUser=YES");
	else
		xstrcat(out, " ExclusiveUser=NO");

	xstrfmtcat(out, " GraceTime=%u", part_ptr->grace_time);

	if (part_ptr->flags & PART_FLAG_HIDDEN)
		xstrcat(out, " Hidden=YES");
	else
		xstrcat(out, " Hidden=NO");

	xstrcat(out, line_end);

	/****** Line 6 ******/

	if (part_ptr->max_nodes == INFINITE)
		xstrcat(out, "MaxNodes=UNLIMITED");
	else {
		if (cluster_flags & CLUSTER_FLAG_BG) {
			convert_num_unit((float)part_ptr->max_nodes, tmp,
					 sizeof(tmp), UNIT_NONE, NO_VAL,
					 CONVERT_NUM_UNIT_EXACT);
			xstrfmtcat(out, "MaxNodes=%s", tmp);
		} else
			xstrfmtcat(out, "MaxNodes=%u", part_ptr->max_nodes);

	}

	if (part_ptr->max_time == INFINITE)
		xstrcat(out, " MaxTime=UNLIMITED");
	else {
		char time_line[32];
		secs2time_str(part_ptr->max_time * 60, time_line,
			      sizeof(time_line));
		xstrfmtcat(out, " MaxTime=%s", time_line);
	}

	if (cluster_flags & CLUSTER_FLAG_BG) {
		convert_num_unit((float)part_ptr->min_nodes, tmp, sizeof(tmp),
				 UNIT_NONE, NO_VAL, CONVERT_NUM_UNIT_EXACT);
		xstrfmtcat(out, " MinNodes=%s", tmp);
	} else
		xstrfmtcat(out, " MinNodes=%u", part_ptr->min_nodes);

	if (part_ptr->flags & PART_FLAG_LLN)
		xstrcat(out, " LLN=YES");
	else
		xstrcat(out, " LLN=NO");

	if (part_ptr->max_cpus_per_node == INFINITE)
		xstrcat(out, " MaxCPUsPerNode=UNLIMITED");
	else {
		xstrfmtcat(out, " MaxCPUsPerNode=%u",
			   part_ptr->max_cpus_per_node);
	}

	xstrcat(out, line_end);

	/****** Line added here for non BG nodes
	 to keep with alphabetized output******/

	if (!(cluster_flags & CLUSTER_FLAG_BG)) {
		xstrfmtcat(out, "Nodes=%s", part_ptr->nodes);
		xstrcat(out, line_end);
	}

	/****** Line 7 ******/

	xstrfmtcat(out, "PriorityJobFactor=%u", part_ptr->priority_job_factor);
	xstrfmtcat(out, " PriorityTier=%u", part_ptr->priority_tier);

	if (part_ptr->flags & PART_FLAG_ROOT_ONLY)
		xstrcat(out, " RootOnly=YES");
	else
		xstrcat(out, " RootOnly=NO");

	if (part_ptr->flags & PART_FLAG_REQ_RESV)
		xstrcat(out, " ReqResv=YES");
	else
		xstrcat(out, " ReqResv=NO");

	force = part_ptr->max_share & SHARED_FORCE;
	val = part_ptr->max_share & (~SHARED_FORCE);
	if (val == 0)
		xstrcat(out, " OverSubscribe=EXCLUSIVE");
	else if (force)
		xstrfmtcat(out, " OverSubscribe=FORCE:%u", val);
	else if (val == 1)
		xstrcat(out, " OverSubscribe=NO");
	else
		xstrfmtcat(out, " OverSubscribe=YES:%u", val);

	xstrcat(out, line_end);

	/****** Line ******/
	if (part_ptr->over_time_limit == NO_VAL16)
		xstrfmtcat(out, "OverTimeLimit=NONE");
	else if (part_ptr->over_time_limit == (uint16_t) INFINITE)
		xstrfmtcat(out, "OverTimeLimit=UNLIMITED");
	else
		xstrfmtcat(out, "OverTimeLimit=%u", part_ptr->over_time_limit);

	preempt_mode = part_ptr->preempt_mode;
	if (preempt_mode == NO_VAL16)
		preempt_mode = slurm_get_preempt_mode(); /* use cluster param */
	xstrfmtcat(out, " PreemptMode=%s", preempt_mode_string(preempt_mode));

	xstrcat(out, line_end);

	/****** Line ******/
	if (part_ptr->state_up == PARTITION_UP)
		xstrcat(out, "State=UP");
	else if (part_ptr->state_up == PARTITION_DOWN)
		xstrcat(out, "State=DOWN");
	else if (part_ptr->state_up == PARTITION_INACTIVE)
		xstrcat(out, "State=INACTIVE");
	else if (part_ptr->state_up == PARTITION_DRAIN)
		xstrcat(out, "State=DRAIN");
	else
		xstrcat(out, "State=UNKNOWN");

	if (cluster_flags & CLUSTER_FLAG_BG) {
		convert_num_unit((float)part_ptr->total_cpus, tmp, sizeof(tmp),
				 UNIT_NONE, NO_VAL, CONVERT_NUM_UNIT_EXACT);
		xstrfmtcat(out, " TotalCPUs=%s", tmp);
	} else
		xstrfmtcat(out, " TotalCPUs=%u", part_ptr->total_cpus);


	if (cluster_flags & CLUSTER_FLAG_BG) {
		convert_num_unit((float)part_ptr->total_nodes, tmp, sizeof(tmp),
				 UNIT_NONE, NO_VAL, CONVERT_NUM_UNIT_EXACT);
		xstrfmtcat(out, " TotalNodes=%s", tmp);
	} else
		xstrfmtcat(out, " TotalNodes=%u", part_ptr->total_nodes);

	xstrfmtcat(out, " SelectTypeParameters=%s",
		   select_type_param_string(part_ptr->cr_type));

	xstrcat(out, line_end);

	/****** Line 9 ******/
	if (part_ptr->def_mem_per_cpu & MEM_PER_CPU) {
		if (part_ptr->def_mem_per_cpu == MEM_PER_CPU) {
			xstrcat(out, "DefMemPerCPU=UNLIMITED");
		} else {
			xstrfmtcat(out, "DefMemPerCPU=%"PRIu64"",
				   part_ptr->def_mem_per_cpu & (~MEM_PER_CPU));
		}
	} else if (part_ptr->def_mem_per_cpu == 0) {
		xstrcat(out, "DefMemPerNode=UNLIMITED");
	} else {
		xstrfmtcat(out, "DefMemPerNode=%"PRIu64"", part_ptr->def_mem_per_cpu);
	}

	if (part_ptr->max_mem_per_cpu & MEM_PER_CPU) {
		if (part_ptr->max_mem_per_cpu == MEM_PER_CPU) {
			xstrcat(out, " MaxMemPerCPU=UNLIMITED");
		} else {
			xstrfmtcat(out, " MaxMemPerCPU=%"PRIu64"",
				   part_ptr->max_mem_per_cpu & (~MEM_PER_CPU));
		}
	} else if (part_ptr->max_mem_per_cpu == 0) {
		xstrcat(out, " MaxMemPerNode=UNLIMITED");
	} else {
		xstrfmtcat(out, " MaxMemPerNode=%"PRIu64"", part_ptr->max_mem_per_cpu);
	}

	/****** Line 10 ******/
	if (part_ptr->billing_weights_str) {
		xstrcat(out, line_end);

		xstrfmtcat(out, "TRESBillingWeights=%s",
			   part_ptr->billing_weights_str);
	}

	if (one_liner)
		xstrcat(out, "\n");
	else
		xstrcat(out, "\n\n");

	return out;
}

예제 #18

0

파일 보기

static int _attempt_backfill(void)
{
	DEF_TIMERS;
	bool filter_root = false;
	List job_queue;
	job_queue_rec_t *job_queue_rec;
	slurmdb_qos_rec_t *qos_ptr = NULL;
	int i, j, node_space_recs;
	struct job_record *job_ptr;
	struct part_record *part_ptr, **bf_part_ptr = NULL;
	uint32_t end_time, end_reserve;
	uint32_t time_limit, comp_time_limit, orig_time_limit, part_time_limit;
	uint32_t min_nodes, max_nodes, req_nodes;
	bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL;
	bitstr_t *exc_core_bitmap = NULL, *non_cg_bitmap = NULL;
	time_t now, sched_start, later_start, start_res, resv_end, window_end;
	node_space_map_t *node_space;
	struct timeval bf_time1, bf_time2;
	int rc = 0;
	int job_test_count = 0;
	uint32_t *uid = NULL, nuser = 0, bf_parts = 0, *bf_part_jobs = NULL;
	uint16_t *njobs = NULL;
	bool already_counted;
	uint32_t reject_array_job_id = 0;
	struct part_record *reject_array_part = NULL;
	uint32_t job_start_cnt = 0, start_time;
	time_t config_update = slurmctld_conf.last_update;
	time_t part_update = last_part_update;
	struct timeval start_tv;

	bf_last_yields = 0;
#ifdef HAVE_ALPS_CRAY
	/*
	 * Run a Basil Inventory immediately before setting up the schedule
	 * plan, to avoid race conditions caused by ALPS node state change.
	 * Needs to be done with the node-state lock taken.
	 */
	START_TIMER;
	if (select_g_reconfigure()) {
		debug4("backfill: not scheduling due to ALPS");
		return SLURM_SUCCESS;
	}
	END_TIMER;
	if (debug_flags & DEBUG_FLAG_BACKFILL)
		info("backfill: ALPS inventory completed, %s", TIME_STR);

	/* The Basil inventory can take a long time to complete. Process
	 * pending RPCs before starting the backfill scheduling logic */
	_yield_locks(1000000);
#endif

	START_TIMER;
	if (debug_flags & DEBUG_FLAG_BACKFILL)
		info("backfill: beginning");
	else
		debug("backfill: beginning");
	sched_start = now = time(NULL);
	gettimeofday(&start_tv, NULL);

	if (slurm_get_root_filter())
		filter_root = true;

	job_queue = build_job_queue(true, true);
	if (list_count(job_queue) == 0) {
		if (debug_flags & DEBUG_FLAG_BACKFILL)
			info("backfill: no jobs to backfill");
		else
			debug("backfill: no jobs to backfill");
		list_destroy(job_queue);
		return 0;
	}

	gettimeofday(&bf_time1, NULL);

	non_cg_bitmap = bit_copy(cg_node_bitmap);
	bit_not(non_cg_bitmap);

	slurmctld_diag_stats.bf_queue_len = list_count(job_queue);
	slurmctld_diag_stats.bf_queue_len_sum += slurmctld_diag_stats.
						 bf_queue_len;
	slurmctld_diag_stats.bf_last_depth = 0;
	slurmctld_diag_stats.bf_last_depth_try = 0;
	slurmctld_diag_stats.bf_when_last_cycle = now;
	slurmctld_diag_stats.bf_active = 1;

	node_space = xmalloc(sizeof(node_space_map_t) *
			     (max_backfill_job_cnt * 2 + 1));
	node_space[0].begin_time = sched_start;
	window_end = sched_start + backfill_window;
	node_space[0].end_time = window_end;
	node_space[0].avail_bitmap = bit_copy(avail_node_bitmap);
	node_space[0].next = 0;
	node_space_recs = 1;
	if (debug_flags & DEBUG_FLAG_BACKFILL_MAP)
		_dump_node_space_table(node_space);

	if (max_backfill_job_per_part) {
		ListIterator part_iterator;
		struct part_record *part_ptr;
		bf_parts = list_count(part_list);
		bf_part_ptr  = xmalloc(sizeof(struct part_record *) * bf_parts);
		bf_part_jobs = xmalloc(sizeof(int) * bf_parts);
		part_iterator = list_iterator_create(part_list);
		i = 0;
		while ((part_ptr = (struct part_record *)
				   list_next(part_iterator))) {
			bf_part_ptr[i++] = part_ptr;
		}
		list_iterator_destroy(part_iterator);
	}
	if (max_backfill_job_per_user) {
		uid = xmalloc(BF_MAX_USERS * sizeof(uint32_t));
		njobs = xmalloc(BF_MAX_USERS * sizeof(uint16_t));
	}
	sort_job_queue(job_queue);
	while (1) {
		job_queue_rec = (job_queue_rec_t *) list_pop(job_queue);
		if (!job_queue_rec) {
			if (debug_flags & DEBUG_FLAG_BACKFILL)
				info("backfill: reached end of job queue");
			break;
		}
		if (slurmctld_config.shutdown_time)
			break;
		if (((defer_rpc_cnt > 0) &&
		     (slurmctld_config.server_thread_count >= defer_rpc_cnt)) ||
		    (_delta_tv(&start_tv) >= sched_timeout)) {
			if (debug_flags & DEBUG_FLAG_BACKFILL) {
				END_TIMER;
				info("backfill: completed yielding locks "
				     "after testing %u(%d) jobs, %s",
				     slurmctld_diag_stats.bf_last_depth,
				     job_test_count, TIME_STR);
			}
			if ((_yield_locks(yield_sleep) && !backfill_continue) ||
			    (slurmctld_conf.last_update != config_update) ||
			    (last_part_update != part_update)) {
				if (debug_flags & DEBUG_FLAG_BACKFILL) {
					info("backfill: system state changed, "
					     "breaking out after testing "
					     "%u(%d) jobs",
					     slurmctld_diag_stats.bf_last_depth,
					     job_test_count);
				}
				rc = 1;
				xfree(job_queue_rec);
				break;
			}
			/* cg_node_bitmap may be changed */
			bit_copybits(non_cg_bitmap, cg_node_bitmap);
			bit_not(non_cg_bitmap);
			/* Reset backfill scheduling timers, resume testing */
			sched_start = time(NULL);
			gettimeofday(&start_tv, NULL);
			job_test_count = 0;
			START_TIMER;
		}

		job_ptr  = job_queue_rec->job_ptr;
		/* With bf_continue configured, the original job could have
		 * been cancelled and purged. Validate pointer here. */
		if ((job_ptr->magic  != JOB_MAGIC) ||
		    (job_ptr->job_id != job_queue_rec->job_id)) {
			xfree(job_queue_rec);
			continue;
		}
		orig_time_limit = job_ptr->time_limit;
		part_ptr = job_queue_rec->part_ptr;

		job_test_count++;
		slurmctld_diag_stats.bf_last_depth++;
		already_counted = false;

		xfree(job_queue_rec);
		if (!IS_JOB_PENDING(job_ptr))
			continue;	/* started in other partition */
		if (!avail_front_end(job_ptr))
			continue;	/* No available frontend for this job */
		if (job_ptr->array_task_id != NO_VAL) {
			if ((reject_array_job_id == job_ptr->array_job_id) &&
			    (reject_array_part   == part_ptr))
				continue;  /* already rejected array element */
			/* assume reject whole array for now, clear if OK */
			reject_array_job_id = job_ptr->array_job_id;
			reject_array_part   = part_ptr;
		}
		job_ptr->part_ptr = part_ptr;

		if (debug_flags & DEBUG_FLAG_BACKFILL) {
			info("backfill test for JobID=%u Prio=%u Partition=%s",
			     job_ptr->job_id, job_ptr->priority,
			     job_ptr->part_ptr->name);
		}

		if (max_backfill_job_per_part) {
			bool skip_job = false;
			for (j = 0; j < bf_parts; j++) {
				if (bf_part_ptr[j] != job_ptr->part_ptr)
					continue;
				if (bf_part_jobs[j]++ >=
				    max_backfill_job_per_part)
					skip_job = true;
				break;
			}
			if (skip_job) {
				if (debug_flags & DEBUG_FLAG_BACKFILL)
					info("backfill: have already "
					     "checked %u jobs for "
					     "partition %s; skipping "
					     "job %u",
					     max_backfill_job_per_part,
					     job_ptr->part_ptr->name,
					     job_ptr->job_id);
				continue;
			}
		}
		if (max_backfill_job_per_user) {
			for (j = 0; j < nuser; j++) {
				if (job_ptr->user_id == uid[j]) {
					njobs[j]++;
					if (debug_flags & DEBUG_FLAG_BACKFILL)
						debug("backfill: user %u: "
						      "#jobs %u",
						      uid[j], njobs[j]);
					break;
				}
			}
			if (j == nuser) { /* user not found */
				static bool bf_max_user_msg = true;
				if (nuser < BF_MAX_USERS) {
					uid[j] = job_ptr->user_id;
					njobs[j] = 1;
					nuser++;
				} else if (bf_max_user_msg) {
					bf_max_user_msg = false;
					error("backfill: too many users in "
					      "queue. Consider increasing "
					      "BF_MAX_USERS");
				}
				if (debug_flags & DEBUG_FLAG_BACKFILL)
					debug2("backfill: found new user %u. "
					       "Total #users now %u",
					       job_ptr->user_id, nuser);
			} else {
				if (njobs[j] >= max_backfill_job_per_user) {
					/* skip job */
					if (debug_flags & DEBUG_FLAG_BACKFILL)
						info("backfill: have already "
						     "checked %u jobs for "
						     "user %u; skipping "
						     "job %u",
						     max_backfill_job_per_user,
						     job_ptr->user_id,
						     job_ptr->job_id);
					continue;
				}
			}
		}

		if (((part_ptr->state_up & PARTITION_SCHED) == 0) ||
		    (part_ptr->node_bitmap == NULL) ||
		    ((part_ptr->flags & PART_FLAG_ROOT_ONLY) && filter_root)) {
			if (debug_flags & DEBUG_FLAG_BACKFILL)
				info("backfill: partition %s not usable",
				     job_ptr->part_ptr->name);
			continue;
		}

		if ((!job_independent(job_ptr, 0)) ||
		    (license_job_test(job_ptr, time(NULL)) != SLURM_SUCCESS)) {
			if (debug_flags & DEBUG_FLAG_BACKFILL)
				info("backfill: job %u not runable now",
				     job_ptr->job_id);
			continue;
		}

		/* Determine minimum and maximum node counts */
		min_nodes = MAX(job_ptr->details->min_nodes,
				part_ptr->min_nodes);
		if (job_ptr->details->max_nodes == 0)
			max_nodes = part_ptr->max_nodes;
		else
			max_nodes = MIN(job_ptr->details->max_nodes,
					part_ptr->max_nodes);
		max_nodes = MIN(max_nodes, 500000);     /* prevent overflows */
		if (job_ptr->details->max_nodes)
			req_nodes = max_nodes;
		else
			req_nodes = min_nodes;
		if (min_nodes > max_nodes) {
			if (debug_flags & DEBUG_FLAG_BACKFILL)
				info("backfill: job %u node count too high",
				     job_ptr->job_id);
			continue;
		}

		/* Determine job's expected completion time */
		if (part_ptr->max_time == INFINITE)
			part_time_limit = 365 * 24 * 60; /* one year */
		else
			part_time_limit = part_ptr->max_time;
		if (job_ptr->time_limit == NO_VAL) {
			time_limit = part_time_limit;
		} else {
			if (part_ptr->max_time == INFINITE)
				time_limit = job_ptr->time_limit;
			else
				time_limit = MIN(job_ptr->time_limit,
						 part_time_limit);
		}
		comp_time_limit = time_limit;
		qos_ptr = job_ptr->qos_ptr;
		if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE) &&
		    slurm_get_preempt_mode())
			time_limit = job_ptr->time_limit = 1;
		else if (job_ptr->time_min && (job_ptr->time_min < time_limit))
			time_limit = job_ptr->time_limit = job_ptr->time_min;

		/* Determine impact of any resource reservations */
		later_start = now;
 TRY_LATER:
		if (slurmctld_config.shutdown_time)
			break;
		if (((defer_rpc_cnt > 0) &&
		     (slurmctld_config.server_thread_count >= defer_rpc_cnt)) ||
		    (_delta_tv(&start_tv) >= sched_timeout)) {
			uint32_t save_job_id = job_ptr->job_id;
			uint32_t save_time_limit = job_ptr->time_limit;
			job_ptr->time_limit = orig_time_limit;
			if (debug_flags & DEBUG_FLAG_BACKFILL) {
				END_TIMER;
				info("backfill: completed yielding locks "
				     "after testing %u(%d) jobs, %s",
				     slurmctld_diag_stats.bf_last_depth,
				     job_test_count, TIME_STR);
			}
			if ((_yield_locks(yield_sleep) && !backfill_continue) ||
			    (slurmctld_conf.last_update != config_update) ||
			    (last_part_update != part_update)) {
				if (debug_flags & DEBUG_FLAG_BACKFILL) {
					info("backfill: system state changed, "
					     "breaking out after testing "
					     "%u(%d) jobs",
					     slurmctld_diag_stats.bf_last_depth,
					     job_test_count);
				}
				rc = 1;
				break;
			}
			/* cg_node_bitmap may be changed */
			bit_copybits(non_cg_bitmap, cg_node_bitmap);
			bit_not(non_cg_bitmap);

			/* With bf_continue configured, the original job could
			 * have been scheduled or cancelled and purged.
			 * Revalidate job the record here. */
			if ((job_ptr->magic  != JOB_MAGIC) ||
			    (job_ptr->job_id != save_job_id))
				continue;
			if (!IS_JOB_PENDING(job_ptr))
				continue;
			if (!avail_front_end(job_ptr))
				continue;	/* No available frontend */

			job_ptr->time_limit = save_time_limit;
			/* Reset backfill scheduling timers, resume testing */
			sched_start = time(NULL);
			gettimeofday(&start_tv, NULL);
			job_test_count = 1;
			START_TIMER;
		}

		FREE_NULL_BITMAP(avail_bitmap);
		FREE_NULL_BITMAP(exc_core_bitmap);
		start_res   = later_start;
		later_start = 0;
		j = job_test_resv(job_ptr, &start_res, true, &avail_bitmap,
				  &exc_core_bitmap);
		if (j != SLURM_SUCCESS) {
			if (debug_flags & DEBUG_FLAG_BACKFILL)
				info("backfill: job %u reservation defer",
				     job_ptr->job_id);
			job_ptr->time_limit = orig_time_limit;
			continue;
		}
		if (start_res > now)
			end_time = (time_limit * 60) + start_res;
		else
			end_time = (time_limit * 60) + now;
		resv_end = find_resv_end(start_res);
		/* Identify usable nodes for this job */
		bit_and(avail_bitmap, part_ptr->node_bitmap);
		bit_and(avail_bitmap, up_node_bitmap);
		bit_and(avail_bitmap, non_cg_bitmap);
		for (j=0; ; ) {
			if ((node_space[j].end_time > start_res) &&
			     node_space[j].next && (later_start == 0))
				later_start = node_space[j].end_time;
			if (node_space[j].end_time <= start_res)
				;
			else if (node_space[j].begin_time <= end_time) {
				bit_and(avail_bitmap,
					node_space[j].avail_bitmap);
			} else
				break;
			if ((j = node_space[j].next) == 0)
				break;
		}
		if (resv_end && (++resv_end < window_end) &&
		    ((later_start == 0) || (resv_end < later_start))) {
			later_start = resv_end;
		}

		if (job_ptr->details->exc_node_bitmap) {
			bit_not(job_ptr->details->exc_node_bitmap);
			bit_and(avail_bitmap,
				job_ptr->details->exc_node_bitmap);
			bit_not(job_ptr->details->exc_node_bitmap);
		}

		/* Test if insufficient nodes remain OR
		 *	required nodes missing OR
		 *	nodes lack features OR
		 *	no change since previously tested nodes (only changes
		 *	in other partition nodes) */
		if ((bit_set_count(avail_bitmap) < min_nodes) ||
		    ((job_ptr->details->req_node_bitmap) &&
		     (!bit_super_set(job_ptr->details->req_node_bitmap,
				     avail_bitmap))) ||
		    (job_req_node_filter(job_ptr, avail_bitmap))) {
			if (later_start) {
				job_ptr->start_time = 0;
				goto TRY_LATER;
			}

			/* Job can not start until too far in the future */
			job_ptr->time_limit = orig_time_limit;
			job_ptr->start_time = sched_start + backfill_window;
			continue;
		}

		/* Identify nodes which are definitely off limits */
		FREE_NULL_BITMAP(resv_bitmap);
		resv_bitmap = bit_copy(avail_bitmap);
		bit_not(resv_bitmap);

		/* this is the time consuming operation */
		debug2("backfill: entering _try_sched for job %u.",
		       job_ptr->job_id);

		if (!already_counted) {
			slurmctld_diag_stats.bf_last_depth_try++;
			already_counted = true;
		}

		if (debug_flags & DEBUG_FLAG_BACKFILL_MAP)
			_dump_job_test(job_ptr, avail_bitmap, start_res);
		j = _try_sched(job_ptr, &avail_bitmap, min_nodes, max_nodes,
			       req_nodes, exc_core_bitmap);

		now = time(NULL);
		if (j != SLURM_SUCCESS) {
			job_ptr->time_limit = orig_time_limit;
			job_ptr->start_time = 0;
			continue;	/* not runable */
		}

		if (start_res > job_ptr->start_time) {
			job_ptr->start_time = start_res;
			last_job_update = now;
		}
		if (job_ptr->start_time <= now) {	/* Can start now */
			uint32_t save_time_limit = job_ptr->time_limit;
			uint32_t hard_limit;
			bool reset_time = false;
			int rc = _start_job(job_ptr, resv_bitmap);
			if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) {
				if (orig_time_limit == NO_VAL) {
					acct_policy_alter_job(
						job_ptr, comp_time_limit);
					job_ptr->time_limit = comp_time_limit;
				} else {
					acct_policy_alter_job(
						job_ptr, orig_time_limit);
					job_ptr->time_limit = orig_time_limit;
				}
			} else if ((rc == SLURM_SUCCESS) && job_ptr->time_min) {
				/* Set time limit as high as possible */
				acct_policy_alter_job(job_ptr, comp_time_limit);
				job_ptr->time_limit = comp_time_limit;
				reset_time = true;
			} else if (orig_time_limit == NO_VAL) {
				acct_policy_alter_job(job_ptr, comp_time_limit);
				job_ptr->time_limit = comp_time_limit;
			} else {
				acct_policy_alter_job(job_ptr, orig_time_limit);
				job_ptr->time_limit = orig_time_limit;

			}
			if (job_ptr->time_limit == INFINITE)
				hard_limit = 365 * 24 * 60;	/* one year */
			else
				hard_limit = job_ptr->time_limit;
			job_ptr->end_time = job_ptr->start_time +
					    (hard_limit * 60);
			if (reset_time) {
				_reset_job_time_limit(job_ptr, now,
						      node_space);
				time_limit = job_ptr->time_limit;
			}

			if (rc == ESLURM_ACCOUNTING_POLICY) {
				/* Unknown future start time, just skip job */
				job_ptr->start_time = 0;
				continue;
			} else if (rc != SLURM_SUCCESS) {
				if (debug_flags & DEBUG_FLAG_BACKFILL) {
					info("backfill: planned start of job %u"
					     " failed: %s", job_ptr->job_id,
					     slurm_strerror(rc));
				}
				/* Drop through and reserve these resources.
				 * Likely due to state changes during sleep.
				 * Make best-effort based upon original state */
				job_ptr->time_limit = orig_time_limit;
				later_start = 0;
			} else {
				/* Started this job, move to next one */
				reject_array_job_id = 0;
				reject_array_part   = NULL;

				/* Update the database if job time limit
				 * changed and move to next job */
				if (save_time_limit != job_ptr->time_limit)
					jobacct_storage_g_job_start(acct_db_conn,
								    job_ptr);
				job_start_cnt++;
				if (max_backfill_jobs_start &&
				    (job_start_cnt >= max_backfill_jobs_start)){
					if (debug_flags & DEBUG_FLAG_BACKFILL) {
						info("backfill: bf_max_job_start"
						     " limit of %d reached",
						     max_backfill_jobs_start);
					}
					break;
				}
				continue;
			}
		} else {
			job_ptr->time_limit = orig_time_limit;
		}

		start_time  = job_ptr->start_time;
		end_reserve = job_ptr->start_time + (time_limit * 60);
		start_time  = (start_time / backfill_resolution) *
			      backfill_resolution;
		end_reserve = (end_reserve / backfill_resolution) *
			      backfill_resolution;

		if (later_start && (start_time > later_start)) {
			/* Try later when some nodes currently reserved for
			 * pending jobs are free */
			job_ptr->start_time = 0;
			goto TRY_LATER;
		}

		if (job_ptr->start_time > (sched_start + backfill_window)) {
			/* Starts too far in the future to worry about */
			if (debug_flags & DEBUG_FLAG_BACKFILL)
				_dump_job_sched(job_ptr, end_reserve,
						avail_bitmap);
			continue;
		}

		if (node_space_recs >= max_backfill_job_cnt) {
			if (debug_flags & DEBUG_FLAG_BACKFILL) {
				info("backfill: table size limit of %u reached",
				     max_backfill_job_cnt);
			}
			break;
		}

		if ((job_ptr->start_time > now) &&
		    _test_resv_overlap(node_space, avail_bitmap,
				       start_time, end_reserve)) {
			/* This job overlaps with an existing reservation for
			 * job to be backfill scheduled, which the sched
			 * plugin does not know about. Try again later. */
			later_start = job_ptr->start_time;
			job_ptr->start_time = 0;
			goto TRY_LATER;
		}

		/*
		 * Add reservation to scheduling table if appropriate
		 */
		if (debug_flags & DEBUG_FLAG_BACKFILL)
			_dump_job_sched(job_ptr, end_reserve, avail_bitmap);
		if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE))
			continue;
		reject_array_job_id = 0;
		reject_array_part   = NULL;
		if (debug_flags & DEBUG_FLAG_BACKFILL)
			_dump_job_sched(job_ptr, end_reserve, avail_bitmap);
		xfree(job_ptr->sched_nodes);
		job_ptr->sched_nodes = bitmap2node_name(avail_bitmap);
		bit_not(avail_bitmap);
		_add_reservation(start_time, end_reserve,
				 avail_bitmap, node_space, &node_space_recs);
		if (debug_flags & DEBUG_FLAG_BACKFILL_MAP)
			_dump_node_space_table(node_space);
	}
	xfree(bf_part_jobs);
	xfree(bf_part_ptr);
	xfree(uid);
	xfree(njobs);
	FREE_NULL_BITMAP(avail_bitmap);
	FREE_NULL_BITMAP(exc_core_bitmap);
	FREE_NULL_BITMAP(resv_bitmap);
	FREE_NULL_BITMAP(non_cg_bitmap);

	for (i=0; ; ) {
		FREE_NULL_BITMAP(node_space[i].avail_bitmap);
		if ((i = node_space[i].next) == 0)
			break;
	}
	xfree(node_space);
	list_destroy(job_queue);
	gettimeofday(&bf_time2, NULL);
	_do_diag_stats(&bf_time1, &bf_time2, yield_sleep);
	if (debug_flags & DEBUG_FLAG_BACKFILL) {
		END_TIMER;
		info("backfill: completed testing %u(%d) jobs, %s",
		     slurmctld_diag_stats.bf_last_depth,
		     job_test_count, TIME_STR);
	}
	return rc;
}