Exemple #1
0
/* If slurmctld crashes, the node state that it recovers could differ
 * from the actual hardware state (e.g. ResumeProgram failed to complete).
 * To address that, when a node that should be powered up for a running
 * job is not responding, they try running ResumeProgram again. */
static void _re_wake(void)
{
	struct node_record *node_ptr;
	bitstr_t *wake_node_bitmap = NULL;
	int i;

	node_ptr = node_record_table_ptr;
	for (i=0; i<node_record_count; i++, node_ptr++) {
		if (IS_NODE_ALLOCATED(node_ptr)   &&
		    IS_NODE_NO_RESPOND(node_ptr)  &&
		    !IS_NODE_POWER_SAVE(node_ptr) &&
		    (bit_test(suspend_node_bitmap, i) == 0) &&
		    (bit_test(resume_node_bitmap,  i) == 0)) {
			if (wake_node_bitmap == NULL) {
				wake_node_bitmap =
					bit_alloc(node_record_count);
			}
			bit_set(wake_node_bitmap, i);
		}
	}

	if (wake_node_bitmap) {
		char *nodes;
		nodes = bitmap2node_name(wake_node_bitmap);
		if (nodes) {
			pid_t pid = _run_prog(resume_prog, nodes, NULL);
			info("power_save: pid %d rewaking nodes %s",
			     (int) pid, nodes);
		} else
			error("power_save: bitmap2nodename");
		xfree(nodes);
		FREE_NULL_BITMAP(wake_node_bitmap);
	}
}
Exemple #2
0
extern uint64_t RRD_consolidate(time_t step_starttime, time_t step_endtime,
				bitstr_t* bitmap_of_nodes)
{
	uint64_t consumed_energy = 0;
	uint64_t tmp;
	char *node_name = NULL;
	hostlist_t hl;
	char* path;

	node_name = bitmap2node_name(bitmap_of_nodes);
	hl = hostlist_create(node_name);
	xfree(node_name);
	while ((node_name = hostlist_shift(hl))) {
		if (!(path = _get_node_rrd_path(node_name,
						EXT_SENSORS_VALUE_ENERGY)))
			consumed_energy = (uint64_t)NO_VAL;
		free(node_name);
		if ((tmp = _rrd_consolidate_one(
			     step_starttime, step_endtime, path,
			     ext_sensors_cnf->energy_rra_name, true)) == NO_VAL)
			consumed_energy = (uint64_t)NO_VAL;
		xfree(path);
		if (consumed_energy == (uint64_t)NO_VAL)
			break;
		consumed_energy += tmp;
	}
	hostlist_destroy(hl);

	return consumed_energy;
}
Exemple #3
0
/* Try to start the job on any non-reserved nodes */
static int _start_job(struct job_record *job_ptr, bitstr_t *resv_bitmap)
{
	int rc;
	bitstr_t *orig_exc_nodes = NULL;
	static uint32_t fail_jobid = 0;

	if (job_ptr->details->exc_node_bitmap) {
		orig_exc_nodes = bit_copy(job_ptr->details->exc_node_bitmap);
		bit_or(job_ptr->details->exc_node_bitmap, resv_bitmap);
	} else
		job_ptr->details->exc_node_bitmap = bit_copy(resv_bitmap);

	rc = select_nodes(job_ptr, false, NULL);
	if (job_ptr->details) { /* select_nodes() might cancel the job! */
		FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap);
		job_ptr->details->exc_node_bitmap = orig_exc_nodes;
	} else
		FREE_NULL_BITMAP(orig_exc_nodes);
	if (rc == SLURM_SUCCESS) {
		/* job initiated */
		last_job_update = time(NULL);
		info("backfill: Started JobId=%u on %s",
		     job_ptr->job_id, job_ptr->nodes);
		if (job_ptr->batch_flag == 0)
			srun_allocate(job_ptr->job_id);
		else if ((job_ptr->details == NULL) ||
			 (job_ptr->details->prolog_running == 0))
			launch_job(job_ptr);
		slurmctld_diag_stats.backfilled_jobs++;
		slurmctld_diag_stats.last_backfilled_jobs++;
		if (debug_flags & DEBUG_FLAG_BACKFILL) {
			info("backfill: Jobs backfilled since boot: %u",
			     slurmctld_diag_stats.backfilled_jobs);
		}
	} else if ((job_ptr->job_id != fail_jobid) &&
		   (rc != ESLURM_ACCOUNTING_POLICY)) {
		char *node_list;
		bit_not(resv_bitmap);
		node_list = bitmap2node_name(resv_bitmap);
		/* This happens when a job has sharing disabled and
		 * a selected node is still completing some job,
		 * which should be a temporary situation. */
		verbose("backfill: Failed to start JobId=%u on %s: %s",
			job_ptr->job_id, node_list, slurm_strerror(rc));
		xfree(node_list);
		fail_jobid = job_ptr->job_id;
	} else {
		debug3("backfill: Failed to start JobId=%u: %s",
		       job_ptr->job_id, slurm_strerror(rc));
	}

	return rc;
}
Exemple #4
0
/* Log recousrces to be allocated to a pending job */
static void _dump_job_sched(struct job_record *job_ptr, time_t end_time,
			    bitstr_t *avail_bitmap)
{
	char begin_buf[32], end_buf[32], *node_list;

	slurm_make_time_str(&job_ptr->start_time, begin_buf, sizeof(begin_buf));
	slurm_make_time_str(&end_time, end_buf, sizeof(end_buf));
	node_list = bitmap2node_name(avail_bitmap);
	info("Job %u to start at %s, end at %s on %s",
	     job_ptr->job_id, begin_buf, end_buf, node_list);
	xfree(node_list);
}
Exemple #5
0
static void _dump_job_test(struct job_record *job_ptr, bitstr_t *avail_bitmap,
			   time_t start_time)
{
	char begin_buf[32], *node_list;

	if (start_time == 0)
		strcpy(begin_buf, "NOW");
	else
		slurm_make_time_str(&start_time, begin_buf, sizeof(begin_buf));
	node_list = bitmap2node_name(avail_bitmap);
	info("Test job %u at %s on %s", job_ptr->job_id, begin_buf, node_list);
	xfree(node_list);
}
Exemple #6
0
/* power_job_reboot - Reboot compute nodes for a job from the head node */
extern int power_job_reboot(struct job_record *job_ptr)
{
	int rc = SLURM_SUCCESS;
	int i, i_first, i_last;
	struct node_record *node_ptr;
	bitstr_t *wake_node_bitmap = NULL;
	time_t now = time(NULL);
	char *nodes, *features = NULL;

	wake_node_bitmap = bit_alloc(node_record_count);
	i_first = bit_ffs(job_ptr->node_bitmap);
	i_last = bit_fls(job_ptr->node_bitmap);
	for (i = i_first; i <= i_last; i++) {
		if (!bit_test(job_ptr->node_bitmap, i))
			continue;
		node_ptr = node_record_table_ptr + i;
		resume_cnt++;
		resume_cnt_f++;
		node_ptr->node_state &= (~NODE_STATE_POWER_SAVE);
		node_ptr->node_state |=   NODE_STATE_POWER_UP;
		node_ptr->node_state |=   NODE_STATE_NO_RESPOND;
		bit_clear(power_node_bitmap, i);
		bit_clear(avail_node_bitmap, i);
		node_ptr->last_response = now + resume_timeout;
		bit_set(wake_node_bitmap,    i);
		bit_set(resume_node_bitmap,  i);
	}

	nodes = bitmap2node_name(wake_node_bitmap);
	if (nodes) {
#if _DEBUG
		info("power_save: reboot nodes %s", nodes);
#else
		verbose("power_save: reboot nodes %s", nodes);
#endif
		if (job_ptr->details && job_ptr->details->features)
			features = xlate_features(job_ptr->details->features);
		_run_prog(resume_prog, nodes, features);
		xfree(features);
	} else {
		error("power_save: bitmap2nodename");
		rc = SLURM_ERROR;
	}
	xfree(nodes);
	FREE_NULL_BITMAP(wake_node_bitmap);
	last_node_update = now;

	return rc;
}
Exemple #7
0
static void _dump_resv_port_info(void)
{
#if _DEBUG
	int i;
	char *tmp_char;

	for (i=0; i<port_resv_cnt; i++) {
		if (bit_set_count(port_resv_table[i]) == 0)
			continue;

		tmp_char = bitmap2node_name(port_resv_table[i]);
		info("Port %d: %s", (i+port_resv_min), tmp_char);
		xfree(tmp_char);
	}
#endif
}
Exemple #8
0
static void _log_switches(void)
{
	int i;
	struct switch_record *switch_ptr;

	switch_ptr = switch_record_table;
	for (i=0; i<switch_record_cnt; i++, switch_ptr++) {
		if (!switch_ptr->nodes) {
			switch_ptr->nodes = bitmap2node_name(switch_ptr->
							     node_bitmap);
		}
		debug("Switch level:%d name:%s nodes:%s switches:%s",
		      switch_ptr->level, switch_ptr->name,
		      switch_ptr->nodes, switch_ptr->switches);
	}
}
Exemple #9
0
static void _build_select_struct(struct job_record *job_ptr,
				 bitstr_t *bitmap, uint32_t node_cnt)
{
	int i;
	uint32_t total_cpus = 0;
	job_resources_t *job_resrcs_ptr;

	xassert(job_ptr);

	if (job_ptr->job_resrcs) {
		error("select_p_job_test: already have select_job");
		free_job_resources(&job_ptr->job_resrcs);
	}

	job_ptr->job_resrcs = job_resrcs_ptr = create_job_resources();
	job_resrcs_ptr->cpu_array_reps = xmalloc(sizeof(uint32_t));
	job_resrcs_ptr->cpu_array_value = xmalloc(sizeof(uint16_t));
	job_resrcs_ptr->cpus = xmalloc(sizeof(uint16_t) * node_cnt);
	job_resrcs_ptr->cpus_used = xmalloc(sizeof(uint16_t) * node_cnt);
/* 	job_resrcs_ptr->nhosts = node_cnt; */
	job_resrcs_ptr->nhosts = bit_set_count(bitmap);
	job_resrcs_ptr->ncpus = job_ptr->details->min_cpus;
	job_resrcs_ptr->node_bitmap = bit_copy(bitmap);
	job_resrcs_ptr->nodes = bitmap2node_name(bitmap);
	if (job_resrcs_ptr->node_bitmap == NULL)
		fatal("bit_copy malloc failure");

	job_resrcs_ptr->cpu_array_cnt = 1;
	if (job_ptr->details->min_cpus < bg_conf->cpus_per_mp)
		job_resrcs_ptr->cpu_array_value[0] = job_ptr->details->min_cpus;
	else
		job_resrcs_ptr->cpu_array_value[0] = bg_conf->cpus_per_mp;
	job_resrcs_ptr->cpu_array_reps[0] = node_cnt;
	total_cpus = bg_conf->cpu_ratio * node_cnt;

	for (i=0; i<node_cnt; i++)
		job_resrcs_ptr->cpus[i] = bg_conf->cpu_ratio;

	if (job_resrcs_ptr->ncpus != total_cpus) {
		error("select_p_job_test: ncpus mismatch %u != %u",
		      job_resrcs_ptr->ncpus, total_cpus);
	}
}
Exemple #10
0
/* Log resource allocate table */
static void _dump_node_space_table(node_space_map_t *node_space_ptr)
{
	int i = 0;
	char begin_buf[32], end_buf[32], *node_list;

	info("=========================================");
	while (1) {
		slurm_make_time_str(&node_space_ptr[i].begin_time,
				    begin_buf, sizeof(begin_buf));
		slurm_make_time_str(&node_space_ptr[i].end_time,
				    end_buf, sizeof(end_buf));
		node_list = bitmap2node_name(node_space_ptr[i].avail_bitmap);
		info("Begin:%s End:%s Nodes:%s",
		     begin_buf, end_buf, node_list);
		xfree(node_list);
		if ((i = node_space_ptr[i].next) == 0)
			break;
	}
	info("=========================================");
}
/*
 * bitmap2wiki_node_name  - given a bitmap, build a list of colon separated
 *	node names (if we can't use node range expressions), or the
 *	normal slurm node name expression
 *
 * IN bitmap - bitmap pointer
 * RET pointer to node list or NULL on error
 * globals: node_record_table_ptr - pointer to node table
 * NOTE: the caller must xfree the returned pointer when no longer required
 */
extern char *	bitmap2wiki_node_name(bitstr_t *bitmap)
{
	int i;
	char *buf = NULL;

	if (use_host_exp)
		return bitmap2node_name(bitmap);

	if (bitmap == NULL)
		return xstrdup("");

	for (i = 0; i < node_record_count; i++) {
		if (bit_test (bitmap, i) == 0)
			continue;
		if (buf)
			xstrcat(buf, ":");
		xstrcat(buf, node_record_table_ptr[i].name);
	}
	return buf;
}
Exemple #12
0
static void _validate_switches(void)
{
	slurm_conf_switches_t *ptr, **ptr_array;
	int depth, i, j;
	struct switch_record *switch_ptr, *prior_ptr;
	hostlist_t hl, invalid_hl = NULL;
	char *child, *buf;
	bool  have_root = false;
	bitstr_t *multi_homed_bitmap = NULL;	/* nodes on >1 leaf switch */
	bitstr_t *switches_bitmap = NULL;	/* nodes on any leaf switch */
	bitstr_t *tmp_bitmap = NULL;

	_free_switch_record_table();

	switch_record_cnt = _read_topo_file(&ptr_array);
	if (switch_record_cnt == 0) {
		error("No switches configured");
		s_p_hashtbl_destroy(conf_hashtbl);
		return;
	}

	switch_record_table = xmalloc(sizeof(struct switch_record) *
				      switch_record_cnt);
	multi_homed_bitmap = bit_alloc(node_record_count);
	switch_ptr = switch_record_table;
	for (i=0; i<switch_record_cnt; i++, switch_ptr++) {
		ptr = ptr_array[i];
		switch_ptr->name = xstrdup(ptr->switch_name);
		/* See if switch name has already been defined. */
		prior_ptr = switch_record_table;
		for (j=0; j<i; j++, prior_ptr++) {
			if (strcmp(switch_ptr->name, prior_ptr->name) == 0) {
				fatal("Switch (%s) has already been defined",
				      prior_ptr->name);
			}
		}
		switch_ptr->link_speed = ptr->link_speed;
		if (ptr->nodes) {
			switch_ptr->level = 0;	/* leaf switch */
			switch_ptr->nodes = xstrdup(ptr->nodes);
			if (_node_name2bitmap(ptr->nodes, 
					      &switch_ptr->node_bitmap, 
					      &invalid_hl)) {
				fatal("Invalid node name (%s) in switch "
				      "config (%s)",
				      ptr->nodes, ptr->switch_name);
			}
			if (switches_bitmap) {
				tmp_bitmap = bit_copy(switch_ptr->node_bitmap);
				bit_and(tmp_bitmap, switches_bitmap);
				bit_or(multi_homed_bitmap, tmp_bitmap);
				FREE_NULL_BITMAP(tmp_bitmap);
				bit_or(switches_bitmap,
				       switch_ptr->node_bitmap);
			} else {
				switches_bitmap = bit_copy(switch_ptr->
							   node_bitmap);
			}
		} else if (ptr->switches) {
			switch_ptr->level = -1;	/* determine later */
			switch_ptr->switches = xstrdup(ptr->switches);
		} else {
			fatal("Switch configuration (%s) lacks children",
			      ptr->switch_name);
		}
	}

	for (depth=1; ; depth++) {
		bool resolved = true;
		switch_ptr = switch_record_table;
		for (i=0; i<switch_record_cnt; i++, switch_ptr++) {
			if (switch_ptr->level != -1)
				continue;
			hl = hostlist_create(switch_ptr->switches);
			if (!hl) {
				fatal("Invalid switches: %s",
				      switch_ptr->switches);
			}
			while ((child = hostlist_pop(hl))) {
				j = _get_switch_inx(child);
				if ((j < 0) || (j == i)) {
					fatal("Switch configuration %s has "
					      "invalid child (%s)",
					      switch_ptr->name, child);
				}
				if (switch_record_table[j].level == -1) {
					/* Children not resolved */
					resolved = false;
					switch_ptr->level = -1;
					FREE_NULL_BITMAP(switch_ptr->
							 node_bitmap);
					free(child);
					break;
				}
				if (switch_ptr->level == -1) {
					switch_ptr->level = 1 +
						switch_record_table[j].level;
					switch_ptr->node_bitmap =
						bit_copy(switch_record_table[j].
							 node_bitmap);
				} else {
					switch_ptr->level =
						MAX(switch_ptr->level,
						     (switch_record_table[j].
						      level + 1));
					bit_or(switch_ptr->node_bitmap,
					       switch_record_table[j].
					       node_bitmap);
				}
				free(child);
			}
			hostlist_destroy(hl);
		}
		if (resolved)
			break;
		if (depth > 20)	/* Prevent infinite loop */
			fatal("Switch configuration is not a tree");
	}

	switch_levels = 0;
	switch_ptr = switch_record_table;
	for (i=0; i<switch_record_cnt; i++, switch_ptr++) {
		switch_levels = MAX(switch_levels, switch_ptr->level);
		if (switch_ptr->node_bitmap == NULL)
			error("switch %s has no nodes", switch_ptr->name);
	}
	if (switches_bitmap) {
		bit_not(switches_bitmap);
		i = bit_set_count(switches_bitmap);
		if (i > 0) {
			child = bitmap2node_name(switches_bitmap);
			error("WARNING: switches lack access to %d nodes: %s",
			      i, child);
			xfree(child);
		}
		FREE_NULL_BITMAP(switches_bitmap);
	} else
		fatal("switches contain no nodes");

	if (invalid_hl) {
		buf = hostlist_ranged_string_xmalloc(invalid_hl);
		error("WARNING: Invalid hostnames in switch configuration: %s",
		      buf);
		xfree(buf);
		hostlist_destroy(invalid_hl);
	}

	/* Report nodes on multiple leaf switches,
	 * possibly due to bad configuration file */
	i = bit_set_count(multi_homed_bitmap);
	if (i > 0) {
		child = bitmap2node_name(multi_homed_bitmap);
		error("WARNING: Multiple leaf switches contain nodes: %s",
		      child);
		xfree(child);
	}
	FREE_NULL_BITMAP(multi_homed_bitmap);

	/* Create array of indexes of children of each switch,
	 * and see if any switch can reach all nodes */
	for (i = 0; i < switch_record_cnt; i++) {
		if (switch_record_table[i].level != 0) {
			_find_child_switches (i);
		}
		if (node_record_count ==
			bit_set_count(switch_record_table[i].node_bitmap)) {
			have_root = true;
		}
	}
	if (!have_root) {
		info("TOPOLOGY: warning -- no switch can reach all nodes"
				" through its descendants."
				"Do not use route/topology");
	}
	s_p_hashtbl_destroy(conf_hashtbl);
	_log_switches();
}
Exemple #13
0
/* Initialize power_save module parameters.
 * Return 0 on valid configuration to run power saving,
 * otherwise log the problem and return -1 */
static int _init_power_config(void)
{
	slurm_ctl_conf_t *conf = slurm_conf_lock();

	last_config     = slurmctld_conf.last_update;
	idle_time       = conf->suspend_time - 1;
	suspend_rate    = conf->suspend_rate;
	resume_timeout  = conf->resume_timeout;
	resume_rate     = conf->resume_rate;
	slurmd_timeout  = conf->slurmd_timeout;
	suspend_timeout = conf->suspend_timeout;
	_clear_power_config();
	if (conf->suspend_program)
		suspend_prog = xstrdup(conf->suspend_program);
	if (conf->resume_program)
		resume_prog = xstrdup(conf->resume_program);
	if (conf->suspend_exc_nodes)
		exc_nodes = xstrdup(conf->suspend_exc_nodes);
	if (conf->suspend_exc_parts)
		exc_parts = xstrdup(conf->suspend_exc_parts);
	slurm_conf_unlock();

	if (idle_time < 0) {	/* not an error */
		debug("power_save module disabled, SuspendTime < 0");
		return -1;
	}
	if (suspend_rate < 0) {
		error("power_save module disabled, SuspendRate < 0");
		return -1;
	}
	if (resume_rate < 0) {
		error("power_save module disabled, ResumeRate < 0");
		return -1;
	}
	if (suspend_prog == NULL) {
		error("power_save module disabled, NULL SuspendProgram");
		return -1;
	} else if (!_valid_prog(suspend_prog)) {
		error("power_save module disabled, invalid SuspendProgram %s",
		      suspend_prog);
		return -1;
	}
	if (resume_prog == NULL) {
		error("power_save module disabled, NULL ResumeProgram");
		return -1;
	} else if (!_valid_prog(resume_prog)) {
		error("power_save module disabled, invalid ResumeProgram %s",
		      resume_prog);
		return -1;
	}

	if (exc_nodes &&
	    (node_name2bitmap(exc_nodes, false, &exc_node_bitmap))) {
		error("power_save module disabled, "
		      "invalid SuspendExcNodes %s", exc_nodes);
		return -1;
	}

	if (exc_parts) {
		char *tmp = NULL, *one_part = NULL, *part_list = NULL;
		struct part_record *part_ptr = NULL;
		int rc = 0;

		part_list = xstrdup(exc_parts);
		one_part = strtok_r(part_list, ",", &tmp);
		while (one_part != NULL) {
			part_ptr = find_part_record(one_part);
			if (!part_ptr) {
				error("power_save module disabled, "
					"invalid SuspendExcPart %s",
					one_part);
				rc = -1;
				break;
			}
			if (exc_node_bitmap)
				bit_or(exc_node_bitmap, part_ptr->node_bitmap);
			else
				exc_node_bitmap = bit_copy(part_ptr->
							   node_bitmap);
			one_part = strtok_r(NULL, ",", &tmp);
		}
		xfree(part_list);
		if (rc)
			return rc;
	}

	if (exc_node_bitmap) {
		char *tmp = bitmap2node_name(exc_node_bitmap);
		debug("power_save module, excluded nodes %s", tmp);
		xfree(tmp);
	}

	return 0;
}
Exemple #14
0
/* power_job_reboot - Reboot compute nodes for a job from the head node */
extern int power_job_reboot(struct job_record *job_ptr)
{
	int rc = SLURM_SUCCESS;
	int i, i_first, i_last;
	struct node_record *node_ptr;
	bitstr_t *boot_node_bitmap = NULL;
	time_t now = time(NULL);
	char *nodes, *features = NULL;
	pid_t pid;

	boot_node_bitmap = node_features_reboot(job_ptr);
	if (boot_node_bitmap == NULL)
		return SLURM_SUCCESS;

	i_first = bit_ffs(boot_node_bitmap);
	if (i_first >= 0)
		i_last = bit_fls(boot_node_bitmap);
	else
		i_last = i_first - 1;
	for (i = i_first; i <= i_last; i++) {
		if (!bit_test(boot_node_bitmap, i))
			continue;
		node_ptr = node_record_table_ptr + i;
		resume_cnt++;
		resume_cnt_f++;
		node_ptr->node_state &= (~NODE_STATE_POWER_SAVE);
		node_ptr->node_state |=   NODE_STATE_POWER_UP;
		node_ptr->node_state |=   NODE_STATE_NO_RESPOND;
		bit_clear(power_node_bitmap, i);
		bit_clear(avail_node_bitmap, i);
		node_ptr->last_response = now + resume_timeout;
		bit_set(resume_node_bitmap,  i);
	}

	nodes = bitmap2node_name(boot_node_bitmap);
	if (nodes) {
		job_ptr->job_state |= JOB_CONFIGURING;
		job_ptr->wait_all_nodes = 1;
		if (job_ptr->details && job_ptr->details->features &&
		    node_features_g_user_update(job_ptr->user_id)) {
			features = node_features_g_job_xlate(
					job_ptr->details->features);
		}
		pid = _run_prog(resume_prog, nodes, features);
#if _DEBUG
		info("power_save: pid %d reboot nodes %s features %s",
		     (int) pid, nodes, features);
#else
		verbose("power_save: pid %d reboot nodes %s features %s",
			(int) pid, nodes, features);
#endif
		xfree(features);
	} else {
		error("power_save: bitmap2nodename");
		rc = SLURM_ERROR;
	}
	xfree(nodes);
	FREE_NULL_BITMAP(boot_node_bitmap);
	last_node_update = now;

	return rc;
}
Exemple #15
0
/* Perform any power change work to nodes */
static void _do_power_work(time_t now)
{
	static time_t last_log = 0, last_work_scan = 0;
	int i, wake_cnt = 0, sleep_cnt = 0, susp_total = 0;
	time_t delta_t;
	uint32_t susp_state;
	bitstr_t *wake_node_bitmap = NULL, *sleep_node_bitmap = NULL;
	struct node_record *node_ptr;
	bool run_suspend = false;

	/* Set limit on counts of nodes to have state changed */
	delta_t = now - last_work_scan;
	if (delta_t >= 60) {
		suspend_cnt_f = 0.0;
		resume_cnt_f  = 0.0;
	} else {
		float rate = (60 - delta_t) / 60.0;
		suspend_cnt_f *= rate;
		resume_cnt_f  *= rate;
	}
	suspend_cnt = (suspend_cnt_f + 0.5);
	resume_cnt  = (resume_cnt_f  + 0.5);

	if (now > (last_suspend + suspend_timeout)) {
		/* ready to start another round of node suspends */
		run_suspend = true;
		if (last_suspend) {
			bit_nclear(suspend_node_bitmap, 0,
				   (node_record_count - 1));
			bit_nclear(resume_node_bitmap, 0,
				   (node_record_count - 1));
			last_suspend = (time_t) 0;
		}
	}

	last_work_scan = now;

	/* Build bitmaps identifying each node which should change state */
	for (i = 0, node_ptr = node_record_table_ptr;
	     i < node_record_count; i++, node_ptr++) {
		susp_state = IS_NODE_POWER_SAVE(node_ptr);

		if (susp_state)
			susp_total++;

		/* Resume nodes as appropriate */
		if (susp_state &&
		    ((resume_rate == 0) || (resume_cnt < resume_rate))	&&
		    (bit_test(suspend_node_bitmap, i) == 0)		&&
		    (IS_NODE_ALLOCATED(node_ptr) ||
		     (node_ptr->last_idle > (now - idle_time)))) {
			if (wake_node_bitmap == NULL) {
				wake_node_bitmap =
					bit_alloc(node_record_count);
			}
			wake_cnt++;
			resume_cnt++;
			resume_cnt_f++;
			node_ptr->node_state &= (~NODE_STATE_POWER_SAVE);
			node_ptr->node_state |=   NODE_STATE_POWER_UP;
			node_ptr->node_state |=   NODE_STATE_NO_RESPOND;
			bit_clear(power_node_bitmap, i);
			bit_clear(avail_node_bitmap, i);
			node_ptr->last_response = now + resume_timeout;
			bit_set(wake_node_bitmap,    i);
			bit_set(resume_node_bitmap,  i);
		}

		/* Suspend nodes as appropriate */
		if (run_suspend 					&&
		    (susp_state == 0)					&&
		    ((suspend_rate == 0) || (suspend_cnt < suspend_rate)) &&
		    (IS_NODE_IDLE(node_ptr) || IS_NODE_DOWN(node_ptr))	&&
		    (node_ptr->sus_job_cnt == 0)			&&
		    (!IS_NODE_COMPLETING(node_ptr))			&&
		    (!IS_NODE_POWER_UP(node_ptr))			&&
		    (node_ptr->last_idle != 0)				&&
		    (node_ptr->last_idle < (now - idle_time))		&&
		    ((exc_node_bitmap == NULL) ||
		     (bit_test(exc_node_bitmap, i) == 0))) {
			if (sleep_node_bitmap == NULL) {
				sleep_node_bitmap =
					bit_alloc(node_record_count);
			}
			sleep_cnt++;
			suspend_cnt++;
			suspend_cnt_f++;
			node_ptr->node_state |= NODE_STATE_POWER_SAVE;
			node_ptr->node_state &= (~NODE_STATE_NO_RESPOND);
			if (!IS_NODE_DOWN(node_ptr) &&
			    !IS_NODE_DRAIN(node_ptr))
				bit_set(avail_node_bitmap,   i);
			bit_set(power_node_bitmap,   i);
			bit_set(sleep_node_bitmap,   i);
			bit_set(suspend_node_bitmap, i);
			last_suspend = now;
		}
	}
	if (((now - last_log) > 600) && (susp_total > 0)) {
		info("Power save mode: %d nodes", susp_total);
		last_log = now;
	}

	if (sleep_node_bitmap) {
		char *nodes;
		nodes = bitmap2node_name(sleep_node_bitmap);
		if (nodes)
			_do_suspend(nodes);
		else
			error("power_save: bitmap2nodename");
		xfree(nodes);
		FREE_NULL_BITMAP(sleep_node_bitmap);
		/* last_node_update could be changed already by another thread!
		last_node_update = now; */
	}

	if (wake_node_bitmap) {
		char *nodes;
		nodes = bitmap2node_name(wake_node_bitmap);
		if (nodes)
			_do_resume(nodes);
		else
			error("power_save: bitmap2nodename");
		xfree(nodes);
		FREE_NULL_BITMAP(wake_node_bitmap);
		/* last_node_update could be changed already by another thread!
		last_node_update = now; */
	}
}
Exemple #16
0
/* cr_job_test - does most of the real work for select_p_job_test(), which
 *	includes contiguous selection, load-leveling and max_share logic
 *
 * PROCEDURE:
 *
 * Step 1: compare nodes in "avail" bitmap with current node state data
 *         to find available nodes that match the job request
 *
 * Step 2: check resources in "avail" bitmap with allocated resources from
 *         higher priority partitions (busy resources are UNavailable)
 *
 * Step 3: select resource usage on remaining resources in "avail" bitmap
 *         for this job, with the placement influenced by existing
 *         allocations
 */
extern int cr_job_test(struct job_record *job_ptr, bitstr_t *bitmap, int mode,
			uint16_t cr_type, enum node_cr_state job_node_req, 
			uint32_t cr_node_cnt,
			struct part_res_record *cr_part_ptr,
			struct node_use_record *node_usage)
{
	static int gang_mode = -1;
	int error_code = SLURM_SUCCESS;
	bitstr_t *orig_map, *avail_cores, *free_cores;
	bitstr_t *tmpcore = NULL;
	bool test_only;
	uint32_t c, i, j, k, n, csize, save_mem = 0;
	job_resources_t *job_res;
	struct job_details *details_ptr;
	struct part_res_record *p_ptr, *jp_ptr;
	uint16_t *cpu_count;

	if (gang_mode == -1) {
		if (slurm_get_preempt_mode() & PREEMPT_MODE_GANG)
			gang_mode = 1;
		else
			gang_mode = 0;
	}

	details_ptr = job_ptr->details;

	free_job_resources(&job_ptr->job_resrcs);

	if (mode == SELECT_MODE_TEST_ONLY)
		test_only = true;
	else	/* SELECT_MODE_RUN_NOW || SELECT_MODE_WILL_RUN  */
		test_only = false;

	/* check node_state and update the node bitmap as necessary */
	if (!test_only) {
		error_code = _verify_node_state(cr_part_ptr, job_ptr,
						bitmap, cr_type, node_usage,
						job_node_req);
		if (error_code != SLURM_SUCCESS)
			return error_code;
	}

	if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
		info("select/serial: evaluating job %u on %u nodes",
		     job_ptr->job_id, bit_set_count(bitmap));
	}

	orig_map = bit_copy(bitmap);
	avail_cores = _make_core_bitmap(bitmap);

	/* test to make sure that this job can succeed with all avail_cores
	 * if 'no' then return FAIL
	 * if 'yes' then we will seek the optimal placement for this job
	 *          within avail_cores
	 */
	free_cores = bit_copy(avail_cores);
	cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores,
				  node_usage, cr_type, test_only);
	if (cpu_count == NULL) {
		/* job cannot fit */
		FREE_NULL_BITMAP(orig_map);
		FREE_NULL_BITMAP(free_cores);
		FREE_NULL_BITMAP(avail_cores);
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
			info("select/serial: cr_job_test: test 0 fail: "
			     "insufficient resources");
		}
		return SLURM_ERROR;
	} else if (test_only) {
		FREE_NULL_BITMAP(orig_map);
		FREE_NULL_BITMAP(free_cores);
		FREE_NULL_BITMAP(avail_cores);
		xfree(cpu_count);
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE)
			info("select/serial: cr_job_test: test 0 pass: "******"test_only");
		return SLURM_SUCCESS;
	}
	if (cr_type == CR_MEMORY) {
		/* CR_MEMORY does not care about existing CPU allocations,
		 * so we can jump right to job allocation from here */
		goto alloc_job;
	}
	xfree(cpu_count);
	if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
		info("select/serial: cr_job_test: test 0 pass - "
		     "job fits on given resources");
	}

	/* now that we know that this job can run with the given resources,
	 * let's factor in the existing allocations and seek the optimal set
	 * of resources for this job. Here is the procedure:
	 *
	 * Step 1: Seek idle CPUs across all partitions. If successful then
	 *         place job and exit. If not successful, then continue. Two
	 *         related items to note:
	 *          1. Jobs that don't share CPUs finish with step 1.
	 *          2. The remaining steps assume sharing or preemption.
	 *
	 * Step 2: Remove resources that are in use by higher-priority
	 *         partitions, and test that job can still succeed. If not
	 *         then exit.
	 *
	 * Step 3: Seek idle nodes among the partitions with the same
	 *         priority as the job's partition. If successful then
	 *         goto Step 6. If not then continue:
	 *
	 * Step 4: Seek placement within the job's partition. Search
	 *         row-by-row. If no placement if found, then exit. If a row
	 *         is found, then continue:
	 *
	 * Step 5: Place job and exit. FIXME! Here is where we need a
	 *         placement algorithm that recognizes existing job
	 *         boundaries and tries to "overlap jobs" as efficiently
	 *         as possible.
	 *
	 * Step 6: Place job and exit. FIXME! here is we use a placement
	 *         algorithm similar to Step 5 on jobs from lower-priority
	 *         partitions.
	 */


	/*** Step 1 ***/
	bit_copybits(bitmap, orig_map);
	bit_copybits(free_cores, avail_cores);

	/* remove all existing allocations from free_cores */
	tmpcore = bit_copy(free_cores);
	for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) {
		if (!p_ptr->row)
			continue;
		for (i = 0; i < p_ptr->num_rows; i++) {
			if (!p_ptr->row[i].row_bitmap)
				continue;
			bit_copybits(tmpcore, p_ptr->row[i].row_bitmap);
			bit_not(tmpcore); /* set bits now "free" resources */
			bit_and(free_cores, tmpcore);
		}
	}
	cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores,
				  node_usage, cr_type, test_only);
	if (cpu_count) {
		/* job fits! We're done. */
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
			info("select/serial: cr_job_test: test 1 pass - "
			     "idle resources found");
		}
		goto alloc_job;
	}

	if ((gang_mode == 0) && (job_node_req == NODE_CR_ONE_ROW)) {
		/* This job CANNOT share CPUs regardless of priority,
		 * so we fail here. Note that Shared=EXCLUSIVE was already
		 * addressed in _verify_node_state() and job preemption
		 * removes jobs from simulated resource allocation map
		 * before this point. */
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
			info("select/serial: cr_job_test: test 1 fail - "
			     "no idle resources available");
		}
		goto alloc_job;
	}
	if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
		info("select/serial: cr_job_test: test 1 fail - "
		     "not enough idle resources");
	}

	/*** Step 2 ***/
	bit_copybits(bitmap, orig_map);
	bit_copybits(free_cores, avail_cores);

	for (jp_ptr = cr_part_ptr; jp_ptr; jp_ptr = jp_ptr->next) {
		if (jp_ptr->part_ptr == job_ptr->part_ptr)
			break;
	}
	if (!jp_ptr) {
		fatal("select/serial: could not find partition for job %u",
		      job_ptr->job_id);
		return SLURM_ERROR;	/* Fix CLANG false positive */
	}

	/* remove existing allocations (jobs) from higher-priority partitions
	 * from avail_cores */
	for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) {
		if ((p_ptr->part_ptr->priority <= jp_ptr->part_ptr->priority) &&
		    (p_ptr->part_ptr->preempt_mode != PREEMPT_MODE_OFF))
			continue;
		if (!p_ptr->row)
			continue;
		for (i = 0; i < p_ptr->num_rows; i++) {
			if (!p_ptr->row[i].row_bitmap)
				continue;
			bit_copybits(tmpcore, p_ptr->row[i].row_bitmap);
			bit_not(tmpcore); /* set bits now "free" resources */
			bit_and(free_cores, tmpcore);
		}
	}
	/* make these changes permanent */
	bit_copybits(avail_cores, free_cores);
	cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores,
				  node_usage, cr_type, test_only);
	if (!cpu_count) {
		/* job needs resources that are currently in use by
		 * higher-priority jobs, so fail for now */
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
			info("select/serial: cr_job_test: test 2 fail - "
			     "resources busy with higher priority jobs");
		}
		goto alloc_job;
	}
	xfree(cpu_count);
	if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
		info("select/serial: cr_job_test: test 2 pass - "
		     "available resources for this priority");
	}

	/*** Step 3 ***/
	bit_copybits(bitmap, orig_map);
	bit_copybits(free_cores, avail_cores);

	/* remove existing allocations (jobs) from same-priority partitions
	 * from avail_cores */
	for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) {
		if (p_ptr->part_ptr->priority != jp_ptr->part_ptr->priority)
			continue;
		if (!p_ptr->row)
			continue;
		for (i = 0; i < p_ptr->num_rows; i++) {
			if (!p_ptr->row[i].row_bitmap)
				continue;
			bit_copybits(tmpcore, p_ptr->row[i].row_bitmap);
			bit_not(tmpcore); /* set bits now "free" resources */
			bit_and(free_cores, tmpcore);
		}
	}
	cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores,
				  node_usage, cr_type, test_only);
	if (cpu_count) {
		/* jobs from low-priority partitions are the only thing left
		 * in our way. for now we'll ignore them, but FIXME: we need
		 * a good placement algorithm here that optimizes "job overlap"
		 * between this job (in these idle nodes) and the low-priority
		 * jobs */
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
			info("select/serial: cr_job_test: test 3 pass - "
			     "found resources");
		}
		goto alloc_job;
	}
	if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
		info("select/serial: cr_job_test: test 3 fail - "
		     "not enough idle resources in same priority");
	}


	/*** Step 4 ***/
	/* try to fit the job into an existing row
	 *
	 * tmpcore = worker core_bitmap
	 * free_cores = core_bitmap to be built
	 * avail_cores = static core_bitmap of all available cores
	 */

	if (!jp_ptr || !jp_ptr->row) {
		/* there's no existing jobs in this partition, so place
		 * the job in avail_cores. FIXME: still need a good
		 * placement algorithm here that optimizes "job overlap"
		 * between this job (in these idle nodes) and existing
		 * jobs in the other partitions with <= priority to
		 * this partition */
		bit_copybits(bitmap, orig_map);
		bit_copybits(free_cores, avail_cores);
		cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt,
					  free_cores, node_usage, cr_type,
					  test_only);
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
			info("select/serial: cr_job_test: test 4 pass - "
			     "first row found");
		}
		goto alloc_job;
	}

	cr_sort_part_rows(jp_ptr);
	c = jp_ptr->num_rows;
	if (job_node_req != NODE_CR_AVAILABLE)
		c = 1;
	for (i = 0; i < c; i++) {
		if (!jp_ptr->row[i].row_bitmap)
			break;
		bit_copybits(bitmap, orig_map);
		bit_copybits(free_cores, avail_cores);
		bit_copybits(tmpcore, jp_ptr->row[i].row_bitmap);
		bit_not(tmpcore);
		bit_and(free_cores, tmpcore);
		cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt,
					  free_cores, node_usage, cr_type,
					  test_only);
		if (cpu_count) {
			if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
				info("select/serial: cr_job_test: "
				     "test 4 pass - row %i", i);
			}
			break;
		}
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
			info("select/serial: cr_job_test: "
			     "test 4 fail - row %i", i);
		}
	}

	if ((i < c) && !jp_ptr->row[i].row_bitmap) {
		/* we've found an empty row, so use it */
		bit_copybits(bitmap, orig_map);
		bit_copybits(free_cores, avail_cores);
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
			info("select/serial: cr_job_test: "
			     "test 4 trying empty row %i",i);
		}
		cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt,
					  free_cores, node_usage, cr_type,
					  test_only);
	}

	if (!cpu_count) {
		/* job can't fit into any row, so exit */
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
			info("select/serial: cr_job_test: test 4 fail - "
			     "busy partition");
		}
		goto alloc_job;

	}

	/*** CONSTRUCTION ZONE FOR STEPs 5 AND 6 ***
	 * Note that while the job may have fit into a row, it should
	 * still be run through a good placement algorithm here that
	 * optimizes "job overlap" between this job (in these idle nodes)
	 * and existing jobs in the other partitions with <= priority to
	 * this partition */

alloc_job:
	/* at this point we've found a good set of
	 * bits to allocate to this job:
	 * - bitmap is the set of nodes to allocate
	 * - free_cores is the set of allocated cores
	 * - cpu_count is the number of cpus per allocated node
	 *
	 * Next steps are to cleanup the worker variables,
	 * create the job_resources struct,
	 * distribute the job on the bits, and exit
	 */
	FREE_NULL_BITMAP(orig_map);
	FREE_NULL_BITMAP(avail_cores);
	FREE_NULL_BITMAP(tmpcore);
	if (!cpu_count) {
		/* we were sent here to cleanup and exit */
		FREE_NULL_BITMAP(free_cores);
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
			info("select/serial: exiting cr_job_test with no "
			     "allocation");
		}
		return SLURM_ERROR;
	}

	/* At this point we have:
	 * - a bitmap of selected nodes
	 * - a free_cores bitmap of usable cores on each selected node
	 * - a per-alloc-node cpu_count array
	 */

	if ((mode != SELECT_MODE_WILL_RUN) && (job_ptr->part_ptr == NULL))
		error_code = EINVAL;
	if ((error_code == SLURM_SUCCESS) && (mode == SELECT_MODE_WILL_RUN))
		job_ptr->total_cpus = 1;
	if ((error_code != SLURM_SUCCESS) || (mode != SELECT_MODE_RUN_NOW)) {
		FREE_NULL_BITMAP(free_cores);
		xfree(cpu_count);
		return error_code;
	}

	n = bit_ffs(bitmap);
	if (n < 0) {
		FREE_NULL_BITMAP(free_cores);
		xfree(cpu_count);
		return error_code;
	}

	if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
		info("select/serial: cr_job_test: distributing job %u",
		     job_ptr->job_id);
	}

	/** create the struct_job_res  **/
	job_res                   = create_job_resources();
	job_res->node_bitmap      = bit_copy(bitmap);
	job_res->nodes            = bitmap2node_name(bitmap);
	job_res->nhosts           = bit_set_count(bitmap);
	job_res->ncpus            = job_res->nhosts;
	if (job_ptr->details->ntasks_per_node)
		job_res->ncpus   *= details_ptr->ntasks_per_node;
	job_res->ncpus            = MAX(job_res->ncpus,
					details_ptr->min_cpus);
	job_res->ncpus            = MAX(job_res->ncpus,
					details_ptr->pn_min_cpus);
	job_res->node_req         = job_node_req;
	job_res->cpus             = cpu_count;
	job_res->cpus_used        = xmalloc(job_res->nhosts *
					    sizeof(uint16_t));
	job_res->memory_allocated = xmalloc(job_res->nhosts *
					    sizeof(uint32_t));
	job_res->memory_used      = xmalloc(job_res->nhosts *
					    sizeof(uint32_t));

	/* store the hardware data for the selected nodes */
	error_code = build_job_resources(job_res, node_record_table_ptr,
					  select_fast_schedule);
	if (error_code != SLURM_SUCCESS) {
		free_job_resources(&job_res);
		FREE_NULL_BITMAP(free_cores);
		return error_code;
	}

	c = 0;
	csize = bit_size(job_res->core_bitmap);
	j = cr_get_coremap_offset(n);
	k = cr_get_coremap_offset(n + 1);
	for (; j < k; j++, c++) {
		if (!bit_test(free_cores, j))
			continue;
		if (c >= csize)	{
			error("select/serial: cr_job_test "
			      "core_bitmap index error on node %s", 
			      select_node_record[n].node_ptr->name);
			drain_nodes(select_node_record[n].node_ptr->name,
				    "Bad core count", getuid());
			free_job_resources(&job_res);
			FREE_NULL_BITMAP(free_cores);
			return SLURM_ERROR;
		}
		bit_set(job_res->core_bitmap, c);
		break;
	}

	if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
		info("select/serial: cr_job_test: job %u ncpus %u cbits %u/%d "
		     "nbits %u", job_ptr->job_id,
		     job_res->ncpus, bit_set_count(free_cores), 1,
		     job_res->nhosts);
	}
	FREE_NULL_BITMAP(free_cores);

	/* distribute the tasks and clear any unused cores */
	job_ptr->job_resrcs = job_res;
	error_code = cr_dist(job_ptr, cr_type);
	if (error_code != SLURM_SUCCESS) {
		free_job_resources(&job_ptr->job_resrcs);
		return error_code;
	}

	/* translate job_res->cpus array into format with rep count */
	job_ptr->total_cpus = build_job_resources_cpu_array(job_res);

	if (!(cr_type & CR_MEMORY))
		return error_code;

	/* load memory allocated array */
	save_mem = details_ptr->pn_min_memory;
	if (save_mem & MEM_PER_CPU) {
		/* memory is per-cpu */
		save_mem &= (~MEM_PER_CPU);
		job_res->memory_allocated[0] = job_res->cpus[0] * save_mem;
	} else {
		/* memory is per-node */
		job_res->memory_allocated[0] = save_mem;
	}
	return error_code;
}
static char *	_will_run_test2(uint32_t jobid, time_t start_time,
				char *node_list,
				uint32_t *preemptee, int preemptee_cnt,
				int *err_code, char **err_msg)
{
	struct job_record *job_ptr = NULL, *pre_ptr;
	struct part_record *part_ptr;
	bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL;
	bitstr_t *exc_core_bitmap = NULL;
	time_t start_res;
	uint32_t min_nodes, max_nodes, req_nodes;
	List preemptee_candidates = NULL, preempted_jobs = NULL;
	time_t orig_start_time;
	char *reply_msg = NULL;
	int i, rc;
	bool resv_overlap = false;

	xassert(node_list);
	debug2("wiki2: will_run2 job_id=%u start_time=%u node_list=%s",
		jobid, (uint32_t)start_time, node_list);

	job_ptr = find_job_record(jobid);
	if (job_ptr == NULL) {
		*err_code = -700;
		*err_msg = "No such job";
		error("wiki: Failed to find job %u", jobid);
		return NULL;
	}
	if ((job_ptr->details == NULL) || (!IS_JOB_PENDING(job_ptr))) {
		*err_code = -700;
		*err_msg = "WillRun not applicable to non-pending job";
		error("wiki: WillRun on non-pending job %u", jobid);
		return NULL;
	}

	part_ptr = job_ptr->part_ptr;
	if (part_ptr == NULL) {
		*err_code = -700;
		*err_msg = "Job lacks a partition";
		error("wiki: Job %u lacks a partition", jobid);
		return NULL;
	}

	if (node_name2bitmap(node_list, false, &avail_bitmap) != 0) {
		*err_code = -700;
		*err_msg = "Invalid available nodes value";
		error("wiki: Attempt to set invalid available node "
		      "list for job %u, %s", jobid, node_list);
		return NULL;
	}

	/* Enforce reservation: access control, time and nodes */
	start_res = start_time;
	rc = job_test_resv(job_ptr, &start_res, true, &resv_bitmap,
			   &exc_core_bitmap, &resv_overlap);
	if (rc != SLURM_SUCCESS) {
		*err_code = -730;
		*err_msg = "Job denied access to reservation";
		error("wiki: reservation access denied for job %u", jobid);
		FREE_NULL_BITMAP(avail_bitmap);
		FREE_NULL_BITMAP(exc_core_bitmap);
		return NULL;
	}
	bit_and(avail_bitmap, resv_bitmap);
	FREE_NULL_BITMAP(resv_bitmap);

	/* Only consider nodes that are not DOWN or DRAINED */
	bit_and(avail_bitmap, avail_node_bitmap);

	/* Consider only nodes in this job's partition */
	if (part_ptr->node_bitmap)
		bit_and(avail_bitmap, part_ptr->node_bitmap);
	else {
		*err_code = -730;
		*err_msg = "Job's partition has no nodes";
		error("wiki: no nodes in partition %s for job %u",
			part_ptr->name, jobid);
		FREE_NULL_BITMAP(avail_bitmap);
		FREE_NULL_BITMAP(exc_core_bitmap);
		return NULL;
	}

	if (job_req_node_filter(job_ptr, avail_bitmap) != SLURM_SUCCESS) {
		/* Job probably has invalid feature list */
		*err_code = -730;
		*err_msg = "Job's required features not available "
			   "on selected nodes";
		error("wiki: job %u not runnable on hosts=%s",
			jobid, node_list);
		FREE_NULL_BITMAP(avail_bitmap);
		FREE_NULL_BITMAP(exc_core_bitmap);
		return NULL;
	}
	if (job_ptr->details->exc_node_bitmap) {
		bit_not(job_ptr->details->exc_node_bitmap);
		bit_and(avail_bitmap, job_ptr->details->exc_node_bitmap);
		bit_not(job_ptr->details->exc_node_bitmap);
	}
	if ((job_ptr->details->req_node_bitmap) &&
	    (!bit_super_set(job_ptr->details->req_node_bitmap,
			    avail_bitmap))) {
		*err_code = -730;
		*err_msg = "Job's required nodes not available";
		error("wiki: job %u not runnable on hosts=%s",
			jobid, node_list);
		FREE_NULL_BITMAP(avail_bitmap);
		FREE_NULL_BITMAP(exc_core_bitmap);
		return NULL;
	}

	min_nodes = MAX(job_ptr->details->min_nodes, part_ptr->min_nodes);
	if (job_ptr->details->max_nodes == 0)
		max_nodes = part_ptr->max_nodes;
	else
		max_nodes = MIN(job_ptr->details->max_nodes,
				part_ptr->max_nodes);
	max_nodes = MIN(max_nodes, 500000); /* prevent overflows */
	if (job_ptr->details->max_nodes)
		req_nodes = max_nodes;
	else
		req_nodes = min_nodes;
	if (min_nodes > max_nodes) {
		/* job's min_nodes exceeds partitions max_nodes */
		*err_code = -730;
		*err_msg = "Job's min_nodes > max_nodes";
		error("wiki: job %u not runnable on hosts=%s",
			jobid, node_list);
		FREE_NULL_BITMAP(avail_bitmap);
		FREE_NULL_BITMAP(exc_core_bitmap);
		return NULL;
	}

	if (preemptee_cnt) {
		preemptee_candidates = list_create(NULL);
		for (i=0; i<preemptee_cnt; i++) {
			if ((pre_ptr = find_job_record(preemptee[i])))
				list_append(preemptee_candidates, pre_ptr);
		}
	}

	orig_start_time = job_ptr->start_time;
	rc = select_g_job_test(job_ptr, avail_bitmap, min_nodes, max_nodes,
			       req_nodes, SELECT_MODE_WILL_RUN,
			       preemptee_candidates, &preempted_jobs,
			       exc_core_bitmap);
	FREE_NULL_LIST(preemptee_candidates);

	if (rc == SLURM_SUCCESS) {
		char *hostlist, *sep, tmp_str[128];
		uint32_t pre_cnt = 0, proc_cnt = 0;

#ifdef HAVE_BG
		select_g_select_jobinfo_get(job_ptr->select_jobinfo,
				     SELECT_JOBDATA_NODE_CNT, &proc_cnt);
#else
		proc_cnt = job_ptr->total_cpus;
#endif
		snprintf(tmp_str, sizeof(tmp_str),
			 "STARTINFO=%u TASKS=%u STARTTIME=%u NODES=",
			 job_ptr->job_id, proc_cnt,
			 (uint32_t) job_ptr->start_time);
		xstrcat(reply_msg, tmp_str);
		hostlist = bitmap2node_name(avail_bitmap);
		xstrcat(reply_msg, hostlist);
		xfree(hostlist);

		if (preempted_jobs) {
			while ((pre_ptr = list_pop(preempted_jobs))) {
				if (pre_cnt++)
					sep = ",";
				else
					sep = " PREEMPT=";
				snprintf(tmp_str, sizeof(tmp_str), "%s%u",
					 sep, pre_ptr->job_id);
				xstrcat(reply_msg, tmp_str);
			}
			FREE_NULL_LIST(preempted_jobs);
		}
	} else {
		xstrcat(reply_msg, "Jobs not runable on selected nodes");
		error("wiki: jobs not runnable on nodes");
	}

	/* Restore pending job's expected start time */
	job_ptr->start_time = orig_start_time;

	FREE_NULL_BITMAP(avail_bitmap);
	FREE_NULL_BITMAP(exc_core_bitmap);
	return reply_msg;
}
Exemple #18
0
/*
 * finds the best match for a given job request
 *
 *
 * OUT - block_id of matched block, NULL otherwise
 * returns 1 for error (no match)
 *
 */
static int _find_best_block_match(List block_list,
				  int *blocks_added,
				  struct job_record* job_ptr,
				  bitstr_t* slurm_block_bitmap,
				  uint32_t min_nodes, uint32_t max_nodes,
				  uint32_t req_nodes,
				  bg_record_t** found_bg_record,
				  uint16_t query_mode, int avail_cpus)
{
	bg_record_t *bg_record = NULL;
	uint16_t req_geometry[SYSTEM_DIMENSIONS];
	uint16_t target_size = 0;
	uint32_t req_procs = job_ptr->details->min_cpus;
	select_ba_request_t request;
	int i, dim;
	int overlap_check = 0;
	int allow = 0;
	int check_image = 1;
	uint32_t max_cpus = job_ptr->details->max_cpus;
	char tmp_char[256];
	static int total_cpus = 0;
	int rc = SLURM_SUCCESS;
	int create_try = 0;
	List overlapped_list = NULL;
	bool is_test = SELECT_IS_TEST(query_mode);

	if (!total_cpus) {
		int *cluster_dims = select_g_ba_get_dims();
		total_cpus = 1;
		for (dim=0; dim<SYSTEM_DIMENSIONS; dim++)
			total_cpus *= cluster_dims[dim];
		total_cpus *= bg_conf->cpus_per_mp;
	}

	if (req_nodes > max_nodes) {
		error("can't run this job max mps is %u asking for %u",
		      max_nodes, req_nodes);
		return SLURM_ERROR;
	}

	if (!is_test && (req_procs > avail_cpus)) {
		if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
			info("asking for %u I only have %d",
			     req_procs, avail_cpus);
		return SLURM_ERROR;
	}

	if (!block_list) {
		error("_find_best_block_match: There is no block_list");
		return SLURM_ERROR;
	}

	memset(&request, 0, sizeof(select_ba_request_t));

	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_CONN_TYPE, &request.conn_type);
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_GEOMETRY, &req_geometry);
	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_ROTATE, &request.rotate);

	if ((rc = _check_images(job_ptr, &request)) == SLURM_ERROR)
		goto end_it;

	if (req_geometry[0] != 0 && req_geometry[0] != (uint16_t)NO_VAL) {
		char tmp_geo[SYSTEM_DIMENSIONS+1];

		target_size = 1;
		for (i=0; i<SYSTEM_DIMENSIONS; i++) {
			target_size *= req_geometry[i];
			tmp_geo[i] = alpha_num[req_geometry[i]];
		}
		tmp_geo[i] = '\0';

		if (target_size != min_nodes) {
			debug2("min_nodes not set correctly %u "
			       "should be %u from %s",
			       min_nodes, target_size,
			       tmp_geo);
			min_nodes = target_size;
		}
		if (!req_nodes)
			req_nodes = min_nodes;
	} else {
		req_geometry[0] = (uint16_t)NO_VAL;
		target_size = min_nodes;
	}

	*found_bg_record = NULL;
	allow = 0;

	memcpy(request.geometry, req_geometry, sizeof(req_geometry));

	request.deny_pass = (uint16_t)NO_VAL;
	request.save_name = NULL;
	request.size = target_size;
	request.procs = req_procs;
	request.elongate = request.rotate;
	/* request.start[0] = 1; */
	/* request.start[1] = 2; */
	/* request.start[2] = 0; */
	/* request.start[3] = 2; */
	/* request.start_req = 1; */

	if (job_ptr->details->req_node_bitmap)
		request.avail_mp_bitmap = job_ptr->details->req_node_bitmap;
	else
		request.avail_mp_bitmap = slurm_block_bitmap;

	/* since we only look at procs after this and not nodes we
	 *  need to set a max_cpus if given
	 */
	if (max_cpus == (uint32_t)NO_VAL)
		max_cpus = max_nodes * bg_conf->cpus_per_mp;

	while (1) {
		/* Here we are creating a list of all the blocks that
		 * have overlapped jobs so if we don't find one that
		 * works we will have can look and see the earliest
		 * the job can start.  This doesn't apply to Dynamic mode.
		 */
		if (is_test && SELECT_IS_CHECK_FULL_SET(query_mode)
		    && bg_conf->layout_mode != LAYOUT_DYNAMIC)
			overlapped_list = list_create(NULL);

		bg_record = _find_matching_block(block_list,
						 job_ptr,
						 slurm_block_bitmap,
						 &request,
						 max_cpus,
						 &allow, check_image,
						 overlap_check,
						 overlapped_list,
						 query_mode);
		/* this could get altered in _find_matching_block so we
		   need to reset it */
		memcpy(request.geometry, req_geometry, sizeof(req_geometry));

		if (!bg_record && overlapped_list
		    && list_count(overlapped_list)) {
			ListIterator itr =
				list_iterator_create(overlapped_list);
			bg_record_t *tmp_rec = NULL;
			while ((tmp_rec = list_next(itr))) {
				if (!bg_record ||
				    (tmp_rec->job_ptr->end_time <
				     bg_record->job_ptr->end_time))
					bg_record = tmp_rec;
			}
			list_iterator_destroy(itr);
		}

		if (overlapped_list)
			list_destroy(overlapped_list);

		/* set the bitmap and do other allocation activities */
		if (bg_record) {
#ifdef HAVE_BG_L_P
			if (!is_test) {
				if (bridge_block_check_mp_states(
					    bg_record->bg_block_id, 1)
				    != SLURM_SUCCESS) {
					/* check_block_mp_states will
					   set this block in the main
					   list to an error state, but
					   we aren't looking
					   at the main list, so we
					   need to set this copy of
					   the block in an
					   error state as well.
					*/
					bg_record->job_running =
						BLOCK_ERROR_STATE;
					bg_record->state |= BG_BLOCK_ERROR_FLAG;
					error("_find_best_block_match: Picked "
					      "block (%s) had some issues with "
					      "hardware, trying a different "
					      "one.",
					      bg_record->bg_block_id);
					continue;
				}
			}
#endif
			format_node_name(bg_record, tmp_char, sizeof(tmp_char));

			debug("_find_best_block_match %s <%s>",
			      bg_record->bg_block_id, tmp_char);
			bit_and(slurm_block_bitmap, bg_record->mp_bitmap);
			rc = SLURM_SUCCESS;
			*found_bg_record = bg_record;
			goto end_it;
		}

		/* see if we can just reset the image and reboot the block */
		if (allow) {
			check_image = 0;
			allow = 0;
			continue;
		}

		check_image = 1;

		/* all these assume that the *bg_record is NULL */

		if (bg_conf->layout_mode == LAYOUT_OVERLAP
		    && !is_test && overlap_check < 2) {
			overlap_check++;
			continue;
		}

		if (create_try || bg_conf->layout_mode != LAYOUT_DYNAMIC)
			goto no_match;

		if ((rc = _dynamically_request(block_list, blocks_added,
					       &request,
					       job_ptr->details->req_nodes,
					       query_mode))
		    == SLURM_SUCCESS) {
			create_try = 1;
			continue;
		}

		/* Only look at the full system if we aren't going to
		   preempt jobs later and look.
		*/
		if (is_test && SELECT_IS_CHECK_FULL_SET(query_mode)) {
			List new_blocks = NULL;
			List job_list = list_create(NULL);
			ListIterator itr = NULL;
			if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
				info("trying with empty machine");

			/* Here we need to make sure the blocks in the
			   job list are those in the block list so go
			   through and grab them and add them to a
			   separate list.
			*/
			itr = list_iterator_create(block_list);
			while ((bg_record = list_next(itr))) {
				if (bg_record->job_running != NO_JOB_RUNNING)
					list_append(job_list, bg_record);
				/* Since the error blocks are at the
				   end we only really need to look at
				   the first one to make sure it will
				   work, so don't add more than one to
				   the job list.
				   We do need to check for at least
				   one error block because that lets
				   us know not to hold up the entire
				   machine for a job that won't run
				   until the error is removed which
				   could be a very long time.
				*/
				if (bg_record->job_running == BLOCK_ERROR_STATE)
					break;
			}
			list_iterator_destroy(itr);

			/* Block list is already in the correct order,
			   earliest avaliable first,
			   so the job list will also be. No need to
			   sort. */
			while (1) {
				bool track_down_nodes = true;

				if ((bg_record = list_pop(job_list))) {
					if (bg_record->job_ptr) {
						if (bg_conf->slurm_debug_flags
						    & DEBUG_FLAG_BG_PICK)
							info("taking off "
							     "%d(%s) started "
							     "at %ld ends "
							     "at %ld",
							     bg_record->
							     job_running,
							     bg_record->
							     bg_block_id,
							     bg_record->
							     job_ptr->
							     start_time,
							     bg_record->
							     job_ptr->
							     end_time);
						/* Mark the block as
						   not running a job,
						   this should
						   correspond to the
						   pointer in the
						   block_list.  We
						   only look at the
						   job_running var so
						   don't remove the
						   job_ptr.
						*/
						bg_record->job_running =
							NO_JOB_RUNNING;
					} else if ((bg_record->job_running
						    == BLOCK_ERROR_STATE)
						   && (bg_conf->
						       slurm_debug_flags
						       & DEBUG_FLAG_BG_PICK))
						info("taking off (%s) "
						     "which is in an "
						     "error state",
						     bg_record->bg_block_id);
				} else
					/* This means we didn't have
					   any jobs to take off
					   anymore so we are making
					   sure we can look at every
					   node on the system.
					*/
					track_down_nodes = false;

				if (!(new_blocks = create_dynamic_block(
					      block_list, &request, job_list,
					      track_down_nodes))) {
					if (errno == ESLURM_INTERCONNECT_FAILURE
					    || !list_count(job_list)) {
						char *nodes;
						if (slurmctld_conf.
						    slurmctld_debug < 5)
							break;
						nodes = bitmap2node_name(
							slurm_block_bitmap);
						debug("job %u not "
						      "runable on %s",
						      job_ptr->job_id,
						      nodes);
						xfree(nodes);
						break;
					}
					continue;
				}
				rc = SLURM_SUCCESS;
				/* outside of the job_test_list this
				 * gets destroyed later, so don't worry
				 * about it now
				 */
				(*found_bg_record) = list_pop(new_blocks);
				if (!(*found_bg_record)) {
					list_destroy(new_blocks);
					if (!bg_record) {
						/* This should never happen */
						error("got an empty list back");
						rc = SLURM_ERROR;
						break;
					}

					if (bg_conf->slurm_debug_flags
					    & DEBUG_FLAG_BG_PICK)
						info("Appears we are trying "
						     "to place this job on "
						     "the block we just "
						     "removed %s.",
						     bg_record->bg_block_id);
					/* This means we placed the job on
					   the block we just popped off.
					*/
					bit_and(slurm_block_bitmap,
						bg_record->mp_bitmap);
					*found_bg_record = bg_record;
					break;
				}
				bit_and(slurm_block_bitmap,
					(*found_bg_record)->mp_bitmap);

				if (bg_record) {
					(*found_bg_record)->job_running =
						bg_record->job_running;
					(*found_bg_record)->job_ptr
						= bg_record->job_ptr;
				}
				list_destroy(new_blocks);
				break;
			}

			list_destroy(job_list);

			goto end_it;
		} else {
			break;
		}
	}

no_match:
	debug("_find_best_block_match none found");
	rc = SLURM_ERROR;

end_it:

	xfree(request.blrtsimage);
	xfree(request.linuximage);
	xfree(request.mloaderimage);
	xfree(request.ramdiskimage);

	return rc;
}
Exemple #19
0
static bg_record_t *_find_matching_block(List block_list,
					 struct job_record* job_ptr,
					 bitstr_t* slurm_block_bitmap,
					 select_ba_request_t *request,
					 uint32_t max_cpus,
					 int *allow, int check_image,
					 int overlap_check,
					 List overlapped_list,
					 uint16_t query_mode)
{
	bg_record_t *bg_record = NULL;
	ListIterator itr = NULL;
	char tmp_char[256];

	if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
		info("number of blocks to check: %d state %d "
		     "asking for %u-%u cpus",
		     list_count(block_list),
		     query_mode, request->procs, max_cpus);

	itr = list_iterator_create(block_list);
	while ((bg_record = list_next(itr))) {
		/* If test_only we want to fall through to tell the
		   scheduler that it is runnable just not right now.
		*/

		/* The job running could be reset so set it back up
		   here if there is a job_ptr
		*/
		if (bg_record->job_ptr)
			bg_record->job_running = bg_record->job_ptr->job_id;

		/*block is messed up some how (BLOCK_ERROR_STATE_FLAG)
		 * ignore it or if state == BG_BLOCK_ERROR */
		if ((bg_record->job_running == BLOCK_ERROR_STATE)
		    || (bg_record->state & BG_BLOCK_ERROR_FLAG)) {
			if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
				info("block %s is in an error "
				     "state (can't use)",
				     bg_record->bg_block_id);
			continue;
		} else if ((bg_conf->layout_mode == LAYOUT_DYNAMIC)
			   || ((!SELECT_IS_CHECK_FULL_SET(query_mode)
				|| SELECT_IS_MODE_RUN_NOW(query_mode))
			       && (bg_conf->layout_mode != LAYOUT_DYNAMIC))) {
			if (bg_record->free_cnt) {
				/* No reason to look at a block that
				   is being freed unless we are
				   running static and looking at the
				   full set.
				*/
				if (bg_conf->slurm_debug_flags
				    & DEBUG_FLAG_BG_PICK)
					info("block %s being free for other "
					     "job(s), skipping",
					     bg_record->bg_block_id);
				continue;
			} else if ((bg_record->job_running != NO_JOB_RUNNING)
				   && (bg_record->job_running
				       != job_ptr->job_id)) {
				/* Look here if you are trying to run now or
				   if you aren't looking at the full set.  We
				   don't continue on running blocks for the
				   full set because we are seeing if the job
				   can ever run so look here.
				*/
				if (bg_conf->slurm_debug_flags
				    & DEBUG_FLAG_BG_PICK)
					info("block %s in use by %s job %d",
					     bg_record->bg_block_id,
					     bg_record->user_name,
					     bg_record->job_running);
				continue;
			}
		}

		/* Check processor count */
		if ((bg_record->cpu_cnt < request->procs)
		    || ((max_cpus != NO_VAL)
			&& (bg_record->cpu_cnt > max_cpus))) {
			/* We use the proccessor count per block here
			   mostly to see if we can run on a smaller block.
			*/
			if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) {
				convert_num_unit((float)bg_record->cpu_cnt,
						 tmp_char,
						 sizeof(tmp_char), UNIT_NONE);
				info("block %s CPU count (%s) not suitable",
				     bg_record->bg_block_id,
				     tmp_char);
			}
			continue;
		}

		/*
		 * Next we check that this block's bitmap is within
		 * the set of nodes which the job can use.
		 * Nodes not available for the job could be down,
		 * drained, allocated to some other job, or in some
		 * SLURM block not available to this job.
		 */
		if (!bit_super_set(bg_record->mp_bitmap, slurm_block_bitmap)) {
			if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK) {
				char *temp = bitmap2node_name(
					bg_record->mp_bitmap);
				char *temp2 = bitmap2node_name(
					slurm_block_bitmap);
				info("bg block %s has nodes not "
				     "usable by this job %s %s",
				     bg_record->bg_block_id, temp, temp2);
				xfree(temp);
				xfree(temp2);
			}
			continue;
		}

		/*
		 * Insure that any required nodes are in this BG block
		 */
		if (job_ptr->details->req_node_bitmap
		    && (!bit_super_set(job_ptr->details->req_node_bitmap,
				       bg_record->mp_bitmap))) {
			if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
				info("bg block %s lacks required nodes",
				     bg_record->bg_block_id);
			continue;
		}

		if (_check_for_booted_overlapping_blocks(
			    block_list, itr, bg_record,
			    overlap_check, overlapped_list, query_mode))
			continue;

		if (check_image) {
#ifdef HAVE_BGL
			if (request->blrtsimage &&
			    strcasecmp(request->blrtsimage,
				       bg_record->blrtsimage)) {
				*allow = 1;
				continue;
			}
#endif
#ifdef HAVE_BG_L_P
			if (request->linuximage &&
			    strcasecmp(request->linuximage,
				       bg_record->linuximage)) {
				*allow = 1;
				continue;
			}

			if (request->ramdiskimage &&
			    strcasecmp(request->ramdiskimage,
				       bg_record->ramdiskimage)) {
				*allow = 1;
				continue;
			}
#endif
			if (request->mloaderimage &&
			    strcasecmp(request->mloaderimage,
				       bg_record->mloaderimage)) {
				*allow = 1;
				continue;
			}
		}

		/***********************************************/
		/* check the connection type specified matches */
		/***********************************************/
		if ((request->conn_type[0] != bg_record->conn_type[0])
		    && (request->conn_type[0] != SELECT_NAV)) {
#ifdef HAVE_BGP
			if (request->conn_type[0] >= SELECT_SMALL) {
				/* we only want to reboot blocks if
				   they have to be so skip booted
				   blocks if in small state
				*/
				if (check_image
				    && (bg_record->state
					== BG_BLOCK_INITED)) {
					*allow = 1;
					continue;
				}
				goto good_conn_type;
			} else if (bg_record->conn_type[0] >= SELECT_SMALL) {
				/* since we already checked to see if
				   the cpus were good this means we are
				   looking for a block in a range that
				   includes small and regular blocks.
				   So we can just continue on.
				*/
				goto good_conn_type;
			}
#endif
			if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
				info("bg block %s conn-type not usable "
				     "asking for %s bg_record is %s",
				     bg_record->bg_block_id,
				     conn_type_string(request->conn_type[0]),
				     conn_type_string(bg_record->conn_type[0]));
			continue;
		}
#ifdef HAVE_BGP
	good_conn_type:
#endif
		/*****************************************/
		/* match up geometry as "best" possible  */
		/*****************************************/
		if ((request->geometry[0] != (uint16_t)NO_VAL)
		    && (!_check_rotate_geo(bg_record->geo, request->geometry,
					   request->rotate)))
			continue;

		if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
			info("we found one! %s", bg_record->bg_block_id);
		break;
	}
	list_iterator_destroy(itr);

	return bg_record;
}
Exemple #20
0
static char *	_will_run_test(uint32_t jobid, time_t start_time,
			       char *node_list, int *err_code, char **err_msg)
{
	struct job_record *job_ptr = NULL;
	struct part_record *part_ptr;
	bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL;
	bitstr_t *exc_core_bitmap = NULL;
	char *hostlist, *reply_msg = NULL;
	uint32_t min_nodes, max_nodes, req_nodes;
	int rc;
	time_t start_res, orig_start_time;
	List preemptee_candidates;

	debug2("wiki2: will_run job_id=%u start_time=%u node_list=%s",
		jobid, (uint32_t)start_time, node_list);

	job_ptr = find_job_record(jobid);
	if (job_ptr == NULL) {
		*err_code = -700;
		*err_msg = "No such job";
		error("wiki: Failed to find job %u", jobid);
		return NULL;
	}
	if ((job_ptr->details == NULL) || (!IS_JOB_PENDING(job_ptr))) {
		*err_code = -700;
		*err_msg = "WillRun not applicable to non-pending job";
		error("wiki: WillRun on non-pending job %u", jobid);
		return NULL;
	}

	part_ptr = job_ptr->part_ptr;
	if (part_ptr == NULL) {
		*err_code = -700;
		*err_msg = "Job lacks a partition";
		error("wiki: Job %u lacks a partition", jobid);
		return NULL;
	}

	if ((node_list == NULL) || (node_list[0] == '\0')) {
		/* assume all nodes available to job for testing */
		avail_bitmap = bit_copy(avail_node_bitmap);
	} else if (node_name2bitmap(node_list, false, &avail_bitmap) != 0) {
		*err_code = -700;
		*err_msg = "Invalid available nodes value";
		error("wiki: Attempt to set invalid available node "
		      "list for job %u, %s", jobid, node_list);
		return NULL;
	}

	/* Enforce reservation: access control, time and nodes */
	start_res = start_time;
	rc = job_test_resv(job_ptr, &start_res, true, &resv_bitmap,
			   &exc_core_bitmap);
	if (rc != SLURM_SUCCESS) {
		*err_code = -730;
		*err_msg = "Job denied access to reservation";
		error("wiki: reservation access denied for job %u", jobid);
		FREE_NULL_BITMAP(avail_bitmap);
		return NULL;
	}
	start_time = MAX(start_time, start_res);
	bit_and(avail_bitmap, resv_bitmap);
	FREE_NULL_BITMAP(resv_bitmap);

	/* Only consider nodes that are not DOWN or DRAINED */
	bit_and(avail_bitmap, avail_node_bitmap);

	/* Consider only nodes in this job's partition */
	if (part_ptr->node_bitmap)
		bit_and(avail_bitmap, part_ptr->node_bitmap);
	else {
		*err_code = -730;
		*err_msg = "Job's partition has no nodes";
		error("wiki: no nodes in partition %s for job %u",
			part_ptr->name, jobid);
		FREE_NULL_BITMAP(avail_bitmap);
		return NULL;
	}

	if (job_req_node_filter(job_ptr, avail_bitmap) != SLURM_SUCCESS) {
		/* Job probably has invalid feature list */
		*err_code = -730;
		*err_msg = "Job's required features not available "
			   "on selected nodes";
		error("wiki: job %u not runnable on hosts=%s",
		      jobid, node_list);
		FREE_NULL_BITMAP(avail_bitmap);
		return NULL;
	}
	if (job_ptr->details->exc_node_bitmap) {
		bit_not(job_ptr->details->exc_node_bitmap);
		bit_and(avail_bitmap, job_ptr->details->exc_node_bitmap);
		bit_not(job_ptr->details->exc_node_bitmap);
	}
	if ((job_ptr->details->req_node_bitmap) &&
	    (!bit_super_set(job_ptr->details->req_node_bitmap,
			    avail_bitmap))) {
		*err_code = -730;
		*err_msg = "Job's required nodes not available";
		error("wiki: job %u not runnable on hosts=%s",
		      jobid, node_list);
		FREE_NULL_BITMAP(avail_bitmap);
		return NULL;
	}

	min_nodes = MAX(job_ptr->details->min_nodes, part_ptr->min_nodes);
	if (job_ptr->details->max_nodes == 0)
		max_nodes = part_ptr->max_nodes;
	else
		max_nodes = MIN(job_ptr->details->max_nodes,
				part_ptr->max_nodes);
	max_nodes = MIN(max_nodes, 500000); /* prevent overflows */
	if (job_ptr->details->max_nodes)
		req_nodes = max_nodes;
	else
		req_nodes = min_nodes;
	if (min_nodes > max_nodes) {
		/* job's min_nodes exceeds partitions max_nodes */
		*err_code = -730;
		*err_msg = "Job's min_nodes > max_nodes";
		error("wiki: job %u not runnable on hosts=%s",
		      jobid, node_list);
		FREE_NULL_BITMAP(avail_bitmap);
		return NULL;
	}

	preemptee_candidates = slurm_find_preemptable_jobs(job_ptr);

	orig_start_time = job_ptr->start_time;
	rc = select_g_job_test(job_ptr, avail_bitmap,
			       min_nodes, max_nodes, req_nodes,
			       SELECT_MODE_WILL_RUN,
			       preemptee_candidates, NULL, exc_core_bitmap);
	if (preemptee_candidates)
		list_destroy(preemptee_candidates);

	if (rc == SLURM_SUCCESS) {
		char tmp_str[128];
		*err_code = 0;
		uint32_t proc_cnt = 0;

		xstrcat(reply_msg, "STARTINFO=");
#ifdef HAVE_BG
		select_g_select_jobinfo_get(job_ptr->select_jobinfo,
                             		    SELECT_JOBDATA_NODE_CNT,
					    &proc_cnt);

#else
		proc_cnt = job_ptr->total_cpus;
#endif
		snprintf(tmp_str, sizeof(tmp_str), "%u:%u@%u,",
			 jobid, proc_cnt, (uint32_t) job_ptr->start_time);
		xstrcat(reply_msg, tmp_str);
		hostlist = bitmap2node_name(avail_bitmap);
		xstrcat(reply_msg, hostlist);
		xfree(hostlist);
	} else {
		xstrcat(reply_msg, "Jobs not runable on selected nodes");
		error("wiki: jobs not runnable on nodes");
	}

	/* Restore pending job's expected start time */
	job_ptr->start_time = orig_start_time;
	FREE_NULL_BITMAP(avail_bitmap);
	return reply_msg;
}
Exemple #21
0
static int _attempt_backfill(void)
{
	DEF_TIMERS;
	bool filter_root = false;
	List job_queue;
	job_queue_rec_t *job_queue_rec;
	slurmdb_qos_rec_t *qos_ptr = NULL;
	int i, j, node_space_recs;
	struct job_record *job_ptr;
	struct part_record *part_ptr, **bf_part_ptr = NULL;
	uint32_t end_time, end_reserve;
	uint32_t time_limit, comp_time_limit, orig_time_limit, part_time_limit;
	uint32_t min_nodes, max_nodes, req_nodes;
	bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL;
	bitstr_t *exc_core_bitmap = NULL, *non_cg_bitmap = NULL;
	bitstr_t *previous_bitmap = NULL;
	time_t now, sched_start, later_start, start_res, resv_end;
	node_space_map_t *node_space;
	struct timeval bf_time1, bf_time2;
	int rc = 0;
	int job_test_count = 0;
	uint32_t *uid = NULL, nuser = 0, bf_parts = 0, *bf_part_jobs = NULL;
	uint16_t *njobs = NULL;
	bool already_counted;
	uint32_t reject_array_job_id = 0;
	struct part_record *reject_array_part = NULL;
	uint32_t job_start_cnt = 0;
	time_t config_update = slurmctld_conf.last_update;
	time_t part_update = last_part_update;
	struct timeval start_tv;

	bf_last_yields = 0;
#ifdef HAVE_ALPS_CRAY
	/*
	 * Run a Basil Inventory immediately before setting up the schedule
	 * plan, to avoid race conditions caused by ALPS node state change.
	 * Needs to be done with the node-state lock taken.
	 */
	START_TIMER;
	if (select_g_reconfigure()) {
		debug4("backfill: not scheduling due to ALPS");
		return SLURM_SUCCESS;
	}
	END_TIMER;
	if (debug_flags & DEBUG_FLAG_BACKFILL)
		info("backfill: ALPS inventory completed, %s", TIME_STR);

	/* The Basil inventory can take a long time to complete. Process
	 * pending RPCs before starting the backfill scheduling logic */
	_yield_locks(1000000);
#endif

	START_TIMER;
	if (debug_flags & DEBUG_FLAG_BACKFILL)
		info("backfill: beginning");
	else
		debug("backfill: beginning");
	sched_start = now = time(NULL);
	gettimeofday(&start_tv, NULL);

	if (slurm_get_root_filter())
		filter_root = true;

	job_queue = build_job_queue(true, true);
	if (list_count(job_queue) == 0) {
		debug("backfill: no jobs to backfill");
		list_destroy(job_queue);
		return 0;
	}

	gettimeofday(&bf_time1, NULL);

	non_cg_bitmap = bit_copy(cg_node_bitmap);
	bit_not(non_cg_bitmap);

	slurmctld_diag_stats.bf_queue_len = list_count(job_queue);
	slurmctld_diag_stats.bf_queue_len_sum += slurmctld_diag_stats.
						 bf_queue_len;
	slurmctld_diag_stats.bf_last_depth = 0;
	slurmctld_diag_stats.bf_last_depth_try = 0;
	slurmctld_diag_stats.bf_when_last_cycle = now;
	slurmctld_diag_stats.bf_active = 1;

	node_space = xmalloc(sizeof(node_space_map_t) *
			     (max_backfill_job_cnt * 2 + 1));
	node_space[0].begin_time = sched_start;
	node_space[0].end_time = sched_start + backfill_window;
	node_space[0].avail_bitmap = bit_copy(avail_node_bitmap);
	node_space[0].next = 0;
	node_space_recs = 1;
	if (debug_flags & DEBUG_FLAG_BACKFILL)
		_dump_node_space_table(node_space);

	if (max_backfill_job_per_part) {
		ListIterator part_iterator;
		struct part_record *part_ptr;
		bf_parts = list_count(part_list);
		bf_part_ptr  = xmalloc(sizeof(struct part_record *) * bf_parts);
		bf_part_jobs = xmalloc(sizeof(int) * bf_parts);
		part_iterator = list_iterator_create(part_list);
		i = 0;
		while ((part_ptr = (struct part_record *)
				   list_next(part_iterator))) {
			bf_part_ptr[i++] = part_ptr;
		}
		list_iterator_destroy(part_iterator);
	}
	if (max_backfill_job_per_user) {
		uid = xmalloc(BF_MAX_USERS * sizeof(uint32_t));
		njobs = xmalloc(BF_MAX_USERS * sizeof(uint16_t));
	}
	sort_job_queue(job_queue);
	while ((job_queue_rec = (job_queue_rec_t *) list_pop(job_queue))) {
		if (slurmctld_config.shutdown_time)
			break;
		if (((defer_rpc_cnt > 0) &&
		     (slurmctld_config.server_thread_count >= defer_rpc_cnt)) ||
		    (_delta_tv(&start_tv) >= sched_timeout)) {
			if (debug_flags & DEBUG_FLAG_BACKFILL) {
				END_TIMER;
				info("backfill: completed yielding locks "
				     "after testing %d jobs, %s",
				     job_test_count, TIME_STR);
			}
			if ((_yield_locks(yield_sleep) && !backfill_continue) ||
			    (slurmctld_conf.last_update != config_update) ||
			    (last_part_update != part_update)) {
				if (debug_flags & DEBUG_FLAG_BACKFILL) {
					info("backfill: system state changed, "
					     "breaking out after testing %d "
					     "jobs", job_test_count);
				}
				rc = 1;
				xfree(job_queue_rec);
				break;
			}
			/* cg_node_bitmap may be changed */
			bit_copybits(non_cg_bitmap, cg_node_bitmap);
			bit_not(non_cg_bitmap);
			/* Reset backfill scheduling timers, resume testing */
			sched_start = time(NULL);
			gettimeofday(&start_tv, NULL);
			job_test_count = 0;
			START_TIMER;
		}

		job_ptr  = job_queue_rec->job_ptr;
		/* With bf_continue configured, the original job could have
		 * been cancelled and purged. Validate pointer here. */
		if ((job_ptr->magic  != JOB_MAGIC) ||
		    (job_ptr->job_id != job_queue_rec->job_id)) {
			xfree(job_queue_rec);
			continue;
		}
		orig_time_limit = job_ptr->time_limit;
		part_ptr = job_queue_rec->part_ptr;
		job_test_count++;

		xfree(job_queue_rec);
		if (!IS_JOB_PENDING(job_ptr))
			continue;	/* started in other partition */
		if (!avail_front_end(job_ptr))
			continue;	/* No available frontend for this job */
		if (job_ptr->array_task_id != NO_VAL) {
			if ((reject_array_job_id == job_ptr->array_job_id) &&
			    (reject_array_part   == part_ptr))
				continue;  /* already rejected array element */
			/* assume reject whole array for now, clear if OK */
			reject_array_job_id = job_ptr->array_job_id;
			reject_array_part   = part_ptr;
		}
		job_ptr->part_ptr = part_ptr;

		if (debug_flags & DEBUG_FLAG_BACKFILL)
			info("backfill test for job %u", job_ptr->job_id);

		slurmctld_diag_stats.bf_last_depth++;
		already_counted = false;

		if (max_backfill_job_per_part) {
			bool skip_job = false;
			for (j = 0; j < bf_parts; j++) {
				if (bf_part_ptr[j] != job_ptr->part_ptr)
					continue;
				if (bf_part_jobs[j]++ >=
				    max_backfill_job_per_part)
					skip_job = true;
				break;
			}
			if (skip_job) {
				if (debug_flags & DEBUG_FLAG_BACKFILL)
					debug("backfill: have already "
					      "checked %u jobs for "
					      "partition %s; skipping "
					      "job %u",
					      max_backfill_job_per_part,
					      job_ptr->part_ptr->name,
					      job_ptr->job_id);
				continue;
			}
		}
		if (max_backfill_job_per_user) {
			for (j = 0; j < nuser; j++) {
				if (job_ptr->user_id == uid[j]) {
					njobs[j]++;
					if (debug_flags & DEBUG_FLAG_BACKFILL)
						debug("backfill: user %u: "
						      "#jobs %u",
						      uid[j], njobs[j]);
					break;
				}
			}
			if (j == nuser) { /* user not found */
				static bool bf_max_user_msg = true;
				if (nuser < BF_MAX_USERS) {
					uid[j] = job_ptr->user_id;
					njobs[j] = 1;
					nuser++;
				} else if (bf_max_user_msg) {
					bf_max_user_msg = false;
					error("backfill: too many users in "
					      "queue. Consider increasing "
					      "BF_MAX_USERS");
				}
				if (debug_flags & DEBUG_FLAG_BACKFILL)
					debug2("backfill: found new user %u. "
					       "Total #users now %u",
					       job_ptr->user_id, nuser);
			} else {
				if (njobs[j] >= max_backfill_job_per_user) {
					/* skip job */
					if (debug_flags & DEBUG_FLAG_BACKFILL)
						debug("backfill: have already "
						      "checked %u jobs for "
						      "user %u; skipping "
						      "job %u",
						      max_backfill_job_per_user,
						      job_ptr->user_id,
						      job_ptr->job_id);
					continue;
				}
			}
		}

		if (((part_ptr->state_up & PARTITION_SCHED) == 0) ||
		    (part_ptr->node_bitmap == NULL))
		 	continue;
		if ((part_ptr->flags & PART_FLAG_ROOT_ONLY) && filter_root)
			continue;

		if ((!job_independent(job_ptr, 0)) ||
		    (license_job_test(job_ptr, time(NULL)) != SLURM_SUCCESS))
			continue;

		/* Determine minimum and maximum node counts */
		min_nodes = MAX(job_ptr->details->min_nodes,
				part_ptr->min_nodes);
		if (job_ptr->details->max_nodes == 0)
			max_nodes = part_ptr->max_nodes;
		else
			max_nodes = MIN(job_ptr->details->max_nodes,
					part_ptr->max_nodes);
		max_nodes = MIN(max_nodes, 500000);     /* prevent overflows */
		if (job_ptr->details->max_nodes)
			req_nodes = max_nodes;
		else
			req_nodes = min_nodes;
		if (min_nodes > max_nodes) {
			/* job's min_nodes exceeds partition's max_nodes */
			continue;
		}

		/* Determine job's expected completion time */
		if (part_ptr->max_time == INFINITE)
			part_time_limit = 365 * 24 * 60; /* one year */
		else
			part_time_limit = part_ptr->max_time;
		if (job_ptr->time_limit == NO_VAL) {
			time_limit = part_time_limit;
		} else {
			if (part_ptr->max_time == INFINITE)
				time_limit = job_ptr->time_limit;
			else
				time_limit = MIN(job_ptr->time_limit,
						 part_time_limit);
		}
		comp_time_limit = time_limit;
		qos_ptr = job_ptr->qos_ptr;
		if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE) &&
		    slurm_get_preempt_mode())
			time_limit = job_ptr->time_limit = 1;
		else if (job_ptr->time_min && (job_ptr->time_min < time_limit))
			time_limit = job_ptr->time_limit = job_ptr->time_min;

		/* Determine impact of any resource reservations */
		later_start = now;
		FREE_NULL_BITMAP(previous_bitmap);
 TRY_LATER:
		if (slurmctld_config.shutdown_time)
			break;
		if (((defer_rpc_cnt > 0) &&
		     (slurmctld_config.server_thread_count >= defer_rpc_cnt)) ||
		    (_delta_tv(&start_tv) >= sched_timeout)) {
			uint32_t save_job_id = job_ptr->job_id;
			uint32_t save_time_limit = job_ptr->time_limit;
			job_ptr->time_limit = orig_time_limit;
			if (debug_flags & DEBUG_FLAG_BACKFILL) {
				END_TIMER;
				info("backfill: completed yielding locks "
				     "after testing %d jobs, %s",
				     job_test_count, TIME_STR);
			}
			if ((_yield_locks(yield_sleep) && !backfill_continue) ||
			    (slurmctld_conf.last_update != config_update) ||
			    (last_part_update != part_update)) {
				if (debug_flags & DEBUG_FLAG_BACKFILL) {
					info("backfill: system state changed, "
					     "breaking out after testing %d "
					     "jobs", job_test_count);
				}
				rc = 1;
				break;
			}
			/* cg_node_bitmap may be changed */
			bit_copybits(non_cg_bitmap, cg_node_bitmap);
			bit_not(non_cg_bitmap);

			/* With bf_continue configured, the original job could
			 * have been scheduled or cancelled and purged.
			 * Revalidate job the record here. */
			if ((job_ptr->magic  != JOB_MAGIC) ||
			    (job_ptr->job_id != save_job_id))
				continue;
			if (!IS_JOB_PENDING(job_ptr))
				continue;
			if (!avail_front_end(job_ptr))
				continue;	/* No available frontend */

			job_ptr->time_limit = save_time_limit;
			/* Reset backfill scheduling timers, resume testing */
			sched_start = time(NULL);
			gettimeofday(&start_tv, NULL);
			job_test_count = 1;
			START_TIMER;
		}

		FREE_NULL_BITMAP(avail_bitmap);
		FREE_NULL_BITMAP(exc_core_bitmap);
		start_res   = later_start;
		later_start = 0;
		j = job_test_resv(job_ptr, &start_res, true, &avail_bitmap,
				  &exc_core_bitmap);
		if (j != SLURM_SUCCESS) {
			job_ptr->time_limit = orig_time_limit;
			continue;
		}
		if (start_res > now)
			end_time = (time_limit * 60) + start_res;
		else
			end_time = (time_limit * 60) + now;
		resv_end = find_resv_end(start_res);
		/* Identify usable nodes for this job */
		bit_and(avail_bitmap, part_ptr->node_bitmap);
		bit_and(avail_bitmap, up_node_bitmap);
		bit_and(avail_bitmap, non_cg_bitmap);
		for (j=0; ; ) {
			if ((node_space[j].end_time > start_res) &&
			     node_space[j].next && (later_start == 0))
				later_start = node_space[j].end_time;
			if (node_space[j].end_time <= start_res)
				;
			else if (node_space[j].begin_time <= end_time) {
				bit_and(avail_bitmap,
					node_space[j].avail_bitmap);
			} else
				break;
			if ((j = node_space[j].next) == 0)
				break;
		}
		if ((resv_end++) &&
		    ((later_start == 0) || (resv_end < later_start))) {
			later_start = resv_end;
		}

		if (job_ptr->details->exc_node_bitmap) {
			bit_not(job_ptr->details->exc_node_bitmap);
			bit_and(avail_bitmap,
				job_ptr->details->exc_node_bitmap);
			bit_not(job_ptr->details->exc_node_bitmap);
		}

		/* Test if insufficient nodes remain OR
		 *	required nodes missing OR
		 *	nodes lack features OR
		 *	no change since previously tested nodes (only changes
		 *	in other partition nodes) */
		if ((bit_set_count(avail_bitmap) < min_nodes) ||
		    ((job_ptr->details->req_node_bitmap) &&
		     (!bit_super_set(job_ptr->details->req_node_bitmap,
				     avail_bitmap))) ||
		    (job_req_node_filter(job_ptr, avail_bitmap)) ||
		    (previous_bitmap &&
		     bit_equal(previous_bitmap, avail_bitmap))) {
			if (later_start) {
				job_ptr->start_time = 0;
				goto TRY_LATER;
			}
			/* Job can not start until too far in the future */
			job_ptr->time_limit = orig_time_limit;
			job_ptr->start_time = sched_start + backfill_window;
			continue;
		}

		FREE_NULL_BITMAP(previous_bitmap);
		previous_bitmap = bit_copy(avail_bitmap);

		/* Identify nodes which are definitely off limits */
		FREE_NULL_BITMAP(resv_bitmap);
		resv_bitmap = bit_copy(avail_bitmap);
		bit_not(resv_bitmap);

		/* this is the time consuming operation */
		debug2("backfill: entering _try_sched for job %u.",
		       job_ptr->job_id);

		if (!already_counted) {
			slurmctld_diag_stats.bf_last_depth_try++;
			already_counted = true;
		}

		if (debug_flags & DEBUG_FLAG_BACKFILL)
			_dump_job_test(job_ptr, avail_bitmap, start_res);
		j = _try_sched(job_ptr, &avail_bitmap, min_nodes, max_nodes,
			       req_nodes, exc_core_bitmap);

		now = time(NULL);
		if (j != SLURM_SUCCESS) {
			job_ptr->time_limit = orig_time_limit;
			job_ptr->start_time = 0;
			continue;	/* not runable */
		}

		if (start_res > job_ptr->start_time) {
			job_ptr->start_time = start_res;
			last_job_update = now;
		}
		if (job_ptr->start_time <= now) {
			uint32_t save_time_limit = job_ptr->time_limit;
			int rc = _start_job(job_ptr, resv_bitmap);
			if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) {
				if (orig_time_limit == NO_VAL) {
					acct_policy_alter_job(
						job_ptr, comp_time_limit);
					job_ptr->time_limit = comp_time_limit;
				} else {
					acct_policy_alter_job(
						job_ptr, orig_time_limit);
					job_ptr->time_limit = orig_time_limit;
				}
				job_ptr->end_time = job_ptr->start_time +
						    (job_ptr->time_limit * 60);
			} else if ((rc == SLURM_SUCCESS) && job_ptr->time_min) {
				/* Set time limit as high as possible */
				acct_policy_alter_job(job_ptr, comp_time_limit);
				job_ptr->time_limit = comp_time_limit;
				job_ptr->end_time = job_ptr->start_time +
						    (comp_time_limit * 60);
				_reset_job_time_limit(job_ptr, now,
						      node_space);
				time_limit = job_ptr->time_limit;
			} else if (orig_time_limit == NO_VAL) {
				acct_policy_alter_job(job_ptr, comp_time_limit);
				job_ptr->time_limit = comp_time_limit;
				job_ptr->end_time = job_ptr->start_time +
					(job_ptr->time_limit * 60);
			} else {
				acct_policy_alter_job(job_ptr, orig_time_limit);
				job_ptr->time_limit = orig_time_limit;
				job_ptr->end_time = job_ptr->start_time +
					(job_ptr->time_limit * 60);
			}
			if (rc == ESLURM_ACCOUNTING_POLICY) {
				/* Unknown future start time, just skip job */
				job_ptr->start_time = 0;
				continue;
			} else if (rc != SLURM_SUCCESS) {
				/* Planned to start job, but something bad
				 * happened. */
				job_ptr->start_time = 0;
				break;
			} else {
				/* Started this job, move to next one */
				reject_array_job_id = 0;
				reject_array_part   = NULL;

				/* Update the database if job time limit
				 * changed and move to next job */
				if (save_time_limit != job_ptr->time_limit)
					jobacct_storage_g_job_start(acct_db_conn,
								    job_ptr);
				if (max_backfill_jobs_start &&
				    (++job_start_cnt >= max_backfill_jobs_start))
					break;
				continue;
			}
		} else
			job_ptr->time_limit = orig_time_limit;

		if (later_start && (job_ptr->start_time > later_start)) {
			/* Try later when some nodes currently reserved for
			 * pending jobs are free */
			job_ptr->start_time = 0;
			goto TRY_LATER;
		}

		if (job_ptr->start_time > (sched_start + backfill_window)) {
			/* Starts too far in the future to worry about */
			continue;
		}

		if (node_space_recs >= max_backfill_job_cnt) {
			/* Already have too many jobs to deal with */
			break;
		}

		end_reserve = job_ptr->start_time + (time_limit * 60);
		if (_test_resv_overlap(node_space, avail_bitmap,
				       job_ptr->start_time, end_reserve)) {
			/* This job overlaps with an existing reservation for
			 * job to be backfill scheduled, which the sched
			 * plugin does not know about. Try again later. */
			later_start = job_ptr->start_time;
			job_ptr->start_time = 0;
			goto TRY_LATER;
		}

		/*
		 * Add reservation to scheduling table if appropriate
		 */
		if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE))
			continue;
		reject_array_job_id = 0;
		reject_array_part   = NULL;
		if (debug_flags & DEBUG_FLAG_BACKFILL)
			_dump_job_sched(job_ptr, end_reserve, avail_bitmap);
		xfree(job_ptr->sched_nodes);
		job_ptr->sched_nodes = bitmap2node_name(avail_bitmap);
		bit_not(avail_bitmap);
		_add_reservation(job_ptr->start_time, end_reserve,
				 avail_bitmap, node_space, &node_space_recs);
		if (debug_flags & DEBUG_FLAG_BACKFILL)
			_dump_node_space_table(node_space);
	}
	xfree(bf_part_jobs);
	xfree(bf_part_ptr);
	xfree(uid);
	xfree(njobs);
	FREE_NULL_BITMAP(avail_bitmap);
	FREE_NULL_BITMAP(exc_core_bitmap);
	FREE_NULL_BITMAP(resv_bitmap);
	FREE_NULL_BITMAP(non_cg_bitmap);
	FREE_NULL_BITMAP(previous_bitmap);

	for (i=0; ; ) {
		FREE_NULL_BITMAP(node_space[i].avail_bitmap);
		if ((i = node_space[i].next) == 0)
			break;
	}
	xfree(node_space);
	list_destroy(job_queue);
	gettimeofday(&bf_time2, NULL);
	_do_diag_stats(&bf_time1, &bf_time2, yield_sleep);
	if (debug_flags & DEBUG_FLAG_BACKFILL) {
		END_TIMER;
		info("backfill: completed testing %d jobs, %s",
		     job_test_count, TIME_STR);
	}
	return rc;
}