Exemple #1
0
void task_state_print (task_state_t ts, log_f fn)
{
    bitstr_t *unseen;

    if (!ts)	/* Not built yet */
        return;

    unseen = bit_alloc (ts->n_tasks);
    if (bit_set_count (ts->start_failed)) {
        _do_log_msg (ts->start_failed, fn, "failed to start");
        bit_or (unseen, ts->start_failed);
    }
    if (bit_set_count (ts->running)) {
        _do_log_msg (ts->running, fn, "running");
        bit_or (unseen, ts->running);
    }
    if (bit_set_count (ts->abnormal_exit)) {
        _do_log_msg (ts->abnormal_exit, fn, "exited abnormally");
        bit_or (unseen, ts->abnormal_exit);
    }
    if (bit_set_count (ts->normal_exit)) {
        _do_log_msg (ts->normal_exit, fn, "exited");
        bit_or (unseen, ts->normal_exit);
    }
    bit_not (unseen);
    if (bit_set_count (unseen))
        _do_log_msg (unseen, fn, "unknown");
    FREE_NULL_BITMAP(unseen);
}
Exemple #2
0
/*
 * Update the state of a specific task ID in a specific task_state structure
 */
extern void task_state_update(task_state_t ts, int task_id, task_state_type_t t)
{
	xassert(ts != NULL);
	xassert(task_id >= 0);
	xassert(task_id < ts->n_tasks);

	if (ts->pack_group == NO_VAL) {
		debug3("%s: step=%u.%u task_id=%d, %s", __func__,
		       ts->job_id, ts->step_id, task_id,
		       _task_state_type_str(t));
	} else {
		debug3("%s: step=%u.%u pack_group=%u task_id=%d, %s", __func__,
		       ts->job_id, ts->step_id, ts->pack_group, task_id,
		       _task_state_type_str(t));
	}

	switch (t) {
	case TS_START_SUCCESS:
		bit_set (ts->running, task_id);
		ts->n_started++;
		break;
	case TS_START_FAILURE:
		bit_set (ts->start_failed, task_id);
		break;
	case TS_NORMAL_EXIT:
		bit_clear(ts->running, task_id);
		if (bit_test(ts->normal_exit, task_id) ||
		    bit_test(ts->abnormal_exit, task_id)) {
			error("Task %d reported exit for a second time.",
			      task_id);
		} else {
			bit_set (ts->normal_exit, task_id);
			ts->n_exited++;
		}
		break;
	case TS_ABNORMAL_EXIT:
		bit_clear(ts->running, task_id);
		if (bit_test(ts->normal_exit, task_id) ||
		    bit_test(ts->abnormal_exit, task_id)) {
			error("Task %d reported exit for a second time.",
			      task_id);
		} else {
			bit_set (ts->abnormal_exit, task_id);
			ts->n_exited++;
			ts->n_abnormal++;
		}
		break;
	}

	xassert((bit_set_count(ts->abnormal_exit) +
		 bit_set_count(ts->normal_exit)) == ts->n_exited);
}
Exemple #3
0
/* Select the best set of resources for the given job
 * IN: job_ptr      - pointer to the job requesting resources
 * IN/OUT: node_map - bitmap of available nodes / bitmap of selected nodes
 * IN: cr_node_cnt  - total number of nodes in the cluster
 * IN/OUT: core_map - bitmap of available cores / bitmap of selected cores
 * IN: cr_type      - resource type
 * IN: test_only    - ignore allocated memory check
 * RET - array with number of CPUs available per node or NULL if not runnable
 */
static uint16_t *_select_nodes(struct job_record *job_ptr,
				bitstr_t *node_map, uint32_t cr_node_cnt,
				bitstr_t *core_map,
				struct node_use_record *node_usage,
				uint16_t cr_type, bool test_only)
{
	int node_inx;
	uint16_t *cpu_cnt, *cpus = NULL;

	if (bit_set_count(node_map) == 0)
		return NULL;

	/* get resource usage for this job from first available node */
	node_inx = _get_res_usage(job_ptr, node_map, core_map, cr_node_cnt,
				  node_usage, cr_type, &cpu_cnt, test_only);

	/* if successful, sync up the core_map with the node_map, and
	 * create a cpus array */
	if (node_inx  >= 0) {
		cpus = xmalloc(sizeof(uint16_t));
		cpus[0] = cpu_cnt[node_inx];
		if (node_inx != 0) {
			bit_nclear(core_map, 0,
				   (cr_get_coremap_offset(node_inx))-1);
		}
		if (node_inx < (cr_node_cnt - 1)) {
			bit_nclear(core_map,
				   (cr_get_coremap_offset(node_inx + 1)),
				   (cr_get_coremap_offset(cr_node_cnt) - 1));
		}
	}

	xfree(cpu_cnt);
	return cpus;
}
/*
 * Test if job can fit into the given full-length core_bitmap
 * IN job_resrcs_ptr - resources allocated to a job
 * IN full_bitmap - bitmap of available CPUs
 * IN bits_per_node - bits per node in the full_bitmap
 * RET 1 on success, 0 otherwise
 */
extern int job_fits_into_cores(job_resources_t *job_resrcs_ptr,
			       bitstr_t *full_bitmap,
			       const uint16_t *bits_per_node)
{
	int full_node_inx = 0, full_bit_inx  = 0, job_bit_inx  = 0, i;
	int job_node_cnt;

	if (!full_bitmap)
		return 1;

	job_node_cnt = bit_set_count(job_resrcs_ptr->node_bitmap);
	for (full_node_inx = bit_ffs(job_resrcs_ptr->node_bitmap);
	     job_node_cnt > 0; full_node_inx++) {
		if (bit_test(job_resrcs_ptr->node_bitmap, full_node_inx)) {
			full_bit_inx = cr_node_cores_offset[full_node_inx];
			for (i = 0; i < bits_per_node[full_node_inx]; i++) {
				if (!bit_test(full_bitmap, full_bit_inx + i))
					continue;
				if (job_resrcs_ptr->whole_node ||
				    bit_test(job_resrcs_ptr->core_bitmap,
					     job_bit_inx + i)) {
					return 0;
				}
			}
			job_bit_inx += bits_per_node[full_node_inx];
			job_node_cnt --;
		}
	}
	return 1;
}
/*
 * Remove job from full-length core_bitmap
 * IN job_resrcs_ptr - resources allocated to a job
 * IN/OUT full_bitmap - bitmap of available CPUs, allocate as needed
 * IN bits_per_node - bits per node in the full_bitmap
 * RET 1 on success, 0 otherwise
 */
extern void remove_job_from_cores(job_resources_t *job_resrcs_ptr,
				  bitstr_t **full_core_bitmap,
				  const uint16_t *bits_per_node)
{
	int full_node_inx = 0, job_node_cnt;
	int job_bit_inx  = 0, full_bit_inx  = 0, i;

	if (!job_resrcs_ptr->core_bitmap)
		return;

	/* add the job to the row_bitmap */
	if (*full_core_bitmap == NULL) {
		uint32_t size = 0;
		for (i = 0; i < node_record_count; i++)
			size += bits_per_node[i];
		*full_core_bitmap = bit_alloc(size);
	}

	job_node_cnt = bit_set_count(job_resrcs_ptr->node_bitmap);
	for (full_node_inx = bit_ffs(job_resrcs_ptr->node_bitmap);
	     job_node_cnt > 0; full_node_inx++) {
		if (bit_test(job_resrcs_ptr->node_bitmap, full_node_inx)) {
			full_bit_inx = cr_node_cores_offset[full_node_inx];
			for (i = 0; i < bits_per_node[full_node_inx]; i++) {
				if (!job_resrcs_ptr->whole_node &&
				    !bit_test(job_resrcs_ptr->core_bitmap,
					      job_bit_inx + i))
					continue;
				bit_clear(*full_core_bitmap, full_bit_inx + i);
			}
			job_bit_inx += bits_per_node[full_node_inx];
			job_node_cnt --;
		}
	}
}
Exemple #6
0
static void _print_jobs(struct gs_part *p_ptr)
{
	int i;

	if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG) {
		info("gang:  part %s has %u jobs, %u shadows:",
		     p_ptr->part_name, p_ptr->num_jobs, p_ptr->num_shadows);
		for (i = 0; i < p_ptr->num_shadows; i++) {
			info("gang:   shadow job %u row_s %s, sig_s %s",
			     p_ptr->shadow[i]->job_ptr->job_id,
			     _print_flag(p_ptr->shadow[i]->row_state),
			     _print_flag(p_ptr->shadow[i]->sig_state));
		}
		for (i = 0; i < p_ptr->num_jobs; i++) {
			info("gang:   job %u row_s %s, sig_s %s",
			     p_ptr->job_list[i]->job_ptr->job_id,
			     _print_flag(p_ptr->job_list[i]->row_state),
			     _print_flag(p_ptr->job_list[i]->sig_state));
		}
		if (p_ptr->active_resmap) {
			int s = bit_size(p_ptr->active_resmap);
			i = bit_set_count(p_ptr->active_resmap);
			info("gang:  active resmap has %d of %d bits set",
		  	     i, s);
		}
	}
}
Exemple #7
0
/* Return 1 if job fits in this row, else return 0 */
static int _job_fits_in_active_row(struct job_record *job_ptr,
				   struct gs_part *p_ptr)
{
	job_resources_t *job_res = job_ptr->job_resrcs;
	int count;
	bitstr_t *job_map;
	uint16_t job_gr_type;

	if ((p_ptr->active_resmap == NULL) || (p_ptr->jobs_active == 0))
		return 1;

	job_gr_type = _get_part_gr_type(job_ptr->part_ptr);
	if ((job_gr_type == GS_CPU2) || (job_gr_type == GS_CORE) ||
	    (job_gr_type == GS_SOCKET)) {
		return job_fits_into_cores(job_res, p_ptr->active_resmap,
					   gs_bits_per_node);
	}

	/* job_gr_type == GS_NODE || job_gr_type == GS_CPU */
	job_map = bit_copy(job_res->node_bitmap);
	bit_and(job_map, p_ptr->active_resmap);
	/* any set bits indicate contention for the same resource */
	count = bit_set_count(job_map);
	if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG)
		info("gang: _job_fits_in_active_row: %d bits conflict", count);
	FREE_NULL_BITMAP(job_map);
	if (count == 0)
		return 1;
	if (job_gr_type == GS_CPU) {
		/* For GS_CPU we check the CPU arrays */
		return _can_cpus_fit(job_ptr, p_ptr);
	}

	return 0;
}
Exemple #8
0
/* Reset the node_bitmap in a job_resources data structure
 * This is needed after a restart/reconfiguration since nodes can
 * be added or removed from the system resulting in changing in
 * the bitmap size or bit positions */
extern int reset_node_bitmap(job_resources_t *job_resrcs_ptr, uint32_t job_id)
{
	int i;

	if (!job_resrcs_ptr)
		return SLURM_SUCCESS;

	if (job_resrcs_ptr->node_bitmap)
		FREE_NULL_BITMAP(job_resrcs_ptr->node_bitmap);

	if (job_resrcs_ptr->nodes &&
	    (node_name2bitmap(job_resrcs_ptr->nodes, false,
			      &job_resrcs_ptr->node_bitmap))) {
		error("Invalid nodes (%s) for job_id %u",
		      job_resrcs_ptr->nodes, job_id);
		return SLURM_ERROR;
	} else if (job_resrcs_ptr->nodes == NULL) {
		job_resrcs_ptr->node_bitmap = bit_alloc(node_record_count);
	}

	i = bit_set_count(job_resrcs_ptr->node_bitmap);
	if (job_resrcs_ptr->nhosts != i) {
		error("Invalid change in resource allocation node count for "
		      "job %u, %u to %d", job_id, job_resrcs_ptr->nhosts, i);
		return SLURM_ERROR;
	}
	return SLURM_SUCCESS;
}
Exemple #9
0
void task_state_update (task_state_t ts, int taskid, task_state_type_t t)
{
    xassert (ts != NULL);
    xassert (taskid >= 0);
    xassert (taskid < ts->n_tasks);

    debug3("task_state_update(taskid=%d, %s)",
           taskid, _task_state_type_str (t));

    switch (t) {
    case TS_START_SUCCESS:
        bit_set (ts->running, taskid);
        ts->n_started++;
        break;
    case TS_START_FAILURE:
        bit_set (ts->start_failed, taskid);
        break;
    case TS_NORMAL_EXIT:
        bit_clear (ts->running, taskid);
        if (bit_test(ts->normal_exit, taskid)) {
            error("Task %d reported exit for a second time.",
                  taskid);
        } else {
            bit_set (ts->normal_exit, taskid);
            ts->n_exited++;
        }
        break;
    case TS_ABNORMAL_EXIT:
        bit_clear (ts->running, taskid);
        if (bit_test(ts->abnormal_exit, taskid)) {
            error("Task %d reported exit for a second time.",
                  taskid);
        } else {
            bit_set (ts->abnormal_exit, taskid);
            ts->n_exited++;
            ts->n_abnormal++;
        }
        break;
    }

    xassert ((bit_set_count(ts->abnormal_exit) +
              bit_set_count(ts->normal_exit)) == ts->n_exited);
}
Exemple #10
0
static void _do_log_msg(task_state_t ts, bitstr_t *b, log_f fn,
			const char *msg)
{
	char buf[4096];
	char *s = bit_set_count (b) == 1 ? "" : "s";
	if (ts->pack_group == NO_VAL) {
		(*fn) ("step:%u.%u task%s %s: %s",
		       ts->job_id, ts->step_id,
		       s, bit_fmt(buf, sizeof(buf), b), msg);
	} else {
		(*fn) ("step:%u.%u pack_group:%u task%s %s: %s",
		       ts->job_id, ts->step_id, ts->pack_group,
		       s, bit_fmt(buf, sizeof(buf), b), msg);
	}
}
Exemple #11
0
/* Rebuild active_feature_list for given node index,
 * IN node_inx - Node index, if -1 then copy alloc_feature_list into
 *		 acitve_feature_list, if -2 then log state
 */
extern void  build_active_feature_list2(int node_inx, char *active_features)
{
	node_feature_t *feature_ptr;
	ListIterator feature_iter;
	char *tmp_str, *token, *last = NULL;

	if (node_inx == -1) {
		_copy_feature_list();
		return;
	}
	if (node_inx == -2) {
#if _DEBUG
		feature_iter = list_iterator_create(active_feature_list);
		while ((feature_ptr = (node_feature_t *)
		        list_next(feature_iter))) {
			info("ACTIVE FEATURE: NAME:%s CNT:%d",
			     feature_ptr->name,
			     bit_set_count(feature_ptr->node_bitmap));
		}
		list_iterator_destroy(feature_iter);
#endif
		return;
	}

	if ((node_inx < 0) || (node_inx >= node_record_count)) {
		error("%s: Invalid node_inx:%d", __func__, node_inx);
		return;
	}

	/* Clear this node from the feature_list record,
	 * then restore as needed */
	feature_iter = list_iterator_create(active_feature_list);
	while ((feature_ptr = (node_feature_t *) list_next(feature_iter))) {
		bit_clear(feature_ptr->node_bitmap, node_inx);
	}
	list_iterator_destroy(feature_iter);

	if (active_features) {
		tmp_str = xstrdup(active_features);
		token = strtok_r(tmp_str, ",", &last);
		while (token) {
			_add_config_feature_inx(active_feature_list, token,
						node_inx);
			token = strtok_r(NULL, ",", &last);
		}
		xfree(tmp_str);
	}
}
Exemple #12
0
static void _dump_resv_port_info(void)
{
#if _DEBUG
	int i;
	char *tmp_char;

	for (i=0; i<port_resv_cnt; i++) {
		if (bit_set_count(port_resv_table[i]) == 0)
			continue;

		tmp_char = bitmap2node_name(port_resv_table[i]);
		info("Port %d: %s", (i+port_resv_min), tmp_char);
		xfree(tmp_char);
	}
#endif
}
Exemple #13
0
static void _build_select_struct(struct job_record *job_ptr,
				 bitstr_t *bitmap, uint32_t node_cnt)
{
	int i;
	uint32_t total_cpus = 0;
	job_resources_t *job_resrcs_ptr;

	xassert(job_ptr);

	if (job_ptr->job_resrcs) {
		error("select_p_job_test: already have select_job");
		free_job_resources(&job_ptr->job_resrcs);
	}

	job_ptr->job_resrcs = job_resrcs_ptr = create_job_resources();
	job_resrcs_ptr->cpu_array_reps = xmalloc(sizeof(uint32_t));
	job_resrcs_ptr->cpu_array_value = xmalloc(sizeof(uint16_t));
	job_resrcs_ptr->cpus = xmalloc(sizeof(uint16_t) * node_cnt);
	job_resrcs_ptr->cpus_used = xmalloc(sizeof(uint16_t) * node_cnt);
/* 	job_resrcs_ptr->nhosts = node_cnt; */
	job_resrcs_ptr->nhosts = bit_set_count(bitmap);
	job_resrcs_ptr->ncpus = job_ptr->details->min_cpus;
	job_resrcs_ptr->node_bitmap = bit_copy(bitmap);
	job_resrcs_ptr->nodes = bitmap2node_name(bitmap);
	if (job_resrcs_ptr->node_bitmap == NULL)
		fatal("bit_copy malloc failure");

	job_resrcs_ptr->cpu_array_cnt = 1;
	if (job_ptr->details->min_cpus < bg_conf->cpus_per_mp)
		job_resrcs_ptr->cpu_array_value[0] = job_ptr->details->min_cpus;
	else
		job_resrcs_ptr->cpu_array_value[0] = bg_conf->cpus_per_mp;
	job_resrcs_ptr->cpu_array_reps[0] = node_cnt;
	total_cpus = bg_conf->cpu_ratio * node_cnt;

	for (i=0; i<node_cnt; i++)
		job_resrcs_ptr->cpus[i] = bg_conf->cpu_ratio;

	if (job_resrcs_ptr->ncpus != total_cpus) {
		error("select_p_job_test: ncpus mismatch %u != %u",
		      job_resrcs_ptr->ncpus, total_cpus);
	}
}
Exemple #14
0
int _print_job_job_id(job_info_t * job, int width, bool right, char* suffix)
{
	if (job == NULL) {	/* Print the Header instead */
		_print_str("JOBID", width, right, true);
	} else if ((job->array_task_id != NO_VAL) &&
		   !params.array_flag && IS_JOB_PENDING(job)  &&
		   job->node_inx) {
		uint32_t i, local_width = width, max_task_id = 0;
		char *id, *task_str;
		bitstr_t *task_bits;
		for (i = 1; i <= job->node_inx[0]; i++)
			max_task_id = MAX(max_task_id, job->node_inx[i]);
		task_bits = bit_alloc(max_task_id + 1);
		for (i = 1; i <= job->node_inx[0]; i++)
			bit_set(task_bits, job->node_inx[i]);
		if (local_width == 0) {
			local_width = bit_set_count(task_bits) *
				      FORMAT_STRING_SIZE;
		}
		id = xmalloc(local_width);
		task_str = xmalloc(local_width);
		bit_fmt(task_str, local_width, task_bits);
		snprintf(id, local_width, "%u_[%s]",
			 job->array_job_id, task_str);
		_print_str(id, width, right, true);
		bit_free(task_bits);
		xfree(id);
		xfree(task_str);
	} else if (job->array_task_id != NO_VAL) {
		char id[FORMAT_STRING_SIZE];
		snprintf(id, FORMAT_STRING_SIZE, "%u_%u",
			 job->array_job_id, job->array_task_id);
		_print_str(id, width, right, true);
	} else {
		char id[FORMAT_STRING_SIZE];
		snprintf(id, FORMAT_STRING_SIZE, "%u", job->job_id);
		_print_str(id, width, right, true);
	}
	if (suffix)
		printf("%s", suffix);
	return SLURM_SUCCESS;
}
Exemple #15
0
/* Wait for all identified computed nodes to enter "on" state */
static void _wait_all_nodes_on(void)
{
	char *argv[10], *resp_msg;
	int i, nid_cnt = 0, status = 0;
	json_object *j;
	uint32_t *nid_array;
	time_t start_time = time(NULL);

	while ((difftime(time(NULL), start_time) < (30 * 60)) &&
	       (bit_set_count(node_bitmap) > 0)) {
		sleep(20);
		argv[0] = "capmc";
		argv[1] = "node_status";
		argv[2] = NULL;
		resp_msg = _run_script(argv, &status);
		if (status != 0) {
			error("%s: capmc(%s,%s,%s): %d %s", log_file,
				argv[1], argv[2], argv[3], status, resp_msg);
			break;
		}
		j = json_tokener_parse(resp_msg);
		if (j == NULL) {
			error("%s: json parser failed on %s",
			      log_file, resp_msg);
			xfree(resp_msg);
			break;
		}
		xfree(resp_msg);
		nid_cnt = 0;
		nid_array = _json_parse_nids(j, "on", &nid_cnt);
		json_object_put(j);	/* Frees json memory */
		for (i = 0; i < nid_cnt; i++) {
			bit_clear(node_bitmap, nid_array[i]);
		}
		xfree(nid_array);
	}
}
Exemple #16
0
/* Attempt to schedule a specific job on specific available nodes
 * IN job_ptr - job to schedule
 * IN/OUT avail_bitmap - nodes available/selected to use
 * IN exc_core_bitmap - cores which can not be used
 * RET SLURM_SUCCESS on success, otherwise an error code
 */
static int  _try_sched(struct job_record *job_ptr, bitstr_t **avail_bitmap,
		       uint32_t min_nodes, uint32_t max_nodes,
		       uint32_t req_nodes, bitstr_t *exc_core_bitmap)
{
	bitstr_t *tmp_bitmap;
	int rc = SLURM_SUCCESS;
	int feat_cnt = _num_feature_count(job_ptr);
	List preemptee_candidates = NULL;

	if (feat_cnt) {
		/* Ideally schedule the job feature by feature,
		 * but I don't want to add that complexity here
		 * right now, so clear the feature counts and try
		 * to schedule. This will work if there is only
		 * one feature count. It should work fairly well
		 * in cases where there are multiple feature
		 * counts. */
		struct job_details *detail_ptr = job_ptr->details;
		ListIterator feat_iter;
		struct feature_record *feat_ptr;
		int i = 0, list_size;
		uint16_t *feat_cnt_orig = NULL, high_cnt = 0;

		/* Clear the feature counts */
		list_size = list_count(detail_ptr->feature_list);
		feat_cnt_orig = xmalloc(sizeof(uint16_t) * list_size);
		feat_iter = list_iterator_create(detail_ptr->feature_list);
		while ((feat_ptr =
			(struct feature_record *) list_next(feat_iter))) {
			high_cnt = MAX(high_cnt, feat_ptr->count);
			feat_cnt_orig[i++] = feat_ptr->count;
			feat_ptr->count = 0;
		}
		list_iterator_destroy(feat_iter);

		if ((job_req_node_filter(job_ptr, *avail_bitmap) !=
		     SLURM_SUCCESS) ||
		    (bit_set_count(*avail_bitmap) < high_cnt)) {
			rc = ESLURM_NODES_BUSY;
		} else {
			preemptee_candidates =
					slurm_find_preemptable_jobs(job_ptr);
			rc = select_g_job_test(job_ptr, *avail_bitmap,
					       high_cnt, max_nodes, req_nodes,
					       SELECT_MODE_WILL_RUN,
					       preemptee_candidates, NULL,
					       exc_core_bitmap);
		}

		/* Restore the feature counts */
		i = 0;
		feat_iter = list_iterator_create(detail_ptr->feature_list);
		while ((feat_ptr =
			(struct feature_record *) list_next(feat_iter))) {
			feat_ptr->count = feat_cnt_orig[i++];
		}
		list_iterator_destroy(feat_iter);
		xfree(feat_cnt_orig);
	} else {
		/* Try to schedule the job. First on dedicated nodes
		 * then on shared nodes (if so configured). */
		uint16_t orig_shared;
		time_t now = time(NULL);
		char str[100];

		preemptee_candidates = slurm_find_preemptable_jobs(job_ptr);
		orig_shared = job_ptr->details->shared;
		job_ptr->details->shared = 0;
		tmp_bitmap = bit_copy(*avail_bitmap);

		if (exc_core_bitmap) {
			bit_fmt(str, (sizeof(str) - 1), exc_core_bitmap);
			debug2(" _try_sched with exclude core bitmap: %s",str);
		}

		rc = select_g_job_test(job_ptr, *avail_bitmap, min_nodes,
				       max_nodes, req_nodes,
				       SELECT_MODE_WILL_RUN,
				       preemptee_candidates, NULL,
				       exc_core_bitmap);

		job_ptr->details->shared = orig_shared;

		if (((rc != SLURM_SUCCESS) || (job_ptr->start_time > now)) &&
		    (orig_shared != 0)) {
			FREE_NULL_BITMAP(*avail_bitmap);
			*avail_bitmap= tmp_bitmap;
			rc = select_g_job_test(job_ptr, *avail_bitmap,
					       min_nodes, max_nodes, req_nodes,
					       SELECT_MODE_WILL_RUN,
					       preemptee_candidates, NULL,
					       exc_core_bitmap);
		} else
			FREE_NULL_BITMAP(tmp_bitmap);
	}

	if (preemptee_candidates)
		list_destroy(preemptee_candidates);
	return rc;

}
Exemple #17
0
/*
 * Attempt to start a job
 * jobid     (IN) - job id
 * task_cnt  (IN) - total count of tasks to start
 * hostlist  (IN) - SLURM hostlist expression with no repeated hostnames
 * tasklist  (IN/OUT) - comma separated list of hosts with tasks to be started,
 *                  list hostname once per task to start
 * comment_ptr (IN) - new comment field for the job or NULL for no change
 * err_code (OUT) - Moab error code
 * err_msg  (OUT) - Moab error message
 */
static int	_start_job(uint32_t jobid, int task_cnt, char *hostlist,
			char *tasklist, char *comment_ptr,
			int *err_code, char **err_msg)
{
	int rc = 0, old_task_cnt = 1;
	struct job_record *job_ptr;
	/* Write lock on job info, read lock on node info */
	slurmctld_lock_t job_write_lock = {
		NO_LOCK, WRITE_LOCK, READ_LOCK, NO_LOCK };
	char *new_node_list = NULL;
	static char tmp_msg[128];
	bitstr_t *new_bitmap = (bitstr_t *) NULL;
	bitstr_t *save_req_bitmap = (bitstr_t *) NULL;
	bitoff_t i, bsize;
	int ll; /* layout info index */
	char *node_name, *node_idx, *node_cur, *save_req_nodes = NULL;
	size_t node_name_len;
	static uint32_t cr_test = 0, cr_enabled = 0;

	if (cr_test == 0) {
		select_g_get_info_from_plugin(SELECT_CR_PLUGIN, NULL,
						&cr_enabled);
		cr_test = 1;
	}

	lock_slurmctld(job_write_lock);
	job_ptr = find_job_record(jobid);
	if (job_ptr == NULL) {
		*err_code = -700;
		*err_msg = "No such job";
		error("wiki: Failed to find job %u", jobid);
		rc = -1;
		goto fini;
	}

	if ((job_ptr->details == NULL) || (!IS_JOB_PENDING(job_ptr))) {
		*err_code = -700;
		*err_msg = "Job not pending, can't start";
		error("wiki: Attempt to start job %u in state %s",
			jobid, job_state_string(job_ptr->job_state));
		rc = -1;
		goto fini;
	}

	if (comment_ptr) {
		char *reserved = strstr(comment_ptr, "RESERVED:");
		if (reserved) {
			reserved += 9;
			job_ptr->details->reserved_resources =
				strtol(reserved, NULL, 10);
		}
		xfree(job_ptr->comment);
		job_ptr->comment = xstrdup(comment_ptr);
	}

	if (task_cnt) {
		new_node_list = xstrdup(hostlist);
		if (node_name2bitmap(new_node_list, false, &new_bitmap) != 0) {
			*err_code = -700;
			*err_msg = "Invalid TASKLIST";
			error("wiki: Attempt to set invalid node list for "
				"job %u, %s",
				jobid, hostlist);
			xfree(new_node_list);
			rc = -1;
			goto fini;
		}

		if (!bit_super_set(new_bitmap, avail_node_bitmap)) {
			/* Selected node is UP and not responding
			 * or it just went DOWN */
			*err_code = -700;
			*err_msg = "TASKLIST includes non-responsive node";
			error("wiki: Attempt to use non-responsive nodes for "
				"job %u, %s",
				jobid, hostlist);
			xfree(new_node_list);
			FREE_NULL_BITMAP(new_bitmap);
			rc = -1;
			goto fini;
		}

		/* User excluded node list incompatible with Wiki
		 * Exclude all nodes not explicitly requested */
		FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap);
		job_ptr->details->exc_node_bitmap = bit_copy(new_bitmap);
		bit_not(job_ptr->details->exc_node_bitmap);
	}

	/* Build layout information from tasklist (assuming that Moab
	 * sends a non-bracketed list of nodes, repeated as many times
	 * as cpus should be used per node); at this point, node names
	 * are comma-separated. This is _not_ a fast algorithm as it
	 * performs many string compares. */
	xfree(job_ptr->details->req_node_layout);
	if (task_cnt && cr_enabled) {
		uint16_t cpus_per_task = MAX(1, job_ptr->details->cpus_per_task);
		job_ptr->details->req_node_layout = (uint16_t *)
			xmalloc(bit_set_count(new_bitmap) * sizeof(uint16_t));
		bsize = bit_size(new_bitmap);
		for (i = 0, ll = -1; i < bsize; i++) {
			if (!bit_test(new_bitmap, i))
				continue;
			ll++;
			node_name = node_record_table_ptr[i].name;
			node_name_len  = strlen(node_name);
			if (node_name_len == 0)
				continue;
			node_cur = tasklist;
			while (*node_cur) {
				if ((node_idx = strstr(node_cur, node_name))) {
					if ((node_idx[node_name_len] == ',') ||
				 	    (node_idx[node_name_len] == '\0')) {
						job_ptr->details->
							req_node_layout[ll] +=
							cpus_per_task;
					}
					node_cur = strchr(node_idx, ',');
					if (node_cur)
						continue;
				}
				break;
			}
		}
	}

	/* save and update job state to start now */
	save_req_nodes = job_ptr->details->req_nodes;
	job_ptr->details->req_nodes = new_node_list;
	save_req_bitmap = job_ptr->details->req_node_bitmap;
	job_ptr->details->req_node_bitmap = new_bitmap;
	old_task_cnt = job_ptr->details->min_cpus;
	job_ptr->details->min_cpus = MAX(task_cnt, old_task_cnt);
	job_ptr->priority = 100000000;

 fini:	unlock_slurmctld(job_write_lock);
	if (rc)
		return rc;

	/* No errors so far */
	(void) schedule(INFINITE);	/* provides own locking */

	/* Check to insure the job was actually started */
	lock_slurmctld(job_write_lock);
	if (job_ptr->job_id != jobid)
		job_ptr = find_job_record(jobid);

	if (job_ptr && (job_ptr->job_id == jobid) &&
	    (!IS_JOB_RUNNING(job_ptr))) {
		uint16_t wait_reason = 0;
		char *wait_string;

		if (IS_JOB_FAILED(job_ptr))
			wait_string = "Invalid request, job aborted";
		else {
			wait_reason = job_ptr->state_reason;
			if (wait_reason == WAIT_HELD) {
				/* some job is completing, slurmctld did
				 * not even try to schedule this job */
				wait_reason = WAIT_RESOURCES;
			}
			wait_string = job_reason_string(wait_reason);
			job_ptr->state_reason = WAIT_HELD;
			xfree(job_ptr->state_desc);
		}
		*err_code = -910 - wait_reason;
		snprintf(tmp_msg, sizeof(tmp_msg),
			"Could not start job %u(%s): %s",
			jobid, new_node_list, wait_string);
		*err_msg = tmp_msg;
		error("wiki: %s", tmp_msg);

		/* restore some of job state */
		job_ptr->priority = 0;
		job_ptr->details->min_cpus = old_task_cnt;
		rc = -1;
	}

	if (job_ptr && (job_ptr->job_id == jobid) && job_ptr->details) {
		/* Restore required node list in case job requeued */
		xfree(job_ptr->details->req_nodes);
		job_ptr->details->req_nodes = save_req_nodes;
		FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap);
		job_ptr->details->req_node_bitmap = save_req_bitmap;
		FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap);
		xfree(job_ptr->details->req_node_layout);
	} else {
		error("wiki: start_job(%u) job missing", jobid);
		xfree(save_req_nodes);
		FREE_NULL_BITMAP(save_req_bitmap);
	}

	unlock_slurmctld(job_write_lock);
	schedule_node_save();	/* provides own locking */
	schedule_job_save();	/* provides own locking */
	return rc;
}
Exemple #18
0
extern int select_nodeinfo_set_all(void)
{
	ListIterator itr = NULL;
	struct node_record *node_ptr = NULL;
	int i=0;
	bg_record_t *bg_record = NULL;
	static time_t last_set_all = 0;
	ba_mp_t *ba_mp;
	node_subgrp_t *subgrp = NULL;
	int bit_count;

	//uint32_t cluster_flags = slurmdb_setup_cluster_flags();

	if (!blocks_are_created)
		return SLURM_NO_CHANGE_IN_DATA;

	if (!g_bitmap_size) {
		/* if (cluster_flags & CLUSTER_FLAG_BGQ) */
		/* 	g_bitmap_size = bg_conf->mp_cnode_cnt; */
		/* else */
			g_bitmap_size = bg_conf->ionodes_per_mp;
	}

	/* only set this once when the last_bg_update is newer than
	   the last time we set things up. */
	if (last_set_all && (last_bg_update-1 < last_set_all)) {
		debug2("Node select info for set all hasn't "
		       "changed since %ld",
		       last_set_all);
		return SLURM_NO_CHANGE_IN_DATA;
	}
	last_set_all = last_bg_update;

	/* set this here so we know things have changed */
	last_node_update = time(NULL);

	slurm_mutex_lock(&block_state_mutex);
	for (i=0; i<node_record_count; i++) {
		select_nodeinfo_t *nodeinfo;

		node_ptr = &(node_record_table_ptr[i]);
		xassert(node_ptr->select_nodeinfo);
		nodeinfo = node_ptr->select_nodeinfo->data;
		xassert(nodeinfo);
		xassert(nodeinfo->subgrp_list);
		list_flush(nodeinfo->subgrp_list);
		if (nodeinfo->bitmap_size != g_bitmap_size)
			nodeinfo->bitmap_size = g_bitmap_size;
	}

	itr = list_iterator_create(bg_lists->main);
	while ((bg_record = list_next(itr))) {
		enum node_states state = NODE_STATE_UNKNOWN;
		select_nodeinfo_t *nodeinfo;
		bitstr_t *bitmap;
		ListIterator itr2 = NULL;

		/* Only mark unidle blocks */
		if (bg_record->job_list && list_count(bg_record->job_list)) {
			struct job_record *job_ptr;
			select_jobinfo_t *jobinfo;
			ListIterator itr =
				list_iterator_create(bg_record->job_list);
			ba_mp = list_peek(bg_record->ba_mp_list);
			node_ptr = &(node_record_table_ptr[ba_mp->index]);
			xassert(node_ptr->select_nodeinfo);
			nodeinfo = node_ptr->select_nodeinfo->data;
			xassert(nodeinfo);
			xassert(nodeinfo->subgrp_list);
			if (ba_mp->cnode_err_bitmap
			    && (bit_count =
				bit_set_count(ba_mp->cnode_err_bitmap))) {
				subgrp = _find_subgrp(nodeinfo->subgrp_list,
						      NODE_STATE_ERROR,
						      g_bitmap_size);
				/* FIXME: the subgrp->bitmap isn't set here. */
				subgrp->cnode_cnt += bit_count;
			}

			subgrp = _find_subgrp(nodeinfo->subgrp_list,
					      NODE_STATE_ALLOCATED,
					      g_bitmap_size);
			while ((job_ptr = list_next(itr))) {
				jobinfo = job_ptr->select_jobinfo->data;
				/* FIXME: the subgrp->bitmap isn't set here. */
				subgrp->cnode_cnt += jobinfo->cnode_cnt;
			}
			list_iterator_destroy(itr);
			continue;
		} else if (bg_record->job_running == NO_JOB_RUNNING)
			continue;

		if (bg_record->state & BG_BLOCK_ERROR_FLAG)
			state = NODE_STATE_ERROR;
		else if (bg_record->job_running > NO_JOB_RUNNING) {
			/* we don't need to set the allocated here
			 * since the whole midplane is allocated */
			if (bg_record->conn_type[0] < SELECT_SMALL)
				continue;
			state = NODE_STATE_ALLOCATED;
		} else {
			error("not sure why we got here with block %s %s",
			      bg_record->bg_block_id,
			      bg_block_state_string(bg_record->state));
			continue;
		}
		/* if ((cluster_flags & CLUSTER_FLAG_BGQ) */
		/*     && (state != NODE_STATE_ERROR)) */
		/* 	bitmap = bg_record->cnodes_used_bitmap; */
		/* else */
		bitmap = bg_record->ionode_bitmap;

		itr2 = list_iterator_create(bg_record->ba_mp_list);
		while ((ba_mp = list_next(itr2))) {
			if (!ba_mp->used)
				continue;

			node_ptr = &(node_record_table_ptr[ba_mp->index]);

			xassert(node_ptr->select_nodeinfo);
			nodeinfo = node_ptr->select_nodeinfo->data;
			xassert(nodeinfo);
			xassert(nodeinfo->subgrp_list);

			if (ba_mp->cnode_err_bitmap
			    && (state == NODE_STATE_ALLOCATED)
			    && (bit_count =
				bit_set_count(ba_mp->cnode_err_bitmap))) {
				subgrp = _find_subgrp(nodeinfo->subgrp_list,
						      NODE_STATE_ERROR,
						      g_bitmap_size);
				/* FIXME: the subgrp->bitmap isn't set here. */
				subgrp->cnode_cnt += bit_count;
			}

			subgrp = _find_subgrp(nodeinfo->subgrp_list,
					      state, g_bitmap_size);

			if (subgrp->cnode_cnt < bg_conf->mp_cnode_cnt) {
				/* if (cluster_flags & CLUSTER_FLAG_BGQ) { */
				/* 	bit_or(subgrp->bitmap, bitmap); */
				/* 	subgrp->cnode_cnt += */
				/* 		bit_set_count(bitmap); */
				/* } else */ if (bg_record->cnode_cnt
					   < bg_conf->mp_cnode_cnt) {
					bit_or(subgrp->bitmap, bitmap);
					subgrp->cnode_cnt +=
						bg_record->cnode_cnt;
				} else {
					bit_nset(subgrp->bitmap,
						 0,
						 (g_bitmap_size-1));
					subgrp->cnode_cnt =
						bg_conf->mp_cnode_cnt;
				}
			}
		}
		list_iterator_destroy(itr2);
	}
	list_iterator_destroy(itr);
	slurm_mutex_unlock(&block_state_mutex);

	return SLURM_SUCCESS;
}
Exemple #19
0
/*
 * _task_layout_lllp_block
 *
 * task_layout_lllp_block will create a block distribution at the
 * lowest level of logical processor which is either socket, core or
 * thread depending on the system architecture. The Block algorithm
 * is the same as the Block distribution performed in srun.
 *
 *  Distribution at the lllp:
 *  -m hostfile|plane|block|cyclic:block|cyclic
 *
 * The first distribution "hostfile|plane|block|cyclic" is computed
 * in srun. The second distribution "plane|block|cyclic" is computed
 * locally by each slurmd.
 *
 * The input to the lllp distribution algorithms is the gids (tasks
 * ids) generated for the local node.
 *
 * The output is a mapping of the gids onto logical processors
 * (thread/core/socket)  with is expressed cpu_bind masks.
 *
 */
static int _task_layout_lllp_block(launch_tasks_request_msg_t *req,
				   uint32_t node_id, bitstr_t ***masks_p)
{
	int c, i, size, last_taskcount = -1, taskcount = 0;
	uint16_t hw_sockets = 0, hw_cores = 0, hw_threads = 0;
	int max_tasks = req->tasks_to_launch[(int)node_id];
	int max_cpus = max_tasks * req->cpus_per_task;
	bitstr_t *avail_map;
	bitstr_t **masks = NULL;

	info("_task_layout_lllp_block ");

	avail_map = _get_avail_map(req, &hw_sockets, &hw_cores, &hw_threads);
	if (!avail_map) {
		return SLURM_ERROR;
	}

	size = bit_set_count(avail_map);
	if (size < max_tasks) {
		error("task/affinity: only %d bits in avail_map for %d tasks!",
		      size, max_tasks);
		FREE_NULL_BITMAP(avail_map);
		return SLURM_ERROR;
	}
	if (size < max_cpus) {
		/* Possible result of overcommit */
		i = size / max_tasks;
		info("task/affinity: reset cpus_per_task from %d to %d",
		     req->cpus_per_task, i);
		req->cpus_per_task = i;
	}
	size = bit_size(avail_map);

	*masks_p = xmalloc(max_tasks * sizeof(bitstr_t*));
	masks = *masks_p;

	/* block distribution with oversubsciption */
	c = 0;
	while(taskcount < max_tasks) {
		if (taskcount == last_taskcount) {
			fatal("_task_layout_lllp_block infinite loop");
		}
		last_taskcount = taskcount;
		/* the abstract map is already laid out in block order,
		 * so just iterate over it
		 */
		for (i = 0; i < size; i++) {
			/* skip unavailable resources */
			if (bit_test(avail_map, i) == 0)
				continue;

			if (!masks[taskcount])
				masks[taskcount] = bit_alloc(
					conf->block_map_size);
			//info("setting %d %d", taskcount, i);
			bit_set(masks[taskcount], i);

			/* skip unrequested threads */
			if (req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE)
				i += hw_threads-1;

			if (++c < req->cpus_per_task)
				continue;
			c = 0;
			if (++taskcount >= max_tasks)
				break;
		}
	}

	/* last step: expand the masks to bind each task
	 * to the requested resource */
	_expand_masks(req->cpu_bind_type, max_tasks, masks,
			hw_sockets, hw_cores, hw_threads, avail_map);
	FREE_NULL_BITMAP(avail_map);

	return SLURM_SUCCESS;
}
Exemple #20
0
static int _attempt_backfill(void)
{
	DEF_TIMERS;
	bool filter_root = false;
	List job_queue;
	job_queue_rec_t *job_queue_rec;
	slurmdb_qos_rec_t *qos_ptr = NULL;
	int i, j, node_space_recs;
	struct job_record *job_ptr;
	struct part_record *part_ptr, **bf_part_ptr = NULL;
	uint32_t end_time, end_reserve;
	uint32_t time_limit, comp_time_limit, orig_time_limit, part_time_limit;
	uint32_t min_nodes, max_nodes, req_nodes;
	bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL;
	bitstr_t *exc_core_bitmap = NULL, *non_cg_bitmap = NULL;
	time_t now, sched_start, later_start, start_res, resv_end, window_end;
	node_space_map_t *node_space;
	struct timeval bf_time1, bf_time2;
	int rc = 0;
	int job_test_count = 0;
	uint32_t *uid = NULL, nuser = 0, bf_parts = 0, *bf_part_jobs = NULL;
	uint16_t *njobs = NULL;
	bool already_counted;
	uint32_t reject_array_job_id = 0;
	struct part_record *reject_array_part = NULL;
	uint32_t job_start_cnt = 0, start_time;
	time_t config_update = slurmctld_conf.last_update;
	time_t part_update = last_part_update;
	struct timeval start_tv;

	bf_last_yields = 0;
#ifdef HAVE_ALPS_CRAY
	/*
	 * Run a Basil Inventory immediately before setting up the schedule
	 * plan, to avoid race conditions caused by ALPS node state change.
	 * Needs to be done with the node-state lock taken.
	 */
	START_TIMER;
	if (select_g_reconfigure()) {
		debug4("backfill: not scheduling due to ALPS");
		return SLURM_SUCCESS;
	}
	END_TIMER;
	if (debug_flags & DEBUG_FLAG_BACKFILL)
		info("backfill: ALPS inventory completed, %s", TIME_STR);

	/* The Basil inventory can take a long time to complete. Process
	 * pending RPCs before starting the backfill scheduling logic */
	_yield_locks(1000000);
#endif

	START_TIMER;
	if (debug_flags & DEBUG_FLAG_BACKFILL)
		info("backfill: beginning");
	else
		debug("backfill: beginning");
	sched_start = now = time(NULL);
	gettimeofday(&start_tv, NULL);

	if (slurm_get_root_filter())
		filter_root = true;

	job_queue = build_job_queue(true, true);
	if (list_count(job_queue) == 0) {
		if (debug_flags & DEBUG_FLAG_BACKFILL)
			info("backfill: no jobs to backfill");
		else
			debug("backfill: no jobs to backfill");
		list_destroy(job_queue);
		return 0;
	}

	gettimeofday(&bf_time1, NULL);

	non_cg_bitmap = bit_copy(cg_node_bitmap);
	bit_not(non_cg_bitmap);

	slurmctld_diag_stats.bf_queue_len = list_count(job_queue);
	slurmctld_diag_stats.bf_queue_len_sum += slurmctld_diag_stats.
						 bf_queue_len;
	slurmctld_diag_stats.bf_last_depth = 0;
	slurmctld_diag_stats.bf_last_depth_try = 0;
	slurmctld_diag_stats.bf_when_last_cycle = now;
	slurmctld_diag_stats.bf_active = 1;

	node_space = xmalloc(sizeof(node_space_map_t) *
			     (max_backfill_job_cnt * 2 + 1));
	node_space[0].begin_time = sched_start;
	window_end = sched_start + backfill_window;
	node_space[0].end_time = window_end;
	node_space[0].avail_bitmap = bit_copy(avail_node_bitmap);
	node_space[0].next = 0;
	node_space_recs = 1;
	if (debug_flags & DEBUG_FLAG_BACKFILL_MAP)
		_dump_node_space_table(node_space);

	if (max_backfill_job_per_part) {
		ListIterator part_iterator;
		struct part_record *part_ptr;
		bf_parts = list_count(part_list);
		bf_part_ptr  = xmalloc(sizeof(struct part_record *) * bf_parts);
		bf_part_jobs = xmalloc(sizeof(int) * bf_parts);
		part_iterator = list_iterator_create(part_list);
		i = 0;
		while ((part_ptr = (struct part_record *)
				   list_next(part_iterator))) {
			bf_part_ptr[i++] = part_ptr;
		}
		list_iterator_destroy(part_iterator);
	}
	if (max_backfill_job_per_user) {
		uid = xmalloc(BF_MAX_USERS * sizeof(uint32_t));
		njobs = xmalloc(BF_MAX_USERS * sizeof(uint16_t));
	}
	sort_job_queue(job_queue);
	while (1) {
		job_queue_rec = (job_queue_rec_t *) list_pop(job_queue);
		if (!job_queue_rec) {
			if (debug_flags & DEBUG_FLAG_BACKFILL)
				info("backfill: reached end of job queue");
			break;
		}
		if (slurmctld_config.shutdown_time)
			break;
		if (((defer_rpc_cnt > 0) &&
		     (slurmctld_config.server_thread_count >= defer_rpc_cnt)) ||
		    (_delta_tv(&start_tv) >= sched_timeout)) {
			if (debug_flags & DEBUG_FLAG_BACKFILL) {
				END_TIMER;
				info("backfill: completed yielding locks "
				     "after testing %u(%d) jobs, %s",
				     slurmctld_diag_stats.bf_last_depth,
				     job_test_count, TIME_STR);
			}
			if ((_yield_locks(yield_sleep) && !backfill_continue) ||
			    (slurmctld_conf.last_update != config_update) ||
			    (last_part_update != part_update)) {
				if (debug_flags & DEBUG_FLAG_BACKFILL) {
					info("backfill: system state changed, "
					     "breaking out after testing "
					     "%u(%d) jobs",
					     slurmctld_diag_stats.bf_last_depth,
					     job_test_count);
				}
				rc = 1;
				xfree(job_queue_rec);
				break;
			}
			/* cg_node_bitmap may be changed */
			bit_copybits(non_cg_bitmap, cg_node_bitmap);
			bit_not(non_cg_bitmap);
			/* Reset backfill scheduling timers, resume testing */
			sched_start = time(NULL);
			gettimeofday(&start_tv, NULL);
			job_test_count = 0;
			START_TIMER;
		}

		job_ptr  = job_queue_rec->job_ptr;
		/* With bf_continue configured, the original job could have
		 * been cancelled and purged. Validate pointer here. */
		if ((job_ptr->magic  != JOB_MAGIC) ||
		    (job_ptr->job_id != job_queue_rec->job_id)) {
			xfree(job_queue_rec);
			continue;
		}
		orig_time_limit = job_ptr->time_limit;
		part_ptr = job_queue_rec->part_ptr;

		job_test_count++;
		slurmctld_diag_stats.bf_last_depth++;
		already_counted = false;

		xfree(job_queue_rec);
		if (!IS_JOB_PENDING(job_ptr))
			continue;	/* started in other partition */
		if (!avail_front_end(job_ptr))
			continue;	/* No available frontend for this job */
		if (job_ptr->array_task_id != NO_VAL) {
			if ((reject_array_job_id == job_ptr->array_job_id) &&
			    (reject_array_part   == part_ptr))
				continue;  /* already rejected array element */
			/* assume reject whole array for now, clear if OK */
			reject_array_job_id = job_ptr->array_job_id;
			reject_array_part   = part_ptr;
		}
		job_ptr->part_ptr = part_ptr;

		if (debug_flags & DEBUG_FLAG_BACKFILL) {
			info("backfill test for JobID=%u Prio=%u Partition=%s",
			     job_ptr->job_id, job_ptr->priority,
			     job_ptr->part_ptr->name);
		}

		if (max_backfill_job_per_part) {
			bool skip_job = false;
			for (j = 0; j < bf_parts; j++) {
				if (bf_part_ptr[j] != job_ptr->part_ptr)
					continue;
				if (bf_part_jobs[j]++ >=
				    max_backfill_job_per_part)
					skip_job = true;
				break;
			}
			if (skip_job) {
				if (debug_flags & DEBUG_FLAG_BACKFILL)
					info("backfill: have already "
					     "checked %u jobs for "
					     "partition %s; skipping "
					     "job %u",
					     max_backfill_job_per_part,
					     job_ptr->part_ptr->name,
					     job_ptr->job_id);
				continue;
			}
		}
		if (max_backfill_job_per_user) {
			for (j = 0; j < nuser; j++) {
				if (job_ptr->user_id == uid[j]) {
					njobs[j]++;
					if (debug_flags & DEBUG_FLAG_BACKFILL)
						debug("backfill: user %u: "
						      "#jobs %u",
						      uid[j], njobs[j]);
					break;
				}
			}
			if (j == nuser) { /* user not found */
				static bool bf_max_user_msg = true;
				if (nuser < BF_MAX_USERS) {
					uid[j] = job_ptr->user_id;
					njobs[j] = 1;
					nuser++;
				} else if (bf_max_user_msg) {
					bf_max_user_msg = false;
					error("backfill: too many users in "
					      "queue. Consider increasing "
					      "BF_MAX_USERS");
				}
				if (debug_flags & DEBUG_FLAG_BACKFILL)
					debug2("backfill: found new user %u. "
					       "Total #users now %u",
					       job_ptr->user_id, nuser);
			} else {
				if (njobs[j] >= max_backfill_job_per_user) {
					/* skip job */
					if (debug_flags & DEBUG_FLAG_BACKFILL)
						info("backfill: have already "
						     "checked %u jobs for "
						     "user %u; skipping "
						     "job %u",
						     max_backfill_job_per_user,
						     job_ptr->user_id,
						     job_ptr->job_id);
					continue;
				}
			}
		}

		if (((part_ptr->state_up & PARTITION_SCHED) == 0) ||
		    (part_ptr->node_bitmap == NULL) ||
		    ((part_ptr->flags & PART_FLAG_ROOT_ONLY) && filter_root)) {
			if (debug_flags & DEBUG_FLAG_BACKFILL)
				info("backfill: partition %s not usable",
				     job_ptr->part_ptr->name);
			continue;
		}

		if ((!job_independent(job_ptr, 0)) ||
		    (license_job_test(job_ptr, time(NULL)) != SLURM_SUCCESS)) {
			if (debug_flags & DEBUG_FLAG_BACKFILL)
				info("backfill: job %u not runable now",
				     job_ptr->job_id);
			continue;
		}

		/* Determine minimum and maximum node counts */
		min_nodes = MAX(job_ptr->details->min_nodes,
				part_ptr->min_nodes);
		if (job_ptr->details->max_nodes == 0)
			max_nodes = part_ptr->max_nodes;
		else
			max_nodes = MIN(job_ptr->details->max_nodes,
					part_ptr->max_nodes);
		max_nodes = MIN(max_nodes, 500000);     /* prevent overflows */
		if (job_ptr->details->max_nodes)
			req_nodes = max_nodes;
		else
			req_nodes = min_nodes;
		if (min_nodes > max_nodes) {
			if (debug_flags & DEBUG_FLAG_BACKFILL)
				info("backfill: job %u node count too high",
				     job_ptr->job_id);
			continue;
		}

		/* Determine job's expected completion time */
		if (part_ptr->max_time == INFINITE)
			part_time_limit = 365 * 24 * 60; /* one year */
		else
			part_time_limit = part_ptr->max_time;
		if (job_ptr->time_limit == NO_VAL) {
			time_limit = part_time_limit;
		} else {
			if (part_ptr->max_time == INFINITE)
				time_limit = job_ptr->time_limit;
			else
				time_limit = MIN(job_ptr->time_limit,
						 part_time_limit);
		}
		comp_time_limit = time_limit;
		qos_ptr = job_ptr->qos_ptr;
		if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE) &&
		    slurm_get_preempt_mode())
			time_limit = job_ptr->time_limit = 1;
		else if (job_ptr->time_min && (job_ptr->time_min < time_limit))
			time_limit = job_ptr->time_limit = job_ptr->time_min;

		/* Determine impact of any resource reservations */
		later_start = now;
 TRY_LATER:
		if (slurmctld_config.shutdown_time)
			break;
		if (((defer_rpc_cnt > 0) &&
		     (slurmctld_config.server_thread_count >= defer_rpc_cnt)) ||
		    (_delta_tv(&start_tv) >= sched_timeout)) {
			uint32_t save_job_id = job_ptr->job_id;
			uint32_t save_time_limit = job_ptr->time_limit;
			job_ptr->time_limit = orig_time_limit;
			if (debug_flags & DEBUG_FLAG_BACKFILL) {
				END_TIMER;
				info("backfill: completed yielding locks "
				     "after testing %u(%d) jobs, %s",
				     slurmctld_diag_stats.bf_last_depth,
				     job_test_count, TIME_STR);
			}
			if ((_yield_locks(yield_sleep) && !backfill_continue) ||
			    (slurmctld_conf.last_update != config_update) ||
			    (last_part_update != part_update)) {
				if (debug_flags & DEBUG_FLAG_BACKFILL) {
					info("backfill: system state changed, "
					     "breaking out after testing "
					     "%u(%d) jobs",
					     slurmctld_diag_stats.bf_last_depth,
					     job_test_count);
				}
				rc = 1;
				break;
			}
			/* cg_node_bitmap may be changed */
			bit_copybits(non_cg_bitmap, cg_node_bitmap);
			bit_not(non_cg_bitmap);

			/* With bf_continue configured, the original job could
			 * have been scheduled or cancelled and purged.
			 * Revalidate job the record here. */
			if ((job_ptr->magic  != JOB_MAGIC) ||
			    (job_ptr->job_id != save_job_id))
				continue;
			if (!IS_JOB_PENDING(job_ptr))
				continue;
			if (!avail_front_end(job_ptr))
				continue;	/* No available frontend */

			job_ptr->time_limit = save_time_limit;
			/* Reset backfill scheduling timers, resume testing */
			sched_start = time(NULL);
			gettimeofday(&start_tv, NULL);
			job_test_count = 1;
			START_TIMER;
		}

		FREE_NULL_BITMAP(avail_bitmap);
		FREE_NULL_BITMAP(exc_core_bitmap);
		start_res   = later_start;
		later_start = 0;
		j = job_test_resv(job_ptr, &start_res, true, &avail_bitmap,
				  &exc_core_bitmap);
		if (j != SLURM_SUCCESS) {
			if (debug_flags & DEBUG_FLAG_BACKFILL)
				info("backfill: job %u reservation defer",
				     job_ptr->job_id);
			job_ptr->time_limit = orig_time_limit;
			continue;
		}
		if (start_res > now)
			end_time = (time_limit * 60) + start_res;
		else
			end_time = (time_limit * 60) + now;
		resv_end = find_resv_end(start_res);
		/* Identify usable nodes for this job */
		bit_and(avail_bitmap, part_ptr->node_bitmap);
		bit_and(avail_bitmap, up_node_bitmap);
		bit_and(avail_bitmap, non_cg_bitmap);
		for (j=0; ; ) {
			if ((node_space[j].end_time > start_res) &&
			     node_space[j].next && (later_start == 0))
				later_start = node_space[j].end_time;
			if (node_space[j].end_time <= start_res)
				;
			else if (node_space[j].begin_time <= end_time) {
				bit_and(avail_bitmap,
					node_space[j].avail_bitmap);
			} else
				break;
			if ((j = node_space[j].next) == 0)
				break;
		}
		if (resv_end && (++resv_end < window_end) &&
		    ((later_start == 0) || (resv_end < later_start))) {
			later_start = resv_end;
		}

		if (job_ptr->details->exc_node_bitmap) {
			bit_not(job_ptr->details->exc_node_bitmap);
			bit_and(avail_bitmap,
				job_ptr->details->exc_node_bitmap);
			bit_not(job_ptr->details->exc_node_bitmap);
		}

		/* Test if insufficient nodes remain OR
		 *	required nodes missing OR
		 *	nodes lack features OR
		 *	no change since previously tested nodes (only changes
		 *	in other partition nodes) */
		if ((bit_set_count(avail_bitmap) < min_nodes) ||
		    ((job_ptr->details->req_node_bitmap) &&
		     (!bit_super_set(job_ptr->details->req_node_bitmap,
				     avail_bitmap))) ||
		    (job_req_node_filter(job_ptr, avail_bitmap))) {
			if (later_start) {
				job_ptr->start_time = 0;
				goto TRY_LATER;
			}

			/* Job can not start until too far in the future */
			job_ptr->time_limit = orig_time_limit;
			job_ptr->start_time = sched_start + backfill_window;
			continue;
		}

		/* Identify nodes which are definitely off limits */
		FREE_NULL_BITMAP(resv_bitmap);
		resv_bitmap = bit_copy(avail_bitmap);
		bit_not(resv_bitmap);

		/* this is the time consuming operation */
		debug2("backfill: entering _try_sched for job %u.",
		       job_ptr->job_id);

		if (!already_counted) {
			slurmctld_diag_stats.bf_last_depth_try++;
			already_counted = true;
		}

		if (debug_flags & DEBUG_FLAG_BACKFILL_MAP)
			_dump_job_test(job_ptr, avail_bitmap, start_res);
		j = _try_sched(job_ptr, &avail_bitmap, min_nodes, max_nodes,
			       req_nodes, exc_core_bitmap);

		now = time(NULL);
		if (j != SLURM_SUCCESS) {
			job_ptr->time_limit = orig_time_limit;
			job_ptr->start_time = 0;
			continue;	/* not runable */
		}

		if (start_res > job_ptr->start_time) {
			job_ptr->start_time = start_res;
			last_job_update = now;
		}
		if (job_ptr->start_time <= now) {	/* Can start now */
			uint32_t save_time_limit = job_ptr->time_limit;
			uint32_t hard_limit;
			bool reset_time = false;
			int rc = _start_job(job_ptr, resv_bitmap);
			if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) {
				if (orig_time_limit == NO_VAL) {
					acct_policy_alter_job(
						job_ptr, comp_time_limit);
					job_ptr->time_limit = comp_time_limit;
				} else {
					acct_policy_alter_job(
						job_ptr, orig_time_limit);
					job_ptr->time_limit = orig_time_limit;
				}
			} else if ((rc == SLURM_SUCCESS) && job_ptr->time_min) {
				/* Set time limit as high as possible */
				acct_policy_alter_job(job_ptr, comp_time_limit);
				job_ptr->time_limit = comp_time_limit;
				reset_time = true;
			} else if (orig_time_limit == NO_VAL) {
				acct_policy_alter_job(job_ptr, comp_time_limit);
				job_ptr->time_limit = comp_time_limit;
			} else {
				acct_policy_alter_job(job_ptr, orig_time_limit);
				job_ptr->time_limit = orig_time_limit;

			}
			if (job_ptr->time_limit == INFINITE)
				hard_limit = 365 * 24 * 60;	/* one year */
			else
				hard_limit = job_ptr->time_limit;
			job_ptr->end_time = job_ptr->start_time +
					    (hard_limit * 60);
			if (reset_time) {
				_reset_job_time_limit(job_ptr, now,
						      node_space);
				time_limit = job_ptr->time_limit;
			}

			if (rc == ESLURM_ACCOUNTING_POLICY) {
				/* Unknown future start time, just skip job */
				job_ptr->start_time = 0;
				continue;
			} else if (rc != SLURM_SUCCESS) {
				if (debug_flags & DEBUG_FLAG_BACKFILL) {
					info("backfill: planned start of job %u"
					     " failed: %s", job_ptr->job_id,
					     slurm_strerror(rc));
				}
				/* Drop through and reserve these resources.
				 * Likely due to state changes during sleep.
				 * Make best-effort based upon original state */
				job_ptr->time_limit = orig_time_limit;
				later_start = 0;
			} else {
				/* Started this job, move to next one */
				reject_array_job_id = 0;
				reject_array_part   = NULL;

				/* Update the database if job time limit
				 * changed and move to next job */
				if (save_time_limit != job_ptr->time_limit)
					jobacct_storage_g_job_start(acct_db_conn,
								    job_ptr);
				job_start_cnt++;
				if (max_backfill_jobs_start &&
				    (job_start_cnt >= max_backfill_jobs_start)){
					if (debug_flags & DEBUG_FLAG_BACKFILL) {
						info("backfill: bf_max_job_start"
						     " limit of %d reached",
						     max_backfill_jobs_start);
					}
					break;
				}
				continue;
			}
		} else {
			job_ptr->time_limit = orig_time_limit;
		}

		start_time  = job_ptr->start_time;
		end_reserve = job_ptr->start_time + (time_limit * 60);
		start_time  = (start_time / backfill_resolution) *
			      backfill_resolution;
		end_reserve = (end_reserve / backfill_resolution) *
			      backfill_resolution;

		if (later_start && (start_time > later_start)) {
			/* Try later when some nodes currently reserved for
			 * pending jobs are free */
			job_ptr->start_time = 0;
			goto TRY_LATER;
		}

		if (job_ptr->start_time > (sched_start + backfill_window)) {
			/* Starts too far in the future to worry about */
			if (debug_flags & DEBUG_FLAG_BACKFILL)
				_dump_job_sched(job_ptr, end_reserve,
						avail_bitmap);
			continue;
		}

		if (node_space_recs >= max_backfill_job_cnt) {
			if (debug_flags & DEBUG_FLAG_BACKFILL) {
				info("backfill: table size limit of %u reached",
				     max_backfill_job_cnt);
			}
			break;
		}

		if ((job_ptr->start_time > now) &&
		    _test_resv_overlap(node_space, avail_bitmap,
				       start_time, end_reserve)) {
			/* This job overlaps with an existing reservation for
			 * job to be backfill scheduled, which the sched
			 * plugin does not know about. Try again later. */
			later_start = job_ptr->start_time;
			job_ptr->start_time = 0;
			goto TRY_LATER;
		}

		/*
		 * Add reservation to scheduling table if appropriate
		 */
		if (debug_flags & DEBUG_FLAG_BACKFILL)
			_dump_job_sched(job_ptr, end_reserve, avail_bitmap);
		if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE))
			continue;
		reject_array_job_id = 0;
		reject_array_part   = NULL;
		if (debug_flags & DEBUG_FLAG_BACKFILL)
			_dump_job_sched(job_ptr, end_reserve, avail_bitmap);
		xfree(job_ptr->sched_nodes);
		job_ptr->sched_nodes = bitmap2node_name(avail_bitmap);
		bit_not(avail_bitmap);
		_add_reservation(start_time, end_reserve,
				 avail_bitmap, node_space, &node_space_recs);
		if (debug_flags & DEBUG_FLAG_BACKFILL_MAP)
			_dump_node_space_table(node_space);
	}
	xfree(bf_part_jobs);
	xfree(bf_part_ptr);
	xfree(uid);
	xfree(njobs);
	FREE_NULL_BITMAP(avail_bitmap);
	FREE_NULL_BITMAP(exc_core_bitmap);
	FREE_NULL_BITMAP(resv_bitmap);
	FREE_NULL_BITMAP(non_cg_bitmap);

	for (i=0; ; ) {
		FREE_NULL_BITMAP(node_space[i].avail_bitmap);
		if ((i = node_space[i].next) == 0)
			break;
	}
	xfree(node_space);
	list_destroy(job_queue);
	gettimeofday(&bf_time2, NULL);
	_do_diag_stats(&bf_time1, &bf_time2, yield_sleep);
	if (debug_flags & DEBUG_FLAG_BACKFILL) {
		END_TIMER;
		info("backfill: completed testing %u(%d) jobs, %s",
		     slurmctld_diag_stats.bf_last_depth,
		     job_test_count, TIME_STR);
	}
	return rc;
}
Exemple #21
0
/*
 * Try to find resources for a given job request
 * IN job_ptr - pointer to job record in slurmctld
 * IN/OUT bitmap - nodes available for assignment to job, clear those not to
 *	be used
 * IN min_nodes, max_nodes  - minimum and maximum number of nodes to allocate
 *	to this job (considers slurm block limits)
 * IN mode - SELECT_MODE_RUN_NOW: try to schedule job now
 *           SELECT_MODE_TEST_ONLY: test if job can ever run
 *           SELECT_MODE_WILL_RUN: determine when and where job can run
 * IN preemptee_candidates - List of pointers to jobs which can be preempted.
 * IN/OUT preemptee_job_list - Pointer to list of job pointers. These are the
 *		jobs to be preempted to initiate the pending job. Not set
 *		if mode=SELECT_MODE_TEST_ONLY or input pointer is NULL.
 * RET - SLURM_SUCCESS if job runnable now, error code otherwise
 */
extern int submit_job(struct job_record *job_ptr, bitstr_t *slurm_block_bitmap,
		      uint32_t min_nodes, uint32_t max_nodes,
		      uint32_t req_nodes, uint16_t mode,
		      List preemptee_candidates,
		      List *preemptee_job_list)
{
	int rc = SLURM_SUCCESS;
	bg_record_t* bg_record = NULL;
	char buf[256];
	uint16_t conn_type[SYSTEM_DIMENSIONS];
	List block_list = NULL;
	int blocks_added = 0;
	time_t starttime = time(NULL);
	uint16_t local_mode = mode;
	int avail_cpus = num_unused_cpus;
	int dim = 0;

	for (dim=0; dim<SYSTEM_DIMENSIONS; dim++)
		conn_type[dim] = (uint16_t)NO_VAL;
	if (preemptee_candidates && preemptee_job_list
	    && list_count(preemptee_candidates))
		local_mode |= SELECT_MODE_PREEMPT_FLAG;
	else
		local_mode |= SELECT_MODE_CHECK_FULL;

	if (bg_conf->layout_mode == LAYOUT_DYNAMIC)
		slurm_mutex_lock(&create_dynamic_mutex);

	slurm_mutex_lock(&block_state_mutex);
	block_list = copy_bg_list(bg_lists->main);
	slurm_mutex_unlock(&block_state_mutex);

	get_select_jobinfo(job_ptr->select_jobinfo->data,
			   SELECT_JOBDATA_CONN_TYPE, &conn_type);
	if (conn_type[0] == SELECT_NAV) {
		if (bg_conf->mp_cnode_cnt == bg_conf->nodecard_cnode_cnt)
			conn_type[0] = SELECT_SMALL;
		else if (min_nodes > 1) {
			for (dim=0; dim<SYSTEM_DIMENSIONS; dim++)
				conn_type[dim] = SELECT_TORUS;
		} else if (job_ptr->details->min_cpus < bg_conf->cpus_per_mp)
			conn_type[0] = SELECT_SMALL;
		else {
			for (dim=1; dim<SYSTEM_DIMENSIONS; dim++)
				conn_type[dim] = SELECT_NAV;
		}
		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_CONN_TYPE,
				   &conn_type);
	}

	if (slurm_block_bitmap && !bit_set_count(slurm_block_bitmap)) {
		error("no nodes given to place job %u.", job_ptr->job_id);

		if (bg_conf->layout_mode == LAYOUT_DYNAMIC)
			slurm_mutex_unlock(&create_dynamic_mutex);

		return SLURM_ERROR;
	}

	sprint_select_jobinfo(job_ptr->select_jobinfo->data,
			      buf, sizeof(buf),
			      SELECT_PRINT_MIXED);

	debug("bluegene:submit_job: %u mode=%d %s nodes=%u-%u-%u",
	      job_ptr->job_id, local_mode, buf,
	      min_nodes, req_nodes, max_nodes);

#ifdef HAVE_BG_L_P
# ifdef HAVE_BGL
	sprint_select_jobinfo(job_ptr->select_jobinfo->data,
			      buf, sizeof(buf),
			      SELECT_PRINT_BLRTS_IMAGE);
	debug3("BlrtsImage=%s", buf);
# endif
	sprint_select_jobinfo(job_ptr->select_jobinfo->data,
			      buf, sizeof(buf),
			      SELECT_PRINT_LINUX_IMAGE);
# ifdef HAVE_BGL
	debug3("LinuxImage=%s", buf);
# else
	debug3("ComputNodeImage=%s", buf);
# endif
	sprint_select_jobinfo(job_ptr->select_jobinfo->data,
			      buf, sizeof(buf),
			      SELECT_PRINT_RAMDISK_IMAGE);
# ifdef HAVE_BGL
	debug3("RamDiskImage=%s", buf);
# else
	debug3("RamDiskIoLoadImage=%s", buf);
# endif
#endif
	sprint_select_jobinfo(job_ptr->select_jobinfo->data,
			      buf, sizeof(buf),
			      SELECT_PRINT_MLOADER_IMAGE);
	debug3("MloaderImage=%s", buf);

	/* First look at the empty space, and then remove the
	   preemptable jobs and try again. */
	list_sort(block_list, (ListCmpF)bg_record_sort_aval_inc);

	rc = _find_best_block_match(block_list, &blocks_added,
				    job_ptr, slurm_block_bitmap, min_nodes,
				    max_nodes, req_nodes,
				    &bg_record, local_mode, avail_cpus);

	if (rc == SLURM_SUCCESS && SELECT_IS_PREEMPT_SET(local_mode)) {
		ListIterator itr;
		ListIterator job_itr;
		bg_record_t *found_record;
		struct job_record *preempt_job_ptr;

		if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
			info("doing preemption");
		local_mode |= SELECT_MODE_CHECK_FULL;

		job_itr = list_iterator_create(preemptee_candidates);
		itr = list_iterator_create(block_list);
		while ((preempt_job_ptr = list_next(job_itr))) {
			while ((found_record = list_next(itr))) {
				if (found_record->job_ptr == preempt_job_ptr) {
					/* info("removing job %u running on %s", */
					/*      preempt_job_ptr->job_id, */
					/*      found_record->bg_block_id); */
					found_record->job_ptr = NULL;
					found_record->job_running =
						NO_JOB_RUNNING;
					avail_cpus += found_record->cpu_cnt;
					break;
				}
			}
			if (!found_record) {
				list_iterator_reset(itr);
				error("Job %u wasn't found running anywhere, "
				      "can't preempt",
				      preempt_job_ptr->job_id);
				continue;
			} else if (job_ptr->details->min_cpus > avail_cpus)
				continue;

			list_sort(block_list,
				  (ListCmpF)bg_record_sort_aval_inc);
			if ((rc = _find_best_block_match(
				     block_list, &blocks_added,
				     job_ptr, slurm_block_bitmap,
				     min_nodes, max_nodes, req_nodes,
				     &bg_record, local_mode, avail_cpus))
			    == SLURM_SUCCESS)
				break;

			list_iterator_reset(itr);
		}
		list_iterator_destroy(itr);
		list_iterator_destroy(job_itr);
	}

	if (rc == SLURM_SUCCESS) {
		if (!bg_record)
			fatal("we got a success, but no block back");
		/* Here we see if there is a job running since
		 * some jobs take awhile to finish we need to
		 * make sure the time of the end is in the
		 * future.  If it isn't (meaning it is in the
		 * past or current time) we add 5 seconds to
		 * it so we don't use the block immediately.
		 */
		if (bg_record->job_ptr
		    && bg_record->job_ptr->end_time) {
			if (bg_record->job_ptr->end_time <= starttime)
				starttime += 5;
			else
				starttime = bg_record->job_ptr->end_time;
		} else if (bg_record->job_running == BLOCK_ERROR_STATE)
			starttime = INFINITE;

		/* make sure the job is eligible to run */
		if (job_ptr->details->begin_time > starttime)
			starttime = job_ptr->details->begin_time;

		job_ptr->start_time = starttime;

		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_NODES,
				   bg_record->mp_str);
		set_select_jobinfo(job_ptr->select_jobinfo->data,
				   SELECT_JOBDATA_IONODES,
				   bg_record->ionode_str);
		if (!bg_record->bg_block_id) {
			debug("%d can start unassigned job %u "
			      "at %ld on %s",
			      local_mode, job_ptr->job_id,
			      starttime, bg_record->mp_str);

			set_select_jobinfo(job_ptr->select_jobinfo->data,
					   SELECT_JOBDATA_BLOCK_PTR,
					   NULL);
			set_select_jobinfo(job_ptr->select_jobinfo->data,
					   SELECT_JOBDATA_NODE_CNT,
					   &bg_record->cnode_cnt);
		} else {
			if ((bg_record->ionode_str)
			    && (job_ptr->part_ptr->max_share <= 1))
				error("Small block used in "
				      "non-shared partition");

			debug("%d(%d) can start job %u "
			      "at %ld on %s(%s) %d",
			      local_mode, mode, job_ptr->job_id,
			      starttime, bg_record->bg_block_id,
			      bg_record->mp_str,
			      SELECT_IS_MODE_RUN_NOW(local_mode));

			if (SELECT_IS_MODE_RUN_NOW(local_mode)) {
				/* Set this up to be the
				   correct pointer since we
				   probably are working off a
				   copy.
				*/
				if (bg_record->original)
					bg_record = bg_record->original;
				set_select_jobinfo(
					job_ptr->select_jobinfo->data,
					SELECT_JOBDATA_BLOCK_PTR,
					bg_record);
				if (job_ptr) {
					bg_record->job_running =
						job_ptr->job_id;
					bg_record->job_ptr = job_ptr;

					job_ptr->job_state |= JOB_CONFIGURING;
					last_bg_update = time(NULL);
				}
			} else {
				set_select_jobinfo(
					job_ptr->select_jobinfo->data,
					SELECT_JOBDATA_BLOCK_PTR,
					NULL);
				/* Just to make sure we don't
				   end up using this on
				   another job, or we have to
				   wait until preemption is
				   done.
				*/
				bg_record->job_ptr = NULL;
				bg_record->job_running = NO_JOB_RUNNING;
			}

			set_select_jobinfo(job_ptr->select_jobinfo->data,
					   SELECT_JOBDATA_NODE_CNT,
					   &bg_record->cnode_cnt);
		}
		if (SELECT_IS_MODE_RUN_NOW(local_mode))
			_build_select_struct(job_ptr,
					     slurm_block_bitmap,
					     bg_record->cnode_cnt);
		/* set up the preempted job list */
		if (SELECT_IS_PREEMPT_SET(local_mode)) {
			if (*preemptee_job_list)
				list_destroy(*preemptee_job_list);
			*preemptee_job_list = _get_preemptables(
				local_mode, bg_record,
				preemptee_candidates);
		}
		if (!bg_record->bg_block_id) {
			/* This is a fake record so we need to
			 * destroy it after we get the info from
			 * it.  If it was just testing then
			 * we added this record to the
			 * block_list.  If this is the case
			 * it will be handled if se sync the
			 * lists.  But we don't want to do
			 * that so we will set blocks_added to
			 * 0 so it doesn't happen. */
			if (!blocks_added) {
				destroy_bg_record(bg_record);
				bg_record = NULL;
			}
			blocks_added = 0;
		}
		last_job_update = time(NULL);
	}

	if (bg_conf->layout_mode == LAYOUT_DYNAMIC) {
		slurm_mutex_lock(&block_state_mutex);
		if (blocks_added)
			_sync_block_lists(block_list, bg_lists->main);
		slurm_mutex_unlock(&block_state_mutex);
		slurm_mutex_unlock(&create_dynamic_mutex);
	}

	list_destroy(block_list);
	return rc;
}
Exemple #22
0
/*
 * route_p_split_hostlist - logic to split an input hostlist into
 *                           a set of hostlists to forward to.
 *
 * IN: hl        - hostlist_t   - list of every node to send message to
 *                                will be empty on return;
 * OUT: sp_hl    - hostlist_t** - the array of hostlists that will be malloced
 * OUT: count    - int*         - the count of created hostlists
 * RET: SLURM_SUCCESS - int
 *
 * Note: created hostlist will have to be freed independently using
 *       hostlist_destroy by the caller.
 * Note: the hostlist_t array will have to be xfree.
 */
extern int route_p_split_hostlist(hostlist_t hl,
				  hostlist_t** sp_hl,
				  int* count)
{
	int i, j, k, hl_ndx, msg_count, sw_count, lst_count;
	char  *buf;
	bitstr_t *nodes_bitmap = NULL;		/* nodes in message list */
	bitstr_t *fwd_bitmap = NULL;		/* nodes in forward list */

	msg_count = hostlist_count(hl);
	if (switch_record_cnt == 0) {
		/* configs have not already been processed */
		slurm_conf_init(NULL);
		if (init_node_conf()) {
			fatal("ROUTE: Failed to init slurm config");
		}
		if (build_all_nodeline_info(false)) {
			fatal("ROUTE: Failed to build node config");
		}
		rehash_node();

		if (slurm_topo_build_config() != SLURM_SUCCESS) {
			fatal("ROUTE: Failed to build topology config");
		}
	}
	*sp_hl = (hostlist_t*) xmalloc(switch_record_cnt * sizeof(hostlist_t));
	/* create bitmap of nodes to send message too */
	if (hostlist2bitmap (hl, false, &nodes_bitmap) != SLURM_SUCCESS) {
		buf = hostlist_ranged_string_xmalloc(hl);
		fatal("ROUTE: Failed to make bitmap from hostlist=%s.", buf);
	}

	/* Find lowest level switch containing all the nodes in the list */
	j = 0;
	for (i = 0; i <= switch_levels; i++) {
		for (j=0; j<switch_record_cnt; j++) {
			if (switch_record_table[j].level == i) {
				if (bit_super_set(nodes_bitmap,
						  switch_record_table[j].
						  node_bitmap)) {
					/* All nodes in message list are in
					 * this switch */
					break;
				}
			}
		}
		if (j < switch_record_cnt) {
			/* Got here via break after bit_super_set */
			break; // 'j' is our switch
		} /* else, no switches at this level reach all nodes */
	}
	if (i > switch_levels) {
		/* This can only happen if trying to schedule multiple physical
		 * clusters as a single logical cluster under the control of a
		 * single slurmctld daemon, and sending something like a
		 * node_registation request to all nodes.
		 * Revert to default behavior*/
		if (debug_flags & DEBUG_FLAG_ROUTE) {
			buf = hostlist_ranged_string_xmalloc(hl);
			debug("ROUTE: didn't find switch containing nodes=%s",
			      buf);
			xfree(buf);
		}
		FREE_NULL_BITMAP(nodes_bitmap);
		xfree(*sp_hl);
		return route_split_hostlist_treewidth(hl, sp_hl, count);
	}
	if (switch_record_table[j].level == 0) {
		/* This is a leaf switch. Construct list based on TreeWidth */
		FREE_NULL_BITMAP(nodes_bitmap);
		xfree(*sp_hl);
		return route_split_hostlist_treewidth(hl, sp_hl, count);
	}
	/* loop through children, construction a hostlist for each child switch
	 * with nodes in the message list */
	hl_ndx = 0;
	lst_count = 0;
	for (i=0; i < switch_record_table[j].num_switches; i++) {
		k = switch_record_table[j].switch_index[i];
		fwd_bitmap = bit_copy(switch_record_table[k].node_bitmap);
		bit_and(fwd_bitmap, nodes_bitmap);
		sw_count = bit_set_count(fwd_bitmap);
		if (sw_count == 0) {
			continue; /* no nodes on this switch in message list */
		}
		(*sp_hl)[hl_ndx] = bitmap2hostlist(fwd_bitmap);
		/* Now remove nodes from this switch from message list */
		bit_not(fwd_bitmap);
		bit_and(nodes_bitmap, fwd_bitmap);
		FREE_NULL_BITMAP(fwd_bitmap);
		if (debug_flags & DEBUG_FLAG_ROUTE) {
			buf = hostlist_ranged_string_xmalloc((*sp_hl)[hl_ndx]);
			debug("ROUTE: ... sublist[%d] switch=%s :: %s",
			      i, switch_record_table[i].name, buf);
			xfree(buf);
		}
		hl_ndx++;
		lst_count += sw_count;
		if (lst_count == msg_count)
			break; /* all nodes in message are in a child list */
	}
	FREE_NULL_BITMAP(nodes_bitmap);

	*count = hl_ndx;
	return SLURM_SUCCESS;

}
/*
 * _task_layout_lllp_block
 *
 * task_layout_lllp_block will create a block distribution at the
 * lowest level of logical processor which is either socket, core or
 * thread depending on the system architecture. The Block algorithm
 * is the same as the Block distribution performed in srun.
 *
 *  Distribution at the lllp:
 *  -m hostfile|plane|block|cyclic:block|cyclic
 *
 * The first distribution "hostfile|plane|block|cyclic" is computed
 * in srun. The second distribution "plane|block|cyclic" is computed
 * locally by each slurmd.
 *
 * The input to the lllp distribution algorithms is the gids (tasks
 * ids) generated for the local node.
 *
 * The output is a mapping of the gids onto logical processors
 * (thread/core/socket)  with is expressed cpu_bind masks.
 *
 */
static int _task_layout_lllp_block(launch_tasks_request_msg_t *req,
				   uint32_t node_id, bitstr_t ***masks_p)
{
	int c, i, size, last_taskcount = -1, taskcount = 0;
	uint16_t hw_sockets = 0, hw_cores = 0, hw_threads = 0;
	int max_tasks = req->tasks_to_launch[(int)node_id];
	int max_cpus = max_tasks * req->cpus_per_task;
	bitstr_t *avail_map;
	bitstr_t **masks = NULL;
	int core_inx, pu_per_core, *core_tasks = NULL;
	int sock_inx, pu_per_socket, *socket_tasks = NULL;

	info("_task_layout_lllp_block ");

	avail_map = _get_avail_map(req, &hw_sockets, &hw_cores, &hw_threads);
	if (!avail_map) {
		return SLURM_ERROR;
	}

	size = bit_set_count(avail_map);
	if (size < max_tasks) {
		error("task/affinity: only %d bits in avail_map for %d tasks!",
		      size, max_tasks);
		FREE_NULL_BITMAP(avail_map);
		return SLURM_ERROR;
	}
	if (size < max_cpus) {
		/* Possible result of overcommit */
		i = size / max_tasks;
		info("task/affinity: reset cpus_per_task from %d to %d",
		     req->cpus_per_task, i);
		req->cpus_per_task = i;
	}
	size = bit_size(avail_map);

	if ((req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE) &&
	    (max_cpus > (hw_sockets * hw_cores))) {
		/* More CPUs requested than available cores,
		 * disable core-level binding */
		req->cpu_bind_type &= (~CPU_BIND_ONE_THREAD_PER_CORE);
	}

	*masks_p = xmalloc(max_tasks * sizeof(bitstr_t*));
	masks = *masks_p;

	pu_per_core = hw_threads;
	core_tasks = xmalloc(sizeof(int) * hw_sockets * hw_cores);
	pu_per_socket = hw_cores * hw_threads;
	socket_tasks = xmalloc(sizeof(int) * hw_sockets);

	/* block distribution with oversubsciption */
	c = 0;
	while (taskcount < max_tasks) {
		if (taskcount == last_taskcount)
			fatal("_task_layout_lllp_block infinite loop");
		if (taskcount > 0) {
			/* Clear counters to over-subscribe, if necessary */
			memset(core_tasks, 0,
			       (sizeof(int) * hw_sockets * hw_cores));
			memset(socket_tasks, 0,
			       (sizeof(int) * hw_sockets));
		}
		last_taskcount = taskcount;
		/* the abstract map is already laid out in block order,
		 * so just iterate over it
		 */
		for (i = 0; i < size; i++) {
			/* skip unavailable resources */
			if (bit_test(avail_map, i) == 0)
				continue;

			core_inx = i / pu_per_core;
			if ((req->ntasks_per_core != 0) &&
			    (core_tasks[core_inx] >= req->ntasks_per_core))
				continue;
			sock_inx = i / pu_per_socket;
			if ((req->ntasks_per_socket != 0) &&
			    (socket_tasks[sock_inx] >= req->ntasks_per_socket))
				continue;
			socket_tasks[sock_inx]++;

			if (!masks[taskcount])
				masks[taskcount] = bit_alloc(
					conf->block_map_size);
			//info("setting %d %d", taskcount, i);
			bit_set(masks[taskcount], i);

			/* skip unrequested threads */
			if (req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE)
				i += hw_threads - 1;

			if (++c < req->cpus_per_task)
				continue;
			core_tasks[core_inx]++;
			/* Binding to cores, skip remaining of the threads */
			if (!(req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE)
			    && ((req->cpu_bind_type & CPU_BIND_TO_CORES)
				|| (req->ntasks_per_core == 1))) {
				int threads_not_used;
				if (req->cpus_per_task < hw_threads)
					threads_not_used =
						hw_threads - req->cpus_per_task;
				else
					threads_not_used =
						req->cpus_per_task % hw_threads;
				i += threads_not_used;
			}
			c = 0;
			if (++taskcount >= max_tasks)
				break;
		}
	}
	xfree(core_tasks);
	xfree(socket_tasks);

	/* last step: expand the masks to bind each task
	 * to the requested resource */
	_expand_masks(req->cpu_bind_type, max_tasks, masks,
			hw_sockets, hw_cores, hw_threads, avail_map);
	FREE_NULL_BITMAP(avail_map);

	return SLURM_SUCCESS;
}
Exemple #24
0
/*
 * _task_layout_lllp_block
 *
 * task_layout_lllp_block will create a block distribution at the
 * lowest level of logical processor which is either socket, core or
 * thread depending on the system architecture. The Block algorithm
 * is the same as the Block distribution performed in srun.
 *
 *  Distribution at the lllp:
 *  -m hostfile|plane|block|cyclic:block|cyclic
 *
 * The first distribution "hostfile|plane|block|cyclic" is computed
 * in srun. The second distribution "plane|block|cyclic" is computed
 * locally by each slurmd.
 *
 * The input to the lllp distribution algorithms is the gids (tasks
 * ids) generated for the local node.
 *
 * The output is a mapping of the gids onto logical processors
 * (thread/core/socket)  with is expressed cpu_bind masks.
 *
 */
static int _task_layout_lllp_block(launch_tasks_request_msg_t *req,
				   uint32_t node_id, bitstr_t ***masks_p)
{
	int c, i, j, t, size, last_taskcount = -1, taskcount = 0;
	uint16_t hw_sockets = 0, hw_cores = 0, hw_threads = 0;
	int max_tasks = req->tasks_to_launch[(int)node_id];
	int max_cpus = max_tasks * req->cpus_per_task;
	int *task_array;
	bitstr_t *avail_map;
	bitstr_t **masks = NULL;

	info("_task_layout_lllp_block ");

	avail_map = _get_avail_map(req, &hw_sockets, &hw_cores, &hw_threads);
	if (!avail_map) {
		return SLURM_ERROR;
	}

	size = bit_set_count(avail_map);
	if (size < max_tasks) {
		error("task/affinity: only %d bits in avail_map for %d tasks!",
		      size, max_tasks);
		FREE_NULL_BITMAP(avail_map);
		return SLURM_ERROR;
	}
	if (size < max_cpus) {
		/* Possible result of overcommit */
		i = size / max_tasks;
		info("task/affinity: reset cpus_per_task from %d to %d",
		     req->cpus_per_task, i);
		req->cpus_per_task = i;
	}
	size = bit_size(avail_map);

	*masks_p = xmalloc(max_tasks * sizeof(bitstr_t*));
	masks = *masks_p;

	task_array = xmalloc(size * sizeof(int));
	if (!task_array) {
		error("In lllp_block: task_array memory error");
		FREE_NULL_BITMAP(avail_map);
		return SLURM_ERROR;
	}

	/* block distribution with oversubsciption */
	c = 0;
	while(taskcount < max_tasks) {
		if (taskcount == last_taskcount) {
			fatal("_task_layout_lllp_block infinite loop");
		}
		last_taskcount = taskcount;
		/* the abstract map is already laid out in block order,
		 * so just iterate over it
		 */
		for (i = 0; i < size; i++) {
			/* skip unrequested threads */
			if (i%hw_threads >= hw_threads)
				continue;
			/* skip unavailable resources */
			if (bit_test(avail_map, i) == 0)
				continue;
			/* if multiple CPUs per task, only
			 * count the task on the first CPU */
			if (c == 0)
				task_array[i] += 1;
			if (++c < req->cpus_per_task)
				continue;
			c = 0;
			if (++taskcount >= max_tasks)
				break;
		}
	}
	/* Distribute the tasks and create per-task masks that only
	 * contain the first CPU. Note that unused resources
	 * (task_array[i] == 0) will get skipped */
	taskcount = 0;
	for (i = 0; i < size; i++) {
		for (t = 0; t < task_array[i]; t++) {
			if (masks[taskcount] == NULL)
				masks[taskcount] = (bitstr_t *)bit_alloc(conf->block_map_size);
			bit_set(masks[taskcount++], i);
		}
	}
	/* now set additional CPUs for cpus_per_task > 1 */
	for (t=0; t<max_tasks && req->cpus_per_task>1; t++) {
		if (!masks[t])
			continue;
		c = 0;
		for (i = 0; i < size && c<req->cpus_per_task; i++) {
			if (bit_test(masks[t], i) == 0)
				continue;
			for (j=i+1,c=1; j<size && c<req->cpus_per_task;j++) {
				if (bit_test(avail_map, j) == 0)
					continue;
				bit_set(masks[t], j);
				c++;
			}
			if (c < req->cpus_per_task) {
				/* we haven't found all of the CPUs for this
				 * task, so we'll wrap the search to cover the
				 * whole node */
				for (j=0; j<i && c<req->cpus_per_task; j++) {
					if (bit_test(avail_map, j) == 0)
						continue;
					bit_set(masks[t], j);
					c++;
				}
			}
		}
	}

	xfree(task_array);

	/* last step: expand the masks to bind each task
	 * to the requested resource */
	_expand_masks(req->cpu_bind_type, max_tasks, masks,
			hw_sockets, hw_cores, hw_threads, avail_map);
	FREE_NULL_BITMAP(avail_map);

	return SLURM_SUCCESS;
}
Exemple #25
0
/*
 * _task_layout_lllp_cyclic
 *
 * task_layout_lllp_cyclic creates a cyclic distribution at the
 * lowest level of logical processor which is either socket, core or
 * thread depending on the system architecture. The Cyclic algorithm
 * is the same as the Cyclic distribution performed in srun.
 *
 *  Distribution at the lllp:
 *  -m hostfile|plane|block|cyclic:block|cyclic
 *
 * The first distribution "hostfile|plane|block|cyclic" is computed
 * in srun. The second distribution "plane|block|cyclic" is computed
 * locally by each slurmd.
 *
 * The input to the lllp distribution algorithms is the gids (tasks
 * ids) generated for the local node.
 *
 * The output is a mapping of the gids onto logical processors
 * (thread/core/socket) with is expressed cpu_bind masks.
 *
 */
static int _task_layout_lllp_cyclic(launch_tasks_request_msg_t *req,
				    uint32_t node_id, bitstr_t ***masks_p)
{
	int last_taskcount = -1, taskcount = 0;
	uint16_t c, i, s, t, hw_sockets = 0, hw_cores = 0, hw_threads = 0;
	int size, max_tasks = req->tasks_to_launch[(int)node_id];
	int max_cpus = max_tasks * req->cpus_per_task;
	int avail_size;
	bitstr_t *avail_map;
	bitstr_t **masks = NULL;

	info ("_task_layout_lllp_cyclic ");

	avail_map = _get_avail_map(req, &hw_sockets, &hw_cores, &hw_threads);
	if (!avail_map)
		return SLURM_ERROR;
	avail_size = bit_size(avail_map);

	*masks_p = xmalloc(max_tasks * sizeof(bitstr_t*));
	masks = *masks_p;

	size = bit_set_count(avail_map);
	if (size < max_tasks) {
		error("task/affinity: only %d bits in avail_map for %d tasks!",
		      size, max_tasks);
		FREE_NULL_BITMAP(avail_map);
		return SLURM_ERROR;
	}
	if (size < max_cpus) {
		/* Possible result of overcommit */
		i = size / max_tasks;
		info("task/affinity: reset cpus_per_task from %d to %d",
		     req->cpus_per_task, i);
		req->cpus_per_task = i;
	}

	i = 0;
	while (taskcount < max_tasks) {
		if (taskcount == last_taskcount)
			fatal("_task_layout_lllp_cyclic failure");
		last_taskcount = taskcount;
		for (t = 0; t < hw_threads; t++) {
			for (c = 0; c < hw_cores; c++) {
				for (s = 0; s < hw_sockets; s++) {
					uint16_t bit = s*(hw_cores*hw_threads) +
						       c*(hw_threads) + t;
					/* In case hardware and config differ */
					bit %= avail_size;
					if (bit_test(avail_map, bit) == 0)
						continue;
					if (masks[taskcount] == NULL) {
						masks[taskcount] =
							(bitstr_t *)
							bit_alloc(conf->
								  block_map_size);
					}
					bit_set(masks[taskcount], bit);

					if (++i < req->cpus_per_task)
						continue;
					i = 0;
					if (++taskcount >= max_tasks)
						break;
				}
				if (taskcount >= max_tasks)
					break;
			}
			if (taskcount >= max_tasks)
				break;
		}
	}

	/* last step: expand the masks to bind each task
	 * to the requested resource */
	_expand_masks(req->cpu_bind_type, max_tasks, masks,
			hw_sockets, hw_cores, hw_threads, avail_map);
	FREE_NULL_BITMAP(avail_map);

	return SLURM_SUCCESS;
}
Exemple #26
0
/* cr_job_test - does most of the real work for select_p_job_test(), which
 *	includes contiguous selection, load-leveling and max_share logic
 *
 * PROCEDURE:
 *
 * Step 1: compare nodes in "avail" bitmap with current node state data
 *         to find available nodes that match the job request
 *
 * Step 2: check resources in "avail" bitmap with allocated resources from
 *         higher priority partitions (busy resources are UNavailable)
 *
 * Step 3: select resource usage on remaining resources in "avail" bitmap
 *         for this job, with the placement influenced by existing
 *         allocations
 */
extern int cr_job_test(struct job_record *job_ptr, bitstr_t *bitmap, int mode,
			uint16_t cr_type, enum node_cr_state job_node_req, 
			uint32_t cr_node_cnt,
			struct part_res_record *cr_part_ptr,
			struct node_use_record *node_usage)
{
	static int gang_mode = -1;
	int error_code = SLURM_SUCCESS;
	bitstr_t *orig_map, *avail_cores, *free_cores;
	bitstr_t *tmpcore = NULL;
	bool test_only;
	uint32_t c, i, j, k, n, csize, save_mem = 0;
	job_resources_t *job_res;
	struct job_details *details_ptr;
	struct part_res_record *p_ptr, *jp_ptr;
	uint16_t *cpu_count;

	if (gang_mode == -1) {
		if (slurm_get_preempt_mode() & PREEMPT_MODE_GANG)
			gang_mode = 1;
		else
			gang_mode = 0;
	}

	details_ptr = job_ptr->details;

	free_job_resources(&job_ptr->job_resrcs);

	if (mode == SELECT_MODE_TEST_ONLY)
		test_only = true;
	else	/* SELECT_MODE_RUN_NOW || SELECT_MODE_WILL_RUN  */
		test_only = false;

	/* check node_state and update the node bitmap as necessary */
	if (!test_only) {
		error_code = _verify_node_state(cr_part_ptr, job_ptr,
						bitmap, cr_type, node_usage,
						job_node_req);
		if (error_code != SLURM_SUCCESS)
			return error_code;
	}

	if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
		info("select/serial: evaluating job %u on %u nodes",
		     job_ptr->job_id, bit_set_count(bitmap));
	}

	orig_map = bit_copy(bitmap);
	avail_cores = _make_core_bitmap(bitmap);

	/* test to make sure that this job can succeed with all avail_cores
	 * if 'no' then return FAIL
	 * if 'yes' then we will seek the optimal placement for this job
	 *          within avail_cores
	 */
	free_cores = bit_copy(avail_cores);
	cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores,
				  node_usage, cr_type, test_only);
	if (cpu_count == NULL) {
		/* job cannot fit */
		FREE_NULL_BITMAP(orig_map);
		FREE_NULL_BITMAP(free_cores);
		FREE_NULL_BITMAP(avail_cores);
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
			info("select/serial: cr_job_test: test 0 fail: "
			     "insufficient resources");
		}
		return SLURM_ERROR;
	} else if (test_only) {
		FREE_NULL_BITMAP(orig_map);
		FREE_NULL_BITMAP(free_cores);
		FREE_NULL_BITMAP(avail_cores);
		xfree(cpu_count);
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE)
			info("select/serial: cr_job_test: test 0 pass: "******"test_only");
		return SLURM_SUCCESS;
	}
	if (cr_type == CR_MEMORY) {
		/* CR_MEMORY does not care about existing CPU allocations,
		 * so we can jump right to job allocation from here */
		goto alloc_job;
	}
	xfree(cpu_count);
	if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
		info("select/serial: cr_job_test: test 0 pass - "
		     "job fits on given resources");
	}

	/* now that we know that this job can run with the given resources,
	 * let's factor in the existing allocations and seek the optimal set
	 * of resources for this job. Here is the procedure:
	 *
	 * Step 1: Seek idle CPUs across all partitions. If successful then
	 *         place job and exit. If not successful, then continue. Two
	 *         related items to note:
	 *          1. Jobs that don't share CPUs finish with step 1.
	 *          2. The remaining steps assume sharing or preemption.
	 *
	 * Step 2: Remove resources that are in use by higher-priority
	 *         partitions, and test that job can still succeed. If not
	 *         then exit.
	 *
	 * Step 3: Seek idle nodes among the partitions with the same
	 *         priority as the job's partition. If successful then
	 *         goto Step 6. If not then continue:
	 *
	 * Step 4: Seek placement within the job's partition. Search
	 *         row-by-row. If no placement if found, then exit. If a row
	 *         is found, then continue:
	 *
	 * Step 5: Place job and exit. FIXME! Here is where we need a
	 *         placement algorithm that recognizes existing job
	 *         boundaries and tries to "overlap jobs" as efficiently
	 *         as possible.
	 *
	 * Step 6: Place job and exit. FIXME! here is we use a placement
	 *         algorithm similar to Step 5 on jobs from lower-priority
	 *         partitions.
	 */


	/*** Step 1 ***/
	bit_copybits(bitmap, orig_map);
	bit_copybits(free_cores, avail_cores);

	/* remove all existing allocations from free_cores */
	tmpcore = bit_copy(free_cores);
	for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) {
		if (!p_ptr->row)
			continue;
		for (i = 0; i < p_ptr->num_rows; i++) {
			if (!p_ptr->row[i].row_bitmap)
				continue;
			bit_copybits(tmpcore, p_ptr->row[i].row_bitmap);
			bit_not(tmpcore); /* set bits now "free" resources */
			bit_and(free_cores, tmpcore);
		}
	}
	cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores,
				  node_usage, cr_type, test_only);
	if (cpu_count) {
		/* job fits! We're done. */
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
			info("select/serial: cr_job_test: test 1 pass - "
			     "idle resources found");
		}
		goto alloc_job;
	}

	if ((gang_mode == 0) && (job_node_req == NODE_CR_ONE_ROW)) {
		/* This job CANNOT share CPUs regardless of priority,
		 * so we fail here. Note that Shared=EXCLUSIVE was already
		 * addressed in _verify_node_state() and job preemption
		 * removes jobs from simulated resource allocation map
		 * before this point. */
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
			info("select/serial: cr_job_test: test 1 fail - "
			     "no idle resources available");
		}
		goto alloc_job;
	}
	if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
		info("select/serial: cr_job_test: test 1 fail - "
		     "not enough idle resources");
	}

	/*** Step 2 ***/
	bit_copybits(bitmap, orig_map);
	bit_copybits(free_cores, avail_cores);

	for (jp_ptr = cr_part_ptr; jp_ptr; jp_ptr = jp_ptr->next) {
		if (jp_ptr->part_ptr == job_ptr->part_ptr)
			break;
	}
	if (!jp_ptr) {
		fatal("select/serial: could not find partition for job %u",
		      job_ptr->job_id);
		return SLURM_ERROR;	/* Fix CLANG false positive */
	}

	/* remove existing allocations (jobs) from higher-priority partitions
	 * from avail_cores */
	for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) {
		if ((p_ptr->part_ptr->priority <= jp_ptr->part_ptr->priority) &&
		    (p_ptr->part_ptr->preempt_mode != PREEMPT_MODE_OFF))
			continue;
		if (!p_ptr->row)
			continue;
		for (i = 0; i < p_ptr->num_rows; i++) {
			if (!p_ptr->row[i].row_bitmap)
				continue;
			bit_copybits(tmpcore, p_ptr->row[i].row_bitmap);
			bit_not(tmpcore); /* set bits now "free" resources */
			bit_and(free_cores, tmpcore);
		}
	}
	/* make these changes permanent */
	bit_copybits(avail_cores, free_cores);
	cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores,
				  node_usage, cr_type, test_only);
	if (!cpu_count) {
		/* job needs resources that are currently in use by
		 * higher-priority jobs, so fail for now */
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
			info("select/serial: cr_job_test: test 2 fail - "
			     "resources busy with higher priority jobs");
		}
		goto alloc_job;
	}
	xfree(cpu_count);
	if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
		info("select/serial: cr_job_test: test 2 pass - "
		     "available resources for this priority");
	}

	/*** Step 3 ***/
	bit_copybits(bitmap, orig_map);
	bit_copybits(free_cores, avail_cores);

	/* remove existing allocations (jobs) from same-priority partitions
	 * from avail_cores */
	for (p_ptr = cr_part_ptr; p_ptr; p_ptr = p_ptr->next) {
		if (p_ptr->part_ptr->priority != jp_ptr->part_ptr->priority)
			continue;
		if (!p_ptr->row)
			continue;
		for (i = 0; i < p_ptr->num_rows; i++) {
			if (!p_ptr->row[i].row_bitmap)
				continue;
			bit_copybits(tmpcore, p_ptr->row[i].row_bitmap);
			bit_not(tmpcore); /* set bits now "free" resources */
			bit_and(free_cores, tmpcore);
		}
	}
	cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt, free_cores,
				  node_usage, cr_type, test_only);
	if (cpu_count) {
		/* jobs from low-priority partitions are the only thing left
		 * in our way. for now we'll ignore them, but FIXME: we need
		 * a good placement algorithm here that optimizes "job overlap"
		 * between this job (in these idle nodes) and the low-priority
		 * jobs */
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
			info("select/serial: cr_job_test: test 3 pass - "
			     "found resources");
		}
		goto alloc_job;
	}
	if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
		info("select/serial: cr_job_test: test 3 fail - "
		     "not enough idle resources in same priority");
	}


	/*** Step 4 ***/
	/* try to fit the job into an existing row
	 *
	 * tmpcore = worker core_bitmap
	 * free_cores = core_bitmap to be built
	 * avail_cores = static core_bitmap of all available cores
	 */

	if (!jp_ptr || !jp_ptr->row) {
		/* there's no existing jobs in this partition, so place
		 * the job in avail_cores. FIXME: still need a good
		 * placement algorithm here that optimizes "job overlap"
		 * between this job (in these idle nodes) and existing
		 * jobs in the other partitions with <= priority to
		 * this partition */
		bit_copybits(bitmap, orig_map);
		bit_copybits(free_cores, avail_cores);
		cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt,
					  free_cores, node_usage, cr_type,
					  test_only);
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
			info("select/serial: cr_job_test: test 4 pass - "
			     "first row found");
		}
		goto alloc_job;
	}

	cr_sort_part_rows(jp_ptr);
	c = jp_ptr->num_rows;
	if (job_node_req != NODE_CR_AVAILABLE)
		c = 1;
	for (i = 0; i < c; i++) {
		if (!jp_ptr->row[i].row_bitmap)
			break;
		bit_copybits(bitmap, orig_map);
		bit_copybits(free_cores, avail_cores);
		bit_copybits(tmpcore, jp_ptr->row[i].row_bitmap);
		bit_not(tmpcore);
		bit_and(free_cores, tmpcore);
		cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt,
					  free_cores, node_usage, cr_type,
					  test_only);
		if (cpu_count) {
			if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
				info("select/serial: cr_job_test: "
				     "test 4 pass - row %i", i);
			}
			break;
		}
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
			info("select/serial: cr_job_test: "
			     "test 4 fail - row %i", i);
		}
	}

	if ((i < c) && !jp_ptr->row[i].row_bitmap) {
		/* we've found an empty row, so use it */
		bit_copybits(bitmap, orig_map);
		bit_copybits(free_cores, avail_cores);
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
			info("select/serial: cr_job_test: "
			     "test 4 trying empty row %i",i);
		}
		cpu_count = _select_nodes(job_ptr, bitmap, cr_node_cnt,
					  free_cores, node_usage, cr_type,
					  test_only);
	}

	if (!cpu_count) {
		/* job can't fit into any row, so exit */
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
			info("select/serial: cr_job_test: test 4 fail - "
			     "busy partition");
		}
		goto alloc_job;

	}

	/*** CONSTRUCTION ZONE FOR STEPs 5 AND 6 ***
	 * Note that while the job may have fit into a row, it should
	 * still be run through a good placement algorithm here that
	 * optimizes "job overlap" between this job (in these idle nodes)
	 * and existing jobs in the other partitions with <= priority to
	 * this partition */

alloc_job:
	/* at this point we've found a good set of
	 * bits to allocate to this job:
	 * - bitmap is the set of nodes to allocate
	 * - free_cores is the set of allocated cores
	 * - cpu_count is the number of cpus per allocated node
	 *
	 * Next steps are to cleanup the worker variables,
	 * create the job_resources struct,
	 * distribute the job on the bits, and exit
	 */
	FREE_NULL_BITMAP(orig_map);
	FREE_NULL_BITMAP(avail_cores);
	FREE_NULL_BITMAP(tmpcore);
	if (!cpu_count) {
		/* we were sent here to cleanup and exit */
		FREE_NULL_BITMAP(free_cores);
		if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
			info("select/serial: exiting cr_job_test with no "
			     "allocation");
		}
		return SLURM_ERROR;
	}

	/* At this point we have:
	 * - a bitmap of selected nodes
	 * - a free_cores bitmap of usable cores on each selected node
	 * - a per-alloc-node cpu_count array
	 */

	if ((mode != SELECT_MODE_WILL_RUN) && (job_ptr->part_ptr == NULL))
		error_code = EINVAL;
	if ((error_code == SLURM_SUCCESS) && (mode == SELECT_MODE_WILL_RUN))
		job_ptr->total_cpus = 1;
	if ((error_code != SLURM_SUCCESS) || (mode != SELECT_MODE_RUN_NOW)) {
		FREE_NULL_BITMAP(free_cores);
		xfree(cpu_count);
		return error_code;
	}

	n = bit_ffs(bitmap);
	if (n < 0) {
		FREE_NULL_BITMAP(free_cores);
		xfree(cpu_count);
		return error_code;
	}

	if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
		info("select/serial: cr_job_test: distributing job %u",
		     job_ptr->job_id);
	}

	/** create the struct_job_res  **/
	job_res                   = create_job_resources();
	job_res->node_bitmap      = bit_copy(bitmap);
	job_res->nodes            = bitmap2node_name(bitmap);
	job_res->nhosts           = bit_set_count(bitmap);
	job_res->ncpus            = job_res->nhosts;
	if (job_ptr->details->ntasks_per_node)
		job_res->ncpus   *= details_ptr->ntasks_per_node;
	job_res->ncpus            = MAX(job_res->ncpus,
					details_ptr->min_cpus);
	job_res->ncpus            = MAX(job_res->ncpus,
					details_ptr->pn_min_cpus);
	job_res->node_req         = job_node_req;
	job_res->cpus             = cpu_count;
	job_res->cpus_used        = xmalloc(job_res->nhosts *
					    sizeof(uint16_t));
	job_res->memory_allocated = xmalloc(job_res->nhosts *
					    sizeof(uint32_t));
	job_res->memory_used      = xmalloc(job_res->nhosts *
					    sizeof(uint32_t));

	/* store the hardware data for the selected nodes */
	error_code = build_job_resources(job_res, node_record_table_ptr,
					  select_fast_schedule);
	if (error_code != SLURM_SUCCESS) {
		free_job_resources(&job_res);
		FREE_NULL_BITMAP(free_cores);
		return error_code;
	}

	c = 0;
	csize = bit_size(job_res->core_bitmap);
	j = cr_get_coremap_offset(n);
	k = cr_get_coremap_offset(n + 1);
	for (; j < k; j++, c++) {
		if (!bit_test(free_cores, j))
			continue;
		if (c >= csize)	{
			error("select/serial: cr_job_test "
			      "core_bitmap index error on node %s", 
			      select_node_record[n].node_ptr->name);
			drain_nodes(select_node_record[n].node_ptr->name,
				    "Bad core count", getuid());
			free_job_resources(&job_res);
			FREE_NULL_BITMAP(free_cores);
			return SLURM_ERROR;
		}
		bit_set(job_res->core_bitmap, c);
		break;
	}

	if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
		info("select/serial: cr_job_test: job %u ncpus %u cbits %u/%d "
		     "nbits %u", job_ptr->job_id,
		     job_res->ncpus, bit_set_count(free_cores), 1,
		     job_res->nhosts);
	}
	FREE_NULL_BITMAP(free_cores);

	/* distribute the tasks and clear any unused cores */
	job_ptr->job_resrcs = job_res;
	error_code = cr_dist(job_ptr, cr_type);
	if (error_code != SLURM_SUCCESS) {
		free_job_resources(&job_ptr->job_resrcs);
		return error_code;
	}

	/* translate job_res->cpus array into format with rep count */
	job_ptr->total_cpus = build_job_resources_cpu_array(job_res);

	if (!(cr_type & CR_MEMORY))
		return error_code;

	/* load memory allocated array */
	save_mem = details_ptr->pn_min_memory;
	if (save_mem & MEM_PER_CPU) {
		/* memory is per-cpu */
		save_mem &= (~MEM_PER_CPU);
		job_res->memory_allocated[0] = job_res->cpus[0] * save_mem;
	} else {
		/* memory is per-node */
		job_res->memory_allocated[0] = save_mem;
	}
	return error_code;
}
Exemple #27
0
static int _attempt_backfill(void)
{
	DEF_TIMERS;
	bool filter_root = false;
	List job_queue;
	job_queue_rec_t *job_queue_rec;
	slurmdb_qos_rec_t *qos_ptr = NULL;
	int i, j, node_space_recs;
	struct job_record *job_ptr;
	struct part_record *part_ptr;
	uint32_t end_time, end_reserve;
	uint32_t time_limit, comp_time_limit, orig_time_limit;
	uint32_t min_nodes, max_nodes, req_nodes;
	bitstr_t *avail_bitmap = NULL, *resv_bitmap = NULL;
	bitstr_t *exc_core_bitmap = NULL;
	time_t now, sched_start, later_start, start_res, resv_end;
	node_space_map_t *node_space;
	struct timeval bf_time1, bf_time2;
	int sched_timeout = 2, yield_sleep = 1;
	int rc = 0;
	int job_test_count = 0;
	uint32_t *uid = NULL, nuser = 0;
	uint16_t *njobs = NULL;
	bool already_counted;
	uint32_t reject_array_job_id = 0;

#ifdef HAVE_CRAY
	/*
	 * Run a Basil Inventory immediately before setting up the schedule
	 * plan, to avoid race conditions caused by ALPS node state change.
	 * Needs to be done with the node-state lock taken.
	 */
	START_TIMER;
	if (select_g_reconfigure()) {
		debug4("backfill: not scheduling due to ALPS");
		return SLURM_SUCCESS;
	}
	END_TIMER;
	if (debug_flags & DEBUG_FLAG_BACKFILL)
		info("backfill: ALPS inventory completed, %s", TIME_STR);

	/* The Basil inventory can take a long time to complete. Process
	 * pending RPCs before starting the backfill scheduling logic */
	_yield_locks(1);
#endif

	START_TIMER;
	if (debug_flags & DEBUG_FLAG_BACKFILL)
		info("backfill: beginning");
	sched_start = now = time(NULL);

	if (slurm_get_root_filter())
		filter_root = true;

	job_queue = build_job_queue(true);
	if (list_count(job_queue) == 0) {
		debug("backfill: no jobs to backfill");
		list_destroy(job_queue);
		return 0;
	}

	gettimeofday(&bf_time1, NULL);

	slurmctld_diag_stats.bf_queue_len = list_count(job_queue);
	slurmctld_diag_stats.bf_queue_len_sum += slurmctld_diag_stats.
						 bf_queue_len;
	slurmctld_diag_stats.bf_last_depth = 0;
	slurmctld_diag_stats.bf_last_depth_try = 0;
	slurmctld_diag_stats.bf_when_last_cycle = now;
	bf_last_yields = 0;
	slurmctld_diag_stats.bf_active = 1;

	node_space = xmalloc(sizeof(node_space_map_t) *
			     (max_backfill_job_cnt + 3));
	node_space[0].begin_time = sched_start;
	node_space[0].end_time = sched_start + backfill_window;
	node_space[0].avail_bitmap = bit_copy(avail_node_bitmap);
	node_space[0].next = 0;
	node_space_recs = 1;
	if (debug_flags & DEBUG_FLAG_BACKFILL)
		_dump_node_space_table(node_space);

	if (max_backfill_job_per_user) {
		uid = xmalloc(BF_MAX_USERS * sizeof(uint32_t));
		njobs = xmalloc(BF_MAX_USERS * sizeof(uint16_t));
	}
	while ((job_queue_rec = (job_queue_rec_t *)
				list_pop_bottom(job_queue, sort_job_queue2))) {
		job_ptr  = job_queue_rec->job_ptr;
		orig_time_limit = job_ptr->time_limit;

		if ((time(NULL) - sched_start) >= sched_timeout) {
			uint32_t save_time_limit = job_ptr->time_limit;
			job_ptr->time_limit = orig_time_limit;
			if (debug_flags & DEBUG_FLAG_BACKFILL) {
				END_TIMER;
				info("backfill: completed yielding locks "
				     "after testing %d jobs, %s",
				     job_test_count, TIME_STR);
			}
			if (_yield_locks(yield_sleep) && !backfill_continue) {
				if (debug_flags & DEBUG_FLAG_BACKFILL) {
					info("backfill: system state changed, "
					     "breaking out after testing %d "
					     "jobs", job_test_count);
				}
				rc = 1;
				break;
			}
			job_ptr->time_limit = save_time_limit;
			/* Reset backfill scheduling timers, resume testing */
			sched_start = time(NULL);
			job_test_count = 0;
			START_TIMER;
		}

		part_ptr = job_queue_rec->part_ptr;
		job_test_count++;

		xfree(job_queue_rec);
		if (!IS_JOB_PENDING(job_ptr))
			continue;	/* started in other partition */
		if (!avail_front_end(job_ptr))
			continue;	/* No available frontend for this job */
		if (job_ptr->array_task_id != (uint16_t) NO_VAL) {
			if (reject_array_job_id == job_ptr->array_job_id)
				continue;  /* already rejected array element */
			/* assume reject whole array for now, clear if OK */
			reject_array_job_id = job_ptr->array_job_id;
		}
		job_ptr->part_ptr = part_ptr;

		if (debug_flags & DEBUG_FLAG_BACKFILL)
			info("backfill test for job %u", job_ptr->job_id);

		slurmctld_diag_stats.bf_last_depth++;
		already_counted = false;

		if (max_backfill_job_per_user) {
			for (j = 0; j < nuser; j++) {
				if (job_ptr->user_id == uid[j]) {
					njobs[j]++;
					if (debug_flags & DEBUG_FLAG_BACKFILL)
						debug("backfill: user %u: "
						      "#jobs %u",
						      uid[j], njobs[j]);
					break;
				}
			}
			if (j == nuser) { /* user not found */
				if (nuser < BF_MAX_USERS) {
					uid[j] = job_ptr->user_id;
					njobs[j] = 1;
					nuser++;
				} else {
					error("backfill: too many users in "
					      "queue. Consider increasing "
					      "BF_MAX_USERS");
				}
				if (debug_flags & DEBUG_FLAG_BACKFILL)
					debug2("backfill: found new user %u. "
					       "Total #users now %u",
					       job_ptr->user_id, nuser);
			} else {
				if (njobs[j] > max_backfill_job_per_user) {
					/* skip job */
					if (debug_flags & DEBUG_FLAG_BACKFILL)
						debug("backfill: have already "
						      "checked %u jobs for "
						      "user %u; skipping "
						      "job %u",
						      max_backfill_job_per_user,
						      job_ptr->user_id,
						      job_ptr->job_id);
					continue;
				}
			}
		}

		if (((part_ptr->state_up & PARTITION_SCHED) == 0) ||
		    (part_ptr->node_bitmap == NULL))
		 	continue;
		if ((part_ptr->flags & PART_FLAG_ROOT_ONLY) && filter_root)
			continue;

		if ((!job_independent(job_ptr, 0)) ||
		    (license_job_test(job_ptr, time(NULL)) != SLURM_SUCCESS))
			continue;

		/* Determine minimum and maximum node counts */
		min_nodes = MAX(job_ptr->details->min_nodes,
				part_ptr->min_nodes);
		if (job_ptr->details->max_nodes == 0)
			max_nodes = part_ptr->max_nodes;
		else
			max_nodes = MIN(job_ptr->details->max_nodes,
					part_ptr->max_nodes);
		max_nodes = MIN(max_nodes, 500000);     /* prevent overflows */
		if (job_ptr->details->max_nodes)
			req_nodes = max_nodes;
		else
			req_nodes = min_nodes;
		if (min_nodes > max_nodes) {
			/* job's min_nodes exceeds partition's max_nodes */
			continue;
		}

		/* Determine job's expected completion time */
		if (job_ptr->time_limit == NO_VAL) {
			if (part_ptr->max_time == INFINITE)
				time_limit = 365 * 24 * 60; /* one year */
			else
				time_limit = part_ptr->max_time;
		} else {
			if (part_ptr->max_time == INFINITE)
				time_limit = job_ptr->time_limit;
			else
				time_limit = MIN(job_ptr->time_limit,
						 part_ptr->max_time);
		}
		comp_time_limit = time_limit;
		qos_ptr = job_ptr->qos_ptr;
		if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE) &&
		    slurm_get_preempt_mode())
			time_limit = job_ptr->time_limit = 1;
		else if (job_ptr->time_min && (job_ptr->time_min < time_limit))
			time_limit = job_ptr->time_limit = job_ptr->time_min;

		/* Determine impact of any resource reservations */
		later_start = now;
 TRY_LATER:
		if ((time(NULL) - sched_start) >= sched_timeout) {
			uint32_t save_time_limit = job_ptr->time_limit;
			job_ptr->time_limit = orig_time_limit;
			if (debug_flags & DEBUG_FLAG_BACKFILL) {
				END_TIMER;
				info("backfill: completed yielding locks 2"
				     "after testing %d jobs, %s",
				     job_test_count, TIME_STR);
			}
			if (_yield_locks(yield_sleep) && !backfill_continue) {
				if (debug_flags & DEBUG_FLAG_BACKFILL) {
					info("backfill: system state changed, "
					     "breaking out after testing %d "
					     "jobs", job_test_count);
				}
				rc = 1;
				break;
			}
			job_ptr->time_limit = save_time_limit;
			/* Reset backfill scheduling timers, resume testing */
			sched_start = time(NULL);
			job_test_count = 1;
			START_TIMER;
		}

		FREE_NULL_BITMAP(avail_bitmap);
		FREE_NULL_BITMAP(exc_core_bitmap);
		start_res   = later_start;
		later_start = 0;
		j = job_test_resv(job_ptr, &start_res, true, &avail_bitmap,
				  &exc_core_bitmap);
		if (j != SLURM_SUCCESS) {
			job_ptr->time_limit = orig_time_limit;
			continue;
		}
		if (start_res > now)
			end_time = (time_limit * 60) + start_res;
		else
			end_time = (time_limit * 60) + now;
		resv_end = find_resv_end(start_res);
		/* Identify usable nodes for this job */
		bit_and(avail_bitmap, part_ptr->node_bitmap);
		bit_and(avail_bitmap, up_node_bitmap);
		for (j=0; ; ) {
			if ((node_space[j].end_time > start_res) &&
			     node_space[j].next && (later_start == 0))
				later_start = node_space[j].end_time;
			if (node_space[j].end_time <= start_res)
				;
			else if (node_space[j].begin_time <= end_time) {
				bit_and(avail_bitmap,
					node_space[j].avail_bitmap);
			} else
				break;
			if ((j = node_space[j].next) == 0)
				break;
		}
		if ((resv_end++) &&
		    ((later_start == 0) || (resv_end < later_start))) {
			later_start = resv_end;
		}

		if (job_ptr->details->exc_node_bitmap) {
			bit_not(job_ptr->details->exc_node_bitmap);
			bit_and(avail_bitmap,
				job_ptr->details->exc_node_bitmap);
			bit_not(job_ptr->details->exc_node_bitmap);
		}

		/* Test if insufficient nodes remain OR
		 *	required nodes missing OR
		 *	nodes lack features */
		if ((bit_set_count(avail_bitmap) < min_nodes) ||
		    ((job_ptr->details->req_node_bitmap) &&
		     (!bit_super_set(job_ptr->details->req_node_bitmap,
				     avail_bitmap))) ||
		    (job_req_node_filter(job_ptr, avail_bitmap))) {
			if (later_start) {
				job_ptr->start_time = 0;
				goto TRY_LATER;
			}
			/* Job can not start until too far in the future */
			job_ptr->time_limit = orig_time_limit;
			job_ptr->start_time = sched_start + backfill_window;
			continue;
		}

		/* Identify nodes which are definitely off limits */
		FREE_NULL_BITMAP(resv_bitmap);
		resv_bitmap = bit_copy(avail_bitmap);
		bit_not(resv_bitmap);

		/* this is the time consuming operation */
		debug2("backfill: entering _try_sched for job %u.",
		       job_ptr->job_id);

		if (!already_counted) {
			slurmctld_diag_stats.bf_last_depth_try++;
			already_counted = true;
		}

		j = _try_sched(job_ptr, &avail_bitmap, min_nodes, max_nodes,
			       req_nodes, exc_core_bitmap);

		now = time(NULL);
		if (j != SLURM_SUCCESS) {
			job_ptr->time_limit = orig_time_limit;
			job_ptr->start_time = 0;
			continue;	/* not runable */
		}

		if (start_res > job_ptr->start_time) {
			job_ptr->start_time = start_res;
			last_job_update = now;
		}
		if (job_ptr->start_time <= now) {
			uint32_t save_time_limit = job_ptr->time_limit;
			int rc = _start_job(job_ptr, resv_bitmap);
			if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE)) {
				if (orig_time_limit == NO_VAL)
					job_ptr->time_limit = comp_time_limit;
				else
					job_ptr->time_limit = orig_time_limit;
				job_ptr->end_time = job_ptr->start_time +
						    (job_ptr->time_limit * 60);
			} else if ((rc == SLURM_SUCCESS) && job_ptr->time_min) {
				/* Set time limit as high as possible */
				job_ptr->time_limit = comp_time_limit;
				job_ptr->end_time = job_ptr->start_time +
						    (comp_time_limit * 60);
				_reset_job_time_limit(job_ptr, now,
						      node_space);
				time_limit = job_ptr->time_limit;
			} else {
				job_ptr->time_limit = orig_time_limit;
			}
			if (rc == ESLURM_ACCOUNTING_POLICY) {
				/* Unknown future start time, just skip job */
				job_ptr->start_time = 0;
				continue;
			} else if (rc != SLURM_SUCCESS) {
				/* Planned to start job, but something bad
				 * happended. */
				job_ptr->start_time = 0;
				break;
			} else {
				/* Started this job, move to next one */
				reject_array_job_id = 0;

				/* Update the database if job time limit
				 * changed and move to next job */
				if (save_time_limit != job_ptr->time_limit)
					jobacct_storage_g_job_start(acct_db_conn,
								    job_ptr);
				continue;
			}
		} else
			job_ptr->time_limit = orig_time_limit;

		if (later_start && (job_ptr->start_time > later_start)) {
			/* Try later when some nodes currently reserved for
			 * pending jobs are free */
			job_ptr->start_time = 0;
			goto TRY_LATER;
		}

		if (job_ptr->start_time > (sched_start + backfill_window)) {
			/* Starts too far in the future to worry about */
			continue;
		}

		if (node_space_recs >= max_backfill_job_cnt) {
			/* Already have too many jobs to deal with */
			break;
		}

		end_reserve = job_ptr->start_time + (time_limit * 60);
		if (_test_resv_overlap(node_space, avail_bitmap,
				       job_ptr->start_time, end_reserve)) {
			/* This job overlaps with an existing reservation for
			 * job to be backfill scheduled, which the sched
			 * plugin does not know about. Try again later. */
			later_start = job_ptr->start_time;
			job_ptr->start_time = 0;
			goto TRY_LATER;
		}

		/*
		 * Add reservation to scheduling table if appropriate
		 */
		if (qos_ptr && (qos_ptr->flags & QOS_FLAG_NO_RESERVE))
			continue;
		reject_array_job_id = 0;
		bit_not(avail_bitmap);
		_add_reservation(job_ptr->start_time, end_reserve,
				 avail_bitmap, node_space, &node_space_recs);
		if (debug_flags & DEBUG_FLAG_BACKFILL)
			_dump_node_space_table(node_space);
	}
	xfree(uid);
	xfree(njobs);
	FREE_NULL_BITMAP(avail_bitmap);
	FREE_NULL_BITMAP(exc_core_bitmap);
	FREE_NULL_BITMAP(resv_bitmap);

	for (i=0; ; ) {
		FREE_NULL_BITMAP(node_space[i].avail_bitmap);
		if ((i = node_space[i].next) == 0)
			break;
	}
	xfree(node_space);
	list_destroy(job_queue);
	gettimeofday(&bf_time2, NULL);
	_do_diag_stats(&bf_time1, &bf_time2, yield_sleep);
	if (debug_flags & DEBUG_FLAG_BACKFILL) {
		END_TIMER;
		info("backfill: completed testing %d jobs, %s",
		     job_test_count, TIME_STR);
	}
	return rc;
}
Exemple #28
0
static void _do_log_msg (bitstr_t *b, log_f fn, const char *msg)
{
    char buf [65536];
    char *s = bit_set_count (b) == 1 ? "" : "s";
    (*fn) ("task%s %s: %s", s, bit_fmt (buf, sizeof(buf), b), msg);
}
/*
 * _task_layout_lllp_cyclic
 *
 * task_layout_lllp_cyclic creates a cyclic distribution at the
 * lowest level of logical processor which is either socket, core or
 * thread depending on the system architecture. The Cyclic algorithm
 * is the same as the Cyclic distribution performed in srun.
 *
 *  Distribution at the lllp:
 *  -m hostfile|block|cyclic:block|cyclic
 *
 * The first distribution "hostfile|block|cyclic" is computed
 * in srun. The second distribution "block|cyclic" is computed
 * locally by each slurmd.
 *
 * The input to the lllp distribution algorithms is the gids (tasks
 * ids) generated for the local node.
 *
 * The output is a mapping of the gids onto logical processors
 * (thread/core/socket) with is expressed cpu_bind masks.
 *
 * If a task asks for more than one CPU per task, put the tasks as
 * close as possible (fill core rather than going next socket for the
 * extra task)
 *
 */
static int _task_layout_lllp_cyclic(launch_tasks_request_msg_t *req,
				    uint32_t node_id, bitstr_t ***masks_p)
{
	int last_taskcount = -1, taskcount = 0;
	uint16_t i, s, hw_sockets = 0, hw_cores = 0, hw_threads = 0;
	uint16_t offset = 0, p = 0;
	int size, max_tasks = req->tasks_to_launch[(int)node_id];
	int max_cpus = max_tasks * req->cpus_per_task;
	bitstr_t *avail_map;
	bitstr_t **masks = NULL;
	int *socket_last_pu = NULL;
	int core_inx, pu_per_core, *core_tasks = NULL;

	info ("_task_layout_lllp_cyclic ");

	avail_map = _get_avail_map(req, &hw_sockets, &hw_cores, &hw_threads);
	if (!avail_map)
		return SLURM_ERROR;

	size = bit_set_count(avail_map);
	if (size < max_tasks) {
		error("task/affinity: only %d bits in avail_map for %d tasks!",
		      size, max_tasks);
		FREE_NULL_BITMAP(avail_map);
		return SLURM_ERROR;
	}
	if (size < max_cpus) {
		/* Possible result of overcommit */
		i = size / max_tasks;
		info("task/affinity: reset cpus_per_task from %d to %d",
		     req->cpus_per_task, i);
		req->cpus_per_task = i;
	}

	pu_per_core = hw_threads;
	core_tasks = xmalloc(sizeof(int) * hw_sockets * hw_cores);
	socket_last_pu = xmalloc(hw_sockets * sizeof(int));

	*masks_p = xmalloc(max_tasks * sizeof(bitstr_t*));
	masks = *masks_p;

	size = bit_size(avail_map);

	offset = hw_cores * hw_threads;
	s = 0;
	while (taskcount < max_tasks) {
		if (taskcount == last_taskcount)
			fatal("_task_layout_lllp_cyclic failure");
		last_taskcount = taskcount;
		for (i = 0; i < size; i++) {
			bool already_switched = false;
			uint16_t bit;
			uint16_t orig_s = s;

			while (socket_last_pu[s] >= offset) {
				/* Switch to the next socket we have
				 * ran out here. */

				/* This only happens if the slurmctld
				 * gave us an allocation that made a
				 * task split sockets.  Or if the
				 * entire allocation is on one socket.
				 */
				s = (s + 1) % hw_sockets;
				if (orig_s == s) {
					/* This should rarely happen,
					 * but is here for sanity sake.
					 */
					debug("allocation is full, "
					      "oversubscribing");
					memset(core_tasks, 0,
					       (sizeof(int) *
					        hw_sockets * hw_cores));
					memset(socket_last_pu, 0,
					       sizeof(hw_sockets
						      * sizeof(int)));
				}
			}

			bit = socket_last_pu[s] + (s * offset);

			/* In case hardware and config differ */
			bit %= size;

			/* set up for the next one */
			socket_last_pu[s]++;
			/* skip unrequested threads */
			if (req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE)
				socket_last_pu[s] += hw_threads - 1;

			if (!bit_test(avail_map, bit))
				continue;

			core_inx = bit / pu_per_core;
			if ((req->ntasks_per_core != 0) &&
			    (core_tasks[core_inx] >= req->ntasks_per_core))
				continue;

			if (!masks[taskcount])
				masks[taskcount] =
					bit_alloc(conf->block_map_size);

			//info("setting %d %d", taskcount, bit);
			bit_set(masks[taskcount], bit);

			if (!already_switched &&
			    (((req->task_dist & SLURM_DIST_STATE_BASE) ==
			     SLURM_DIST_CYCLIC_CFULL) ||
			    ((req->task_dist & SLURM_DIST_STATE_BASE) ==
			     SLURM_DIST_BLOCK_CFULL))) {
				/* This means we are laying out cpus
				 * within a task cyclically as well. */
				s = (s + 1) % hw_sockets;
				already_switched = true;
			}

			if (++p < req->cpus_per_task)
				continue;

			core_tasks[core_inx]++;

			/* Binding to cores, skip remaining of the threads */
			if (!(req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE)
			    && ((req->cpu_bind_type & CPU_BIND_TO_CORES)
				|| (req->ntasks_per_core == 1))) {
				int threads_not_used;
				if (req->cpus_per_task < hw_threads)
					threads_not_used =
						hw_threads - req->cpus_per_task;
				else
					threads_not_used =
						req->cpus_per_task % hw_threads;
				socket_last_pu[s] += threads_not_used;
			}
			p = 0;

			if (!already_switched) {
				/* Now that we have finished a task, switch to
				 * the next socket. */
				s = (s + 1) % hw_sockets;
			}

			if (++taskcount >= max_tasks)
				break;
		}
	}

	/* last step: expand the masks to bind each task
	 * to the requested resource */
	_expand_masks(req->cpu_bind_type, max_tasks, masks,
		      hw_sockets, hw_cores, hw_threads, avail_map);
	FREE_NULL_BITMAP(avail_map);
	xfree(core_tasks);
	xfree(socket_last_pu);

	return SLURM_SUCCESS;
}
Exemple #30
0
int
main(int argc, char *argv[])
{
	note("Testing static decl");
	{
		bitstr_t bit_decl(bs, 65);
		/*bitstr_t *bsp = bs;*/

		bit_set(bs,9);
		bit_set(bs,14);
		TEST(bit_test(bs,9), "bit 9 set"); 
		TEST(!bit_test(bs,12), "bit 12 not set");
		TEST(bit_test(bs,14), "bit 14 set" );
		/*bit_free(bsp);*/	/* triggers TEST in bit_free - OK */
	}
	note("Testing basic vixie functions");
	{
		bitstr_t *bs = bit_alloc(16), *bs2;


		/*bit_set(bs, 42);*/ 	/* triggers TEST in bit_set - OK */
		bit_set(bs,9);
		bit_set(bs,14);
		TEST(bit_test(bs,9), "bit 9 set"); 
		TEST(!bit_test(bs,12), "bit 12 not set" );
		TEST(bit_test(bs,14), "bit 14 set");

		bs2 = bit_copy(bs);
		bit_fill_gaps(bs2);
		TEST(bit_ffs(bs2) == 9, "first bit set = 9 ");
		TEST(bit_fls(bs2) == 14, "last bit set = 14");
		TEST(bit_set_count(bs2) == 6, "bitstring");
		TEST(bit_test(bs2,12), "bitstring");
		TEST(bit_super_set(bs,bs2) == 1, "bitstring");
		TEST(bit_super_set(bs2,bs) == 0, "bitstring");

		bit_clear(bs,14);
		TEST(!bit_test(bs,14), "bitstring");

		bit_nclear(bs,9,14);
		TEST(!bit_test(bs,9), "bitstring");
		TEST(!bit_test(bs,12), "bitstring");
		TEST(!bit_test(bs,14), "bitstring");

		bit_nset(bs,9,14);
		TEST(bit_test(bs,9), "bitstring");
		TEST(bit_test(bs,12), "bitstring");
		TEST(bit_test(bs,14), "bitstring");

		TEST(bit_ffs(bs) == 9, "ffs");
		TEST(bit_ffc(bs) == 0, "ffc");
		bit_nset(bs,0,8);
		TEST(bit_ffc(bs) == 15, "ffc");

		bit_free(bs);
		/*bit_set(bs,9); */	/* triggers TEST in bit_set - OK */
	}
	note("Testing and/or/not");
	{
		bitstr_t *bs1 = bit_alloc(128);
		bitstr_t *bs2 = bit_alloc(128);

		bit_set(bs1, 100);
		bit_set(bs1, 104);
		bit_set(bs2, 100);

		bit_and(bs1, bs2);
		TEST(bit_test(bs1, 100), "and");
		TEST(!bit_test(bs1, 104), "and");

		bit_set(bs2, 110);
		bit_set(bs2, 111);
		bit_set(bs2, 112);
		bit_or(bs1, bs2);
		TEST(bit_test(bs1, 100), "or");
		TEST(bit_test(bs1, 110), "or");
		TEST(bit_test(bs1, 111), "or");
		TEST(bit_test(bs1, 112), "or");

		bit_not(bs1);
		TEST(!bit_test(bs1, 100), "not");
		TEST(bit_test(bs1, 12), "not");

		bit_free(bs1);
		bit_free(bs2);
	}

	note("testing bit selection");
	{
		bitstr_t *bs1 = bit_alloc(128), *bs2;
		bit_set(bs1, 21);
		bit_set(bs1, 100);
		bit_fill_gaps(bs1);
		bs2 = bit_pick_cnt(bs1,20);

		if (bs2) {
			TEST(bit_set_count(bs2) == 20, "pick");
			TEST(bit_ffs(bs2) == 21, "pick");
			TEST(bit_fls(bs2) == 40, "pick");
			bit_free(bs2);
		}
		else
			TEST(0, "alloc fail");

		bit_free(bs1);
	}
	note("Testing realloc");
	{
		bitstr_t *bs = bit_alloc(1);

		TEST(bit_ffs(bs) == -1, "bitstring");
		bit_set(bs,0);
		/*bit_set(bs, 1000);*/	/* triggers TEST in bit_set - OK */
		bs = bit_realloc(bs,1048576);
		bit_set(bs,1000);
		bit_set(bs,1048575);
		TEST(bit_test(bs, 0), "bitstring");
		TEST(bit_test(bs, 1000), "bitstring");
		TEST(bit_test(bs, 1048575), "bitstring");
		TEST(bit_set_count(bs) == 3, "bitstring");
		bit_clear(bs,0);
		bit_clear(bs,1000);
		TEST(bit_set_count(bs) == 1, "bitstring");
		TEST(bit_ffs(bs) == 1048575, "bitstring");
		bit_free(bs);
	}
	note("Testing bit_fmt");
	{
		char tmpstr[1024];
		bitstr_t *bs = bit_alloc(1024);

		TEST(!strcmp(bit_fmt(tmpstr,sizeof(tmpstr),bs), ""), "bitstring");
		bit_set(bs,42);
		TEST(!strcmp(bit_fmt(tmpstr,sizeof(tmpstr),bs), "42"), "bitstring");
		bit_set(bs,102);
		TEST(!strcmp(bit_fmt(tmpstr,sizeof(tmpstr),bs), "42,102"), "bitstring");
		bit_nset(bs,9,14);
		TEST(!strcmp(bit_fmt(tmpstr,sizeof(tmpstr), bs), 
					"9-14,42,102"), "bitstring");
	}

	note("Testing bit_nffc/bit_nffs");
	{
		bitstr_t *bs = bit_alloc(1024);

		bit_set(bs, 2);
		bit_set(bs, 6);
		bit_set(bs, 7);
		bit_nset(bs,12,1018); 

		TEST(bit_nffc(bs, 2) == 0, "bitstring");
		TEST(bit_nffc(bs, 3) == 3, "bitstring");
		TEST(bit_nffc(bs, 4) == 8, "bitstring");
		TEST(bit_nffc(bs, 5) == 1019, "bitstring");
		TEST(bit_nffc(bs, 6) == -1, "bitstring");

		TEST(bit_nffs(bs, 1) == 2, "bitstring");
		TEST(bit_nffs(bs, 2) == 6, "bitstring");
		TEST(bit_nffs(bs, 100) == 12, "bitstring");
		TEST(bit_nffs(bs, 1023) == -1, "bitstring");

		bit_free(bs);
	}

	note("Testing bit_unfmt");
	{
		bitstr_t *bs = bit_alloc(1024);
		bitstr_t *bs2 = bit_alloc(1024);
		char tmpstr[4096];

		bit_set(bs,1);
		bit_set(bs,3);
		bit_set(bs,30);
		bit_nset(bs,42,64);
		bit_nset(bs,97,1000);

		bit_fmt(tmpstr, sizeof(tmpstr), bs);
		TEST(bit_unfmt(bs2, tmpstr) != -1, "bitstring");
		TEST(bit_equal(bs, bs2), "bitstring");
	}

	totals();
	return failed;
}