Esempio n. 1
0
File: gang.c Progetto: corburn/slurm
/* Return 1 if job fits in this row, else return 0 */
static int _job_fits_in_active_row(struct job_record *job_ptr,
				   struct gs_part *p_ptr)
{
	job_resources_t *job_res = job_ptr->job_resrcs;
	int count;
	bitstr_t *job_map;
	uint16_t job_gr_type;

	if ((p_ptr->active_resmap == NULL) || (p_ptr->jobs_active == 0))
		return 1;

	job_gr_type = _get_part_gr_type(job_ptr->part_ptr);
	if ((job_gr_type == GS_CPU2) || (job_gr_type == GS_CORE) ||
	    (job_gr_type == GS_SOCKET)) {
		return job_fits_into_cores(job_res, p_ptr->active_resmap,
					   gs_bits_per_node);
	}

	/* job_gr_type == GS_NODE || job_gr_type == GS_CPU */
	job_map = bit_copy(job_res->node_bitmap);
	bit_and(job_map, p_ptr->active_resmap);
	/* any set bits indicate contention for the same resource */
	count = bit_set_count(job_map);
	if (slurmctld_conf.debug_flags & DEBUG_FLAG_GANG)
		info("gang: _job_fits_in_active_row: %d bits conflict", count);
	FREE_NULL_BITMAP(job_map);
	if (count == 0)
		return 1;
	if (job_gr_type == GS_CPU) {
		/* For GS_CPU we check the CPU arrays */
		return _can_cpus_fit(job_ptr, p_ptr);
	}

	return 0;
}
/* Reset the node_bitmap in a job_resources data structure
 * This is needed after a restart/reconfiguration since nodes can
 * be added or removed from the system resulting in changing in
 * the bitmap size or bit positions */
extern int reset_node_bitmap(job_resources_t *job_resrcs_ptr, uint32_t job_id)
{
	int i;

	if (!job_resrcs_ptr)
		return SLURM_SUCCESS;

	if (job_resrcs_ptr->node_bitmap)
		FREE_NULL_BITMAP(job_resrcs_ptr->node_bitmap);

	if (job_resrcs_ptr->nodes &&
	    (node_name2bitmap(job_resrcs_ptr->nodes, false,
			      &job_resrcs_ptr->node_bitmap))) {
		error("Invalid nodes (%s) for job_id %u",
		      job_resrcs_ptr->nodes, job_id);
		return SLURM_ERROR;
	} else if (job_resrcs_ptr->nodes == NULL) {
		job_resrcs_ptr->node_bitmap = bit_alloc(node_record_count);
	}

	i = bit_set_count(job_resrcs_ptr->node_bitmap);
	if (job_resrcs_ptr->nhosts != i) {
		error("Invalid change in resource allocation node count for "
		      "job %u, %u to %d", job_id, job_resrcs_ptr->nhosts, i);
		return SLURM_ERROR;
	}
	return SLURM_SUCCESS;
}
Esempio n. 3
0
int powercap_get_job_optimal_cpufreq(uint32_t powercap, int *allowed_freqs)
{
	uint32_t cur_max_watts = 0, *tmp_max_watts_dvfs = NULL;
	int k = 1;
	bitstr_t *tmp_bitmap = NULL;

	if (!_powercap_enabled())
		return 0;

	tmp_max_watts_dvfs = xmalloc(sizeof(uint32_t) * (allowed_freqs[0]+1));
	tmp_bitmap = bit_copy(idle_node_bitmap);
	bit_not(tmp_bitmap);

	cur_max_watts = powercap_get_node_bitmap_maxwatts_dvfs(tmp_bitmap,
				idle_node_bitmap, tmp_max_watts_dvfs,
				allowed_freqs, 0);
	FREE_NULL_BITMAP(tmp_bitmap);

	if (cur_max_watts > powercap) {
		while (tmp_max_watts_dvfs[k] > powercap &&
		      k < allowed_freqs[0] + 1) {
			k++;
		}
		if (k == allowed_freqs[0] + 1)
			k--;
	} else {
		k = 1;
	}
	xfree(tmp_max_watts_dvfs);

	return k;
}
Esempio n. 4
0
/* If slurmctld crashes, the node state that it recovers could differ
 * from the actual hardware state (e.g. ResumeProgram failed to complete).
 * To address that, when a node that should be powered up for a running
 * job is not responding, they try running ResumeProgram again. */
static void _re_wake(void)
{
	struct node_record *node_ptr;
	bitstr_t *wake_node_bitmap = NULL;
	int i;

	node_ptr = node_record_table_ptr;
	for (i=0; i<node_record_count; i++, node_ptr++) {
		if (IS_NODE_ALLOCATED(node_ptr)   &&
		    IS_NODE_NO_RESPOND(node_ptr)  &&
		    !IS_NODE_POWER_SAVE(node_ptr) &&
		    (bit_test(suspend_node_bitmap, i) == 0) &&
		    (bit_test(resume_node_bitmap,  i) == 0)) {
			if (wake_node_bitmap == NULL) {
				wake_node_bitmap =
					bit_alloc(node_record_count);
			}
			bit_set(wake_node_bitmap, i);
		}
	}

	if (wake_node_bitmap) {
		char *nodes;
		nodes = bitmap2node_name(wake_node_bitmap);
		if (nodes) {
			pid_t pid = _run_prog(resume_prog, nodes, NULL);
			info("power_save: pid %d rewaking nodes %s",
			     (int) pid, nodes);
		} else
			error("power_save: bitmap2nodename");
		xfree(nodes);
		FREE_NULL_BITMAP(wake_node_bitmap);
	}
}
Esempio n. 5
0
/*
 * _list_delete_part - delete an entry from the global partition list,
 *	see common/list.h for documentation
 * global: node_record_count - count of nodes in the system
 *         node_record_table_ptr - pointer to global node table
 */
static void _list_delete_part(void *part_entry)
{
	struct part_record *part_ptr;
	struct node_record *node_ptr;
	int i, j, k;

	part_ptr = (struct part_record *) part_entry;
	node_ptr = &node_record_table_ptr[0];
	for (i = 0; i < node_record_count; i++, node_ptr++) {
		for (j=0; j<node_ptr->part_cnt; j++) {
			if (node_ptr->part_pptr[j] != part_ptr)
				continue;
			node_ptr->part_cnt--;
			for (k=j; k<node_ptr->part_cnt; k++) {
				node_ptr->part_pptr[k] =
					node_ptr->part_pptr[k+1];
			}
			break;
		}
	}

	xfree(part_ptr->allow_alloc_nodes);
	xfree(part_ptr->allow_groups);
	xfree(part_ptr->allow_uids);
	xfree(part_ptr->alternate);
	xfree(part_ptr->name);
	xfree(part_ptr->nodes);
	FREE_NULL_BITMAP(part_ptr->node_bitmap);
	xfree(part_entry);
}
Esempio n. 6
0
void task_state_print (task_state_t ts, log_f fn)
{
    bitstr_t *unseen;

    if (!ts)	/* Not built yet */
        return;

    unseen = bit_alloc (ts->n_tasks);
    if (bit_set_count (ts->start_failed)) {
        _do_log_msg (ts->start_failed, fn, "failed to start");
        bit_or (unseen, ts->start_failed);
    }
    if (bit_set_count (ts->running)) {
        _do_log_msg (ts->running, fn, "running");
        bit_or (unseen, ts->running);
    }
    if (bit_set_count (ts->abnormal_exit)) {
        _do_log_msg (ts->abnormal_exit, fn, "exited abnormally");
        bit_or (unseen, ts->abnormal_exit);
    }
    if (bit_set_count (ts->normal_exit)) {
        _do_log_msg (ts->normal_exit, fn, "exited");
        bit_or (unseen, ts->normal_exit);
    }
    bit_not (unseen);
    if (bit_set_count (unseen))
        _do_log_msg (unseen, fn, "unknown");
    FREE_NULL_BITMAP(unseen);
}
Esempio n. 7
0
/*
 * hostlist2bitmap - given a hostlist, build a bitmap representation
 * IN hl          - hostlist
 * IN best_effort - if set don't return an error on invalid node name entries
 * OUT bitmap     - set to bitmap, may not have all bits set on error
 * RET 0 if no error, otherwise EINVAL
 */
extern int hostlist2bitmap (hostlist_t hl, bool best_effort, bitstr_t **bitmap)
{
	int rc = SLURM_SUCCESS;
	bitstr_t *my_bitmap;
	char *name;
	hostlist_iterator_t hi;

	FREE_NULL_BITMAP(*bitmap);
	my_bitmap = (bitstr_t *) bit_alloc (node_record_count);
	*bitmap = my_bitmap;

	hi = hostlist_iterator_create(hl);
	while ((name = hostlist_next(hi)) != NULL) {
		struct node_record *node_ptr;
		node_ptr = _find_node_record(name, best_effort, true);
		if (node_ptr) {
			bit_set (my_bitmap, (bitoff_t) (node_ptr -
							node_record_table_ptr));
		} else {
			error ("hostlist2bitmap: invalid node specified %s",
			       name);
			if (!best_effort)
				rc = EINVAL;
		}
		free (name);
	}

	hostlist_iterator_destroy(hi);
	return rc;

}
Esempio n. 8
0
/* Free all allocated memory */
static void _clear_power_config(void)
{
	xfree(suspend_prog);
	xfree(resume_prog);
	xfree(exc_nodes);
	xfree(exc_parts);
	FREE_NULL_BITMAP(exc_node_bitmap);
}
Esempio n. 9
0
static void _destroy_bitmap(void *object)
{
	bitstr_t *bitstr = (bitstr_t *)object;

	if (bitstr) {
		FREE_NULL_BITMAP(bitstr);
	}
}
Esempio n. 10
0
/* _list_delete_feature - delete an entry from the feature list,
 *	see list.h for documentation */
static void _list_delete_feature (void *feature_entry)
{
	node_feature_t *feature_ptr = (node_feature_t *) feature_entry;

	xassert(feature_ptr);
	xassert(feature_ptr->magic == FEATURE_MAGIC);
	xfree (feature_ptr->name);
	FREE_NULL_BITMAP (feature_ptr->node_bitmap);
	xfree (feature_ptr);
}
Esempio n. 11
0
static void _destroy_local_cluster(void *object)
{
	local_cluster_t *local_cluster = (local_cluster_t *)object;
	if (local_cluster) {
		if (local_cluster->hl)
			hostlist_destroy(local_cluster->hl);
		FREE_NULL_BITMAP(local_cluster->asked_bitmap);
		xfree(local_cluster);
	}
}
Esempio n. 12
0
static void _free_node_subgrp(void *object)
{
	node_subgrp_t *subgrp = (node_subgrp_t *)object;
	if (subgrp) {
		FREE_NULL_BITMAP(subgrp->bitmap);
		xfree(subgrp->str);
		xfree(subgrp->inx);
		xfree(subgrp);
	}
}
Esempio n. 13
0
int fini(void)
{

#ifdef HAVE_NATIVE_CRAY
    pthread_mutex_lock(&port_mutex);
    FREE_NULL_BITMAP(port_resv);
    pthread_mutex_unlock(&port_mutex);
#endif

    return SLURM_SUCCESS;
}
Esempio n. 14
0
static void _delete_gres_list(void *x)
{
	gres_slurmd_conf_t *p = (gres_slurmd_conf_t *) x;
	xfree(p->cpus);
	FREE_NULL_BITMAP(p->cpus_bitmap);
	xfree(p->file);
	xfree(p->links);
	xfree(p->name);
	xfree(p->type_name);
	xfree(p);
}
Esempio n. 15
0
static void _lllp_free_masks(const uint32_t maxtasks, bitstr_t **masks)
{
    	int i;
	bitstr_t *bitmask;

	for (i = 0; i < maxtasks; i++) {
		bitmask = masks[i];
		FREE_NULL_BITMAP(bitmask);
	}
	xfree(masks);
}
Esempio n. 16
0
int slurm_job_cpus_allocated_str_on_node_id(char *cpus,
					    size_t cpus_len,
					    job_resources_t *job_resrcs_ptr,
					    int node_id)
{
	uint32_t threads = 1;
	int inx = 0;
	bitstr_t *cpu_bitmap;
	int j, k, bit_inx, bit_reps, hi;

	if (!job_resrcs_ptr || node_id < 0)
		slurm_seterrno_ret(EINVAL);

	/* find index in and first bit index in sock_core_rep_count[]
	 * for this node id */
	bit_inx = 0;
	hi = node_id + 1;    /* change from 0-origin to 1-origin */
	for (inx = 0; hi; inx++) {
		if (hi > job_resrcs_ptr->sock_core_rep_count[inx]) {
			bit_inx += job_resrcs_ptr->sockets_per_node[inx] *
				   job_resrcs_ptr->cores_per_socket[inx] *
				   job_resrcs_ptr->sock_core_rep_count[inx];
			hi -= job_resrcs_ptr->sock_core_rep_count[inx];
		} else {
			bit_inx += job_resrcs_ptr->sockets_per_node[inx] *
				   job_resrcs_ptr->cores_per_socket[inx] *
				   (hi - 1);
			break;
		}
	}

	bit_reps = job_resrcs_ptr->sockets_per_node[inx] *
		   job_resrcs_ptr->cores_per_socket[inx];

	/* get the number of threads per core on this node
	 */
	if (job_node_ptr)
		threads = job_node_ptr->node_array[node_id].threads;
	cpu_bitmap = bit_alloc(bit_reps * threads);
	for (j = 0; j < bit_reps; j++) {
		if (bit_test(job_resrcs_ptr->core_bitmap, bit_inx)){
			for (k = 0; k < threads; k++)
				bit_set(cpu_bitmap,
					(j * threads) + k);
		}
		bit_inx++;
	}
	bit_fmt(cpus, cpus_len, cpu_bitmap);
	FREE_NULL_BITMAP(cpu_bitmap);

	return SLURM_SUCCESS;
}
Esempio n. 17
0
/* Configure reserved ports.
 * Call with mpi_params==NULL to free memory */
extern int reserve_port_config(char *mpi_params)
{
	char *tmp_e=NULL, *tmp_p=NULL;
	int i, p_min, p_max;

	if (mpi_params)
		tmp_p = strstr(mpi_params, "ports=");
	if (tmp_p == NULL) {
		if (port_resv_table) {
			info("Clearing port reservations");
			for (i=0; i<port_resv_cnt; i++)
				FREE_NULL_BITMAP(port_resv_table[i]);
			xfree(port_resv_table);
			port_resv_cnt = 0;
			port_resv_min = port_resv_max = 0;
		}
		return SLURM_SUCCESS;
	}

	tmp_p += 6;
	p_min = strtol(tmp_p, &tmp_e, 10);
	if ((p_min < 1) || (tmp_e[0] != '-')) {
		info("invalid MpiParams: %s", mpi_params);
		return SLURM_ERROR;
	}
	tmp_e++;
	p_max = strtol(tmp_e, NULL, 10);
	if (p_max < p_min) {
		info("invalid MpiParams: %s", mpi_params);
		return SLURM_ERROR;
	}

	if ((p_min == port_resv_min) && (p_max == port_resv_max)) {
		_dump_resv_port_info();
		return SLURM_SUCCESS;	/* No change */
	}

	port_resv_min = p_min;
	port_resv_max = p_max;
	port_resv_cnt = p_max - p_min + 1;
	debug("Ports available for reservation %u-%u",
	      port_resv_min, port_resv_max);

	xfree(port_resv_table);
	port_resv_table = xmalloc(sizeof(bitstr_t *) * port_resv_cnt);
	for (i=0; i<port_resv_cnt; i++)
		port_resv_table[i] = bit_alloc(node_record_count);

	_make_all_resv();
	_dump_resv_port_info();
	return SLURM_SUCCESS;
}
Esempio n. 18
0
extern int good_nodes_from_inx(List local_cluster_list,
			       void **object, char *node_inx,
			       int start)
{
	local_cluster_t **curr_cluster = (local_cluster_t **)object;

	/* check the bitmap to see if this is one of the jobs
	   we are looking for */
	if (*curr_cluster) {
		bitstr_t *job_bitmap = NULL;
		if (!node_inx || !node_inx[0])
			return 0;
		if ((start < (*curr_cluster)->start)
		    || (start > (*curr_cluster)->end)) {
			local_cluster_t *local_cluster = NULL;

			ListIterator itr =
				list_iterator_create(local_cluster_list);
			while ((local_cluster = list_next(itr))) {
				if ((start >= local_cluster->start)
				    && (start <= local_cluster->end)) {
					*curr_cluster = local_cluster;
					break;
				}
			}
			list_iterator_destroy(itr);
			if (!local_cluster)
				return 0;
		}
		job_bitmap = bit_alloc(hostlist_count((*curr_cluster)->hl));
		bit_unfmt(job_bitmap, node_inx);
		if (!bit_overlap((*curr_cluster)->asked_bitmap, job_bitmap)) {
			FREE_NULL_BITMAP(job_bitmap);
			return 0;
		}
		FREE_NULL_BITMAP(job_bitmap);
	}
	return 1;
}
Esempio n. 19
0
extern void free_job_resources(job_resources_t **job_resrcs_pptr)
{
	job_resources_t *job_resrcs_ptr = *job_resrcs_pptr;

	if (job_resrcs_ptr) {
		FREE_NULL_BITMAP(job_resrcs_ptr->core_bitmap);
		FREE_NULL_BITMAP(job_resrcs_ptr->core_bitmap_used);
		xfree(job_resrcs_ptr->cores_per_socket);
		xfree(job_resrcs_ptr->cpu_array_reps);
		xfree(job_resrcs_ptr->cpu_array_value);
		xfree(job_resrcs_ptr->cpus);
		xfree(job_resrcs_ptr->cpus_used);
		xfree(job_resrcs_ptr->memory_allocated);
		xfree(job_resrcs_ptr->memory_used);
		FREE_NULL_BITMAP(job_resrcs_ptr->node_bitmap);
		xfree(job_resrcs_ptr->nodes);
		xfree(job_resrcs_ptr->sock_core_rep_count);
		xfree(job_resrcs_ptr->sockets_per_node);
		xfree(job_resrcs_ptr);
		*job_resrcs_pptr = NULL;
	}
}
Esempio n. 20
0
/* RET 0 on success, -1 on failure */
extern int	job_requeue_wiki(char *cmd_ptr, int *err_code, char **err_msg)
{
	char *arg_ptr, *tmp_char;
	uint32_t jobid;
	struct job_record *job_ptr;
	static char reply_msg[128];
	int slurm_rc;
	/* Write lock on job and node info */
	slurmctld_lock_t job_write_lock = {
		NO_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK };

	arg_ptr = strstr(cmd_ptr, "ARG=");
	if (arg_ptr == NULL) {
		*err_code = -300;
		*err_msg = "REQUEUEJOB lacks ARG";
		error("wiki: REQUEUEJOB lacks ARG");
		return -1;
	}
	jobid = strtoul(arg_ptr+4, &tmp_char, 10);
	if ((tmp_char[0] != '\0') && (!isspace(tmp_char[0]))) {
		*err_code = -300;
		*err_msg = "Invalid ARG value";
		error("wiki: REQUEUEJOB has invalid jobid");
		return -1;
	}

	lock_slurmctld(job_write_lock);
	slurm_rc = job_requeue(0, jobid, NULL, false, 0);
	if (slurm_rc != SLURM_SUCCESS) {
		unlock_slurmctld(job_write_lock);
		*err_code = -700;
		*err_msg = slurm_strerror(slurm_rc);
		error("wiki: Failed to requeue job %u (%m)", jobid);
		return -1;
	}

	/* We need to clear the required node list here.
	 * If the job was submitted with srun and a
	 * required node list, it gets lost here. */
	job_ptr = find_job_record(jobid);
	if (job_ptr && job_ptr->details) {
		xfree(job_ptr->details->req_nodes);
		FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap);
	}
	info("wiki: requeued job %u", jobid);
	unlock_slurmctld(job_write_lock);
	snprintf(reply_msg, sizeof(reply_msg),
		"job %u requeued successfully", jobid);
	*err_msg = reply_msg;
	return 0;
}
Esempio n. 21
0
/* _list_delete_config - delete an entry from the config list,
 *	see list.h for documentation */
static void _list_delete_config (void *config_entry)
{
	struct config_record *config_ptr = (struct config_record *)
					   config_entry;

	xassert(config_ptr);
	xassert(config_ptr->magic == CONFIG_MAGIC);
	xfree(config_ptr->feature);
	xfree(config_ptr->gres);
	build_config_feature_list(config_ptr);
	xfree (config_ptr->nodes);
	FREE_NULL_BITMAP (config_ptr->node_bitmap);
	xfree (config_ptr);
}
Esempio n. 22
0
File: gang.c Progetto: corburn/slurm
static void _destroy_parts(void *x)
{
	int i;
	struct gs_part *gs_part_ptr = (struct gs_part *) x;

	xfree(gs_part_ptr->part_name);
	for (i = 0; i < gs_part_ptr->num_jobs; i++)
		xfree(gs_part_ptr->job_list[i]);
	xfree(gs_part_ptr->shadow);
	FREE_NULL_BITMAP(gs_part_ptr->active_resmap);
	xfree(gs_part_ptr->active_cpus);
	xfree(gs_part_ptr->job_list);
	xfree(gs_part_ptr);
}
Esempio n. 23
0
/*
 * _lllp_map_abstract_masks
 *
 * Map an array of abstract block masks to physical machine masks
 *
 * IN- maximum number of tasks
 * IN/OUT- array of masks
 */
static void _lllp_map_abstract_masks(const uint32_t maxtasks, bitstr_t **masks)
{
    	int i;
	debug3("_lllp_map_abstract_masks");

	for (i = 0; i < maxtasks; i++) {
		bitstr_t *bitmask = masks[i];
	    	if (bitmask) {
			bitstr_t *newmask = _lllp_map_abstract_mask(bitmask);
			FREE_NULL_BITMAP(bitmask);
			masks[i] = newmask;
		}
	}
}
Esempio n. 24
0
/*
 * good_nodes_from_inx - whether node index is within the used nodes
 *   of specified cluster
 */
extern int
good_nodes_from_inx(cluster_nodes_t *cnodes, char *node_inx, int submit)
{
	bitstr_t *job_bitmap = NULL;

	if (! cnodes)
		return 1;

	if (!node_inx || !node_inx[0])
		return 0;

	if (!cnodes->curr_cluster ||
	   (submit < (cnodes->curr_cluster)->start) ||
	   (submit > (cnodes->curr_cluster)->end)) {
		local_cluster_t *local_cluster = NULL;
		ListIterator itr =
			list_iterator_create(cnodes->cluster_list);
		while((local_cluster = list_next(itr))) {
			if ((submit >= local_cluster->start)
			   && (submit <= local_cluster->end)) {
				cnodes->curr_cluster = local_cluster;
				break;
			}
		}
		list_iterator_destroy(itr);
		if (! local_cluster)
			return 0;
	}
	job_bitmap = bit_alloc(hostlist_count((cnodes->curr_cluster)->hl));
	bit_unfmt(job_bitmap, node_inx);
	if (!bit_overlap((cnodes->curr_cluster)->asked_bitmap, job_bitmap)) {
		FREE_NULL_BITMAP(job_bitmap);
		return 0;
	}
	FREE_NULL_BITMAP(job_bitmap);
	return 1;
}
Esempio n. 25
0
/* power_job_reboot - Reboot compute nodes for a job from the head node */
extern int power_job_reboot(struct job_record *job_ptr)
{
	int rc = SLURM_SUCCESS;
	int i, i_first, i_last;
	struct node_record *node_ptr;
	bitstr_t *wake_node_bitmap = NULL;
	time_t now = time(NULL);
	char *nodes, *features = NULL;

	wake_node_bitmap = bit_alloc(node_record_count);
	i_first = bit_ffs(job_ptr->node_bitmap);
	i_last = bit_fls(job_ptr->node_bitmap);
	for (i = i_first; i <= i_last; i++) {
		if (!bit_test(job_ptr->node_bitmap, i))
			continue;
		node_ptr = node_record_table_ptr + i;
		resume_cnt++;
		resume_cnt_f++;
		node_ptr->node_state &= (~NODE_STATE_POWER_SAVE);
		node_ptr->node_state |=   NODE_STATE_POWER_UP;
		node_ptr->node_state |=   NODE_STATE_NO_RESPOND;
		bit_clear(power_node_bitmap, i);
		bit_clear(avail_node_bitmap, i);
		node_ptr->last_response = now + resume_timeout;
		bit_set(wake_node_bitmap,    i);
		bit_set(resume_node_bitmap,  i);
	}

	nodes = bitmap2node_name(wake_node_bitmap);
	if (nodes) {
#if _DEBUG
		info("power_save: reboot nodes %s", nodes);
#else
		verbose("power_save: reboot nodes %s", nodes);
#endif
		if (job_ptr->details && job_ptr->details->features)
			features = xlate_features(job_ptr->details->features);
		_run_prog(resume_prog, nodes, features);
		xfree(features);
	} else {
		error("power_save: bitmap2nodename");
		rc = SLURM_ERROR;
	}
	xfree(nodes);
	FREE_NULL_BITMAP(wake_node_bitmap);
	last_node_update = now;

	return rc;
}
Esempio n. 26
0
int fini(void)
{

#ifdef HAVE_NATIVE_CRAY
	pthread_mutex_lock(&port_mutex);
	FREE_NULL_BITMAP(port_resv);
	pthread_mutex_unlock(&port_mutex);
#endif

#if defined(HAVE_NATIVE_CRAY) || defined(HAVE_CRAY_NETWORK)
	cleanup_lease_extender();
#endif

	return SLURM_SUCCESS;
}
Esempio n. 27
0
/* Try to start the job on any non-reserved nodes */
static int _start_job(struct job_record *job_ptr, bitstr_t *resv_bitmap)
{
	int rc;
	bitstr_t *orig_exc_nodes = NULL;
	static uint32_t fail_jobid = 0;

	if (job_ptr->details->exc_node_bitmap) {
		orig_exc_nodes = bit_copy(job_ptr->details->exc_node_bitmap);
		bit_or(job_ptr->details->exc_node_bitmap, resv_bitmap);
	} else
		job_ptr->details->exc_node_bitmap = bit_copy(resv_bitmap);

	rc = select_nodes(job_ptr, false, NULL);
	FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap);
	job_ptr->details->exc_node_bitmap = orig_exc_nodes;
	if (rc == SLURM_SUCCESS) {
		/* job initiated */
		last_job_update = time(NULL);
		info("backfill: Started JobId=%u on %s",
		     job_ptr->job_id, job_ptr->nodes);
		if (job_ptr->batch_flag == 0)
			srun_allocate(job_ptr->job_id);
		else if (job_ptr->details->prolog_running == 0)
			launch_job(job_ptr);
		backfilled_jobs++;
		if (debug_flags & DEBUG_FLAG_BACKFILL) {
			info("backfill: Jobs backfilled since boot: %d",
			     backfilled_jobs);
		}
	} else if ((job_ptr->job_id != fail_jobid) &&
		   (rc != ESLURM_ACCOUNTING_POLICY)) {
		char *node_list;
		bit_not(resv_bitmap);
		node_list = bitmap2node_name(resv_bitmap);
		/* This happens when a job has sharing disabled and
		 * a selected node is still completing some job,
		 * which should be a temporary situation. */
		verbose("backfill: Failed to start JobId=%u on %s: %s",
			job_ptr->job_id, node_list, slurm_strerror(rc));
		xfree(node_list);
		fail_jobid = job_ptr->job_id;
	} else {
		debug3("backfill: Failed to start JobId=%u: %s",
		       job_ptr->job_id, slurm_strerror(rc));
	}

	return rc;
}
Esempio n. 28
0
int slurm_job_cpus_allocated_str_on_node_id(char *cpus,
					    size_t cpus_len,
					    job_resources_t *job_resrcs_ptr,
					    int node_id)
{
	int start_node = -1; /* start with -1 less so the array reps
			      * lines up correctly */
	uint32_t threads = 1;
	int inx = 0;
	bitstr_t *cpu_bitmap;
	int j, k, bit_inx, bit_reps;

	if (!job_resrcs_ptr || node_id < 0)
		slurm_seterrno_ret(EINVAL);

	/* find index in sock_core_rep_count[] for this node id
	 */
	do {
		start_node += job_resrcs_ptr->sock_core_rep_count[inx];
		inx++;
	} while (start_node < node_id);
	/* back to previous index since inx is always one step further
	 * after previous loop
	 */
	inx--;

	bit_reps = job_resrcs_ptr->sockets_per_node[inx] *
		job_resrcs_ptr->cores_per_socket[inx];

	/* get the number of threads per core on this node
	 */
	if (job_node_ptr)
		threads = job_node_ptr->node_array[node_id].threads;
	bit_inx = 0;
	cpu_bitmap = bit_alloc(bit_reps * threads);
	for (j = 0; j < bit_reps; j++) {
		if (bit_test(job_resrcs_ptr->core_bitmap, bit_inx)){
			for (k = 0; k < threads; k++)
				bit_set(cpu_bitmap,
					(j * threads) + k);
		}
		bit_inx++;
	}
	bit_fmt(cpus, cpus_len, cpu_bitmap);
	FREE_NULL_BITMAP(cpu_bitmap);

	return SLURM_SUCCESS;
}
Esempio n. 29
0
/* Free all memory associated with switch_record_table structure */
static void _free_switch_record_table(void)
{
	int i;

	if (switch_record_table) {
		for (i=0; i<switch_record_cnt; i++) {
			xfree(switch_record_table[i].name);
			xfree(switch_record_table[i].nodes);
			xfree(switch_record_table[i].switches);
			xfree(switch_record_table[i].switch_index);
			FREE_NULL_BITMAP(switch_record_table[i].node_bitmap);
		}
		xfree(switch_record_table);
		switch_record_cnt = 0;
		switch_levels = 0;
	}
}
Esempio n. 30
0
/* Convert an array of task IDs into a string.
 * RET: the string, caller must xfree() this value
 * NOTE: the taskids array is not necessarily in numeric order,
 *       so we use existing bitmap functions to format */
static char *_task_array_to_string(int ntasks, uint32_t taskids[])
{
	bitstr_t *tasks_bitmap = NULL;
	char *str;
	int i;

	tasks_bitmap = bit_alloc(local_srun_job->ntasks);
	if (!tasks_bitmap) {
		error("bit_alloc: memory allocation failure");
		exit(error_exit);
	}
	for (i=0; i<ntasks; i++)
		bit_set(tasks_bitmap, taskids[i]);
	str = xmalloc(2048);
	bit_fmt(str, 2048, tasks_bitmap);
	FREE_NULL_BITMAP(tasks_bitmap);

	return str;
}