예제 #1
0
static int _sort_step_by_node_list(void *void1, void *void2)
{
	int diff = 0;
	job_step_info_t *step1;
	job_step_info_t *step2;

	hostlist_t hostlist1, hostlist2;
	char *val1, *val2;
#if	PURE_ALPHA_SORT == 0
	int inx;
#endif

	_get_step_info_from_void(&step1, &step2, void1, void2);

	hostlist1 = hostlist_create(step1->nodes);
	hostlist_sort(hostlist1);
	val1 = hostlist_shift(hostlist1);
	if (val1 == NULL)
		val1 = "";
	hostlist_destroy(hostlist1);

	hostlist2 = hostlist_create(step2->nodes);
	hostlist_sort(hostlist2);
	val2 = hostlist_shift(hostlist2);
	if (val2 == NULL)
		val2 = "";
	hostlist_destroy(hostlist2);

#if	PURE_ALPHA_SORT
	diff = strcmp(val1, val2);
#else
	for (inx=0; ; inx++) {
		if (val1[inx] == val2[inx]) {
			if (val1[inx] == '\0')
				break;
			continue;
		}
		if ((isdigit((int)val1[inx])) &&
		    (isdigit((int)val2[inx]))) {
			int num1, num2;
			num1 = atoi(val1+inx);
			num2 = atoi(val2+inx);
			diff = num1 - num2;
		} else
			diff = strcmp(val1, val2);
		break;
	}
#endif
	if (strlen(val1))
		free(val1);
	if (strlen(val2))
		free(val2);

	if (reverse_order)
		diff = -diff;
	return diff;
}
예제 #2
0
파일: sort.c 프로젝트: A1ve5/slurm
static int _sort_by_node_list(void *void1, void *void2)
{
	int diff = 0;
	sinfo_data_t *sinfo1;
	sinfo_data_t *sinfo2;
	char *val1, *val2;
#if	PURE_ALPHA_SORT == 0
	int inx;
#endif

	_get_sinfo_from_void(&sinfo1, &sinfo2, void1, void2);

	val1 = hostlist_shift(sinfo1->nodes);
	if (val1) {
		hostlist_push_host(sinfo1->nodes, val1);
		hostlist_sort(sinfo1->nodes);
	} else
		val1 = "";

	val2 = hostlist_shift(sinfo2->nodes);
	if (val2) {
		hostlist_push_host(sinfo2->nodes, val2);
		hostlist_sort(sinfo2->nodes);
	} else
		val2 = "";

#if	PURE_ALPHA_SORT
	diff = xstrcmp(val1, val2);
#else
	for (inx=0; ; inx++) {
		if (val1[inx] == val2[inx]) {
			if (val1[inx] == '\0')
				break;
			continue;
		}
		if ((isdigit((int)val1[inx])) &&
		    (isdigit((int)val2[inx]))) {
			int num1, num2;
			num1 = atoi(val1+inx);
			num2 = atoi(val2+inx);
			diff = num1 - num2;
		} else
			diff = xstrcmp(val1, val2);
		break;
	}
#endif
	if (strlen(val1))
		free(val1);
	if (strlen(val2))
		free(val2);

	if (reverse_order)
		diff = -diff;

	return diff;
}
예제 #3
0
extern void select_admin_front_end(GtkTreeModel *model, GtkTreeIter *iter,
				   display_data_t *display_data,
				   GtkTreeView *treeview)
{
	if (treeview) {
		char *node_list;
		hostlist_t hl = NULL;
		front_end_user_data_t user_data;

		memset(&user_data, 0, sizeof(front_end_user_data_t));
		gtk_tree_selection_selected_foreach(
			gtk_tree_view_get_selection(treeview),
			_process_each_front_end, &user_data);

		hl = hostlist_create(user_data.node_list);
		hostlist_uniq(hl);
		hostlist_sort(hl);
		xfree(user_data.node_list);
		node_list = hostlist_ranged_string_xmalloc(hl);
		hostlist_destroy(hl);

		_admin_front_end(model, iter, display_data->name, node_list);
		xfree(node_list);
	}
}
예제 #4
0
파일: node_conf.c 프로젝트: jsollom/slurm
/*
 * bitmap2node_name_sortable - given a bitmap, build a list of comma
 *	separated node names. names may include regular expressions
 *	(e.g. "lx[01-10]")
 * IN bitmap - bitmap pointer
 * IN sort   - returned sorted list or not
 * RET pointer to node list or NULL on error
 * globals: node_record_table_ptr - pointer to node table
 * NOTE: the caller must xfree the memory at node_list when no longer required
 */
char * bitmap2node_name_sortable (bitstr_t *bitmap, bool sort)
{
	int i, first, last;
	hostlist_t hl;
	char *buf;

	if (bitmap == NULL)
		return xstrdup("");

	first = bit_ffs(bitmap);
	if (first == -1)
		return xstrdup("");

	last  = bit_fls(bitmap);
	hl = hostlist_create("");
	for (i = first; i <= last; i++) {
		if (bit_test(bitmap, i) == 0)
			continue;
		hostlist_push(hl, node_record_table_ptr[i].name);
	}
	if (sort)
		hostlist_sort(hl);
	buf = hostlist_ranged_string_xmalloc(hl);
	hostlist_destroy(hl);

	return buf;
}
예제 #5
0
extern int basil_node_ranking(struct node_record *node_array, int node_cnt)
{
	enum basil_version version = get_basil_version();
	struct basil_inventory *inv;
	struct basil_node *node;
	int rank_count = 0, i;
	hostlist_t hl = hostlist_create(NULL);
	bool bad_node = 0;

	inv = get_full_inventory(version);
	if (inv == NULL)
		/* FIXME: should retry here if the condition is transient */
		fatal("failed to get BASIL %s ranking", bv_names_long[version]);
	else if (!inv->batch_total)
		fatal("system has no usable batch compute nodes");

	debug("BASIL %s RANKING INVENTORY: %d/%d batch nodes",
	      bv_names_long[version], inv->batch_avail, inv->batch_total);

	/*
	 * Node ranking is based on a subset of the inventory: only nodes in
	 * batch allocation mode which are up and not allocated. Assign a
	 * 'NO_VAL' rank to all other nodes, which will translate as a very
	 * high value, (unsigned)-2, to put those nodes last in the ranking.
	 * The rest of the code must ensure that those nodes are never chosen.
	 */
	for (i = 0; i < node_cnt; i++)
		node_array[i].node_rank = NO_VAL;

	for (node = inv->f->node_head; node; node = node->next) {
		struct node_record *node_ptr;
		char tmp[50];

		node_ptr = _find_node_by_basil_id(node->node_id);
		if (node_ptr == NULL) {
			error("nid%05u (%s node in state %s) not in slurm.conf",
			      node->node_id, nam_noderole[node->role],
			      nam_nodestate[node->state]);
			bad_node = 1;
		} else
			node_ptr->node_rank = inv->nodes_total - rank_count++;
		sprintf(tmp, "nid%05u", node->node_id);
		hostlist_push(hl, tmp);
	}
	free_inv(inv);
	if (bad_node) {
		hostlist_sort(hl);
		char *name = hostlist_ranged_string_xmalloc(hl);
		info("It appears your slurm.conf nodelist doesn't "
		     "match the alps system.  Here are the nodes alps knows "
		     "about\n%s", name);
	}
	hostlist_destroy(hl);

	return SLURM_SUCCESS;
}
예제 #6
0
static void _sort_hostlist(List sinfo_list)
{
	ListIterator i;
	sinfo_data_t *sinfo_ptr;

	i = list_iterator_create(sinfo_list);
	while ((sinfo_ptr = list_next(i)))
		hostlist_sort(sinfo_ptr->nodes);
	list_iterator_destroy(i);
}
예제 #7
0
void 
wrap_hostlist_sort(WRAPPERS_ARGS, hostlist_t hl)
{
  assert(file && function);

  if (!hl)
    WRAPPERS_ERR_INVALID_PARAMETERS("hostlist_sort");

  hostlist_sort(hl);
  return;
}
예제 #8
0
파일: node_conf.c 프로젝트: HDOD/slurm
/*
 * bitmap2node_name_sortable - given a bitmap, build a list of comma
 *	separated node names. names may include regular expressions
 *	(e.g. "lx[01-10]")
 * IN bitmap - bitmap pointer
 * IN sort   - returned sorted list or not
 * RET pointer to node list or NULL on error
 * globals: node_record_table_ptr - pointer to node table
 * NOTE: the caller must xfree the memory at node_list when no longer required
 */
char * bitmap2node_name_sortable (bitstr_t *bitmap, bool sort)
{
	hostlist_t hl;
	char *buf;

	hl = bitmap2hostlist (bitmap);
	if (hl == NULL)
		return xstrdup("");
	if (sort)
		hostlist_sort(hl);
	buf = hostlist_ranged_string_xmalloc(hl);
	hostlist_destroy(hl);
	return buf;
}
예제 #9
0
파일: info_job.c 프로젝트: artpol84/slurm
/*
 * scontrol_encode_hostlist - given a list of hostnames or the pathname
 *	of a file containing hostnames, translate them into a hostlist
 *	expression
 */
extern int
scontrol_encode_hostlist(char *hostlist, bool sorted)
{
	char *io_buf = NULL, *tmp_list, *ranged_string;
	int buf_size = 1024 * 1024;
	hostlist_t hl;

	if (!hostlist) {
		fprintf(stderr, "Hostlist is NULL\n");
		return SLURM_ERROR;
	}

	if (hostlist[0] == '/') {
		ssize_t buf_read;
		int fd = open(hostlist, O_RDONLY);
		if (fd < 0) {
			fprintf(stderr, "Can not open %s\n", hostlist);
			return SLURM_ERROR;
		}
		io_buf = xmalloc(buf_size);
		buf_read = read(fd, io_buf, buf_size);
		close(fd);
		if (buf_read >= buf_size) {
			/* If over 1MB, the file is almost certainly invalid */
			fprintf(stderr, "File %s is too large\n", hostlist);
			xfree(io_buf);
			return SLURM_ERROR;
		}
		io_buf[buf_read] = '\0';
		_reformat_hostlist(io_buf);
		tmp_list = io_buf;
	} else
		tmp_list = hostlist;

	hl = hostlist_create(tmp_list);
	if (hl == NULL) {
		fprintf(stderr, "Invalid hostlist: %s\n", tmp_list);
		xfree(io_buf);
		return SLURM_ERROR;
	}
	if (sorted)
		hostlist_sort(hl);
	ranged_string = hostlist_ranged_string_xmalloc(hl);
	printf("%s\n", ranged_string);
	hostlist_destroy(hl);
	xfree(ranged_string);
	xfree(io_buf);
	return SLURM_SUCCESS;
}
예제 #10
0
파일: start_job.c 프로젝트: VURM/slurm
/* Start a job:
 *	CMD=STARTJOB ARG=<jobid> TASKLIST=<node_list> [COMMENT=<whatever>]
 * RET 0 on success, -1 on failure */
extern int	start_job(char *cmd_ptr, int *err_code, char **err_msg)
{
	char *arg_ptr, *comment_ptr, *task_ptr, *tasklist, *tmp_char;
	int i, rc, task_cnt;
	uint32_t jobid;
	hostlist_t hl = (hostlist_t) NULL;
	char *host_string;
	static char reply_msg[128];

	arg_ptr = strstr(cmd_ptr, "ARG=");
	if (arg_ptr == NULL) {
		*err_code = -300;
		*err_msg = "STARTJOB lacks ARG";
		error("wiki: STARTJOB lacks ARG");
		return -1;
	}
	jobid = strtoul(arg_ptr+4, &tmp_char, 10);
	if (!isspace(tmp_char[0])) {
		*err_code = -300;
		*err_msg = "Invalid ARG value";
		error("wiki: STARTJOB has invalid jobid");
		return -1;
	}

	comment_ptr = strstr(cmd_ptr, "COMMENT=");
	task_ptr    = strstr(cmd_ptr, "TASKLIST=");

	if (comment_ptr) {
		comment_ptr[7] = ':';
		comment_ptr += 8;
		if (comment_ptr[0] == '\"') {
			comment_ptr++;
			for (i=0; i<MAX_COMMENT_LEN; i++) {
				if (comment_ptr[i] == '\0')
					break;
				if (comment_ptr[i] == '\"') {
					comment_ptr[i] = '\0';
					break;
				}
			}
			if (i == MAX_COMMENT_LEN)
				comment_ptr[i-1] = '\0';
		} else if (comment_ptr[0] == '\'') {
			comment_ptr++;
			for (i=0; i<MAX_COMMENT_LEN; i++) {
				if (comment_ptr[i] == '\0')
					break;
				if (comment_ptr[i] == '\'') {
					comment_ptr[i] = '\0';
					break;
				}
			}
			if (i == MAX_COMMENT_LEN)
				comment_ptr[i-1] = '\0';
		} else
			null_term(comment_ptr);
	}

	if (task_ptr == NULL) {
		*err_code = -300;
		*err_msg = "STARTJOB lacks TASKLIST";
		error("wiki: STARTJOB lacks TASKLIST");
		return -1;
	}
	task_ptr += 9;	/* skip over "TASKLIST=" */
	if ((task_ptr[0] == '\0') || isspace(task_ptr[0])) {
		/* No TASKLIST specification, useful for testing */
		host_string = xstrdup("");
		task_cnt = 0;
		tasklist = NULL;
	} else {
		null_term(task_ptr);
		tasklist = moab2slurm_task_list(task_ptr, &task_cnt);
		if (tasklist)
			hl = hostlist_create(tasklist);
		if ((tasklist == NULL) || (hl == NULL)) {
			*err_code = -300;
			*err_msg = "STARTJOB TASKLIST is invalid";
			error("wiki: STARTJOB TASKLIST is invalid: %s",
			      task_ptr);
			xfree(tasklist);
			return -1;
		}
		hostlist_uniq(hl);
		hostlist_sort(hl);
		host_string = hostlist_ranged_string_xmalloc(hl);
		hostlist_destroy(hl);
		if (host_string == NULL) {
			*err_code = -300;
			*err_msg = "STARTJOB has invalid TASKLIST";
			error("wiki: STARTJOB has invalid TASKLIST: %s",
			      tasklist);
			xfree(tasklist);
			return -1;
		}
	}

	rc = _start_job(jobid, task_cnt, host_string, tasklist, comment_ptr,
			err_code, err_msg);
	xfree(host_string);
	xfree(tasklist);
	if (rc == 0) {
		snprintf(reply_msg, sizeof(reply_msg),
			"job %u started successfully", jobid);
		*err_msg = reply_msg;
	}
	return rc;
}
예제 #11
0
파일: sort.c 프로젝트: HPCNow/slurm
static int _sort_job_by_node_list(void *void1, void *void2)
{
	int diff = 0;
	job_info_t *job1;
	job_info_t *job2;
	hostlist_t hostlist1, hostlist2;
	char *val1, *val2;
	char *ptr1, *ptr2;
#if	PURE_ALPHA_SORT == 0
	int inx;
#endif

	_get_job_info_from_void(&job1, &job2, void1, void2);

	hostlist1 = hostlist_create(job1->nodes);
	hostlist_sort(hostlist1);
	val1 = hostlist_shift(hostlist1);
	if (val1)
		ptr1 = val1;
	else
		ptr1 = "";
	hostlist_destroy(hostlist1);

	hostlist2 = hostlist_create(job2->nodes);
	hostlist_sort(hostlist2);
	val2 = hostlist_shift(hostlist2);
	if (val2)
		ptr2 = val2;
	else
		ptr2 = "";
	hostlist_destroy(hostlist2);

#if	PURE_ALPHA_SORT
	diff = xstrcmp(ptr1, ptr2);
#else
	for (inx = 0; ; inx++) {
		if (ptr1[inx] == ptr2[inx]) {
			if (ptr1[inx] == '\0')
				break;
			continue;
		}
		if ((isdigit((int)ptr1[inx])) &&
		    (isdigit((int)ptr2[inx]))) {
			int num1, num2;
			num1 = atoi(ptr1 + inx);
			num2 = atoi(ptr2 + inx);
			diff = num1 - num2;
		} else
			diff = xstrcmp(ptr1, ptr2);
		break;
	}
#endif
	if (val1)
		free(val1);
	if (val2)
		free(val2);

	if (reverse_order)
		diff = -diff;
	return diff;
}
예제 #12
0
static int	_job_modify(uint32_t jobid, char *bank_ptr,
			char *depend_ptr, char *new_hostlist,
			uint32_t new_node_cnt, char *part_name_ptr,
			uint32_t new_time_limit, char *name_ptr,
			char *start_ptr, char *feature_ptr, char *env_ptr,
			char *comment_ptr, char *gres_ptr, char *wckey_ptr)
{
	struct job_record *job_ptr;
	time_t now = time(NULL);
	bool update_accounting = false;

	job_ptr = find_job_record(jobid);
	if (job_ptr == NULL) {
		error("wiki: MODIFYJOB has invalid jobid %u", jobid);
		return ESLURM_INVALID_JOB_ID;
	}
	if (IS_JOB_FINISHED(job_ptr) || (job_ptr->details == NULL)) {
		info("wiki: MODIFYJOB jobid %u is finished", jobid);
		return ESLURM_DISABLED;
	}

	if (comment_ptr) {
		info("wiki: change job %u comment %s", jobid, comment_ptr);
		xfree(job_ptr->comment);
		job_ptr->comment = xstrdup(comment_ptr);
		last_job_update = now;
	}

	if (depend_ptr) {
		int rc = update_job_dependency(job_ptr, depend_ptr);
		if (rc == SLURM_SUCCESS) {
			info("wiki: changed job %u dependency to %s",
				jobid, depend_ptr);
		} else {
			error("wiki: changing job %u dependency to %s",
				jobid, depend_ptr);
			return EINVAL;
		}
	}

	if (env_ptr) {
		bool have_equal = false;
		char old_sep[1];
		int begin = 0, i;

		if (job_ptr->batch_flag == 0) {
			error("wiki: attempt to set environment variables "
			      "for non-batch job %u", jobid);
			return ESLURM_DISABLED;
		}
		for (i=0; ; i++) {
			if (env_ptr[i] == '=') {
				if (have_equal) {
					error("wiki: setting job %u invalid "
					      "environment variables: %s",
					      jobid, env_ptr);
					return EINVAL;
				}
				have_equal = true;
				if (env_ptr[i+1] == '\"') {
					for (i+=2; ; i++) {
						if (env_ptr[i] == '\0') {
							error("wiki: setting job %u "
							      "invalid environment "
							      "variables: %s",
					 		     jobid, env_ptr);
							return EINVAL;
						}
						if (env_ptr[i] == '\"') {
							i++;
							break;
						}
						if (env_ptr[i] == '\\') {
							i++;
						}
					}
				} else if (env_ptr[i+1] == '\'') {
					for (i+=2; ; i++) {
						if (env_ptr[i] == '\0') {
							error("wiki: setting job %u "
							      "invalid environment "
							      "variables: %s",
					 		     jobid, env_ptr);
							return EINVAL;
						}
						if (env_ptr[i] == '\'') {
							i++;
							break;
						}
						if (env_ptr[i] == '\\') {
							i++;
						}
					}
				}
			}
			if (isspace(env_ptr[i]) || (env_ptr[i] == ',')) {
				if (!have_equal) {
					error("wiki: setting job %u invalid "
					      "environment variables: %s",
					      jobid, env_ptr);
					return EINVAL;
				}
				old_sep[0] = env_ptr[i];
				env_ptr[i] = '\0';
				xrealloc(job_ptr->details->env_sup,
					 sizeof(char *) *
					 (job_ptr->details->env_cnt+1));
				job_ptr->details->env_sup
						[job_ptr->details->env_cnt++] =
						xstrdup(&env_ptr[begin]);
				info("wiki: for job %u add env: %s",
				     jobid, &env_ptr[begin]);
				env_ptr[i] = old_sep[0];
				if (isspace(old_sep[0]))
					break;
				begin = i + 1;
				have_equal = false;
			}
		}
	}

	if (new_time_limit) {
		time_t old_time = job_ptr->time_limit;
		job_ptr->time_limit = new_time_limit;
		info("wiki: change job %u time_limit to %u",
			jobid, new_time_limit);
		/* Update end_time based upon change
		 * to preserve suspend time info */
		job_ptr->end_time = job_ptr->end_time +
				((job_ptr->time_limit -
				  old_time) * 60);
		last_job_update = now;
	}

	if (bank_ptr &&
	    (update_job_account("wiki", job_ptr, bank_ptr) != SLURM_SUCCESS)) {
		return EINVAL;
	}

	if (feature_ptr) {
		if (IS_JOB_PENDING(job_ptr) && (job_ptr->details)) {
			info("wiki: change job %u features to %s",
				jobid, feature_ptr);
			job_ptr->details->features = xstrdup(feature_ptr);
			last_job_update = now;
		} else {
			error("wiki: MODIFYJOB features of non-pending "
				"job %u", jobid);
			return ESLURM_DISABLED;
		}
	}

	if (start_ptr) {
		char *end_ptr;
		uint32_t begin_time = strtol(start_ptr, &end_ptr, 10);
		if (IS_JOB_PENDING(job_ptr) && (job_ptr->details)) {
			info("wiki: change job %u begin time to %u",
				jobid, begin_time);
			job_ptr->details->begin_time = begin_time;
			last_job_update = now;
			update_accounting = true;
		} else {
			error("wiki: MODIFYJOB begin_time of non-pending "
				"job %u", jobid);
			return ESLURM_DISABLED;
		}
	}

	if (name_ptr) {
		if (IS_JOB_PENDING(job_ptr)) {
			info("wiki: change job %u name %s", jobid, name_ptr);
			xfree(job_ptr->name);
			job_ptr->name = xstrdup(name_ptr);
			last_job_update = now;
			update_accounting = true;
		} else {
			error("wiki: MODIFYJOB name of non-pending job %u",
			      jobid);
			return ESLURM_DISABLED;
		}
	}

	if (new_hostlist) {
		int rc = 0, task_cnt;
		hostlist_t hl;
		char *tasklist;

		if (!IS_JOB_PENDING(job_ptr) || !job_ptr->details) {
			/* Job is done, nothing to reset */
			if (new_hostlist == '\0')
				goto host_fini;
			error("wiki: MODIFYJOB hostlist of non-pending "
				"job %u", jobid);
			return ESLURM_DISABLED;
		}

		xfree(job_ptr->details->req_nodes);
		FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap);
		if (new_hostlist == '\0')
			goto host_fini;

		tasklist = moab2slurm_task_list(new_hostlist, &task_cnt);
		if (tasklist == NULL) {
			rc = 1;
			goto host_fini;
		}
		hl = hostlist_create(tasklist);
		if (hl == 0) {
			rc = 1;
			goto host_fini;
		}
		hostlist_uniq(hl);
		hostlist_sort(hl);
		job_ptr->details->req_nodes =
			hostlist_ranged_string_xmalloc(hl);
		hostlist_destroy(hl);
		if (job_ptr->details->req_nodes == NULL) {
			rc = 1;
			goto host_fini;
		}
		if (node_name2bitmap(job_ptr->details->req_nodes, false,
                                     &job_ptr->details->req_node_bitmap)) {
			rc = 1;
			goto host_fini;
		}

host_fini:	if (rc) {
			info("wiki: change job %u invalid hostlist %s",
				jobid, new_hostlist);
			xfree(job_ptr->details->req_nodes);
			return EINVAL;
		} else {
			info("wiki: change job %u hostlist %s",
				jobid, new_hostlist);
			update_accounting = true;
		}
	}

	if (part_name_ptr) {
		struct part_record *part_ptr;
		if (!IS_JOB_PENDING(job_ptr)) {
			error("wiki: MODIFYJOB partition of non-pending "
			      "job %u", jobid);
			return ESLURM_DISABLED;
		}

		part_ptr = find_part_record(part_name_ptr);
		if (part_ptr == NULL) {
			error("wiki: MODIFYJOB has invalid partition %s",
				part_name_ptr);
			return ESLURM_INVALID_PARTITION_NAME;
		}

		info("wiki: change job %u partition %s",
			jobid, part_name_ptr);
		xfree(job_ptr->partition);
		job_ptr->partition = xstrdup(part_name_ptr);
		job_ptr->part_ptr = part_ptr;
		last_job_update = now;
		update_accounting = true;
	}

	if (new_node_cnt) {
		job_desc_msg_t job_desc;
#ifdef HAVE_BG
		uint16_t geometry[SYSTEM_DIMENSIONS] = {(uint16_t) NO_VAL};
		static uint16_t cpus_per_node = 0;
		if (!cpus_per_node) {
			select_g_alter_node_cnt(SELECT_GET_NODE_CPU_CNT,
						&cpus_per_node);
		}
#endif
		if(!IS_JOB_PENDING(job_ptr) || !job_ptr->details) {
			error("wiki: MODIFYJOB node count of non-pending "
			      "job %u", jobid);
			return ESLURM_DISABLED;
		}
		memset(&job_desc, 0, sizeof(job_desc_msg_t));

		job_desc.min_nodes = new_node_cnt;
		job_desc.max_nodes = NO_VAL;
		job_desc.select_jobinfo = select_g_select_jobinfo_alloc();

		select_g_alter_node_cnt(SELECT_SET_NODE_CNT, &job_desc);

		select_g_select_jobinfo_free(job_desc.select_jobinfo);

		job_ptr->details->min_nodes = job_desc.min_nodes;
		if (job_ptr->details->max_nodes &&
		    (job_ptr->details->max_nodes < job_desc.min_nodes))
			job_ptr->details->max_nodes = job_desc.min_nodes;
		info("wiki: change job %u min_nodes to %u",
		     jobid, new_node_cnt);
#ifdef HAVE_BG
		job_ptr->details->min_cpus = job_desc.min_cpus;
		job_ptr->details->max_cpus = job_desc.max_cpus;
		job_ptr->details->pn_min_cpus = job_desc.pn_min_cpus;

		new_node_cnt = job_ptr->details->min_cpus;
		if (cpus_per_node)
			new_node_cnt /= cpus_per_node;

		/* This is only set up so accounting is set up correctly */
		select_g_select_jobinfo_set(job_ptr->select_jobinfo,
					    SELECT_JOBDATA_NODE_CNT,
					    &new_node_cnt);
		/* reset geo since changing this makes any geo
		   potentially invalid */
		select_g_select_jobinfo_set(job_ptr->select_jobinfo,
					    SELECT_JOBDATA_GEOMETRY,
					    geometry);
#endif
		last_job_update = now;
		update_accounting = true;
	}

	if (gres_ptr) {
		char *orig_gres;

		if (!IS_JOB_PENDING(job_ptr)) {
			error("wiki: MODIFYJOB GRES of non-pending job %u",
			      jobid);
			return ESLURM_DISABLED;
		}

		orig_gres = job_ptr->gres;
		job_ptr->gres = NULL;
		if (gres_ptr[0])
			job_ptr->gres = xstrdup(gres_ptr);
		if (gres_plugin_job_state_validate(job_ptr->gres,
						   &job_ptr->gres_list)) {
			error("wiki: MODIFYJOB Invalid GRES=%s", gres_ptr);
			xfree(job_ptr->gres);
			job_ptr->gres = orig_gres;
			return ESLURM_INVALID_GRES;
		}
		xfree(orig_gres);
	}

	if (wckey_ptr) {
		int rc = update_job_wckey("update_job", job_ptr, wckey_ptr);
		if (rc != SLURM_SUCCESS) {
			error("wiki: MODIFYJOB Invalid WCKEY=%s", wckey_ptr);
			return rc;
		}
	}

	if (update_accounting) {
		if (job_ptr->details && job_ptr->details->begin_time) {
			/* Update job record in accounting to reflect
			 * the changes */
			jobacct_storage_g_job_start(acct_db_conn, job_ptr);
		}
	}

	return SLURM_SUCCESS;
}
예제 #13
0
파일: sstat.c 프로젝트: Cray/slurm
int _do_stat(uint32_t jobid, uint32_t stepid, char *nodelist,
	     uint32_t req_cpufreq)
{
	job_step_stat_response_msg_t *step_stat_response = NULL;
	int rc = SLURM_SUCCESS;
	ListIterator itr;
	slurmdb_stats_t temp_stats;
	job_step_stat_t *step_stat = NULL;
	int ntasks = 0;
	int tot_tasks = 0;
	hostlist_t hl = NULL;

	debug("requesting info for job %u.%u", jobid, stepid);
	if ((rc = slurm_job_step_stat(jobid, stepid, nodelist,
				      &step_stat_response)) != SLURM_SUCCESS) {
		if (rc == ESLURM_INVALID_JOB_ID) {
			debug("job step %u.%u has already completed",
			      jobid, stepid);
		} else {
			error("problem getting step_layout for %u.%u: %s",
			      jobid, stepid, slurm_strerror(rc));
		}
		return rc;
	}

	memset(&job, 0, sizeof(slurmdb_job_rec_t));
	job.jobid = jobid;

	memset(&step, 0, sizeof(slurmdb_step_rec_t));

	memset(&temp_stats, 0, sizeof(slurmdb_stats_t));
	temp_stats.cpu_min = NO_VAL;
	memset(&step.stats, 0, sizeof(slurmdb_stats_t));
	step.stats.cpu_min = NO_VAL;

	step.job_ptr = &job;
	step.stepid = stepid;
	step.nodes = xmalloc(BUF_SIZE);
	step.req_cpufreq = req_cpufreq;
	step.stepname = NULL;
	step.state = JOB_RUNNING;

	hl = hostlist_create(NULL);
	itr = list_iterator_create(step_stat_response->stats_list);
	while ((step_stat = list_next(itr))) {
		if (!step_stat->step_pids || !step_stat->step_pids->node_name)
			continue;
		if (step_stat->step_pids->pid_cnt > 0 ) {
			int i;
			for(i=0; i<step_stat->step_pids->pid_cnt; i++) {
				if (step.pid_str)
					xstrcat(step.pid_str, ",");
				xstrfmtcat(step.pid_str, "%u",
					   step_stat->step_pids->pid[i]);
			}
		}

		if (params.pid_format) {
			step.nodes = step_stat->step_pids->node_name;
			print_fields(&step);
			xfree(step.pid_str);
		} else {
			hostlist_push(hl, step_stat->step_pids->node_name);
			jobacctinfo_2_stats(&temp_stats, step_stat->jobacct);
			ntasks += step_stat->num_tasks;
			aggregate_stats(&step.stats, &temp_stats);
		}
	}
	list_iterator_destroy(itr);
	slurm_job_step_pids_response_msg_free(step_stat_response);
	/* we printed it out already */
	if (params.pid_format)
		return rc;

	hostlist_sort(hl);
	hostlist_ranged_string(hl, BUF_SIZE, step.nodes);
	hostlist_destroy(hl);
	tot_tasks += ntasks;

	if (tot_tasks) {
		step.stats.cpu_ave /= (double)tot_tasks;
		step.stats.rss_ave /= (double)tot_tasks;
		step.stats.vsize_ave /= (double)tot_tasks;
		step.stats.pages_ave /= (double)tot_tasks;
		step.stats.disk_read_ave /= (double)tot_tasks;
		step.stats.disk_write_ave /= (double)tot_tasks;
		step.stats.act_cpufreq /= (double)tot_tasks;
		step.ntasks = tot_tasks;
	}

	print_fields(&step);

	return rc;
}
예제 #14
0
파일: start_job.c 프로젝트: IFCA/slurm
/* Start a job:
 *	CMD=STARTJOB ARG=<jobid> TASKLIST=<node_list>
 * RET 0 on success, -1 on failure */
extern int	start_job(char *cmd_ptr, int *err_code, char **err_msg)
{
	char *arg_ptr, *task_ptr, *tasklist, *tmp_char;
	int rc, task_cnt;
	uint32_t jobid;
	hostlist_t hl = (hostlist_t) NULL;
	char *host_string;
	static char reply_msg[128];

	arg_ptr = strstr(cmd_ptr, "ARG=");
	if (arg_ptr == NULL) {
		*err_code = -300;
		*err_msg = "STARTJOB lacks ARG";
		error("wiki: STARTJOB lacks ARG");
		return -1;
	}
	jobid = strtoul(arg_ptr+4, &tmp_char, 10);
	if (!isspace(tmp_char[0])) {
		*err_code = -300;
		*err_msg = "Invalid ARG value";
		error("wiki: STARTJOB has invalid jobid");
		return -1;
	}

	task_ptr = strstr(cmd_ptr, "TASKLIST=");
	if (task_ptr == NULL) {
		*err_code = -300;
		*err_msg = "STARTJOB lacks TASKLIST";
		error("wiki: STARTJOB lacks TASKLIST");
		return -1;
	}
	task_ptr += 9;	/* skip over "TASKLIST=" */
	null_term(task_ptr);
	tasklist = moab2slurm_task_list(task_ptr, &task_cnt);
	if (tasklist)
		hl = hostlist_create(tasklist);
	if ((tasklist == NULL) || (hl == NULL)) {
		*err_code = -300;
		*err_msg = "STARTJOB TASKLIST is invalid";
		error("wiki: STARTJOB TASKLIST is invalid: %s",
			task_ptr);
		xfree(tasklist);
		return -1;
	}
	hostlist_uniq(hl);
	hostlist_sort(hl);
	host_string = hostlist_ranged_string_xmalloc(hl);
	hostlist_destroy(hl);
	if (host_string == NULL) {
		*err_code = -300;
		*err_msg = "STARTJOB has invalid TASKLIST";
		error("wiki: STARTJOB has invalid TASKLIST: %s", tasklist);
		xfree(tasklist);
		return -1;
	}

	rc = _start_job(jobid, task_cnt, host_string, tasklist,
			err_code, err_msg);
	xfree(host_string);
	xfree(tasklist);
	if (rc == 0) {
		snprintf(reply_msg, sizeof(reply_msg),
			"job %u started successfully", jobid);
		*err_msg = reply_msg;
	}
	return rc;
}
예제 #15
0
extern int basil_node_ranking(struct node_record *node_array, int node_cnt)
{
	enum basil_version version = get_basil_version();
	struct basil_inventory *inv;
	struct basil_node *node;
	int rank_count = 0, i;
	hostlist_t hl = hostlist_create(NULL);
	bool bad_node = 0;

	node_rank_inv = 1;
	/*
	 * When obtaining the initial configuration, we can not allow ALPS to
	 * fail. If there is a problem at this stage it is better to restart
	 * SLURM completely, after investigating (and/or fixing) the cause.
	 */
	inv = get_full_inventory(version);
	if (inv == NULL)
		fatal("failed to get BASIL %s ranking", bv_names_long[version]);
	else if (!inv->batch_total)
		fatal("system has no usable batch compute nodes");
	else if (inv->batch_total < node_cnt)
		info("Warning: ALPS sees only %d/%d slurm.conf nodes, "
		     "check DownNodes", inv->batch_total, node_cnt);

	debug("BASIL %s RANKING INVENTORY: %d/%d batch nodes",
	      bv_names_long[version], inv->batch_avail, inv->batch_total);

	/*
	 * Node ranking is based on a subset of the inventory: only nodes in
	 * batch allocation mode which are up and not allocated. Assign a
	 * 'NO_VAL' rank to all other nodes, which will translate as a very
	 * high value, (unsigned)-2, to put those nodes last in the ranking.
	 * The rest of the code must ensure that those nodes are never chosen.
	 */
	for (i = 0; i < node_cnt; i++)
		node_array[i].node_rank = NO_VAL;

	for (node = inv->f->node_head; node; node = node->next) {
		struct node_record *node_ptr;
		char tmp[50];

		/* This will ignore interactive nodes when iterating through
		 * the apbasil inventory.  If we don't do this, SLURM is
		 * unable to resolve the ID to a nidXXX name since it's not in
		 * the slurm.conf file.  (Chris North)
		 */
		if (node->role == BNR_INTER)
			continue;

		node_ptr = _find_node_by_basil_id(node->node_id);
		if (node_ptr == NULL) {
			error("nid%05u (%s node in state %s) not in slurm.conf",
			      node->node_id, nam_noderole[node->role],
			      nam_nodestate[node->state]);
			bad_node = 1;
		} else if ((slurmctld_conf.fast_schedule != 2)
			   && (node->cpu_count != node_ptr->config_ptr->cpus)) {
			fatal("slurm.conf: node %s has %u cpus "
			      "but configured as CPUs=%u in your slurm.conf",
			      node_ptr->name, node->cpu_count,
			      node_ptr->config_ptr->cpus);
		} else if ((slurmctld_conf.fast_schedule != 2)
			   && (node->mem_size
			       != node_ptr->config_ptr->real_memory)) {
			fatal("slurm.conf: node %s has RealMemory=%u "
			      "but configured as RealMemory=%u in your "
			      "slurm.conf",
			      node_ptr->name, node->mem_size,
			      node_ptr->config_ptr->real_memory);
		} else {
			node_ptr->node_rank = inv->nodes_total - rank_count++;
			/*
			 * Convention: since we are using SLURM in
			 *             frontend-mode, we use
			 *             NodeHostName as follows.
			 *
			 * NodeHostName:  c#-#c#s#n# using the  NID convention
			 *                <cabinet>-<row><chassis><slot><node>
			 * - each cabinet can accommodate 3 chassis (c1..c3)
			 * - each chassis has 8 slots               (s0..s7)
			 * - each slot contains 2 or 4 nodes        (n0..n3)
			 *   o either 2 service nodes (n0/n3)
			 *   o or 4 compute nodes     (n0..n3)
			 *   o or 2 gemini chips      (g0/g1 serving n0..n3)
			 *
			 * Example: c0-0c1s0n1
			 *          - c0- = cabinet 0
			 *          - 0   = row     0
			 *          - c1  = chassis 1
			 *          - s0  = slot    0
			 *          - n1  = node    1
			 */
			xfree(node_ptr->node_hostname);
			node_ptr->node_hostname = xstrdup(node->name);
		}

		sprintf(tmp, "nid%05u", node->node_id);
		hostlist_push_host(hl, tmp);
	}
	free_inv(inv);
	if (bad_node) {
		hostlist_sort(hl);
		char *name = hostlist_ranged_string_xmalloc(hl);
		info("It appears your slurm.conf nodelist doesn't "
		     "match the alps system.  Here are the nodes alps knows "
		     "about\n%s", name);
	}
	hostlist_destroy(hl);
	node_rank_inv = 0;

	return SLURM_SUCCESS;
}
예제 #16
0
static int	_job_modify(uint32_t jobid, char *bank_ptr,
			char *depend_ptr, char *new_hostlist,
			uint32_t new_node_cnt, char *part_name_ptr,
			uint32_t new_time_limit)
{
	struct job_record *job_ptr;
	bool update_accounting = false;

	job_ptr = find_job_record(jobid);
	if (job_ptr == NULL) {
		error("wiki: MODIFYJOB has invalid jobid %u", jobid);
		return ESLURM_INVALID_JOB_ID;
	}
	if (IS_JOB_FINISHED(job_ptr)) {
		error("wiki: MODIFYJOB jobid %u is finished", jobid);
		return ESLURM_DISABLED;
	}

	if (depend_ptr) {
		int rc = update_job_dependency(job_ptr, depend_ptr);
		if (rc == SLURM_SUCCESS) {
			info("wiki: changed job %u dependency to %s",
				jobid, depend_ptr);
		} else {
			error("wiki: changing job %u dependency to %s",
				jobid, depend_ptr);
			return EINVAL;
		}
	}

	if (new_time_limit) {
		time_t old_time = job_ptr->time_limit;
		job_ptr->time_limit = new_time_limit;
		info("wiki: change job %u time_limit to %u",
			jobid, new_time_limit);
		/* Update end_time based upon change
		 * to preserve suspend time info */
		job_ptr->end_time = job_ptr->end_time +
				((job_ptr->time_limit -
				  old_time) * 60);
		last_job_update = time(NULL);
	}

	if (bank_ptr) {
		if (update_job_account("wiki", job_ptr, bank_ptr)
		   != SLURM_SUCCESS)
			return EINVAL;
		else
			update_accounting = true;
	}

	if (new_hostlist) {
		int rc = 0, task_cnt;
		hostlist_t hl;
		char *tasklist;

		if (!IS_JOB_PENDING(job_ptr) || !job_ptr->details) {
			/* Job is done, nothing to reset */
			if (new_hostlist == '\0')
				goto host_fini;
			error("wiki: MODIFYJOB tasklist of non-pending "
				"job %u", jobid);
			return ESLURM_DISABLED;
		}

		xfree(job_ptr->details->req_nodes);
		FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap);
		if (new_hostlist == '\0')
			goto host_fini;

		tasklist = moab2slurm_task_list(new_hostlist, &task_cnt);
		if (tasklist == NULL) {
			rc = 1;
			goto host_fini;
		}
		hl = hostlist_create(tasklist);
		if (hl == 0) {
			rc = 1;
			goto host_fini;
		}
		hostlist_uniq(hl);
		hostlist_sort(hl);
		job_ptr->details->req_nodes =
			hostlist_ranged_string_xmalloc(hl);
		hostlist_destroy(hl);
		if (job_ptr->details->req_nodes == NULL) {
			rc = 1;
			goto host_fini;
		}
		if (node_name2bitmap(job_ptr->details->req_nodes, false,
                                     &job_ptr->details->req_node_bitmap)) {
			rc = 1;
			goto host_fini;
		}

host_fini:	if (rc) {
			info("wiki: change job %u invalid hostlist %s",
			     jobid, new_hostlist);
			xfree(job_ptr->details->req_nodes);
			return EINVAL;
		} else {
			info("wiki: change job %u hostlist %s",
			     jobid, new_hostlist);
			update_accounting = true;
		}
	}

	if (part_name_ptr) {
		struct part_record *part_ptr;
		if (!IS_JOB_PENDING(job_ptr)) {
			error("wiki: MODIFYJOB partition of non-pending "
			      "job %u", jobid);
			return ESLURM_DISABLED;
		}

		part_ptr = find_part_record(part_name_ptr);
		if (part_ptr == NULL) {
			error("wiki: MODIFYJOB has invalid partition %s",
				part_name_ptr);
			return ESLURM_INVALID_PARTITION_NAME;
		}
		info("wiki: change job %u partition %s",
			jobid, part_name_ptr);
		xfree(job_ptr->partition);
		job_ptr->partition = xstrdup(part_name_ptr);
		job_ptr->part_ptr = part_ptr;
		last_job_update = time(NULL);
		update_accounting = true;
	}
	if (new_node_cnt) {
		if (IS_JOB_PENDING(job_ptr) && job_ptr->details) {
			job_ptr->details->min_nodes = new_node_cnt;
			if (job_ptr->details->max_nodes
			&&  (job_ptr->details->max_nodes < new_node_cnt))
				job_ptr->details->max_nodes = new_node_cnt;
			info("wiki: change job %u min_nodes to %u",
				jobid, new_node_cnt);
			last_job_update = time(NULL);
			update_accounting = true;
		} else {
			error("wiki: MODIFYJOB node count of non-pending "
				"job %u", jobid);
			return ESLURM_DISABLED;
		}
	}

	if (update_accounting) {
		/* Update job record in accounting to reflect changes */
		jobacct_storage_job_start_direct(acct_db_conn, job_ptr);
	}

	return SLURM_SUCCESS;
}
예제 #17
0
파일: port_mgr.c 프로젝트: Cray/slurm
/* Reserve ports for a job step
 * NOTE: We keep track of last port reserved and go round-robin through full
 *       set of available ports. This helps avoid re-using busy ports when
 *       restarting job steps.
 * RET SLURM_SUCCESS or an error code */
extern int resv_port_alloc(struct step_record *step_ptr)
{
	int i, port_inx;
	int *port_array = NULL;
	char port_str[16], *tmp_str;
	hostlist_t hl;
	static int last_port_alloc = 0;

	if (step_ptr->resv_port_cnt > port_resv_cnt) {
		info("step %u.%u needs %u reserved ports, but only %d exist",
		     step_ptr->job_ptr->job_id, step_ptr->step_id,
		     step_ptr->resv_port_cnt, port_resv_cnt);
		return ESLURM_PORTS_INVALID;
	}

	/* Identify available ports */
	port_array = xmalloc(sizeof(int) * step_ptr->resv_port_cnt);
	port_inx = 0;
	for (i=0; i<port_resv_cnt; i++) {
		if (++last_port_alloc >= port_resv_cnt)
			last_port_alloc = 0;
		if (bit_overlap(step_ptr->step_node_bitmap,
				port_resv_table[last_port_alloc]))
			continue;
		port_array[port_inx++] = last_port_alloc;
		if (port_inx >= step_ptr->resv_port_cnt)
			break;
	}
	if (port_inx < step_ptr->resv_port_cnt) {
		info("insufficient ports for step %u.%u to reserve (%d of %u)",
		     step_ptr->job_ptr->job_id, step_ptr->step_id,
		     port_inx, step_ptr->resv_port_cnt);
		xfree(port_array);
		return ESLURM_PORTS_BUSY;
	}

	/* Reserve selected ports */
	hl = hostlist_create(NULL);
	for (i=0; i<port_inx; i++) {
		/* NOTE: We give the port a name like "[1234]" rather than
		 * just "1234" to avoid hostlists of the form "1[234-236]" */
		bit_or(port_resv_table[port_array[i]],
		       step_ptr->step_node_bitmap);
		port_array[i] += port_resv_min;
		snprintf(port_str, sizeof(port_str), "[%d]", port_array[i]);
		hostlist_push(hl, port_str);
	}
	hostlist_sort(hl);
	step_ptr->resv_ports = hostlist_ranged_string_xmalloc(hl);
	hostlist_destroy(hl);
	step_ptr->resv_port_array = port_array;

	if (step_ptr->resv_ports[0] == '[') {
		/* Remove brackets from hostlist */
		i = strlen(step_ptr->resv_ports);
		step_ptr->resv_ports[i-1] = '\0';
		tmp_str = xmalloc(i);
		strcpy(tmp_str, step_ptr->resv_ports + 1);
		xfree(step_ptr->resv_ports);
		step_ptr->resv_ports = tmp_str;
	}

	debug("reserved ports %s for step %u.%u",
	      step_ptr->resv_ports,
	      step_ptr->job_ptr->job_id, step_ptr->step_id);

	return SLURM_SUCCESS;
}
예제 #18
0
파일: srun.c 프로젝트: supermanue/slurm
/*
 * The pack_node_list may not be ordered across multiple components, which can
 * cause problems for some MPI implementations. Put the pack_node_list records
 * in alphabetic order and reorder pack_task_cnts pack_tids to match
 */
static void _reorder_pack_recs(char **in_node_list, uint16_t **in_task_cnts,
			       uint32_t ***in_tids, int total_nnodes)
{
	hostlist_t in_hl, out_hl;
	uint16_t *out_task_cnts = NULL;
	uint32_t **out_tids = NULL;
	char *hostname;
	int i, j;

	in_hl = hostlist_create(*in_node_list);
	if (!in_hl) {
		error("%s: Invalid hostlist(%s)", __func__, *in_node_list);
		return;
	}
	out_hl = hostlist_copy(in_hl);
	hostlist_sort(out_hl);
	hostlist_uniq(out_hl);
	i = hostlist_count(out_hl);
	if (i != total_nnodes) {
		error("%s: Invalid hostlist(%s) count(%d)", __func__,
		      *in_node_list, total_nnodes);
		goto fini;
	}

	out_task_cnts = xmalloc(sizeof(uint16_t) * total_nnodes);
	out_tids = xmalloc(sizeof(uint32_t *) * total_nnodes);
	for (i = 0; i < total_nnodes; i++) {
		hostname = hostlist_nth(out_hl, i);
		if (!hostname) {
			error("%s: Invalid hostlist(%s) count(%d)", __func__,
			      *in_node_list, total_nnodes);
			break;
		}
		j = hostlist_find(in_hl, hostname);
		if (j == -1) {
			error("%s: Invalid hostlist(%s) parsing", __func__,
			      *in_node_list);
			free(hostname);
			break;
		}
		out_task_cnts[i] = in_task_cnts[0][j];
		out_tids[i] = in_tids[0][j];
		free(hostname);
	}

	if (i >= total_nnodes) {	/* Success */
		xfree(*in_node_list);
		*in_node_list = hostlist_ranged_string_xmalloc(out_hl);
		xfree(*in_task_cnts);
		*in_task_cnts = out_task_cnts;
		out_task_cnts = NULL;
		xfree(*in_tids);
		*in_tids = out_tids;
		out_tids = NULL;
	}

#if 0
	info("NODE_LIST[%d]:%s", total_nnodes, *in_node_list);
	for (i = 0; i < total_nnodes; i++) {
		info("TASK_CNT[%d]:%u", i, in_task_cnts[0][i]);
		for (j = 0; j < in_task_cnts[0][i]; j++) {
			info("TIDS[%d][%d]: %u", i, j, in_tids[0][i][j]);
		}
	}
#endif

fini:	hostlist_destroy(in_hl);
	hostlist_destroy(out_hl);
	xfree(out_task_cnts);
	xfree(out_tids);
}
예제 #19
0
extern int basil_node_ranking(struct node_record *node_array, int node_cnt)
{
    enum basil_version version = get_basil_version();
    struct basil_inventory *inv;
    struct basil_node *node;
    int rank_count = 0, i;
    hostlist_t hl = hostlist_create(NULL);
    bool bad_node = 0;

    /*
     * When obtaining the initial configuration, we can not allow ALPS to
     * fail. If there is a problem at this stage it is better to restart
     * SLURM completely, after investigating (and/or fixing) the cause.
     */
    inv = get_full_inventory(version);
    if (inv == NULL)
        fatal("failed to get BASIL %s ranking", bv_names_long[version]);
    else if (!inv->batch_total)
        fatal("system has no usable batch compute nodes");
    else if (inv->batch_total < node_cnt)
        info("Warning: ALPS sees only %d/%d slurm.conf nodes, "
             "check DownNodes", inv->batch_total, node_cnt);

    debug("BASIL %s RANKING INVENTORY: %d/%d batch nodes",
          bv_names_long[version], inv->batch_avail, inv->batch_total);

    /*
     * Node ranking is based on a subset of the inventory: only nodes in
     * batch allocation mode which are up and not allocated. Assign a
     * 'NO_VAL' rank to all other nodes, which will translate as a very
     * high value, (unsigned)-2, to put those nodes last in the ranking.
     * The rest of the code must ensure that those nodes are never chosen.
     */
    for (i = 0; i < node_cnt; i++)
        node_array[i].node_rank = NO_VAL;

    for (node = inv->f->node_head; node; node = node->next) {
        struct node_record *node_ptr;
        char tmp[50];

        /* This will ignore interactive nodes when iterating through
         * the apbasil inventory.  If we don't do this, SLURM is
         * unable to resolve the ID to a nidXXX name since it's not in
         * the slurm.conf file.  (Chris North)
         */
        if (node->role == BNR_INTER)
            continue;

        node_ptr = _find_node_by_basil_id(node->node_id);
        if (node_ptr == NULL) {
            error("nid%05u (%s node in state %s) not in slurm.conf",
                  node->node_id, nam_noderole[node->role],
                  nam_nodestate[node->state]);
            bad_node = 1;
        } else
            node_ptr->node_rank = inv->nodes_total - rank_count++;
        sprintf(tmp, "nid%05u", node->node_id);
        hostlist_push(hl, tmp);
    }
    free_inv(inv);
    if (bad_node) {
        hostlist_sort(hl);
        char *name = hostlist_ranged_string_xmalloc(hl);
        info("It appears your slurm.conf nodelist doesn't "
             "match the alps system.  Here are the nodes alps knows "
             "about\n%s", name);
    }
    hostlist_destroy(hl);

    return SLURM_SUCCESS;
}
예제 #20
0
static char *	_dump_node(struct node_record *node_ptr, hostlist_t hl,
			   time_t update_time)
{
	char tmp[16*1024], *buf = NULL;
	int i;
	uint32_t cpu_cnt;

	if (!node_ptr)
		return NULL;

	if (hl) {
		char *node_list;
		hostlist_sort(hl);
		hostlist_uniq(hl);
		node_list = hostlist_ranged_string_xmalloc(hl);
		xstrcat(buf, node_list);
		xfree(node_list);
	} else {
		snprintf(tmp, sizeof(tmp), "%s", node_ptr->name);
		xstrcat(buf, tmp);
	}

	snprintf(tmp, sizeof(tmp), ":STATE=%s;", _get_node_state(node_ptr));
	xstrcat(buf, tmp);

	if (node_ptr->cpu_load != NO_VAL) {
		snprintf(tmp, sizeof(tmp), "CPULOAD=%f;",
			 (node_ptr->cpu_load / 100.0));
		xstrcat(buf, tmp);
	}

	if (node_ptr->reason) {
		/* Strip out any quotes, they confuse Moab */
		char *reason, *bad_char;
		reason = xstrdup(node_ptr->reason);
		while ((bad_char = strchr(reason, '\'')))
			bad_char[0] = ' ';
		while ((bad_char = strchr(reason, '\"')))
			bad_char[0] = ' ';
		snprintf(tmp, sizeof(tmp), "CAT=\"%s\";", reason);
		xstrcat(buf, tmp);
		xfree(reason);
	}

	if (update_time > last_node_update)
		return buf;

	if (slurmctld_conf.fast_schedule) {
		/* config from slurm.conf */
		cpu_cnt = node_ptr->config_ptr->cpus;
	} else {
		/* config as reported by slurmd */
		cpu_cnt = node_ptr->cpus;
	}
	for (i=0; i<node_ptr->part_cnt; i++) {
		if (i == 0)
			xstrcat(buf, "CCLASS=");
		snprintf(tmp, sizeof(tmp), "[%s:%u]",
			node_ptr->part_pptr[i]->name,
			cpu_cnt);
		xstrcat(buf, tmp);
	}
	if (i > 0)
		xstrcat(buf, ";");

	if (node_ptr->arch) {
		snprintf(tmp, sizeof(tmp), "ARCH=%s;", node_ptr->arch);
		xstrcat(buf, tmp);
	}

	if (node_ptr->os) {
		snprintf(tmp, sizeof(tmp), "OS=%s;", node_ptr->os);
		xstrcat(buf, tmp);
	}

	if (node_ptr->config_ptr && node_ptr->config_ptr->feature) {
		snprintf(tmp, sizeof(tmp), "FEATURE=%s;",
			node_ptr->config_ptr->feature);
		/* comma separator to colon */
		for (i=0; (tmp[i] != '\0'); i++) {
			if (tmp[i] == ',')
				tmp[i] = ':';
		}
		xstrcat(buf, tmp);
	}

	if (node_ptr->config_ptr && node_ptr->config_ptr->gres) {
		snprintf(tmp, sizeof(tmp), "GRES=%s;",
			node_ptr->config_ptr->gres);
		xstrcat(buf, tmp);
	}

	if (update_time > 0)
		return buf;

	if (slurmctld_conf.fast_schedule) {
		/* config from slurm.conf */
		snprintf(tmp, sizeof(tmp),
			"CMEMORY=%u;CDISK=%u;CPROC=%u;",
			node_ptr->config_ptr->real_memory,
			node_ptr->config_ptr->tmp_disk,
			node_ptr->config_ptr->cpus);
	} else {
		/* config as reported by slurmd */
		snprintf(tmp, sizeof(tmp),
			"CMEMORY=%u;CDISK=%u;CPROC=%u;",
			node_ptr->real_memory,
			node_ptr->tmp_disk,
			node_ptr->cpus);
	}
	xstrcat(buf, tmp);

	return buf;
}
예제 #21
0
static int
_pstdout_output_consolidated(FILE *stream,
                             List whichconsolidatedlist,
                             pthread_mutex_t *whichconsolidatedmutex)
{
    struct pstdout_consolidated_data *cdata;
    ListIterator itr = NULL;
    int mutex_locked = 0;
    int rc, rv = -1;

    assert(stream);
    assert(stream == stdout || stream == stderr);
    assert(whichconsolidatedlist);
    assert(whichconsolidatedmutex);

    if ((rc = pthread_mutex_lock(whichconsolidatedmutex)))
    {
        if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD)
            fprintf(stderr, "pthread_mutex_lock: %s\n", strerror(rc));
        pstdout_errnum = PSTDOUT_ERR_INTERNAL;
        goto cleanup;
    }
    mutex_locked++;

    list_sort(whichconsolidatedlist, _pstdout_consolidated_data_compare);

    if (!(itr = list_iterator_create (whichconsolidatedlist)))
    {
        pstdout_errnum = PSTDOUT_ERR_OUTMEM;
        goto cleanup;
    }

    while ((cdata = list_next(itr)))
    {
        char hbuf[PSTDOUT_BUFLEN];

        memset(hbuf, '\0', PSTDOUT_BUFLEN);
        hostlist_sort(cdata->h);
        if (hostlist_ranged_string(cdata->h, PSTDOUT_BUFLEN, hbuf) < 0)
        {
            if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD)
                fprintf(stderr, "hostlist_ranged_string: %s\n", strerror(errno));
            pstdout_errnum = PSTDOUT_ERR_INTERNAL;
            goto cleanup;
        }

        fprintf(stream, "----------------\n");
        fprintf(stream, "%s\n", hbuf);
        fprintf(stream, "----------------\n");
        fprintf(stream, "%s", cdata->output);
    }

    rv = 0;
cleanup:
    if (mutex_locked)
    {
        if ((rc = pthread_mutex_unlock(whichconsolidatedmutex)))
        {
            if (pstdout_debug_flags & PSTDOUT_DEBUG_STANDARD)
                fprintf(stderr, "pthread_mutex_unlock: %s\n", strerror(rc));
            /* Don't change error code, just move on */
        }
    }
    if (itr)
        list_iterator_destroy(itr);
    return rv;
}