Beispiel #1
0
/*
 * build_all_frontend_info - get a array of slurm_conf_frontend_t structures
 *	from the slurm.conf reader, build table, and set values
 * is_slurmd_context: set to true if run from slurmd
 * RET 0 if no error, error code otherwise
 */
extern int build_all_frontend_info (bool is_slurmd_context)
{
	slurm_conf_frontend_t **ptr_array;
#ifdef HAVE_FRONT_END
	slurm_conf_frontend_t *fe_single, *fe_line;
	int i, count, max_rc = SLURM_SUCCESS;
	bool front_end_debug;

	if (slurm_get_debug_flags() & DEBUG_FLAG_FRONT_END)
		front_end_debug = true;
	else
		front_end_debug = false;
	count = slurm_conf_frontend_array(&ptr_array);
	if (count == 0)
		fatal("No FrontendName information available!");

	for (i = 0; i < count; i++) {
		hostlist_t hl_name, hl_addr;
		char *fe_name, *fe_addr;

		fe_line = ptr_array[i];
		hl_name = hostlist_create(fe_line->frontends);
		if (hl_name == NULL)
			fatal("Invalid FrontendName:%s", fe_line->frontends);
		hl_addr = hostlist_create(fe_line->addresses);
		if (hl_addr == NULL)
			fatal("Invalid FrontendAddr:%s", fe_line->addresses);
		if (hostlist_count(hl_name) != hostlist_count(hl_addr)) {
			fatal("Inconsistent node count between "
			      "FrontendName(%s) and FrontendAddr(%s)",
			      fe_line->frontends, fe_line->addresses);
		}
		while ((fe_name = hostlist_shift(hl_name))) {
			fe_addr = hostlist_shift(hl_addr);
			fe_single = xmalloc(sizeof(slurm_conf_frontend_t));
			if (list_append(front_end_list, fe_single) == NULL)
				fatal("list_append: malloc failure");
			fe_single->frontends = xstrdup(fe_name);
			fe_single->addresses = xstrdup(fe_addr);
			free(fe_name);
			free(fe_addr);
			fe_single->port = fe_line->port;
			if (fe_line->reason && fe_line->reason[0])
				fe_single->reason = xstrdup(fe_line->reason);
			fe_single->node_state = fe_line->node_state;
			if (front_end_debug && !is_slurmd_context)
				_dump_front_end(fe_single);
		}
		hostlist_destroy(hl_addr);
		hostlist_destroy(hl_name);
	}
	return max_rc;
#else
	if (slurm_conf_frontend_array(&ptr_array) != 0)
		fatal("FrontendName information configured!");
	return SLURM_SUCCESS;
#endif
}
Beispiel #2
0
static int _sort_step_by_node_list(void *void1, void *void2)
{
	int diff = 0;
	job_step_info_t *step1;
	job_step_info_t *step2;

	hostlist_t hostlist1, hostlist2;
	char *val1, *val2;
#if	PURE_ALPHA_SORT == 0
	int inx;
#endif

	_get_step_info_from_void(&step1, &step2, void1, void2);

	hostlist1 = hostlist_create(step1->nodes);
	hostlist_sort(hostlist1);
	val1 = hostlist_shift(hostlist1);
	if (val1 == NULL)
		val1 = "";
	hostlist_destroy(hostlist1);

	hostlist2 = hostlist_create(step2->nodes);
	hostlist_sort(hostlist2);
	val2 = hostlist_shift(hostlist2);
	if (val2 == NULL)
		val2 = "";
	hostlist_destroy(hostlist2);

#if	PURE_ALPHA_SORT
	diff = strcmp(val1, val2);
#else
	for (inx=0; ; inx++) {
		if (val1[inx] == val2[inx]) {
			if (val1[inx] == '\0')
				break;
			continue;
		}
		if ((isdigit((int)val1[inx])) &&
		    (isdigit((int)val2[inx]))) {
			int num1, num2;
			num1 = atoi(val1+inx);
			num2 = atoi(val2+inx);
			diff = num1 - num2;
		} else
			diff = strcmp(val1, val2);
		break;
	}
#endif
	if (strlen(val1))
		free(val1);
	if (strlen(val2))
		free(val2);

	if (reverse_order)
		diff = -diff;
	return diff;
}
Beispiel #3
0
static int _sort_by_node_list(void *void1, void *void2)
{
	int diff = 0;
	sinfo_data_t *sinfo1;
	sinfo_data_t *sinfo2;
	char *val1, *val2;
#if	PURE_ALPHA_SORT == 0
	int inx;
#endif

	_get_sinfo_from_void(&sinfo1, &sinfo2, void1, void2);

	val1 = hostlist_shift(sinfo1->nodes);
	if (val1) {
		hostlist_push_host(sinfo1->nodes, val1);
		hostlist_sort(sinfo1->nodes);
	} else
		val1 = "";

	val2 = hostlist_shift(sinfo2->nodes);
	if (val2) {
		hostlist_push_host(sinfo2->nodes, val2);
		hostlist_sort(sinfo2->nodes);
	} else
		val2 = "";

#if	PURE_ALPHA_SORT
	diff = xstrcmp(val1, val2);
#else
	for (inx=0; ; inx++) {
		if (val1[inx] == val2[inx]) {
			if (val1[inx] == '\0')
				break;
			continue;
		}
		if ((isdigit((int)val1[inx])) &&
		    (isdigit((int)val2[inx]))) {
			int num1, num2;
			num1 = atoi(val1+inx);
			num2 = atoi(val2+inx);
			diff = num1 - num2;
		} else
			diff = xstrcmp(val1, val2);
		break;
	}
#endif
	if (strlen(val1))
		free(val1);
	if (strlen(val2))
		free(val2);

	if (reverse_order)
		diff = -diff;

	return diff;
}
Beispiel #4
0
extern uint64_t RRD_consolidate(time_t step_starttime, time_t step_endtime,
				bitstr_t* bitmap_of_nodes)
{
	uint64_t consumed_energy = 0;
	uint64_t tmp;
	char *node_name = NULL;
	hostlist_t hl;
	char* path;

	node_name = bitmap2node_name(bitmap_of_nodes);
	hl = hostlist_create(node_name);
	xfree(node_name);
	while ((node_name = hostlist_shift(hl))) {
		if (!(path = _get_node_rrd_path(node_name,
						EXT_SENSORS_VALUE_ENERGY)))
			consumed_energy = (uint64_t)NO_VAL;
		free(node_name);
		if ((tmp = _rrd_consolidate_one(
			     step_starttime, step_endtime, path,
			     ext_sensors_cnf->energy_rra_name, true)) == NO_VAL)
			consumed_energy = (uint64_t)NO_VAL;
		xfree(path);
		if (consumed_energy == (uint64_t)NO_VAL)
			break;
		consumed_energy += tmp;
	}
	hostlist_destroy(hl);

	return consumed_energy;
}
Beispiel #5
0
/* Builds the job step's resv_port_array based upon resv_ports (a string) */
static void _rebuild_port_array(struct step_record *step_ptr)
{
	int i;
	char *tmp_char;
	hostlist_t hl;

	i = strlen(step_ptr->resv_ports);
	tmp_char = xmalloc(i+3);
	sprintf(tmp_char, "[%s]", step_ptr->resv_ports);
	hl = hostlist_create(tmp_char);
	if (!hl)
		fatal("Invalid reserved ports: %s", step_ptr->resv_ports);
	xfree(tmp_char);

	step_ptr->resv_port_array = xmalloc(sizeof(int) *
					    step_ptr->resv_port_cnt);
	step_ptr->resv_port_cnt = 0;
	while ((tmp_char = hostlist_shift(hl))) {
		i = atoi(tmp_char);
		if (i > 0)
			step_ptr->resv_port_array[step_ptr->resv_port_cnt++]=i;
		free(tmp_char);
	}
	hostlist_destroy(hl);
	if (step_ptr->resv_port_cnt == 0) {
		error("Problem recovering resv_port_array for step %u.%u: %s",
		      step_ptr->job_ptr->job_id, step_ptr->step_id,
		      step_ptr->resv_ports);
		xfree(step_ptr->resv_ports);
	}
}
Beispiel #6
0
static int _change_state_bps(char *com, int state)
{
	char *host;
	int i = 0;
	uint16_t pos[params.cluster_dims];
	char letter = '.';
	bool used = false;
	char *c_state = "up";
	hostlist_t hl = NULL;
	int rc = 1;

	if (state == NODE_STATE_DOWN) {
		letter = '#';
		used = true;
		c_state = "down";
	}

	while (com[i] && (com[i] != '[') &&
	       ((com[i] < '0') || (com[i] > '9')) &&
	       ((com[i] < 'A') || (com[i] > 'Z')))
		i++;
	if (com[i] == '\0') {
		memset(error_string, 0, 255);
		sprintf(error_string,
			"You didn't specify any nodes to make %s. "
			"in statement '%s'",
			c_state, com);
		return 0;
	}

	if (!(hl = hostlist_create(com+i))) {
		memset(error_string, 0, 255);
		sprintf(error_string, "Bad hostlist given '%s'", com+i);
		return 0;

	}

	while ((host = hostlist_shift(hl))) {
		ba_mp_t *ba_mp;
		smap_node_t *smap_node;

		for (i = 0; i < params.cluster_dims; i++)
			pos[i] = select_char2coord(host[i]);
		if (!(ba_mp = bg_configure_coord2ba_mp(pos))) {
			memset(error_string, 0, 255);
			sprintf(error_string, "Bad host given '%s'", host);
			rc = 0;
			break;
		}
		bg_configure_ba_update_mp_state(ba_mp, state);
		smap_node = smap_system_ptr->grid[ba_mp->index];
		smap_node->color = 0;
		smap_node->letter = letter;
		smap_node->used = used;
		free(host);
	}
	hostlist_destroy(hl);

	return rc;
}
Beispiel #7
0
/*
 * scontrol_print_node_list - print information about the supplied node list
 *	(or regular expression)
 * IN node_list - print information about the supplied node list
 *	(or regular expression)
 */
extern void
scontrol_print_node_list (char *node_list)
{
	node_info_msg_t *node_info_ptr = NULL;
	hostlist_t host_list;
	int error_code;
	uint16_t show_flags = 0;
	char *this_node_name;

	if (all_flag)
		show_flags |= SHOW_ALL;
	if (detail_flag)
		show_flags |= SHOW_DETAIL;

	error_code = scontrol_load_nodes(&node_info_ptr, show_flags);
	if (error_code) {
		exit_code = 1;
		if (quiet_flag != 1)
			slurm_perror ("slurm_load_node error");
		return;
	}

	if (quiet_flag == -1) {
		char time_str[32];
		slurm_make_time_str ((time_t *)&node_info_ptr->last_update,
			             time_str, sizeof(time_str));
		printf ("last_update_time=%s, records=%d\n",
			time_str, node_info_ptr->record_count);
	}

	if (node_list == NULL) {
		scontrol_print_node (NULL, node_info_ptr);
	} else {
		if ((host_list = hostlist_create (node_list))) {
			while ((this_node_name = hostlist_shift (host_list))) {
				scontrol_print_node(this_node_name,
						    node_info_ptr);
				free(this_node_name);
			}

			hostlist_destroy(host_list);
		} else {
			exit_code = 1;
			if (quiet_flag != 1) {
				if (errno == EINVAL) {
					fprintf(stderr,
					        "unable to parse node list %s\n",
					        node_list);
				 } else if (errno == ERANGE) {
					fprintf(stderr,
					        "too many nodes in supplied range %s\n",
					        node_list);
				} else
					perror("error parsing node list");
			}
		}
	}
	return;
}
Beispiel #8
0
int p_mpi_hook_slurmstepd_task (const mpi_plugin_client_info_t *job,
				char ***env)
{
	char *nodelist, *task_cnt;

	nodelist = getenvp(*env, "SLURM_NODELIST");
	if (nodelist) {
		char *host_str = NULL, *tmp;
		hostlist_t hl = hostlist_create(nodelist);
		while ((tmp = hostlist_shift(hl))) {
			if (host_str)
				xstrcat(host_str, ",");
			xstrcat(host_str, tmp);
			free(tmp);
		}
		hostlist_destroy(hl);
		env_array_overwrite_fmt(env, "SLURM_MPICH_NODELIST", "%s",
			host_str);
		xfree(host_str);
	}

	task_cnt = getenvp(*env, "SLURM_TASKS_PER_NODE");
	if (task_cnt) {
		char *task_str = NULL, tmp_str[32];
		int i=0, val, reps;
		while (task_cnt[i]) {
			if ((task_cnt[i] >= '0') && (task_cnt[i] <= '9'))
				val = atoi(&task_cnt[i]);
			else
				break;	/* bad parse */
			i++;
			while (task_cnt[i]
			&&     (task_cnt[i] != 'x') && (task_cnt[i] != ','))
				i++;
			if (task_cnt[i] == 'x') {
				i++;
				reps = atoi(&task_cnt[i]);
				while (task_cnt[i] && (task_cnt[i] != ','))
					i++;
			} else
				reps = 1;
			if (task_cnt[i] == ',')
				i++;
			while (reps) {
				if (task_str)
					xstrcat(task_str, ",");
				snprintf(tmp_str, sizeof(tmp_str), "%d", val);
				xstrcat(task_str, tmp_str);
				reps--;
			}
		}
		env_array_overwrite_fmt(env, "SLURM_MPICH_TASKS", "%s",
			task_str);
		xfree(task_str);
	}

	return SLURM_SUCCESS;
}
Beispiel #9
0
/*
 * Convert all GPU records to a new entries in a list where each File is a
 * unique device (i.e. convert a record with "File=nvidia[0-3]" into 4 separate
 * records).
 */
static List _build_gpu_list(List gres_list)
{
	ListIterator itr;
	gres_slurmd_conf_t *gres_record, *gpu_record;
	List gpu_list;
	hostlist_t hl;
	char *f_name;
	bool log_fname = true;

	if (gres_list == NULL)
		return NULL;

	gpu_list = list_create(_delete_gres_list);
	itr = list_iterator_create(gres_list);
	while ((gres_record = list_next(itr))) {
		if (xstrcmp(gres_record->name, "gpu"))
			continue;
		if (!gres_record->file) {
			if (log_fname) {
				error("%s: GPU configuration lacks \"File\" specification",
				      plugin_name);
				log_fname = false;
			}
			continue;
		}
		hl = hostlist_create(gres_record->file);
		while ((f_name = hostlist_shift(hl))) {
			gpu_record = xmalloc(sizeof(gres_slurmd_conf_t));
			gpu_record->config_flags = gres_record->config_flags;
			if (gres_record->type_name) {
				gpu_record->config_flags |=
					GRES_CONF_HAS_TYPE;
			}
			gpu_record->count = 1;
			gpu_record->cpu_cnt = gres_record->cpu_cnt;
			gpu_record->cpus = xstrdup(gres_record->cpus);
			if (gres_record->cpus_bitmap) {
				gpu_record->cpus_bitmap =
					bit_copy(gres_record->cpus_bitmap);
			}
			gpu_record->file = xstrdup(f_name);
			gpu_record->links = xstrdup(gres_record->links);
			gpu_record->name = xstrdup(gres_record->name);
			gpu_record->plugin_id = gres_record->plugin_id;
			gpu_record->type_name = xstrdup(gres_record->type_name);
			list_append(gpu_list, gpu_record);
			free(f_name);
		}
		hostlist_destroy(hl);
		(void) list_delete_item(itr);
	}
	list_iterator_destroy(itr);

	return gpu_list;
}
Beispiel #10
0
/*
 * route_split_hostlist_treewidth - logic to split an input hostlist into
 *                                  a set of hostlists to forward to.
 *
 * This is the default behavior. It is implemented here as there are cases
 * where the topology version also needs to split the message list based
 * on TreeWidth.
 *
 * IN: hl        - hostlist_t   - list of every node to send message to
 *                                will be empty on return which is same behavior
 *                                as similar code replaced in forward.c
 * OUT: sp_hl    - hostlist_t** - the array of hostlists that will be malloced
 * OUT: count    - int*         - the count of created hostlists
 * RET: SLURM_SUCCESS - int
 *
 * Note: created hostlist will have to be freed independently using
 *       hostlist_destroy by the caller.
 * Note: the hostlist_t array will have to be xfree.
 */
extern int route_split_hostlist_treewidth(hostlist_t hl,
					  hostlist_t** sp_hl,
					  int* count)
{
	int host_count;
	int *span = NULL;
	char *name = NULL;
	char *buf;
	int nhl = 0;
	int j;

	host_count = hostlist_count(hl);
	span = set_span(host_count, tree_width);
	*sp_hl = (hostlist_t*) xmalloc(tree_width * sizeof(hostlist_t));

	while ((name = hostlist_shift(hl))) {
		(*sp_hl)[nhl] = hostlist_create(name);
		free(name);
		for (j = 0; j < span[nhl]; j++) {
			name = hostlist_shift(hl);
			if (!name) {
				break;
			}
			hostlist_push_host((*sp_hl)[nhl], name);
			free(name);
		}
		if (debug_flags & DEBUG_FLAG_ROUTE) {
			buf = hostlist_ranged_string_xmalloc((*sp_hl)[nhl]);
			debug("ROUTE: ... sublist[%d] %s", nhl, buf);
			xfree(buf);
		}
		nhl++;
	}
	xfree(span);
	*count = nhl;

	return SLURM_SUCCESS;
}
Beispiel #11
0
void start_msg_tree(char *hl, int tree_width, int depth)
{
	int span[100];
	int j = 0, count = 0;
	char *name = NULL;
	int thr_count = 0;
	int host_count = 0;
    char *launcher;
    char *launcher_names[1024];
    int launcher_work[1024];
    int launcher_cnt;

    hl = hostlist_shift(hl, &name);
    launcher_names[0] = name;
    launcher_work[0] = 0;
    launcher_cnt++;
    while( hl[0] ) {
        int assign = 0, i;
        for(i=1; i<launcher_cnt; i++){
            if( launcher_work[assign] - (i - assign) > launcher_work[i]) {
                assign = i;
            }
        }
        for(i=0; i<tree_width; i++) {
            hl = hostlist_shift(hl, &name);
            launcher_names[launcher_cnt] = name;
            launcher_work[launcher_cnt++] = 0;
            if( out_dot ){
                add_launch_dot(buf, launcher_names[assign], name);
            } else {
                add_launch_std(buf, launcher_names[assign], name);
            }
            launcher_work[assign]++;
        }
    }

	return;
}
Beispiel #12
0
static void _start_msg_tree_internal(hostlist_t hl, hostlist_t* sp_hl,
				     fwd_tree_t *fwd_tree_in,
				     int hl_count)
{
	int j;
	fwd_tree_t *fwd_tree;

	xassert((hl || sp_hl) && !(hl && sp_hl));
	xassert(fwd_tree_in);
	xassert(fwd_tree_in->p_thr_count);
	xassert(fwd_tree_in->tree_mutex);
	xassert(fwd_tree_in->notify);
	xassert(fwd_tree_in->ret_list);

	if (hl)
		xassert(hl_count == hostlist_count(hl));

	if (fwd_tree_in->timeout <= 0)
		/* convert secs to msec */
		fwd_tree_in->timeout  = slurm_get_msg_timeout() * 1000;

	for (j = 0; j < hl_count; j++) {
		fwd_tree = xmalloc(sizeof(fwd_tree_t));
		memcpy(fwd_tree, fwd_tree_in, sizeof(fwd_tree_t));

		if (sp_hl) {
			fwd_tree->tree_hl = sp_hl[j];
			sp_hl[j] = NULL;
		} else if (hl) {
			char *name = hostlist_shift(hl);
			fwd_tree->tree_hl = hostlist_create(name);
			free(name);
		}

		/*
		 * Lock and increase thread counter, we need that to protect
		 * the start_msg_tree waiting loop that was originally designed
		 * around a "while ((count < host_count))" loop. In case where a
		 * fwd thread was not able to get all the return codes from
		 * children, the waiting loop was deadlocked.
		 */
		slurm_mutex_lock(fwd_tree->tree_mutex);
		(*fwd_tree->p_thr_count)++;
		slurm_mutex_unlock(fwd_tree->tree_mutex);

		slurm_thread_create_detached(NULL, _fwd_tree_thread, fwd_tree);
	}
}
Beispiel #13
0
static void _forward_msg_internal(hostlist_t hl, hostlist_t* sp_hl,
				  forward_struct_t *fwd_struct,
				  header_t *header, int timeout,
				  int hl_count)
{
	int j;
	forward_msg_t *fwd_msg = NULL;
	char *buf = NULL, *tmp_char = NULL;

	if (timeout <= 0)
		/* convert secs to msec */
		timeout  = slurm_get_msg_timeout() * 1000;

	for (j = 0; j < hl_count; j++) {
		fwd_msg = xmalloc(sizeof(forward_msg_t));

		fwd_msg->fwd_struct = fwd_struct;

		fwd_msg->timeout = timeout;

		memcpy(&fwd_msg->header.orig_addr,
		       &header->orig_addr,
		       sizeof(slurm_addr_t));

		fwd_msg->header.version = header->version;
		fwd_msg->header.flags = header->flags;
		fwd_msg->header.msg_type = header->msg_type;
		fwd_msg->header.body_length = header->body_length;
		fwd_msg->header.ret_list = NULL;
		fwd_msg->header.ret_cnt = 0;

		if (sp_hl) {
			buf = hostlist_ranged_string_xmalloc(sp_hl[j]);
			hostlist_destroy(sp_hl[j]);
		} else {
			tmp_char = hostlist_shift(hl);
			buf = xstrdup(tmp_char);
			free(tmp_char);
		}

		forward_init(&fwd_msg->header.forward, NULL);
		fwd_msg->header.forward.nodelist = buf;
		slurm_thread_create_detached(NULL, _forward_thread, fwd_msg);
	}
}
Beispiel #14
0
/*
 * _node_name2bitmap - given a node name regular expression, build a bitmap
 *	representation, any invalid hostnames are added to a hostlist
 * IN node_names  - set of node namess
 * OUT bitmap     - set to bitmap, may not have all bits set on error
 * IN/OUT invalid_hostlist - hostlist of invalid host names, initialize to NULL
 * RET 0 if no error, otherwise EINVAL
 * NOTE: call FREE_NULL_BITMAP(bitmap) and hostlist_destroy(invalid_hostlist)
 *       to free memory when variables are no longer required
 */
static int _node_name2bitmap(char *node_names, bitstr_t **bitmap, 
			     hostlist_t *invalid_hostlist)
{
	char *this_node_name;
	bitstr_t *my_bitmap;
	hostlist_t host_list;

	my_bitmap = (bitstr_t *) bit_alloc(node_record_count);
	*bitmap = my_bitmap;

	if (node_names == NULL) {
		error("_node_name2bitmap: node_names is NULL");
		return EINVAL;
	}

	if ( (host_list = hostlist_create(node_names)) == NULL) {
		/* likely a badly formatted hostlist */
		error("_node_name2bitmap: hostlist_create(%s) error", 
		      node_names);
		return EINVAL;
	}

	while ( (this_node_name = hostlist_shift(host_list)) ) {
		struct node_record *node_ptr;
		node_ptr = find_node_record(this_node_name);
		if (node_ptr) {
			bit_set(my_bitmap, 
				(bitoff_t) (node_ptr - node_record_table_ptr));
		} else {
			debug2("_node_name2bitmap: invalid node specified %s",
			       this_node_name);
			if (*invalid_hostlist) {
				hostlist_push_host(*invalid_hostlist,
						   this_node_name);
			} else {
				*invalid_hostlist = 
					hostlist_create(this_node_name);
			}
		}
		free (this_node_name);
	}
	hostlist_destroy(host_list);

	return SLURM_SUCCESS;
}
Beispiel #15
0
/*
 * node_name2bitmap - given a node name regular expression, build a bitmap
 *	representation
 * IN node_names  - list of nodes
 * IN best_effort - if set don't return an error on invalid node name entries
 * OUT bitmap     - set to bitmap, may not have all bits set on error
 * RET 0 if no error, otherwise EINVAL
 * NOTE: call FREE_NULL_BITMAP() to free bitmap memory when no longer required
 */
extern int node_name2bitmap (char *node_names, bool best_effort,
			     bitstr_t **bitmap)
{
	int rc = SLURM_SUCCESS;
	char *this_node_name;
	bitstr_t *my_bitmap;
	hostlist_t host_list;

	my_bitmap = (bitstr_t *) bit_alloc (node_record_count);
	if (my_bitmap == NULL)
		fatal("bit_alloc malloc failure");
	*bitmap = my_bitmap;

	if (node_names == NULL) {
		info("node_name2bitmap: node_names is NULL");
		return rc;
	}

	if ( (host_list = hostlist_create (node_names)) == NULL) {
		/* likely a badly formatted hostlist */
		error ("hostlist_create on %s error:", node_names);
		if (!best_effort)
			rc = EINVAL;
		return rc;
	}

	while ( (this_node_name = hostlist_shift (host_list)) ) {
		struct node_record *node_ptr;
		node_ptr = find_node_record (this_node_name);
		if (node_ptr) {
			bit_set (my_bitmap, (bitoff_t) (node_ptr -
							node_record_table_ptr));
		} else {
			error ("node_name2bitmap: invalid node specified %s",
			       this_node_name);
			if (!best_effort)
				rc = EINVAL;
		}
		free (this_node_name);
	}
	hostlist_destroy (host_list);

	return rc;
}
Beispiel #16
0
Datei: xcpu.c Projekt: IFCA/slurm
/* Identify every XCPU process in a specific node and signal it.
 * Return the process count */
extern int xcpu_signal(int sig, char *nodes)
{
	int procs = 0;
	hostlist_t hl;
	char *node, sig_msg[64], dir_path[128], ctl_path[200];
	DIR *dir;
	struct dirent *sub_dir;

	/* Translate "nodes" to a hostlist */
	hl = hostlist_create(nodes);
	if (hl == NULL) {
		error("hostlist_create: %m");
		return 0;
	}

	/* Plan 9 only takes strings, so we map number to name */
	snprintf(sig_msg, sizeof(sig_msg), "signal %s",
		_sig_name(sig));

	/* For each node, look for processes */
	while ((node = hostlist_shift(hl))) {
		snprintf(dir_path, sizeof(dir_path), 
			"%s/%s/xcpu",
			XCPU_DIR, node);
		free(node);
		if ((dir = opendir(dir_path)) == NULL) {
			error("opendir(%s): %m", dir_path);
			continue;
		}
		while ((sub_dir = readdir(dir))) {
			snprintf(ctl_path, sizeof(ctl_path),
				"%s/%s/ctl",dir_path, 
				sub_dir->d_name);
			procs += _send_sig(ctl_path, sig, sig_msg);
		}
		closedir(dir);
	}

	hostlist_destroy(hl);
	return procs;
}
Beispiel #17
0
/*
 * scontrol_print_hosts - given a node list expression, return
 *	a list of nodes, one per line
 */
extern void
scontrol_print_hosts (char * node_list)
{
	hostlist_t hl;
	char *host;

	if (!node_list) {
		error("host list is empty");
		return;
	}
	hl = hostlist_create(node_list);
	if (!hl) {
		fprintf(stderr, "Invalid hostlist: %s\n", node_list);
		return;
	}
	while ((host = hostlist_shift(hl))) {
		printf("%s\n", host);
		free(host);
	}
	hostlist_destroy(hl);
}
Beispiel #18
0
static void _start_msg_tree_internal(hostlist_t hl, hostlist_t* sp_hl,
				     fwd_tree_t *fwd_tree_in,
				     int hl_count)
{
	int j;
	fwd_tree_t *fwd_tree;

	xassert((hl || sp_hl) && !(hl && sp_hl));
	xassert(fwd_tree_in);
	xassert(fwd_tree_in->p_thr_count);
	xassert(fwd_tree_in->tree_mutex);
	xassert(fwd_tree_in->notify);
	xassert(fwd_tree_in->ret_list);

	if (hl)
		xassert(hl_count == hostlist_count(hl));

	if (fwd_tree_in->timeout <= 0)
		/* convert secs to msec */
		fwd_tree_in->timeout  = slurm_get_msg_timeout() * 1000;

	for (j = 0; j < hl_count; j++) {
		pthread_attr_t attr_agent;
		pthread_t thread_agent;
		int retries = 0;

		slurm_attr_init(&attr_agent);
		if (pthread_attr_setdetachstate
		    (&attr_agent, PTHREAD_CREATE_DETACHED))
			error("pthread_attr_setdetachstate error %m");

		fwd_tree = xmalloc(sizeof(fwd_tree_t));
		memcpy(fwd_tree, fwd_tree_in, sizeof(fwd_tree_t));

		if (sp_hl) {
			fwd_tree->tree_hl = sp_hl[j];
			sp_hl[j] = NULL;
		} else if (hl) {
			char *name = hostlist_shift(hl);
			fwd_tree->tree_hl = hostlist_create(name);
			free(name);
		}

		/*
		 * Lock and increase thread counter, we need that to protect
		 * the start_msg_tree waiting loop that was originally designed
		 * around a "while ((count < host_count))" loop. In case where a
		 * fwd thread was not able to get all the return codes from
		 * children, the waiting loop was deadlocked.
		 */
		slurm_mutex_lock(fwd_tree->tree_mutex);
		(*fwd_tree->p_thr_count)++;
		slurm_mutex_unlock(fwd_tree->tree_mutex);

		while (pthread_create(&thread_agent, &attr_agent,
				      _fwd_tree_thread, (void *)fwd_tree)) {
			error("pthread_create error %m");
			if (++retries > MAX_RETRIES)
				fatal("Can't create pthread");
			usleep(100000);	/* sleep and try again */
		}
		slurm_attr_destroy(&attr_agent);

	}
}
Beispiel #19
0
void *_fwd_tree_thread(void *arg)
{
	fwd_tree_t *fwd_tree = (fwd_tree_t *)arg;
	List ret_list = NULL;
	char *name = NULL;
	char *buf = NULL;
	slurm_msg_t send_msg;

	slurm_msg_t_init(&send_msg);
	send_msg.msg_type = fwd_tree->orig_msg->msg_type;
	send_msg.data = fwd_tree->orig_msg->data;
	send_msg.protocol_version = fwd_tree->orig_msg->protocol_version;

	/* repeat until we are sure the message was sent */
	while ((name = hostlist_shift(fwd_tree->tree_hl))) {
		if (slurm_conf_get_addr(name, &send_msg.address)
		    == SLURM_ERROR) {
			error("fwd_tree_thread: can't find address for host "
			      "%s, check slurm.conf", name);
			slurm_mutex_lock(fwd_tree->tree_mutex);
			mark_as_failed_forward(&fwd_tree->ret_list, name,
					       SLURM_UNKNOWN_FORWARD_ADDR);
 			slurm_cond_signal(fwd_tree->notify);
			slurm_mutex_unlock(fwd_tree->tree_mutex);
			free(name);

			continue;
		}

		send_msg.forward.timeout = fwd_tree->timeout;
		if ((send_msg.forward.cnt = hostlist_count(fwd_tree->tree_hl))){
			buf = hostlist_ranged_string_xmalloc(
					fwd_tree->tree_hl);
			send_msg.forward.nodelist = buf;
		} else
			send_msg.forward.nodelist = NULL;

		if (send_msg.forward.nodelist && send_msg.forward.nodelist[0]) {
			debug3("Tree sending to %s along with %s",
			       name, send_msg.forward.nodelist);
		} else
			debug3("Tree sending to %s", name);

		ret_list = slurm_send_addr_recv_msgs(&send_msg, name,
						     fwd_tree->timeout);

		xfree(send_msg.forward.nodelist);

		if (ret_list) {
			int ret_cnt = list_count(ret_list);
			/* This is most common if a slurmd is running
			   an older version of Slurm than the
			   originator of the message.
			*/
			if ((ret_cnt <= send_msg.forward.cnt) &&
			    (errno != SLURM_COMMUNICATIONS_CONNECTION_ERROR)) {
				error("fwd_tree_thread: %s failed to forward "
				      "the message, expecting %d ret got only "
				      "%d",
				      name, send_msg.forward.cnt + 1, ret_cnt);
				if (ret_cnt > 1) { /* not likely */
					ret_data_info_t *ret_data_info = NULL;
					ListIterator itr =
						list_iterator_create(ret_list);
					while ((ret_data_info =
						list_next(itr))) {
						if (xstrcmp(ret_data_info->
							    node_name, name))
							hostlist_delete_host(
								fwd_tree->
								tree_hl,
								ret_data_info->
								node_name);
					}
					list_iterator_destroy(itr);
				}
			}

			slurm_mutex_lock(fwd_tree->tree_mutex);
			list_transfer(fwd_tree->ret_list, ret_list);
			slurm_cond_signal(fwd_tree->notify);
			slurm_mutex_unlock(fwd_tree->tree_mutex);
			FREE_NULL_LIST(ret_list);
			/* try next node */
			if (ret_cnt <= send_msg.forward.cnt) {
				free(name);
				/* Abandon tree. This way if all the
				 * nodes in the branch are down we
				 * don't have to time out for each
				 * node serially.
				 */
				_start_msg_tree_internal(
					fwd_tree->tree_hl, NULL,
					fwd_tree,
					hostlist_count(fwd_tree->tree_hl));
				continue;
			}
		} else {
			/* This should never happen (when this was
			 * written slurm_send_addr_recv_msgs always
			 * returned a list */
			error("fwd_tree_thread: no return list given from "
			      "slurm_send_addr_recv_msgs spawned for %s",
			      name);
			slurm_mutex_lock(fwd_tree->tree_mutex);
			mark_as_failed_forward(
				&fwd_tree->ret_list, name,
				SLURM_COMMUNICATIONS_CONNECTION_ERROR);
 			slurm_cond_signal(fwd_tree->notify);
			slurm_mutex_unlock(fwd_tree->tree_mutex);
			free(name);

			continue;
		}

		free(name);

		/* check for error and try again */
		if (errno == SLURM_COMMUNICATIONS_CONNECTION_ERROR)
 			continue;

		break;
	}

	_destroy_tree_fwd(fwd_tree);

	return NULL;
}
Beispiel #20
0
/*
 * Convert Moab supplied TASKLIST expression into a SLURM hostlist expression
 *
 * Moab format 1: tux0:tux0:tux1:tux1:tux2   (list host for each cpu)
 * Moab format 2: tux[0-1]*2:tux2            (list cpu count after host name)
 *
 * SLURM format:  tux0,tux0,tux1,tux1,tux2   (if consumable resources enabled)
 * SLURM format:  tux0,tux1,tux2             (if consumable resources disabled)
 *
 * NOTE: returned string must be released with xfree()
 */
extern char * moab2slurm_task_list(char *moab_tasklist, int *task_cnt)
{
	char *slurm_tasklist = NULL, *host = NULL, *tmp1 = NULL,
		*tmp2 = NULL, *tok = NULL, *tok_p = NULL;
	int i, reps;
	hostlist_t hl;
	static uint32_t cr_test = 0, cr_enabled = 0;

	if (cr_test == 0) {
		select_g_get_info_from_plugin(SELECT_CR_PLUGIN, NULL,
						&cr_enabled);
		cr_test = 1;
	}

	*task_cnt = 0;

	/* Moab format 2 if string contains '*' or '[' */
	tmp1 = strchr(moab_tasklist, (int) '*');
	if (tmp1 == NULL)
		tmp1 = strchr(moab_tasklist, (int) '[');

	if (tmp1 == NULL) {	/* Moab format 1 */
		slurm_tasklist = xstrdup(moab_tasklist);
		if (moab_tasklist[0])
			*task_cnt = 1;
		for (i=0; slurm_tasklist[i]!='\0'; i++) {
			if (slurm_tasklist[i] == ':') {
				slurm_tasklist[i] = ',';
				(*task_cnt)++;
			} else if (slurm_tasklist[i] == ',')
				(*task_cnt)++;
		}
		return slurm_tasklist;
	}

	/* Moab format 2 */
	slurm_tasklist = xstrdup("");
	tmp1 = xstrdup(moab_tasklist);
	tok = strtok_r(tmp1, ":", &tok_p);
	while (tok) {
		/* find task count, assume 1 if no "*" */
		tmp2 = strchr(tok, (int) '*');
		if (tmp2) {
			reps = atoi(tmp2 + 1);
			tmp2[0] = '\0';
		} else
			reps = 1;

		/* find host expression */
		hl = hostlist_create(tok);
		while ((host = hostlist_shift(hl))) {
			for (i=0; i<reps; i++) {
				if (slurm_tasklist[0])
					xstrcat(slurm_tasklist, ",");
				xstrcat(slurm_tasklist, host);
				if (!cr_enabled)
					break;
			}
			free(host);
			(*task_cnt) += reps;
		}
		hostlist_destroy(hl);

		/* get next token */
		tok = strtok_r(NULL, ":", &tok_p);
	}
	xfree(tmp1);
	return slurm_tasklist;
}
Beispiel #21
0
extern bg_record_t *create_small_record(bg_record_t *bg_record,
					bitstr_t *ionodes, int size)
{
	bg_record_t *found_record = NULL;
	ba_mp_t *new_ba_mp = NULL;
	ba_mp_t *ba_mp = NULL;
	char bitstring[BITSIZE];

	found_record = (bg_record_t*) xmalloc(sizeof(bg_record_t));
	found_record->magic = BLOCK_MAGIC;

	found_record->job_running = NO_JOB_RUNNING;
	found_record->user_name = xstrdup(bg_record->user_name);
	found_record->user_uid = bg_record->user_uid;
	found_record->ba_mp_list = list_create(destroy_ba_mp);
	if (bg_record->ba_mp_list)
		ba_mp = list_peek(bg_record->ba_mp_list);
	if (!ba_mp) {
		if (bg_record->mp_str) {
			hostlist_t hl = hostlist_create(bg_record->mp_str);
			char *host = hostlist_shift(hl);
			hostlist_destroy(hl);
			found_record->mp_str = xstrdup(host);
			free(host);
			error("you gave me a list with no ba_mps using %s",
			      found_record->mp_str);
		} else {
			char tmp_char[SYSTEM_DIMENSIONS+1];
			int dim;
			for (dim=0; dim<SYSTEM_DIMENSIONS; dim++)
				tmp_char[dim] =
					alpha_num[found_record->start[dim]];
			tmp_char[dim] = '\0';
			found_record->mp_str = xstrdup_printf(
				"%s%s",
				bg_conf->slurm_node_prefix,
				tmp_char);
			error("you gave me a record with no ba_mps "
			      "and no nodes either using %s",
			      found_record->mp_str);
		}
	} else {
		new_ba_mp = ba_copy_mp(ba_mp);
		/* We need to have this node wrapped in Q to handle
		   wires correctly when creating around the midplane.
		*/
		ba_setup_mp(new_ba_mp, false, true);

		new_ba_mp->used = BA_MP_USED_TRUE;
		list_append(found_record->ba_mp_list, new_ba_mp);
		found_record->mp_count = 1;
		found_record->mp_str = xstrdup_printf(
			"%s%s",
			bg_conf->slurm_node_prefix, new_ba_mp->coord_str);
	}

#ifdef HAVE_BGL
	found_record->node_use = SELECT_COPROCESSOR_MODE;
	found_record->blrtsimage = xstrdup(bg_record->blrtsimage);
#endif
#ifdef HAVE_BG_L_P
	found_record->linuximage = xstrdup(bg_record->linuximage);
	found_record->ramdiskimage = xstrdup(bg_record->ramdiskimage);
#endif
	found_record->mloaderimage = xstrdup(bg_record->mloaderimage);

	process_nodes(found_record, false);

	found_record->conn_type[0] = SELECT_SMALL;

	xassert(bg_conf->cpu_ratio);
	found_record->cpu_cnt = bg_conf->cpu_ratio * size;
	found_record->cnode_cnt = size;

	found_record->ionode_bitmap = bit_copy(ionodes);
	bit_fmt(bitstring, BITSIZE, found_record->ionode_bitmap);
	found_record->ionode_str = xstrdup(bitstring);
	if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
		info("made small block of %s[%s]",
		     found_record->mp_str, found_record->ionode_str);
	return found_record;
}
Beispiel #22
0
/* Get the next job ID from local variables set up by _is_job_id() */
static char *_next_job_id(void)
{
	static hostlist_t hl = NULL;
	static char *save_ptr = NULL;
	static char *next_job_id = NULL;
	static char *task_id_spec = NULL;
	char *job_id_str = NULL, *bracket_ptr, *under_ptr;
	char *tmp_str, *end_job_str;
	int i;

	/* Clean up from previous calls */
	xfree(next_job_id);

	if (hl) {
		/* Process job ID regular expression using previously
		 * established hostlist data structure */
		tmp_str = hostlist_shift(hl);
		if (tmp_str) {
			next_job_id = xstrdup(tmp_str);
			free(tmp_str);
			if (task_id_spec) {
				xstrcat(next_job_id, "_");
				xstrcat(next_job_id, task_id_spec);
			}
			return next_job_id;
		}
		hostlist_destroy(hl);
		hl = NULL;
	}

	/* Get next token */
	xfree(task_id_spec);
	if (local_job_str && !save_ptr)	/* Get first token */
		job_id_str = strtok_r(local_job_str, "^", &save_ptr);
	else if (save_ptr)		/* Get next token */
		job_id_str = strtok_r(NULL, "^", &save_ptr);

	if (!job_id_str)	/* No more tokens */
		goto fini;

	under_ptr = strchr(job_id_str, '_');
	if (under_ptr) {
		if (under_ptr[1] == '[') {
			/* Strip brackets from job array task ID spec */
			task_id_spec = xstrdup(under_ptr + 2);
			for (i = 0; task_id_spec[i]; i++) {
				if (task_id_spec[i] == ']') {
					task_id_spec[i] = '\0';
					break;
				}
			}
		} else {
			task_id_spec = xstrdup(under_ptr + 1);
		}
	}

	bracket_ptr = strchr(job_id_str, '[');
	if (bracket_ptr && (!under_ptr || (bracket_ptr < under_ptr))) {
		/* Job ID specification uses regular expression */
		tmp_str = xstrdup(job_id_str);
		if ((end_job_str = strchr(tmp_str, '_')))
			end_job_str[0] = '\0';
		hl = hostlist_create(tmp_str);
		if (!hl) {
			error("Invalid job id: %s", job_id_str);
			xfree(tmp_str);
			goto fini;
		}
		xfree(tmp_str);
		tmp_str = hostlist_shift(hl);
		if (!tmp_str) {
			error("Invalid job id: %s", job_id_str);
			hostlist_destroy(hl);
			goto fini;
		}
		next_job_id = xstrdup(tmp_str);
		free(tmp_str);
	} else if (under_ptr) {
		under_ptr[0] = '\0';
		next_job_id = xstrdup(job_id_str);
		under_ptr[0] = '_';
	} else {
		next_job_id = xstrdup(job_id_str);
	}

	if (task_id_spec) {
		xstrcat(next_job_id, "_");
		xstrcat(next_job_id, task_id_spec);
	}

	return next_job_id;

fini:	xfree(local_job_str);
	save_ptr = NULL;
	return NULL;
}
Beispiel #23
0
/*
 * We could load gres state or validate it using various mechanisms here.
 * This only validates that the configuration was specified in gres.conf.
 * In the general case, no code would need to be changed.
 */
extern int node_config_load(List gres_conf_list)
{
	int i, rc = SLURM_SUCCESS;
	ListIterator iter;
	gres_slurmd_conf_t *gres_slurmd_conf;
	int nb_nic = 0;	/* Number of NICs in the list */
	int available_files_index = 0;

	xassert(gres_conf_list);
	iter = list_iterator_create(gres_conf_list);
	while ((gres_slurmd_conf = list_next(iter))) {
		if (strcmp(gres_slurmd_conf->name, gres_name))
			continue;
		if (gres_slurmd_conf->file)
			nb_nic++;
	}
	list_iterator_destroy(iter);
	xfree(nic_devices);	/* No-op if NULL */
	nb_available_files = -1;
	/* (Re-)Allocate memory if number of files changed */
	if (nb_nic > nb_available_files) {
		nic_devices = (int *) xmalloc(sizeof(int) * nb_nic);
		nb_available_files = nb_nic;
		for (i = 0; i < nb_available_files; i++)
			nic_devices[i] = -1;
	}

	iter = list_iterator_create(gres_conf_list);
	while ((gres_slurmd_conf = list_next(iter))) {
		if ((strcmp(gres_slurmd_conf->name, gres_name) == 0) &&
		    gres_slurmd_conf->file) {
			/* Populate nic_devices array with number
			 * at end of the file name */
			char *bracket, *fname, *tmp_name;
			hostlist_t hl;
			bracket = strrchr(gres_slurmd_conf->file, '[');
			if (bracket)
				tmp_name = xstrdup(bracket);
			else
				tmp_name = xstrdup(gres_slurmd_conf->file);
			hl = hostlist_create(tmp_name);
			xfree(tmp_name);
			if (!hl) {
				rc = EINVAL;
				break;
			}
			while ((fname = hostlist_shift(hl))) {
				if (available_files_index ==
				    nb_available_files) {
					nb_available_files++;
					xrealloc(nic_devices, sizeof(int) *
						 nb_available_files);
					nic_devices[available_files_index] = -1;
				}
				for (i = 0; fname[i]; i++) {
					if (!isdigit(fname[i]))
						continue;
					nic_devices[available_files_index] =
						atoi(fname + i);
					break;
				}
				available_files_index++;
				free(fname);
			}
			hostlist_destroy(hl);
		}
	}
	list_iterator_destroy(iter);

	if (rc != SLURM_SUCCESS)
		fatal("%s failed to load configuration", plugin_name);

	for (i = 0; i < nb_available_files; i++)
		info("nic %d is device number %d", i, nic_devices[i]);

	return rc;
}
Beispiel #24
0
/*
 * _build_part_bitmap - update the total_cpus, total_nodes, and node_bitmap
 *	for the specified partition, also reset the partition pointers in
 *	the node back to this partition.
 * IN part_ptr - pointer to the partition
 * RET 0 if no error, errno otherwise
 * global: node_record_table_ptr - pointer to global node table
 * NOTE: this does not report nodes defined in more than one partition. this
 *	is checked only upon reading the configuration file, not on an update
 */
static int _build_part_bitmap(struct part_record *part_ptr)
{
	char *this_node_name;
	bitstr_t *old_bitmap;
	struct node_record *node_ptr;	/* pointer to node_record */
	hostlist_t host_list;

	part_ptr->total_cpus = 0;
	part_ptr->total_nodes = 0;

	if (part_ptr->node_bitmap == NULL) {
		part_ptr->node_bitmap = bit_alloc(node_record_count);
		old_bitmap = NULL;
	} else {
		old_bitmap = bit_copy(part_ptr->node_bitmap);
		bit_nclear(part_ptr->node_bitmap, 0,
			   node_record_count - 1);
	}

	if (part_ptr->nodes == NULL) {	/* no nodes in partition */
		_unlink_free_nodes(old_bitmap, part_ptr);
		FREE_NULL_BITMAP(old_bitmap);
		return 0;
	}

	if ((host_list = hostlist_create(part_ptr->nodes)) == NULL) {
		FREE_NULL_BITMAP(old_bitmap);
		error("hostlist_create error on %s, %m",
		      part_ptr->nodes);
		return ESLURM_INVALID_NODE_NAME;
	}

	while ((this_node_name = hostlist_shift(host_list))) {
		node_ptr = find_node_record(this_node_name);
		if (node_ptr == NULL) {
			error("_build_part_bitmap: invalid node name %s",
				this_node_name);
			free(this_node_name);
			FREE_NULL_BITMAP(old_bitmap);
			hostlist_destroy(host_list);
			return ESLURM_INVALID_NODE_NAME;
		}
		part_ptr->total_nodes++;
		if (slurmctld_conf.fast_schedule)
			part_ptr->total_cpus += node_ptr->config_ptr->cpus;
		else
			part_ptr->total_cpus += node_ptr->cpus;
		node_ptr->part_cnt++;
		xrealloc(node_ptr->part_pptr, (node_ptr->part_cnt *
			sizeof(struct part_record *)));
		node_ptr->part_pptr[node_ptr->part_cnt-1] = part_ptr;
		if (old_bitmap)
			bit_clear(old_bitmap,
				  (int) (node_ptr -
					 node_record_table_ptr));
		bit_set(part_ptr->node_bitmap,
			(int) (node_ptr - node_record_table_ptr));
		free(this_node_name);
	}
	hostlist_destroy(host_list);

	_unlink_free_nodes(old_bitmap, part_ptr);
	last_node_update = time(NULL);
	FREE_NULL_BITMAP(old_bitmap);
	return 0;
}
Beispiel #25
0
/*
 * _build_single_nodeline_info - From the slurm.conf reader, build table,
 * 	and set values
 * RET 0 if no error, error code otherwise
 * Note: Operates on common variables
 *	default_node_record - default node configuration values
 */
static int _build_single_nodeline_info(slurm_conf_node_t *node_ptr,
				       struct config_record *config_ptr)
{
	int error_code = SLURM_SUCCESS;
	struct node_record *node_rec = NULL;
	hostlist_t address_list = NULL;
	hostlist_t alias_list = NULL;
	hostlist_t hostname_list = NULL;
	hostlist_t port_list = NULL;
	char *address = NULL;
	char *alias = NULL;
	char *hostname = NULL;
	char *port_str = NULL;
	int state_val = NODE_STATE_UNKNOWN;
	int address_count, alias_count, hostname_count, port_count;
	uint16_t port = 0;

	if (node_ptr->state != NULL) {
		state_val = state_str2int(node_ptr->state, node_ptr->nodenames);
		if (state_val == NO_VAL)
			goto cleanup;
	}

	if ((address_list = hostlist_create(node_ptr->addresses)) == NULL) {
		fatal("Unable to create NodeAddr list from %s",
		      node_ptr->addresses);
		error_code = errno;
		goto cleanup;
	}
	if ((alias_list = hostlist_create(node_ptr->nodenames)) == NULL) {
		fatal("Unable to create NodeName list from %s",
		      node_ptr->nodenames);
		error_code = errno;
		goto cleanup;
	}
	if ((hostname_list = hostlist_create(node_ptr->hostnames)) == NULL) {
		fatal("Unable to create NodeHostname list from %s",
		      node_ptr->hostnames);
		error_code = errno;
		goto cleanup;
	}
	if (node_ptr->port_str && node_ptr->port_str[0] &&
	    (node_ptr->port_str[0] != '[') &&
	    (strchr(node_ptr->port_str, '-') ||
	     strchr(node_ptr->port_str, ','))) {
		xstrfmtcat(port_str, "[%s]", node_ptr->port_str);
		port_list = hostlist_create(port_str);
		xfree(port_str);
	} else {
		port_list = hostlist_create(node_ptr->port_str);
	}
	if (port_list == NULL) {
		error("Unable to create Port list from %s",
		      node_ptr->port_str);
		error_code = errno;
		goto cleanup;
	}

	/* some sanity checks */
	address_count  = hostlist_count(address_list);
	alias_count    = hostlist_count(alias_list);
	hostname_count = hostlist_count(hostname_list);
	port_count     = hostlist_count(port_list);
#ifdef HAVE_FRONT_END
	if ((hostname_count != alias_count) && (hostname_count != 1)) {
		error("NodeHostname count must equal that of NodeName "
		      "records of there must be no more than one");
		goto cleanup;
	}
	if ((address_count != alias_count) && (address_count != 1)) {
		error("NodeAddr count must equal that of NodeName "
		      "records of there must be no more than one");
		goto cleanup;
	}
#else
#ifdef MULTIPLE_SLURMD
	if ((address_count != alias_count) && (address_count != 1)) {
		error("NodeAddr count must equal that of NodeName "
		      "records of there must be no more than one");
		goto cleanup;
	}
#else
	if (address_count < alias_count) {
		error("At least as many NodeAddr are required as NodeName");
		goto cleanup;
	}
	if (hostname_count < alias_count) {
		error("At least as many NodeHostname are required "
		      "as NodeName");
		goto cleanup;
	}
#endif	/* MULTIPLE_SLURMD */
#endif	/* HAVE_FRONT_END */
	if ((port_count != alias_count) && (port_count > 1)) {
		error("Port count must equal that of NodeName "
		      "records or there must be no more than one");
		goto cleanup;
	}

	/* now build the individual node structures */
	while ((alias = hostlist_shift(alias_list))) {
		if (address_count > 0) {
			address_count--;
			if (address)
				free(address);
			address = hostlist_shift(address_list);
		}
		if (hostname_count > 0) {
			hostname_count--;
			if (hostname)
				free(hostname);
			hostname = hostlist_shift(hostname_list);
		}
		if (port_count > 0) {
			int port_int;
			port_count--;
			if (port_str)
				free(port_str);
			port_str = hostlist_shift(port_list);
			port_int = atoi(port_str);
			if ((port_int <= 0) || (port_int > 0xffff))
				fatal("Invalid Port %s", node_ptr->port_str);
			port = port_int;
		}
		/* find_node_record locks this to get the
		 * alias so we need to unlock */
		node_rec = find_node_record(alias);

		if (node_rec == NULL) {
			node_rec = create_node_record(config_ptr, alias);
			if ((state_val != NO_VAL) &&
			    (state_val != NODE_STATE_UNKNOWN))
				node_rec->node_state = state_val;
			node_rec->last_response = (time_t) 0;
			node_rec->comm_name = xstrdup(address);
			node_rec->node_hostname = xstrdup(hostname);
			node_rec->port      = port;
			node_rec->weight    = node_ptr->weight;
			node_rec->features  = xstrdup(node_ptr->feature);
			node_rec->reason    = xstrdup(node_ptr->reason);
		} else {
			/* FIXME - maybe should be fatal? */
			error("Reconfiguration for node %s, ignoring!", alias);
		}
		free(alias);
	}

	/* free allocated storage */
cleanup:
	if (address)
		free(address);
	if (hostname)
		free(hostname);
	if (port_str)
		free(port_str);
	if (address_list)
		hostlist_destroy(address_list);
	if (alias_list)
		hostlist_destroy(alias_list);
	if (hostname_list)
		hostlist_destroy(hostname_list);
	if (port_list)
		hostlist_destroy(port_list);
	return error_code;
}
Beispiel #26
0
static void _forward_msg_internal(hostlist_t hl, hostlist_t* sp_hl,
				  forward_struct_t *fwd_struct,
				  header_t *header, int timeout,
				  int hl_count)
{
	int j;
	forward_msg_t *fwd_msg = NULL;
	char *buf = NULL, *tmp_char = NULL;
	pthread_attr_t attr_agent;
	pthread_t thread_agent;

	if (timeout <= 0)
		/* convert secs to msec */
		timeout  = slurm_get_msg_timeout() * 1000;

	for (j = 0; j < hl_count; j++) {
		int retries = 0;

		slurm_attr_init(&attr_agent);
		if (pthread_attr_setdetachstate
		    (&attr_agent, PTHREAD_CREATE_DETACHED))
			error("pthread_attr_setdetachstate error %m");

		fwd_msg = xmalloc(sizeof(forward_msg_t));

		fwd_msg->fwd_struct = fwd_struct;

		fwd_msg->timeout = timeout;

		memcpy(&fwd_msg->header.orig_addr,
		       &header->orig_addr,
		       sizeof(slurm_addr_t));

		fwd_msg->header.version = header->version;
		fwd_msg->header.flags = header->flags;
		fwd_msg->header.msg_type = header->msg_type;
		fwd_msg->header.body_length = header->body_length;
		fwd_msg->header.ret_list = NULL;
		fwd_msg->header.ret_cnt = 0;

		if (sp_hl) {
			buf = hostlist_ranged_string_xmalloc(sp_hl[j]);
			hostlist_destroy(sp_hl[j]);
		} else {
			tmp_char = hostlist_shift(hl);
			buf = xstrdup(tmp_char);
			free(tmp_char);
		}

		forward_init(&fwd_msg->header.forward, NULL);
		fwd_msg->header.forward.nodelist = buf;
		while (pthread_create(&thread_agent, &attr_agent,
				     _forward_thread,
				     (void *)fwd_msg)) {
			error("pthread_create error %m");
			if (++retries > MAX_RETRIES)
				fatal("Can't create pthread");
			usleep(100000);	/* sleep and try again */
		}
		slurm_attr_destroy(&attr_agent);
	}
}
Beispiel #27
0
/*
 * _set_collectors call the split_hostlist API on the all nodes hostlist
 * to set the node to be used as a collector for unsolicited node aggregation.
 *
 * If this node is a forwarding node (first node in any hostlist),
 * then its collector and backup are the ControlMachine and it's backup.
 *
 * Otherwise, we find the hostlist containing this node.
 * The forwarding node in that hostlist becomes a collector, the next node
 * which is not this node becomes the backup.
 * That list is split, we iterate through it and searching for a list in
 * which this node is a forwarding node. If found, we set the collector and
 * backup, else this process is repeated.
 */
static void _set_collectors(char *this_node_name)
{
	slurm_ctl_conf_t *conf;
	hostlist_t  nodes;
	hostlist_t* hll = NULL;
	char *parent = NULL, *backup = NULL;
	char addrbuf[32];
	int i, j, f = -1;
	int hl_count = 0;
	uint16_t parent_port;
	uint16_t backup_port;
	bool found = false;
	bool ctldparent = true;

#ifdef HAVE_FRONT_END
	return; /* on a FrontEnd system this would never be useful. */
#endif

	if (!run_in_daemon("slurmd"))
		return; /* Only compute nodes have collectors */

	/* Set the initial iteration, collector is controller,
	 * full list is split */
	xassert(this_node_name);

	conf = slurm_conf_lock();
	nodes = _get_all_nodes();
	parent = strdup(conf->control_addr);
	if (conf->backup_addr) {
		backup = strdup(conf->backup_addr);
	}
	parent_port = conf->slurmctld_port;
	backup_port = parent_port;
	slurm_conf_unlock();
	while (!found) {
		if ( route_g_split_hostlist(nodes, &hll, &hl_count) ) {
			error("unable to split forward hostlist");
			goto clean; /* collector addrs remains null */
		}
		/* Find which hostlist contains this node */
		for (i=0; i < hl_count; i++) {
			f = hostlist_find(hll[i], this_node_name);
			if (f != -1)
				break;
		}
		if (i == hl_count) {
			fatal("ROUTE -- %s not found in node_record_table",
			      this_node_name);
		}
		if (f == 0) {
			/* we are a forwarded to node,
			 * so our parent is parent */
			if (hostlist_count(hll[i]) > 1)
				this_is_collector = true;
			xfree(msg_collect_node);
			msg_collect_node = xmalloc(sizeof(slurm_addr_t));
			if (ctldparent)
				slurm_set_addr(msg_collect_node, parent_port,
					       parent);
			else {
				slurm_conf_get_addr(parent, msg_collect_node);
				msg_collect_node->sin_port = htons(parent_port);
			}
			if (debug_flags & DEBUG_FLAG_ROUTE) {
				slurm_print_slurm_addr(msg_collect_node,
						       addrbuf, 32);
				info("ROUTE -- message collector address is %s",
				     addrbuf);
			}
			xfree(msg_collect_backup);
			if (backup) {
				msg_collect_backup =
					xmalloc(sizeof(slurm_addr_t));
				if (ctldparent) {
					slurm_set_addr(msg_collect_backup,
						       backup_port, backup);
				} else {
					slurm_conf_get_addr(backup,
							    msg_collect_backup);
					msg_collect_backup->sin_port =
						htons(backup_port);
				}
				if (debug_flags & DEBUG_FLAG_ROUTE) {
					slurm_print_slurm_addr(
						msg_collect_backup,
						addrbuf, 32);
					info("ROUTE -- message collector backup"
					     " address is %s", addrbuf);
				}
			} else {
				if (debug_flags & DEBUG_FLAG_ROUTE) {
					info("ROUTE -- no message collector "
					     "backup");
				}

			}
			found = true;
			goto clean;
		}

		/* We are not a forwarding node, the first node in this list
		 * will split the forward_list.
		 * We also know that the forwarding node is not a controller.
		 *
		 * clean up parent context */
		ctldparent = false;
		hostlist_destroy(nodes);
		if (parent)
			free(parent);
		if (backup)
			free(backup);
		nodes = hostlist_copy(hll[i]);
		for (j=0; j < hl_count; j++) {
			hostlist_destroy(hll[j]);
		}
		xfree(hll);

		/* set our parent, backup, and continue search */
		parent = hostlist_shift(nodes);
		backup = hostlist_nth(nodes, 0);
		if (strcmp(backup, this_node_name) == 0) {
			free(backup);
			backup = NULL;
			if (hostlist_count(nodes) > 1)
				backup = hostlist_nth(nodes, 1);
		}
		parent_port =  slurm_conf_get_port(parent);
		if (backup) {
			backup_port = slurm_conf_get_port(backup);
		} else
			backup_port = 0;

	}
clean:
	if (debug_flags & DEBUG_FLAG_ROUTE) {
		if (this_is_collector)
			info("ROUTE -- %s is a collector node", this_node_name);
		else
			info("ROUTE -- %s is a leaf node", this_node_name);
	}
	hostlist_destroy(nodes);
	if (parent)
		free(parent);
	if (backup)
		free(backup);
	for (i=0; i < hl_count; i++) {
		hostlist_destroy(hll[i]);
	}
	xfree(hll);
}
Beispiel #28
0
static void
_xlate_job_step_ids(char **rest)
{
	int buf_size, buf_offset, i;
	long job_id, tmp_l;
	char *next_str;

	opt.job_cnt = 0;

	buf_size   = 0xffff;
	buf_offset = 0;
	opt.array_id = xmalloc(buf_size * sizeof(uint32_t));
	opt.job_id   = xmalloc(buf_size * sizeof(uint32_t));
	opt.step_id  = xmalloc(buf_size * sizeof(uint32_t));

	for (i = 0; rest[i] && (buf_offset < buf_size); i++) {
		job_id = strtol(rest[i], &next_str, 10);
		if (job_id <= 0) {
			error ("Invalid job_id %s", rest[i]);
			exit (1);
		}
		opt.job_id[buf_offset] = job_id;

		if ((next_str[0] == '_') && (next_str[1] == '[')) {
			hostlist_t hl;
			char save_char, *next_elem;
			char *end_char = strchr(next_str + 2, ']');
			if (!end_char || (end_char[1] != '\0')) {
				error ("Invalid job id %s", rest[i]);
				exit (1);
			}
			save_char = end_char[1];
			end_char[1] = '\0';
			hl = hostlist_create(next_str + 1);
			if (!hl) {
				error ("Invalid job id %s", rest[i]);
				exit (1);
			}
			while ((next_elem = hostlist_shift(hl))) {
				tmp_l = strtol(next_elem, &next_str, 10);
				if (tmp_l < 0) {
					error ("Invalid job id %s", rest[i]);
					exit (1);
				}
				opt.job_id[buf_offset]   = job_id;
				opt.array_id[buf_offset] = tmp_l;
				opt.step_id[buf_offset]  = SLURM_BATCH_SCRIPT;
				free(next_elem);
				if (++buf_offset >= buf_size)
					break;
			}
			hostlist_destroy(hl);
			end_char[1] = save_char;
			/* No step ID support for job array range */
			continue;
		} else if ((next_str[0] == '_') && (next_str[1] == '*')) {
			opt.array_id[buf_offset] = INFINITE;
			next_str += 2;
		} else if (next_str[0] == '_') {
			tmp_l = strtol(&next_str[1], &next_str, 10);
			if (tmp_l < 0) {
				error ("Invalid job id %s", rest[i]);
				exit (1);
			}
			opt.array_id[buf_offset] = tmp_l;
		} else {
			opt.array_id[buf_offset] = NO_VAL;
		}


		if (next_str[0] == '.') {
			tmp_l = strtol(&next_str[1], &next_str, 10);
			if (tmp_l < 0) {
				error ("Invalid job id %s", rest[i]);
				exit (1);
			}
			opt.step_id[buf_offset] = tmp_l;
		} else
			opt.step_id[buf_offset] = SLURM_BATCH_SCRIPT;
		buf_offset++;

		if (next_str[0] != '\0') {
			error ("Invalid job ID %s", rest[i]);
			exit (1);
		}
	}
	opt.job_cnt = buf_offset;
}
Beispiel #29
0
void *_forward_thread(void *arg)
{
	forward_msg_t *fwd_msg = (forward_msg_t *)arg;
	forward_struct_t *fwd_struct = fwd_msg->fwd_struct;
	Buf buffer = init_buf(BUF_SIZE);	/* probably enough for header */
	List ret_list = NULL;
	int fd = -1;
	ret_data_info_t *ret_data_info = NULL;
	char *name = NULL;
	hostlist_t hl = hostlist_create(fwd_msg->header.forward.nodelist);
	slurm_addr_t addr;
	char *buf = NULL;
	int steps = 0;
	int start_timeout = fwd_msg->timeout;

	/* repeat until we are sure the message was sent */
	while ((name = hostlist_shift(hl))) {
		if (slurm_conf_get_addr(name, &addr) == SLURM_ERROR) {
			error("forward_thread: can't find address for host "
			      "%s, check slurm.conf", name);
			slurm_mutex_lock(&fwd_struct->forward_mutex);
			mark_as_failed_forward(&fwd_struct->ret_list, name,
					       SLURM_UNKNOWN_FORWARD_ADDR);
 			free(name);
			if (hostlist_count(hl) > 0) {
				slurm_mutex_unlock(&fwd_struct->forward_mutex);
				continue;
			}
			goto cleanup;
		}
		if ((fd = slurm_open_msg_conn(&addr)) < 0) {
			error("forward_thread to %s: %m", name);

			slurm_mutex_lock(&fwd_struct->forward_mutex);
			mark_as_failed_forward(
				&fwd_struct->ret_list, name,
				SLURM_COMMUNICATIONS_CONNECTION_ERROR);
			free(name);
			if (hostlist_count(hl) > 0) {
				slurm_mutex_unlock(&fwd_struct->forward_mutex);
				/* Abandon tree. This way if all the
				 * nodes in the branch are down we
				 * don't have to time out for each
				 * node serially.
				 */
				_forward_msg_internal(hl, NULL, fwd_struct,
						      &fwd_msg->header, 0,
						      hostlist_count(hl));
				continue;
			}
			goto cleanup;
		}
		buf = hostlist_ranged_string_xmalloc(hl);

		xfree(fwd_msg->header.forward.nodelist);
		fwd_msg->header.forward.nodelist = buf;
		fwd_msg->header.forward.cnt = hostlist_count(hl);
#if 0
		info("sending %d forwards (%s) to %s",
		     fwd_msg->header.forward.cnt,
		     fwd_msg->header.forward.nodelist, name);
#endif
		if (fwd_msg->header.forward.nodelist[0]) {
			debug3("forward: send to %s along with %s",
			       name, fwd_msg->header.forward.nodelist);
		} else
			debug3("forward: send to %s ", name);

		pack_header(&fwd_msg->header, buffer);

		/* add forward data to buffer */
		if (remaining_buf(buffer) < fwd_struct->buf_len) {
			int new_size = buffer->processed + fwd_struct->buf_len;
			new_size += 1024; /* padded for paranoia */
			xrealloc_nz(buffer->head, new_size);
			buffer->size = new_size;
		}
		if (fwd_struct->buf_len) {
			memcpy(&buffer->head[buffer->processed],
			       fwd_struct->buf, fwd_struct->buf_len);
			buffer->processed += fwd_struct->buf_len;
		}

		/*
		 * forward message
		 */
		if (slurm_msg_sendto(fd,
				     get_buf_data(buffer),
				     get_buf_offset(buffer),
				     SLURM_PROTOCOL_NO_SEND_RECV_FLAGS ) < 0) {
			error("forward_thread: slurm_msg_sendto: %m");

			slurm_mutex_lock(&fwd_struct->forward_mutex);
			mark_as_failed_forward(&fwd_struct->ret_list, name,
					       errno);
			free(name);
			if (hostlist_count(hl) > 0) {
				free_buf(buffer);
				buffer = init_buf(fwd_struct->buf_len);
				slurm_mutex_unlock(&fwd_struct->forward_mutex);
				slurm_close(fd);
				fd = -1;
				/* Abandon tree. This way if all the
				 * nodes in the branch are down we
				 * don't have to time out for each
				 * node serially.
				 */
				_forward_msg_internal(hl, NULL, fwd_struct,
						      &fwd_msg->header, 0,
						      hostlist_count(hl));
				continue;
			}
			goto cleanup;
		}

		/* These messages don't have a return message, but if
		 * we got here things worked out so make note of the
		 * list of nodes as success.
		 */
		if ((fwd_msg->header.msg_type == REQUEST_SHUTDOWN) ||
		    (fwd_msg->header.msg_type == REQUEST_RECONFIGURE) ||
		    (fwd_msg->header.msg_type == REQUEST_REBOOT_NODES)) {
			slurm_mutex_lock(&fwd_struct->forward_mutex);
			ret_data_info = xmalloc(sizeof(ret_data_info_t));
			list_push(fwd_struct->ret_list, ret_data_info);
			ret_data_info->node_name = xstrdup(name);
			free(name);
			while ((name = hostlist_shift(hl))) {
				ret_data_info =
					xmalloc(sizeof(ret_data_info_t));
				list_push(fwd_struct->ret_list, ret_data_info);
				ret_data_info->node_name = xstrdup(name);
				free(name);
			}
			goto cleanup;
		}

		if (fwd_msg->header.forward.cnt > 0) {
			static int message_timeout = -1;
			if (message_timeout < 0)
				message_timeout =
					slurm_get_msg_timeout() * 1000;
			if (!fwd_msg->header.forward.tree_width)
				fwd_msg->header.forward.tree_width =
					slurm_get_tree_width();
			steps = (fwd_msg->header.forward.cnt+1) /
					fwd_msg->header.forward.tree_width;
			fwd_msg->timeout = (message_timeout*steps);
			/* info("got %d * %d = %d", message_timeout, */
			/*      steps, fwd_msg->timeout); */
			steps++;
			fwd_msg->timeout += (start_timeout*steps);
			/* info("now  + %d*%d = %d", start_timeout, */
			/*      steps, fwd_msg->timeout); */
		}

		ret_list = slurm_receive_msgs(fd, steps, fwd_msg->timeout);
		/* info("sent %d forwards got %d back", */
		/*      fwd_msg->header.forward.cnt, list_count(ret_list)); */

		if (!ret_list || (fwd_msg->header.forward.cnt != 0
				  && list_count(ret_list) <= 1)) {
			slurm_mutex_lock(&fwd_struct->forward_mutex);
			mark_as_failed_forward(&fwd_struct->ret_list, name,
					       errno);
			free(name);
			FREE_NULL_LIST(ret_list);
			if (hostlist_count(hl) > 0) {
				free_buf(buffer);
				buffer = init_buf(fwd_struct->buf_len);
				slurm_mutex_unlock(&fwd_struct->forward_mutex);
				slurm_close(fd);
				fd = -1;
				continue;
			}
			goto cleanup;
		} else if ((fwd_msg->header.forward.cnt+1)
			  != list_count(ret_list)) {
			/* this should never be called since the above
			   should catch the failed forwards and pipe
			   them back down, but this is here so we
			   never have to worry about a locked
			   mutex */
			ListIterator itr = NULL;
			char *tmp = NULL;
			int first_node_found = 0;
			hostlist_iterator_t host_itr
				= hostlist_iterator_create(hl);
			error("We shouldn't be here.  We forwarded to %d "
			      "but only got %d back",
			      (fwd_msg->header.forward.cnt+1),
			      list_count(ret_list));
			while ((tmp = hostlist_next(host_itr))) {
				int node_found = 0;
				itr = list_iterator_create(ret_list);
				while ((ret_data_info = list_next(itr))) {
					if (!ret_data_info->node_name) {
						first_node_found = 1;
						ret_data_info->node_name =
							xstrdup(name);
					}
					if (!xstrcmp(tmp,
						   ret_data_info->node_name)) {
						node_found = 1;
						break;
					}
				}
				list_iterator_destroy(itr);
				if (!node_found) {
					mark_as_failed_forward(
						&fwd_struct->ret_list,
						tmp,
						SLURM_COMMUNICATIONS_CONNECTION_ERROR);
				}
				free(tmp);
			}
			hostlist_iterator_destroy(host_itr);
			if (!first_node_found) {
				mark_as_failed_forward(
					&fwd_struct->ret_list,
					name,
					SLURM_COMMUNICATIONS_CONNECTION_ERROR);
			}
		}
		break;
	}
	slurm_mutex_lock(&fwd_struct->forward_mutex);
	if (ret_list) {
		while ((ret_data_info = list_pop(ret_list)) != NULL) {
			if (!ret_data_info->node_name) {
				ret_data_info->node_name = xstrdup(name);
			}
			list_push(fwd_struct->ret_list, ret_data_info);
			debug3("got response from %s",
			       ret_data_info->node_name);
		}
		FREE_NULL_LIST(ret_list);
	}
	free(name);
cleanup:
	if ((fd >= 0) && slurm_close(fd) < 0)
		error ("close(%d): %m", fd);
	hostlist_destroy(hl);
	destroy_forward(&fwd_msg->header.forward);
	free_buf(buffer);
	slurm_cond_signal(&fwd_struct->notify);
	slurm_mutex_unlock(&fwd_struct->forward_mutex);
	xfree(fwd_msg);

	return (NULL);
}
Beispiel #30
0
extern int launch_p_create_job_step(srun_job_t *job, bool use_all_cpus,
				    void (*signal_function)(int),
				    sig_atomic_t *destroy_job)
{
	char dname[512], value[32];
	char *protocol = "mpi";
	uint32_t ntasks = opt.ntasks;
	uint32_t nnodes = opt.min_nodes;

	if (opt.launch_cmd) {
		int i;

		xstrfmtcat(poe_cmd_line, "%s", opt.argv[0]);
		for (i = 1; i < opt.argc; i++)
			xstrfmtcat(poe_cmd_line, " %s", opt.argv[i]);
	}

	if (job) {
		/* poe can't accept ranges so give the actual number
		   here so it doesn't get confused if srun gives the
		   max instead of the min.
		*/
		ntasks = job->ntasks;
		nnodes = job->nhosts;
	}

	/*
	 * In order to support MPMD or job steps smaller than the LoadLeveler
	 * job allocation size, specify a command file using the poe option
	 * -cmdfile or MP_CMDFILE env var. See page 43 here:
	 * http://publib.boulder.ibm.com/epubs/pdf/c2367811.pdf
	 * The command file should contain one more more lines of the following
	 * form:
	 * <cmd>@<step_id>%<total_tasks>%<protocol>:<num_tasks> <args>
	 * IBM is working to eliminate the need to specify protocol, but until
	 * then it might be determined as follows:
	 *
	 * We are currently looking at 'ldd <program>' and checking the name of
	 * the MPI and PAMI libraries and on x86, also checking to see if Intel
	 * MPI library is used.
	 * This is done at runtime in PMD and depending on '-mpilib' option and
	 * '-config' option used, change the LD_LIBRARY_PATH to properly
	 * support the different PE Runtime levels the customer have installed
	 * on their cluster.
	 *
	 * There is precedence order that would be important if multiple
	 * libraries are listed in the 'ldd output' as long as you know it is
	 * not a mixed protocol (i.e. Openshmem + MPI, UPC + MPI, etc)
	 * application.
	 * 1) If MPI library is found (libmpi*.so) -> use 'mpi'
	 * 2) if Openshmem library is found (libshmem.so) -> use 'shmem'
	 * 3) if UPC runtime library is found (libxlpgas.so) -> use 'pgas'
	 * 4) if only PAMI library is found (libpami.so) -> use 'pami'
	 * 5) if only LAPI library is found (liblapi.so) -> use 'lapi'
	 */
	if (opt.multi_prog) {
		protocol = "multi";
	} else {
		protocol = _get_cmd_protocol(opt.argv[1]);
	}
	debug("cmd:%s protocol:%s", opt.argv[1], protocol);

	if (opt.multi_prog) {
		int fd, k;

		if (opt.launch_cmd) {
			error("--launch_cmd not available "
			      "when using a cmdfile");
			return SLURM_ERROR;
		}
		xassert(job);
		/* NOTE: The command file needs to be in a directory that can
		 * be read from the compute node(s), so /tmp does not work.
		 * We use the user's home directory (based upon "HOME"
		 * environment variable) otherwise use current working
		 * directory. The file is only created here, it is written
		 * in launch_poe.c. */
		_build_work_dir(dname, sizeof(dname));
		xstrfmtcat(cmd_fname, "%s/slurm_cmdfile.%u",
			   dname, (uint32_t) getpid());
		while ((fd = creat(cmd_fname, 0600)) < 0) {
			if (errno == EINTR)
				continue;
			fatal("creat(%s): %m", cmd_fname);
		}
		(void) close(fd);

		/* Set command file name via MP_CMDFILE and remove it from
		 * the execute line. */
		setenv("MP_NEWJOB", "parallel", 1);
		setenv("MP_CMDFILE", cmd_fname, 1);
		setenv("SLURM_CMDFILE", opt.argv[1], 1);
		if (opt.argc) {
			xfree(opt.argv[1]);
			for (k = 1; k < opt.argc; k++)
				opt.argv[k] = opt.argv[k + 1];
			opt.argc--;
		}
	}

	if (opt.shared != (uint16_t) NO_VAL) {
		char *shared_cpu_use = "multiple";

		if (opt.shared)
			shared_cpu_use = "unique";

		setenv("MP_CPU_USE", shared_cpu_use, 1);

		if (opt.launch_cmd)
			xstrfmtcat(poe_cmd_line, " -cpu_use %s",
				   shared_cpu_use);
	}
	if (opt.network) {
		bool cau_set = false;
		bool dev_type_set = false;
		bool protocol_set = false;
		char *type_ptr = NULL;
		char *save_ptr = NULL, *token;
		char *network_str = xstrdup(opt.network);
		char *adapter_use = NULL;

		if (strstr(opt.network, "dedicated"))
			adapter_use = "dedicated";
		else if (strstr(opt.network, "shared"))
			adapter_use = "shared";

		if (adapter_use) {
			setenv("MP_ADAPTER_USE", adapter_use, 1);
			if (opt.launch_cmd)
				xstrfmtcat(poe_cmd_line, " -adapter_use %s",
					   adapter_use);
		}

		token = strtok_r(network_str, ",", &save_ptr);
		while (token) {
			/* bulk_xfer options */
			if (!strncasecmp(token, "bulk_xfer", 9)) {
				setenv("MP_USE_BULK_XFER", "yes", 1);
				if (opt.launch_cmd)
					xstrfmtcat(poe_cmd_line,
						   " -use_bulk_xfer yes");
			/* device name options */
			} else if (!strncasecmp(token, "devname=", 8)) {
				/* Ignored by POE */

			/* device type options */
			} else if (!strncasecmp(token, "devtype=", 8)) {
				type_ptr = token + 8;
				if (!xstrcasecmp(type_ptr, "ib")) {
					setenv("MP_DEVTYPE", type_ptr, 1);
					if (opt.launch_cmd)
						xstrfmtcat(poe_cmd_line,
							   " -devtype %s",
							   type_ptr);
				} else if (!xstrcasecmp(type_ptr, "hfi")) {
					setenv("MP_DEVTYPE", type_ptr, 1);
					if (opt.launch_cmd)
						xstrfmtcat(poe_cmd_line,
							   " -devtype %s",
							   type_ptr);
				}
				dev_type_set = true;
				/* POE ignores other options */

			/* instances options */
			} else if (!strncasecmp(token, "instances=", 10)) {
				type_ptr = token + 10;
				setenv("MP_INSTANCES", type_ptr, 1);
				if (opt.launch_cmd) {
					xstrfmtcat(poe_cmd_line,
						   " -instances %s",
						   type_ptr);
				}

			/* network options */
			} else if (!xstrcasecmp(token, "ip")   ||
				  !xstrcasecmp(token, "ipv4")  ||
				  !xstrcasecmp(token, "ipv6")) {
				setenv("MP_EUILIB", "ip", 1);
				if (opt.launch_cmd)
					xstrfmtcat(poe_cmd_line,
						   " -euilib ip");
			} else if (!xstrcasecmp(token, "us")) {
				setenv("MP_EUILIB", "us", 1);
				if (opt.launch_cmd)
					xstrfmtcat(poe_cmd_line,
						   " -euilib us");
			/* protocol options */
			} else if ((!strncasecmp(token, "lapi", 4)) ||
				   (!strncasecmp(token, "mpi",  3)) ||
				   (!strncasecmp(token, "pami", 4)) ||
				   (!strncasecmp(token, "shmem",5)) ||
				   (!strncasecmp(token, "upc",  3))) {
				if (!protocol_set) {
					protocol_set = true;
					protocol = NULL;
				}
				if (protocol)
					xstrcat(protocol, ",");
				xstrcat(protocol, token);
				setenv("MP_MSG_API", protocol, 0);
			/* adapter options */
			} else if (!xstrcasecmp(token, "sn_all")) {
				setenv("MP_EUIDEVICE", "sn_all", 1);
				if (opt.launch_cmd)
					xstrfmtcat(poe_cmd_line,
						   " -euidevice sn_all");
			} else if (!xstrcasecmp(token, "sn_single")) {
				setenv("MP_EUIDEVICE", "sn_single", 1);
				if (opt.launch_cmd)
					xstrfmtcat(poe_cmd_line,
						   " -euidevice sn_single");
			/* Collective Acceleration Units (CAU) */
			} else if (!strncasecmp(token, "cau=", 4)) {
				setenv("MP_COLLECTIVE_GROUPS", token + 4, 1);
				if (opt.launch_cmd)
					xstrfmtcat(poe_cmd_line,
						   " -collective_groups %s",
						   token + 4);
				if (atoi(token + 4))
					cau_set = true;
			/* Immediate Send Slots Per Window */
			} else if (!strncasecmp(token, "immed=", 6)) {
				setenv("MP_IMM_SEND_BUFFERS", token + 6, 1);
				if (opt.launch_cmd)
					xstrfmtcat(poe_cmd_line,
						   " -imm_send_buffers %s",
						   token + 6);
			/* other */
			} else {
				info("switch/nrt: invalid option: %s", token);
			}
			token = strtok_r(NULL, ",", &save_ptr);
		}
		if (cau_set && !dev_type_set) {
			/* If POE is executed directly (not spawned by srun)
			 * it will generate an error if -collective_groups is
			 * non-zero and devtype is not set. Since we do not
			 * know what devices are available at this point, set
			 * the default type to hfi in hopes of avoiding an
			 * error. User can always specify a devtype in the
			 * --network option to avoid possible invalid value */
			setenv("MP_DEVTYPE", type_ptr, 1);
			if (opt.launch_cmd)
				xstrcat(poe_cmd_line, " -devtype hfi");
		}

		if (opt.launch_cmd)
			xstrfmtcat(poe_cmd_line, " -msg_api %s", protocol);
		if (protocol_set)
			xfree(protocol);
		else
			setenv("MP_MSG_API", protocol, 0);
	} else {
		if (xstrcmp(protocol, "multi")) {
			setenv("MP_MSG_API", protocol, 0);
			if (opt.launch_cmd)
				xstrfmtcat(poe_cmd_line,
					   " -msg_api %s", protocol);
		}
	}

	if (opt.nodelist &&
	    ((opt.distribution & SLURM_DIST_STATE_BASE)==SLURM_DIST_ARBITRARY)) {
		bool destroy_hostfile = 0;
		if (!opt.hostfile) {
			char *host_name, *host_line;
			pid_t pid = getpid();
			hostlist_t hl;
			int fd, len, offset, wrote;

			destroy_hostfile = 1;

			hl = hostlist_create(opt.nodelist);
			if (!hl)
				fatal("Invalid nodelist: %s", opt.nodelist);
			xstrfmtcat(opt.hostfile, "slurm_hostlist.%u",
				   (uint32_t) pid);
			if ((fd = creat(opt.hostfile, 0600)) < 0)
				fatal("creat(%s): %m", opt.hostfile);
			host_line = NULL;
			while ((host_name = hostlist_shift(hl))) {
				if (host_line)
					xstrcat(host_line, "\n");
				xstrcat(host_line, host_name);
				free(host_name);
			}
			hostlist_destroy(hl);
			len = strlen(host_line) + 1;
			offset = 0;
			while (len > offset) {
				wrote = write(fd, host_line + offset,
					      len - offset);
				if (wrote < 0) {
					if ((errno == EAGAIN) ||
					    (errno == EINTR))
						continue;
					fatal("write(%s): %m", opt.hostfile);
				}
				offset += wrote;
			}
			xfree(host_line);
			close(fd);
		}
		debug2("using hostfile %s", opt.hostfile);
		setenv("MP_HOSTFILE", opt.hostfile, 1);
		if (opt.launch_cmd) {
			xstrfmtcat(poe_cmd_line, " -hfile %s", opt.hostfile);
			if (destroy_hostfile)
				info("WARNING: hostlist file %s was created.  "
				     "User is responsible to remove it when "
				     "done.", opt.hostfile);
		} else if (destroy_hostfile)
			setenv("SRUN_DESTROY_HOSTFILE", opt.hostfile, 1);

		/* RESD has to be set to yes or for some reason poe
		   thinks things are already set up and then we are
		   screwed.
		*/
		setenv("MP_RESD", "yes", 1);
		if (opt.launch_cmd)
			xstrcat(poe_cmd_line, " -resd yes");
		/* FIXME: This next line is here just for debug
		 * purpose.  It makes it so each task has a separate
		 * line. */
		setenv("MP_STDOUTMODE", "unordered", 1);
		/* Just incase we didn't specify a file in srun. */
		setenv("SLURM_ARBITRARY_NODELIST", opt.nodelist, 1);
	} else {
		/* Since poe doesn't need to know about the partition and it
		   really needs to have RMPOOL set just set it to something.
		   This only needs to happen if we don't specify the
		   hostlist like above.
		*/
		setenv("MP_RMPOOL", "SLURM", 1);
		if (opt.launch_cmd)
			xstrfmtcat(poe_cmd_line, " -rmpool slurm");
	}

	if (opt.msg_timeout) {
		snprintf(value, sizeof(value), "%d", opt.msg_timeout);
		setenv("MP_TIMEOUT", value, 1);
		/* There is no equivelent cmd line option */
	}
	if (opt.immediate) {
		setenv("MP_RETRY", "0", 1);
		if (opt.launch_cmd)
			xstrfmtcat(poe_cmd_line, " -retry 0");
	}
	if (opt.labelio) {
		setenv("MP_LABELIO", "yes", 0);
		if (opt.launch_cmd)
			xstrfmtcat(poe_cmd_line, " -labelio yes");
	}
	if (nnodes) {
		snprintf(value, sizeof(value), "%u", nnodes);
		setenv("MP_NODES", value, 1);
		if (opt.launch_cmd)
			xstrfmtcat(poe_cmd_line, " -nodes %s", value);
	}
	if (ntasks) {
		snprintf(value, sizeof(value), "%u", ntasks);
		setenv("MP_PROCS", value, 1);
		if (opt.launch_cmd)
			xstrfmtcat(poe_cmd_line, " -procs %s", value);
	}
	if (opt.cpu_bind_type) {
		/* POE supports a limited subset of CPU binding options */
		opt.cpu_bind_type &= (CPU_BIND_TO_THREADS |
				      CPU_BIND_TO_CORES   |
				      CPU_BIND_RANK);
	}
	if (opt.cpu_bind_type) {
		char *units;
		int count = 1;

		if (opt.cpu_bind_type & CPU_BIND_TO_CORES)
			units = "core";
		else
			units = "cpu";

		if (opt.cpus_per_task)
			count = MAX(opt.cpus_per_task, 1);
		snprintf(value, sizeof(value), "%s:%d", units, count);
		setenv("MP_TASK_AFFINITY", value, 1);
		setenv("MP_BINDPROC", "yes", 1);
		if (opt.launch_cmd) {
			xstrfmtcat(poe_cmd_line, " -task_affinity %s", value);
			xstrfmtcat(poe_cmd_line, " -bindproc yes");
		}
	}
	if (opt.ntasks_per_node != NO_VAL) {
		snprintf(value, sizeof(value), "%u", opt.ntasks_per_node);
		setenv("MP_TASKS_PER_NODE", value, 1);
		if (opt.launch_cmd)
			xstrfmtcat(poe_cmd_line, " -tasks_per_node %s", value);
	}
	if (opt.unbuffered) {
		setenv("MP_STDOUTMODE", "unordered", 1);
		if (opt.launch_cmd)
			xstrfmtcat(poe_cmd_line,
				   " -stdoutmode unordered");
	}

	_propagate_srun_opts(nnodes, ntasks);
	setenv("SLURM_STARTED_STEP", "YES", 1);
	//disable_status = opt.disable_status;
	//quit_on_intr = opt.quit_on_intr;
	//srun_jobid = xstrdup(opt.jobid);

	if (opt.launch_cmd) {
		printf("%s\n", poe_cmd_line);
		xfree(poe_cmd_line);

		exit(0);
	}
 	return SLURM_SUCCESS;
}