Example #1
0
void *_forward_thread(void *arg)
{
	forward_msg_t *fwd_msg = (forward_msg_t *)arg;
	forward_struct_t *fwd_struct = fwd_msg->fwd_struct;
	Buf buffer = init_buf(BUF_SIZE);	/* probably enough for header */
	List ret_list = NULL;
	int fd = -1;
	ret_data_info_t *ret_data_info = NULL;
	char *name = NULL;
	hostlist_t hl = hostlist_create(fwd_msg->header.forward.nodelist);
	slurm_addr_t addr;
	char *buf = NULL;
	int steps = 0;
	int start_timeout = fwd_msg->timeout;

	/* repeat until we are sure the message was sent */
	while ((name = hostlist_shift(hl))) {
		if (slurm_conf_get_addr(name, &addr) == SLURM_ERROR) {
			error("forward_thread: can't find address for host "
			      "%s, check slurm.conf", name);
			slurm_mutex_lock(&fwd_struct->forward_mutex);
			mark_as_failed_forward(&fwd_struct->ret_list, name,
					       SLURM_UNKNOWN_FORWARD_ADDR);
 			free(name);
			if (hostlist_count(hl) > 0) {
				slurm_mutex_unlock(&fwd_struct->forward_mutex);
				continue;
			}
			goto cleanup;
		}
		if ((fd = slurm_open_msg_conn(&addr)) < 0) {
			error("forward_thread to %s: %m", name);

			slurm_mutex_lock(&fwd_struct->forward_mutex);
			mark_as_failed_forward(
				&fwd_struct->ret_list, name,
				SLURM_COMMUNICATIONS_CONNECTION_ERROR);
			free(name);
			if (hostlist_count(hl) > 0) {
				slurm_mutex_unlock(&fwd_struct->forward_mutex);
				/* Abandon tree. This way if all the
				 * nodes in the branch are down we
				 * don't have to time out for each
				 * node serially.
				 */
				_forward_msg_internal(hl, NULL, fwd_struct,
						      &fwd_msg->header, 0,
						      hostlist_count(hl));
				continue;
			}
			goto cleanup;
		}
		buf = hostlist_ranged_string_xmalloc(hl);

		xfree(fwd_msg->header.forward.nodelist);
		fwd_msg->header.forward.nodelist = buf;
		fwd_msg->header.forward.cnt = hostlist_count(hl);
#if 0
		info("sending %d forwards (%s) to %s",
		     fwd_msg->header.forward.cnt,
		     fwd_msg->header.forward.nodelist, name);
#endif
		if (fwd_msg->header.forward.nodelist[0]) {
			debug3("forward: send to %s along with %s",
			       name, fwd_msg->header.forward.nodelist);
		} else
			debug3("forward: send to %s ", name);

		pack_header(&fwd_msg->header, buffer);

		/* add forward data to buffer */
		if (remaining_buf(buffer) < fwd_struct->buf_len) {
			int new_size = buffer->processed + fwd_struct->buf_len;
			new_size += 1024; /* padded for paranoia */
			xrealloc_nz(buffer->head, new_size);
			buffer->size = new_size;
		}
		if (fwd_struct->buf_len) {
			memcpy(&buffer->head[buffer->processed],
			       fwd_struct->buf, fwd_struct->buf_len);
			buffer->processed += fwd_struct->buf_len;
		}

		/*
		 * forward message
		 */
		if (slurm_msg_sendto(fd,
				     get_buf_data(buffer),
				     get_buf_offset(buffer),
				     SLURM_PROTOCOL_NO_SEND_RECV_FLAGS ) < 0) {
			error("forward_thread: slurm_msg_sendto: %m");

			slurm_mutex_lock(&fwd_struct->forward_mutex);
			mark_as_failed_forward(&fwd_struct->ret_list, name,
					       errno);
			free(name);
			if (hostlist_count(hl) > 0) {
				free_buf(buffer);
				buffer = init_buf(fwd_struct->buf_len);
				slurm_mutex_unlock(&fwd_struct->forward_mutex);
				close(fd);
				fd = -1;
				/* Abandon tree. This way if all the
				 * nodes in the branch are down we
				 * don't have to time out for each
				 * node serially.
				 */
				_forward_msg_internal(hl, NULL, fwd_struct,
						      &fwd_msg->header, 0,
						      hostlist_count(hl));
				continue;
			}
			goto cleanup;
		}

		/* These messages don't have a return message, but if
		 * we got here things worked out so make note of the
		 * list of nodes as success.
		 */
		if ((fwd_msg->header.msg_type == REQUEST_SHUTDOWN) ||
		    (fwd_msg->header.msg_type == REQUEST_RECONFIGURE) ||
		    (fwd_msg->header.msg_type == REQUEST_REBOOT_NODES)) {
			slurm_mutex_lock(&fwd_struct->forward_mutex);
			ret_data_info = xmalloc(sizeof(ret_data_info_t));
			list_push(fwd_struct->ret_list, ret_data_info);
			ret_data_info->node_name = xstrdup(name);
			free(name);
			while ((name = hostlist_shift(hl))) {
				ret_data_info =
					xmalloc(sizeof(ret_data_info_t));
				list_push(fwd_struct->ret_list, ret_data_info);
				ret_data_info->node_name = xstrdup(name);
				free(name);
			}
			goto cleanup;
		}

		if (fwd_msg->header.forward.cnt > 0) {
			static int message_timeout = -1;
			if (message_timeout < 0)
				message_timeout =
					slurm_get_msg_timeout() * 1000;
			if (!fwd_msg->header.forward.tree_width)
				fwd_msg->header.forward.tree_width =
					slurm_get_tree_width();
			steps = (fwd_msg->header.forward.cnt+1) /
					fwd_msg->header.forward.tree_width;
			fwd_msg->timeout = (message_timeout*steps);
			/* info("got %d * %d = %d", message_timeout, */
			/*      steps, fwd_msg->timeout); */
			steps++;
			fwd_msg->timeout += (start_timeout*steps);
			/* info("now  + %d*%d = %d", start_timeout, */
			/*      steps, fwd_msg->timeout); */
		}

		ret_list = slurm_receive_msgs(fd, steps, fwd_msg->timeout);
		/* info("sent %d forwards got %d back", */
		/*      fwd_msg->header.forward.cnt, list_count(ret_list)); */

		if (!ret_list || (fwd_msg->header.forward.cnt != 0
				  && list_count(ret_list) <= 1)) {
			slurm_mutex_lock(&fwd_struct->forward_mutex);
			mark_as_failed_forward(&fwd_struct->ret_list, name,
					       errno);
			free(name);
			FREE_NULL_LIST(ret_list);
			if (hostlist_count(hl) > 0) {
				free_buf(buffer);
				buffer = init_buf(fwd_struct->buf_len);
				slurm_mutex_unlock(&fwd_struct->forward_mutex);
				close(fd);
				fd = -1;
				continue;
			}
			goto cleanup;
		} else if ((fwd_msg->header.forward.cnt+1)
			  != list_count(ret_list)) {
			/* this should never be called since the above
			   should catch the failed forwards and pipe
			   them back down, but this is here so we
			   never have to worry about a locked
			   mutex */
			ListIterator itr = NULL;
			char *tmp = NULL;
			int first_node_found = 0;
			hostlist_iterator_t host_itr
				= hostlist_iterator_create(hl);
			error("We shouldn't be here.  We forwarded to %d "
			      "but only got %d back",
			      (fwd_msg->header.forward.cnt+1),
			      list_count(ret_list));
			while ((tmp = hostlist_next(host_itr))) {
				int node_found = 0;
				itr = list_iterator_create(ret_list);
				while ((ret_data_info = list_next(itr))) {
					if (!ret_data_info->node_name) {
						first_node_found = 1;
						ret_data_info->node_name =
							xstrdup(name);
					}
					if (!xstrcmp(tmp,
						   ret_data_info->node_name)) {
						node_found = 1;
						break;
					}
				}
				list_iterator_destroy(itr);
				if (!node_found) {
					mark_as_failed_forward(
						&fwd_struct->ret_list,
						tmp,
						SLURM_COMMUNICATIONS_CONNECTION_ERROR);
				}
				free(tmp);
			}
			hostlist_iterator_destroy(host_itr);
			if (!first_node_found) {
				mark_as_failed_forward(
					&fwd_struct->ret_list,
					name,
					SLURM_COMMUNICATIONS_CONNECTION_ERROR);
			}
		}
		break;
	}
	slurm_mutex_lock(&fwd_struct->forward_mutex);
	if (ret_list) {
		while ((ret_data_info = list_pop(ret_list)) != NULL) {
			if (!ret_data_info->node_name) {
				ret_data_info->node_name = xstrdup(name);
			}
			list_push(fwd_struct->ret_list, ret_data_info);
			debug3("got response from %s",
			       ret_data_info->node_name);
		}
		FREE_NULL_LIST(ret_list);
	}
	free(name);
cleanup:
	if ((fd >= 0) && close(fd) < 0)
		error ("close(%d): %m", fd);
	hostlist_destroy(hl);
	destroy_forward(&fwd_msg->header.forward);
	free_buf(buffer);
	slurm_cond_signal(&fwd_struct->notify);
	slurm_mutex_unlock(&fwd_struct->forward_mutex);
	xfree(fwd_msg);

	return (NULL);
}
Example #2
0
/*
 * Read a Slurm hostfile specified by "filename".  "filename" must contain
 * a list of Slurm NodeNames, one per line.  Reads up to "n" number of hostnames
 * from the file. Returns a string representing a hostlist ranged string of
 * the contents of the file.  This is a helper function, it does not
 * contact any Slurm daemons.
 *
 * Returns a string representing the hostlist.  Returns NULL if there are fewer
 * than "n" hostnames in the file, or if an error occurs.  If "n" ==
 * NO_VAL then the entire file is read in
 *
 * Returned string must be freed with free().
 */
char *slurm_read_hostfile(const char *filename, int n)
{
	FILE *fp = NULL;
	char in_line[BUFFER_SIZE];	/* input line */
	int i, j;
	int line_size;
	int line_num = 0;
	hostlist_t hostlist = NULL;
	char *nodelist = NULL, *end_part = NULL;
	char *asterisk, *tmp_text = NULL, *save_ptr = NULL, *host_name;
	int total_file_len = 0;

	if (filename == NULL || strlen(filename) == 0)
		return NULL;

	if ((fp = fopen(filename, "r")) == NULL) {
		error("slurm_allocate_resources error opening file %s, %m",
		      filename);
		return NULL;
	}

	hostlist = hostlist_create(NULL);
	if (hostlist == NULL) {
		fclose(fp);
		return NULL;
	}

	while (fgets(in_line, BUFFER_SIZE, fp) != NULL) {

		line_size = strlen(in_line);
		for (i = 0; i < line_size; i++) {
			if (in_line[i] == '\n') {
				in_line[i] = '\0';
				break;
			}
			if (in_line[i] == '\0')
				break;
			if (in_line[i] != '#')
				continue;
			if ((i > 0) && (in_line[i - 1] == '\\')) {
				for (j = i; j < line_size; j++) {
					in_line[j - 1] = in_line[j];
				}
				line_size--;
				continue;
			}
			in_line[i] = '\0';
			break;
		}

		/*
		 * Get the string length again just to in case it changed from
		 * the above loop
		 */
		line_size = strlen(in_line);
		total_file_len += line_size;

		/*
		 * If there was an end section from before set it up to be on
		 * the front of this next chunk.
		 */
		if (end_part) {
			tmp_text = end_part;
			end_part = NULL;
		}

		if (line_size == (BUFFER_SIZE - 1)) {
			/*
			 * If we filled up the buffer get the end past the last
			 * comma.  We will tack it on the next pass through.
			 */
			char *last_comma = strrchr(in_line, ',');
			if (!last_comma) {
				error("Line %d, of hostfile %s too long",
				      line_num, filename);
				fclose(fp);
				hostlist_destroy(hostlist);
				return NULL;
			}
			end_part = xstrdup(last_comma + 1);
			*last_comma = '\0';
		} else
			line_num++;

		xstrcat(tmp_text, in_line);

		/* Skip this line */
		if (tmp_text[0] == '\0')
			continue;

		if (!isalpha(tmp_text[0]) && !isdigit(tmp_text[0])) {
			error("Invalid hostfile %s contents on line %d",
			      filename, line_num);
			fclose(fp);
			hostlist_destroy(hostlist);
			xfree(end_part);
			xfree(tmp_text);
			return NULL;
		}

		host_name = strtok_r(tmp_text, ",", &save_ptr);
		while (host_name) {
			if ((asterisk = strchr(host_name, '*')) &&
			    (i = atoi(asterisk + 1))) {
				asterisk[0] = '\0';

				/*
				 * Don't forget the extra space potentially
				 * needed
				 */
				total_file_len += strlen(host_name) * i;

				for (j = 0; j < i; j++)
					hostlist_push_host(hostlist, host_name);
			} else {
				hostlist_push_host(hostlist, host_name);
			}
			host_name = strtok_r(NULL, ",", &save_ptr);
		}
		xfree(tmp_text);

		if ((n != (int)NO_VAL) && (hostlist_count(hostlist) == n))
			break;
	}
	fclose(fp);

	if (hostlist_count(hostlist) <= 0) {
		error("Hostlist is empty!");
		goto cleanup_hostfile;
	}
	if (hostlist_count(hostlist) < n) {
		error("Too few NodeNames in Slurm Hostfile");
		goto cleanup_hostfile;
	}

	total_file_len += 1024;
	nodelist = (char *)malloc(total_file_len);
	if (!nodelist) {
		error("Nodelist xmalloc failed");
		goto cleanup_hostfile;
	}

	if (hostlist_ranged_string(hostlist, total_file_len, nodelist) == -1) {
		error("Hostlist is too long for the allocate RPC!");
		free(nodelist);
		nodelist = NULL;
		goto cleanup_hostfile;
	}

	debug2("Hostlist from SLURM_HOSTFILE = %s", nodelist);

cleanup_hostfile:
	hostlist_destroy(hostlist);
	xfree(end_part);
	xfree(tmp_text);

	return nodelist;
}
Example #3
0
/*
 * _build_single_nodeline_info - From the slurm.conf reader, build table,
 * 	and set values
 * RET 0 if no error, error code otherwise
 * Note: Operates on common variables
 *	default_node_record - default node configuration values
 */
static int _build_single_nodeline_info(slurm_conf_node_t *node_ptr,
				       struct config_record *config_ptr)
{
	int error_code = SLURM_SUCCESS;
	struct node_record *node_rec = NULL;
	hostlist_t address_list = NULL;
	hostlist_t alias_list = NULL;
	hostlist_t hostname_list = NULL;
	hostlist_t port_list = NULL;
	char *address = NULL;
	char *alias = NULL;
	char *hostname = NULL;
	char *port_str = NULL;
	int state_val = NODE_STATE_UNKNOWN;
	int address_count, alias_count, hostname_count, port_count;
	uint16_t port = 0;

	if (node_ptr->state != NULL) {
		state_val = state_str2int(node_ptr->state, node_ptr->nodenames);
		if (state_val == NO_VAL)
			goto cleanup;
	}

	if ((address_list = hostlist_create(node_ptr->addresses)) == NULL) {
		fatal("Unable to create NodeAddr list from %s",
		      node_ptr->addresses);
		error_code = errno;
		goto cleanup;
	}
	if ((alias_list = hostlist_create(node_ptr->nodenames)) == NULL) {
		fatal("Unable to create NodeName list from %s",
		      node_ptr->nodenames);
		error_code = errno;
		goto cleanup;
	}
	if ((hostname_list = hostlist_create(node_ptr->hostnames)) == NULL) {
		fatal("Unable to create NodeHostname list from %s",
		      node_ptr->hostnames);
		error_code = errno;
		goto cleanup;
	}
	if (node_ptr->port_str && node_ptr->port_str[0] &&
	    (node_ptr->port_str[0] != '[') &&
	    (strchr(node_ptr->port_str, '-') ||
	     strchr(node_ptr->port_str, ','))) {
		xstrfmtcat(port_str, "[%s]", node_ptr->port_str);
		port_list = hostlist_create(port_str);
		xfree(port_str);
	} else {
		port_list = hostlist_create(node_ptr->port_str);
	}
	if (port_list == NULL) {
		error("Unable to create Port list from %s",
		      node_ptr->port_str);
		error_code = errno;
		goto cleanup;
	}

	/* some sanity checks */
	address_count  = hostlist_count(address_list);
	alias_count    = hostlist_count(alias_list);
	hostname_count = hostlist_count(hostname_list);
	port_count     = hostlist_count(port_list);
#ifdef HAVE_FRONT_END
	if ((hostname_count != alias_count) && (hostname_count != 1)) {
		error("NodeHostname count must equal that of NodeName "
		      "records of there must be no more than one");
		goto cleanup;
	}
	if ((address_count != alias_count) && (address_count != 1)) {
		error("NodeAddr count must equal that of NodeName "
		      "records of there must be no more than one");
		goto cleanup;
	}
#else
#ifdef MULTIPLE_SLURMD
	if ((address_count != alias_count) && (address_count != 1)) {
		error("NodeAddr count must equal that of NodeName "
		      "records of there must be no more than one");
		goto cleanup;
	}
#else
	if (address_count < alias_count) {
		error("At least as many NodeAddr are required as NodeName");
		goto cleanup;
	}
	if (hostname_count < alias_count) {
		error("At least as many NodeHostname are required "
		      "as NodeName");
		goto cleanup;
	}
#endif	/* MULTIPLE_SLURMD */
#endif	/* HAVE_FRONT_END */
	if ((port_count != alias_count) && (port_count > 1)) {
		error("Port count must equal that of NodeName "
		      "records or there must be no more than one (%u != %u)",
		      port_count, alias_count);
		goto cleanup;
	}

	/* now build the individual node structures */
	while ((alias = hostlist_shift(alias_list))) {
		if (address_count > 0) {
			address_count--;
			if (address)
				free(address);
			address = hostlist_shift(address_list);
		}
		if (hostname_count > 0) {
			hostname_count--;
			if (hostname)
				free(hostname);
			hostname = hostlist_shift(hostname_list);
		}
		if (port_count > 0) {
			int port_int;
			port_count--;
			if (port_str)
				free(port_str);
			port_str = hostlist_shift(port_list);
			port_int = atoi(port_str);
			if ((port_int <= 0) || (port_int > 0xffff))
				fatal("Invalid Port %s", node_ptr->port_str);
			port = port_int;
		}
		/* find_node_record locks this to get the
		 * alias so we need to unlock */
		node_rec = find_node_record(alias);

		if (node_rec == NULL) {
			node_rec = create_node_record(config_ptr, alias);
			if ((state_val != NO_VAL) &&
			    (state_val != NODE_STATE_UNKNOWN))
				node_rec->node_state = state_val;
			node_rec->last_response = (time_t) 0;
			node_rec->comm_name = xstrdup(address);
			node_rec->node_hostname = xstrdup(hostname);
			node_rec->port      = port;
			node_rec->weight    = node_ptr->weight;
			node_rec->features  = xstrdup(node_ptr->feature);
			node_rec->reason    = xstrdup(node_ptr->reason);
		} else {
			/* FIXME - maybe should be fatal? */
			error("Reconfiguration for node %s, ignoring!", alias);
		}
		free(alias);
	}
	/* free allocated storage */
cleanup:
	if (address)
		free(address);
	if (hostname)
		free(hostname);
	if (port_str)
		free(port_str);
	if (address_list)
		hostlist_destroy(address_list);
	if (alias_list)
		hostlist_destroy(alias_list);
	if (hostname_list)
		hostlist_destroy(hostname_list);
	if (port_list)
		hostlist_destroy(port_list);
	return error_code;
}
Example #4
0
/*
 * Read a SLURM hostfile specified by "filename".  "filename" must contain
 * a list of SLURM NodeNames, one per line.  Reads up to "n" number of hostnames
 * from the file. Returns a string representing a hostlist ranged string of
 * the contents of the file.  This is a helper function, it does not
 * contact any SLURM daemons.
 *
 * Returns a string representing the hostlist.  Returns NULL if there are fewer
 * than "n" hostnames in the file, or if an error occurs.  If "n" ==
 * NO_VAL then the entire file is read in
 *
 * Returned string must be freed with free().
 */
char *slurm_read_hostfile(char *filename, int n)
{
	FILE *fp = NULL;
	char in_line[BUFFER_SIZE];	/* input line */
	int i, j;
	int line_size;
	int line_num = 0;
	hostlist_t hostlist = NULL;
	char *nodelist = NULL;
	char *asterisk, *tmp_text, *save_ptr = NULL, *host_name;
	int total_file_len = 0;

	if (filename == NULL || strlen(filename) == 0)
		return NULL;

	if ((fp = fopen(filename, "r")) == NULL) {
		error("slurm_allocate_resources error opening file %s, %m",
		      filename);
		return NULL;
	}

	hostlist = hostlist_create(NULL);
	if (hostlist == NULL) {
		fclose(fp);
		return NULL;
	}

	while (fgets(in_line, BUFFER_SIZE, fp) != NULL) {
		line_num++;
		if (!isalpha(in_line[0]) && !isdigit(in_line[0])) {
			error ("Invalid hostfile %s contents on line %d",
			       filename, line_num);
			fclose (fp);
			hostlist_destroy(hostlist);
			return NULL;
		}

		line_size = strlen(in_line);
		total_file_len += line_size;
		if (line_size == (BUFFER_SIZE - 1)) {
			error ("Line %d, of hostfile %s too long",
			       line_num, filename);
			fclose (fp);
			hostlist_destroy(hostlist);
			return NULL;
		}

		for (i = 0; i < line_size; i++) {
			if (in_line[i] == '\n') {
				in_line[i] = '\0';
				break;
			}
			if (in_line[i] == '\0')
				break;
			if (in_line[i] != '#')
				continue;
			if ((i > 0) && (in_line[i - 1] == '\\')) {
				for (j = i; j < line_size; j++) {
					in_line[j - 1] = in_line[j];
				}
				line_size--;
				continue;
			}
			in_line[i] = '\0';
			break;
		}

		tmp_text = xstrdup(in_line);
		host_name = strtok_r(tmp_text, ",", &save_ptr);
		while (host_name) {
			if ((asterisk = strchr(host_name, '*')) &&
			    (i = atoi(asterisk + 1))) {
				asterisk[0] = '\0';
				for (j = 0; j < i; j++)
					hostlist_push_host(hostlist, host_name);
			} else {
				hostlist_push_host(hostlist, host_name);
			}
			host_name = strtok_r(NULL, ",", &save_ptr);
		}
		xfree(tmp_text);

		if ((n != (int)NO_VAL) && (hostlist_count(hostlist) == n))
			break;
	}
	fclose(fp);

	if (hostlist_count(hostlist) <= 0) {
		error("Hostlist is empty!");
		goto cleanup_hostfile;
	}
	if (hostlist_count(hostlist) < n) {
		error("Too few NodeNames in SLURM Hostfile");
		goto cleanup_hostfile;
	}

	total_file_len += 1024;
	nodelist = (char *)malloc(total_file_len);
	if (!nodelist) {
		error("Nodelist xmalloc failed");
		goto cleanup_hostfile;
	}

	if (hostlist_ranged_string(hostlist, total_file_len, nodelist) == -1) {
		error("Hostlist is too long for the allocate RPC!");
		free(nodelist);
		nodelist = NULL;
		goto cleanup_hostfile;
	}

	debug2("Hostlist from SLURM_HOSTFILE = %s", nodelist);

cleanup_hostfile:
	hostlist_destroy(hostlist);

	return nodelist;
}
Example #5
0
/*
 * _filter_out - Determine if the specified node should be filtered out or
 *	reported.
 * node_ptr IN - node to consider filtering out
 * RET - true if node should not be reported, false otherwise
 */
static bool _filter_out(node_info_t *node_ptr)
{
	static hostlist_t host_list = NULL;

	if (params.nodes) {
		if (host_list == NULL)
			host_list = hostlist_create(params.nodes);
		if (hostlist_find (host_list, node_ptr->name) == -1)
			return true;
	}

	if (params.dead_nodes && !IS_NODE_NO_RESPOND(node_ptr))
		return true;

	if (params.responding_nodes && IS_NODE_NO_RESPOND(node_ptr))
		return true;

	if (params.state_list) {
		int *node_state;
		bool match = false;
		uint16_t base_state;
		ListIterator iterator;
		uint16_t cpus = 0;
		node_info_t tmp_node, *tmp_node_ptr = &tmp_node;

		iterator = list_iterator_create(params.state_list);
		while ((node_state = list_next(iterator))) {
			tmp_node_ptr->node_state = *node_state;
			if (*node_state == NODE_STATE_DRAIN) {
				/* We search for anything that has the
				 * drain flag set */
				if (IS_NODE_DRAIN(node_ptr)) {
					match = true;
					break;
				}
			} else if (IS_NODE_DRAINING(tmp_node_ptr)) {
				/* We search for anything that gets mapped to
				 * DRAINING in node_state_string */
				if (IS_NODE_DRAINING(node_ptr)) {
					match = true;
					break;
				}
			} else if (IS_NODE_DRAINED(tmp_node_ptr)) {
				/* We search for anything that gets mapped to
				 * DRAINED in node_state_string */
				if (IS_NODE_DRAINED(node_ptr)) {
					match = true;
					break;
				}
			} else if (*node_state & NODE_STATE_FLAGS) {
				if (*node_state & node_ptr->node_state) {
					match = true;
					break;
				}
			} else if (*node_state == NODE_STATE_ERROR) {
				slurm_get_select_nodeinfo(
					node_ptr->select_nodeinfo,
					SELECT_NODEDATA_SUBCNT,
					NODE_STATE_ERROR,
					&cpus);
				if (cpus) {
					match = true;
					break;
				}
			} else if (*node_state == NODE_STATE_ALLOCATED) {
				slurm_get_select_nodeinfo(
					node_ptr->select_nodeinfo,
					SELECT_NODEDATA_SUBCNT,
					NODE_STATE_ALLOCATED,
					&cpus);
				if (params.cluster_flags & CLUSTER_FLAG_BG
				    && !cpus &&
				    (IS_NODE_ALLOCATED(node_ptr) ||
				     IS_NODE_COMPLETING(node_ptr)))
					cpus = node_ptr->cpus;
				if (cpus) {
					match = true;
					break;
				}
			} else if (*node_state == NODE_STATE_IDLE) {
				base_state = node_ptr->node_state &
					(~NODE_STATE_NO_RESPOND);
				if (base_state == NODE_STATE_IDLE) {
					match = true;
					break;
				}
			} else {
				base_state =
					node_ptr->node_state & NODE_STATE_BASE;
				if (base_state == *node_state) {
					match = true;
					break;
				}
			}
		}
		list_iterator_destroy(iterator);
		if (!match)
			return true;
	}

	return false;
}
Example #6
0
extern int parse_blockreq(void **dest, slurm_parser_enum_t type,
			  const char *key, const char *value,
			  const char *line, char **leftover)
{
	s_p_options_t block_options[] = {
		{"Type", S_P_STRING},
		{"32CNBlocks", S_P_UINT16},
		{"128CNBlocks", S_P_UINT16},
#ifdef HAVE_BGL
		{"Nodecards", S_P_UINT16},
		{"Quarters", S_P_UINT16},
		{"BlrtsImage", S_P_STRING},
		{"LinuxImage", S_P_STRING},
		{"RamDiskImage", S_P_STRING},
#else
		{"16CNBlocks", S_P_UINT16},
		{"64CNBlocks", S_P_UINT16},
		{"256CNBlocks", S_P_UINT16},
		{"CnloadImage", S_P_STRING},
		{"IoloadImage", S_P_STRING},
#endif
		{"MloaderImage", S_P_STRING},
		{NULL}
	};
	s_p_hashtbl_t *tbl;
	char *tmp = NULL;
	select_ba_request_t *n = NULL;
	hostlist_t hl = NULL;

	tbl = s_p_hashtbl_create(block_options);
	s_p_parse_line(tbl, *leftover, leftover);
	if (!value) {
		return 0;
	}
	n = xmalloc(sizeof(select_ba_request_t));
	hl = hostlist_create(value);
	n->save_name = hostlist_ranged_string_xmalloc(hl);
	hostlist_destroy(hl);
#ifdef HAVE_BGL
	s_p_get_string(&n->blrtsimage, "BlrtsImage", tbl);
	s_p_get_string(&n->linuximage, "LinuxImage", tbl);
	s_p_get_string(&n->ramdiskimage, "RamDiskImage", tbl);
#else
	s_p_get_string(&n->linuximage, "CnloadImage", tbl);
	s_p_get_string(&n->ramdiskimage, "IoloadImage", tbl);
#endif
	s_p_get_string(&n->mloaderimage, "MloaderImage", tbl);

	s_p_get_string(&tmp, "Type", tbl);
	if (!tmp || !strcasecmp(tmp,"TORUS"))
		n->conn_type[0] = SELECT_TORUS;
	else if (!strcasecmp(tmp,"MESH"))
		n->conn_type[0] = SELECT_MESH;
	else
		n->conn_type[0] = SELECT_SMALL;
	xfree(tmp);

	if (!s_p_get_uint16(&n->small32, "32CNBlocks", tbl)) {
#ifdef HAVE_BGL
		s_p_get_uint16(&n->small32, "Nodecards", tbl);
#else
		;
#endif
	}
	if (!s_p_get_uint16(&n->small128, "128CNBlocks", tbl)) {
#ifdef HAVE_BGL
		s_p_get_uint16(&n->small128, "Quarters", tbl);
#else
		;
#endif
	}

#ifndef HAVE_BGL
	s_p_get_uint16(&n->small16, "16CNBlocks", tbl);
	s_p_get_uint16(&n->small64, "64CNBlocks", tbl);
	s_p_get_uint16(&n->small256, "256CNBlocks", tbl);
#endif

	s_p_hashtbl_destroy(tbl);

	*dest = (void *)n;
	return 1;
}
Example #7
0
extern bg_record_t *create_small_record(bg_record_t *bg_record,
					bitstr_t *ionodes, int size)
{
	bg_record_t *found_record = NULL;
	ba_mp_t *new_ba_mp = NULL;
	ba_mp_t *ba_mp = NULL;
	char bitstring[BITSIZE];

	found_record = (bg_record_t*) xmalloc(sizeof(bg_record_t));
	found_record->magic = BLOCK_MAGIC;

	found_record->job_running = NO_JOB_RUNNING;
	found_record->user_name = xstrdup(bg_record->user_name);
	found_record->user_uid = bg_record->user_uid;
	found_record->ba_mp_list = list_create(destroy_ba_mp);
	if (bg_record->ba_mp_list)
		ba_mp = list_peek(bg_record->ba_mp_list);
	if (!ba_mp) {
		if (bg_record->mp_str) {
			hostlist_t hl = hostlist_create(bg_record->mp_str);
			char *host = hostlist_shift(hl);
			hostlist_destroy(hl);
			found_record->mp_str = xstrdup(host);
			free(host);
			error("you gave me a list with no ba_mps using %s",
			      found_record->mp_str);
		} else {
			char tmp_char[SYSTEM_DIMENSIONS+1];
			int dim;
			for (dim=0; dim<SYSTEM_DIMENSIONS; dim++)
				tmp_char[dim] =
					alpha_num[found_record->start[dim]];
			tmp_char[dim] = '\0';
			found_record->mp_str = xstrdup_printf(
				"%s%s",
				bg_conf->slurm_node_prefix,
				tmp_char);
			error("you gave me a record with no ba_mps "
			      "and no nodes either using %s",
			      found_record->mp_str);
		}
	} else {
		new_ba_mp = ba_copy_mp(ba_mp);
		/* We need to have this node wrapped in Q to handle
		   wires correctly when creating around the midplane.
		*/
		ba_setup_mp(new_ba_mp, false, true);

		new_ba_mp->used = BA_MP_USED_TRUE;
		list_append(found_record->ba_mp_list, new_ba_mp);
		found_record->mp_count = 1;
		found_record->mp_str = xstrdup_printf(
			"%s%s",
			bg_conf->slurm_node_prefix, new_ba_mp->coord_str);
	}

#ifdef HAVE_BGL
	found_record->node_use = SELECT_COPROCESSOR_MODE;
	found_record->blrtsimage = xstrdup(bg_record->blrtsimage);
#endif
#ifdef HAVE_BG_L_P
	found_record->linuximage = xstrdup(bg_record->linuximage);
	found_record->ramdiskimage = xstrdup(bg_record->ramdiskimage);
#endif
	found_record->mloaderimage = xstrdup(bg_record->mloaderimage);

	process_nodes(found_record, false);

	found_record->conn_type[0] = SELECT_SMALL;

	xassert(bg_conf->cpu_ratio);
	found_record->cpu_cnt = bg_conf->cpu_ratio * size;
	found_record->cnode_cnt = size;

	found_record->ionode_bitmap = bit_copy(ionodes);
	bit_fmt(bitstring, BITSIZE, found_record->ionode_bitmap);
	found_record->ionode_str = xstrdup(bitstring);
	if (bg_conf->slurm_debug_flags & DEBUG_FLAG_BG_PICK)
		info("made small block of %s[%s]",
		     found_record->mp_str, found_record->ionode_str);
	return found_record;
}
Example #8
0
/* Return task list in Moab format 2: tux[0-1]*2:tux2 */
static char * _task_list_exp(struct job_record *job_ptr)
{
	int i, node_inx = 0, reps = -1, task_cnt;
	char *buf = NULL, *host;
	hostlist_t hl_tmp = (hostlist_t) NULL;
	job_resources_t *job_resrcs_ptr = job_ptr->job_resrcs;

	xassert(job_resrcs_ptr);
#ifdef HAVE_BG
	if(job_ptr->node_cnt) {
		task_cnt = job_resrcs_ptr->cpu_array_value[0];
	} else
		task_cnt = 1;
#endif
	for (i=0; i<job_resrcs_ptr->nhosts; i++) {
		if (i == 0) {
			xassert(job_resrcs_ptr->cpus &&
				job_resrcs_ptr->node_bitmap);
			node_inx = bit_ffs(job_resrcs_ptr->node_bitmap);
		} else {
			for (node_inx++; node_inx<node_record_count;
			     node_inx++) {
				if (bit_test(job_resrcs_ptr->node_bitmap,
					     node_inx))
					break;
			}
			if (node_inx >= node_record_count) {
				error("Improperly formed job_resrcs for %u",
				      job_ptr->job_id);
				break;
			}
		}
		host = node_record_table_ptr[node_inx].name;

#ifndef HAVE_BG
		task_cnt = job_resrcs_ptr->cpus[i];
		if (job_ptr->details && job_ptr->details->cpus_per_task)
			task_cnt /= job_ptr->details->cpus_per_task;
		if (task_cnt < 1) {
			error("Invalid task_cnt for job %u on node %s",
			      job_ptr->job_id, host);
			task_cnt = 1;
		}
#endif
		if (reps == task_cnt) {
			/* append to existing hostlist record */
			if (hostlist_push(hl_tmp, host) == 0)
				error("hostlist_push failure");
		} else {
			if (hl_tmp)
				_append_hl_buf(&buf, &hl_tmp, &reps);

			/* start new hostlist record */
			hl_tmp = hostlist_create(host);
			if (hl_tmp)
				reps = task_cnt;
			else
				error("hostlist_create failure");
		}
	}
	if (hl_tmp)
		_append_hl_buf(&buf, &hl_tmp, &reps);
	return buf;
}
Example #9
0
/*
 * Convert Moab supplied TASKLIST expression into a SLURM hostlist expression
 *
 * Moab format 1: tux0:tux0:tux1:tux1:tux2   (list host for each cpu)
 * Moab format 2: tux[0-1]*2:tux2            (list cpu count after host name)
 *
 * SLURM format:  tux0,tux0,tux1,tux1,tux2   (if consumable resources enabled)
 * SLURM format:  tux0,tux1,tux2             (if consumable resources disabled)
 *
 * NOTE: returned string must be released with xfree()
 */
extern char * moab2slurm_task_list(char *moab_tasklist, int *task_cnt)
{
	char *slurm_tasklist = NULL, *host = NULL, *tmp1 = NULL,
		*tmp2 = NULL, *tok = NULL, *tok_p = NULL;
	int i, reps;
	hostlist_t hl;
	static uint32_t cr_test = 0, cr_enabled = 0;

	if (cr_test == 0) {
		select_g_get_info_from_plugin(SELECT_CR_PLUGIN, NULL,
						&cr_enabled);
		cr_test = 1;
	}

	*task_cnt = 0;

	/* Moab format 2 if string contains '*' or '[' */
	tmp1 = strchr(moab_tasklist, (int) '*');
	if (tmp1 == NULL)
		tmp1 = strchr(moab_tasklist, (int) '[');

	if (tmp1 == NULL) {	/* Moab format 1 */
		slurm_tasklist = xstrdup(moab_tasklist);
		if (moab_tasklist[0])
			*task_cnt = 1;
		for (i=0; slurm_tasklist[i]!='\0'; i++) {
			if (slurm_tasklist[i] == ':') {
				slurm_tasklist[i] = ',';
				(*task_cnt)++;
			} else if (slurm_tasklist[i] == ',')
				(*task_cnt)++;
		}
		return slurm_tasklist;
	}

	/* Moab format 2 */
	slurm_tasklist = xstrdup("");
	tmp1 = xstrdup(moab_tasklist);
	tok = strtok_r(tmp1, ":", &tok_p);
	while (tok) {
		/* find task count, assume 1 if no "*" */
		tmp2 = strchr(tok, (int) '*');
		if (tmp2) {
			reps = atoi(tmp2 + 1);
			tmp2[0] = '\0';
		} else
			reps = 1;

		/* find host expression */
		hl = hostlist_create(tok);
		while ((host = hostlist_shift(hl))) {
			for (i=0; i<reps; i++) {
				if (slurm_tasklist[0])
					xstrcat(slurm_tasklist, ",");
				xstrcat(slurm_tasklist, host);
				if (!cr_enabled)
					break;
			}
			free(host);
			(*task_cnt) += reps;
		}
		hostlist_destroy(hl);

		/* get next token */
		tok = strtok_r(NULL, ":", &tok_p);
	}
	xfree(tmp1);
	return slurm_tasklist;
}
Example #10
0
extern int launch_p_create_job_step(srun_job_t *job, bool use_all_cpus,
				    void (*signal_function)(int),
				    sig_atomic_t *destroy_job)
{
	char dname[512], value[32];
	char *protocol = "mpi";
	uint32_t ntasks = opt.ntasks;
	uint32_t nnodes = opt.min_nodes;

	if (opt.launch_cmd) {
		int i;

		xstrfmtcat(poe_cmd_line, "%s", opt.argv[0]);
		for (i = 1; i < opt.argc; i++)
			xstrfmtcat(poe_cmd_line, " %s", opt.argv[i]);
	}

	if (job) {
		/* poe can't accept ranges so give the actual number
		   here so it doesn't get confused if srun gives the
		   max instead of the min.
		*/
		ntasks = job->ntasks;
		nnodes = job->nhosts;
	}

	/*
	 * In order to support MPMD or job steps smaller than the LoadLeveler
	 * job allocation size, specify a command file using the poe option
	 * -cmdfile or MP_CMDFILE env var. See page 43 here:
	 * http://publib.boulder.ibm.com/epubs/pdf/c2367811.pdf
	 * The command file should contain one more more lines of the following
	 * form:
	 * <cmd>@<step_id>%<total_tasks>%<protocol>:<num_tasks> <args>
	 * IBM is working to eliminate the need to specify protocol, but until
	 * then it might be determined as follows:
	 *
	 * We are currently looking at 'ldd <program>' and checking the name of
	 * the MPI and PAMI libraries and on x86, also checking to see if Intel
	 * MPI library is used.
	 * This is done at runtime in PMD and depending on '-mpilib' option and
	 * '-config' option used, change the LD_LIBRARY_PATH to properly
	 * support the different PE Runtime levels the customer have installed
	 * on their cluster.
	 *
	 * There is precedence order that would be important if multiple
	 * libraries are listed in the 'ldd output' as long as you know it is
	 * not a mixed protocol (i.e. Openshmem + MPI, UPC + MPI, etc)
	 * application.
	 * 1) If MPI library is found (libmpi*.so) -> use 'mpi'
	 * 2) if Openshmem library is found (libshmem.so) -> use 'shmem'
	 * 3) if UPC runtime library is found (libxlpgas.so) -> use 'pgas'
	 * 4) if only PAMI library is found (libpami.so) -> use 'pami'
	 * 5) if only LAPI library is found (liblapi.so) -> use 'lapi'
	 */
	if (opt.multi_prog) {
		protocol = "multi";
	} else {
		protocol = _get_cmd_protocol(opt.argv[1]);
	}
	debug("cmd:%s protcol:%s", opt.argv[1], protocol);

	if (opt.multi_prog) {
		int fd, k;

		if (opt.launch_cmd) {
			error("--launch_cmd not available "
			      "when using a cmdfile");
			return SLURM_ERROR;
		}
		xassert(job);
		/* NOTE: The command file needs to be in a directory that can
		 * be read from the compute node(s), so /tmp does not work.
		 * We use the user's home directory (based upon "HOME"
		 * environment variable) otherwise use current working
		 * directory. The file is only created here, it is written
		 * in launch_poe.c. */
		_build_work_dir(dname, sizeof(dname));
		xstrfmtcat(cmd_fname, "%s/slurm_cmdfile.%u",
			   dname, (uint32_t) getpid());
		while ((fd = creat(cmd_fname, 0600)) < 0) {
			if (errno == EINTR)
				continue;
			fatal("creat(%s): %m", cmd_fname);
		}
		(void) close(fd);

		/* Set command file name via MP_CMDFILE and remove it from
		 * the execute line. */
		setenv("MP_NEWJOB", "parallel", 1);
		setenv("MP_CMDFILE", cmd_fname, 1);
		setenv("SLURM_CMDFILE", opt.argv[1], 1);
		if (opt.argc) {
			xfree(opt.argv[1]);
			for (k = 1; k < opt.argc; k++)
				opt.argv[k] = opt.argv[k + 1];
			opt.argc--;
		}
	}

	if (opt.shared != (uint16_t) NO_VAL) {
		char *shared_cpu_use = "multiple";

		if (opt.shared)
			shared_cpu_use = "unique";

		setenv("MP_CPU_USE", shared_cpu_use, 1);

		if (opt.launch_cmd)
			xstrfmtcat(poe_cmd_line, " -cpu_use %s",
				   shared_cpu_use);
	}
	if (opt.network) {
		bool cau_set = false;
		bool dev_type_set = false;
		bool protocol_set = false;
		char *type_ptr = NULL;
		char *save_ptr = NULL, *token;
		char *network_str = xstrdup(opt.network);
		char *adapter_use = NULL;

		if (strstr(opt.network, "dedicated"))
			adapter_use = "dedicated";
		else if (strstr(opt.network, "shared"))
			adapter_use = "shared";

		if (adapter_use) {
			setenv("MP_ADAPTER_USE", adapter_use, 1);
			if (opt.launch_cmd)
				xstrfmtcat(poe_cmd_line, " -adapter_use %s",
					   adapter_use);
		}

		token = strtok_r(network_str, ",", &save_ptr);
		while (token) {
			/* bulk_xfer options */
			if (!strncasecmp(token, "bulk_xfer", 9)) {
				setenv("MP_USE_BULK_XFER", "yes", 1);
				if (opt.launch_cmd)
					xstrfmtcat(poe_cmd_line,
						   " -use_bulk_xfer yes");
			/* device name options */
			} else if (!strncasecmp(token, "devname=", 8)) {
				/* Ignored by POE */

			/* device type options */
			} else if (!strncasecmp(token, "devtype=", 8)) {
				type_ptr = token + 8;
				if (!strcasecmp(type_ptr, "ib")) {
					setenv("MP_DEVTYPE", type_ptr, 1);
					if (opt.launch_cmd)
						xstrfmtcat(poe_cmd_line,
							   " -devtype %s",
							   type_ptr);
				} else if (!strcasecmp(type_ptr, "hfi")) {
					setenv("MP_DEVTYPE", type_ptr, 1);
					if (opt.launch_cmd)
						xstrfmtcat(poe_cmd_line,
							   " -devtype %s",
							   type_ptr);
				}
				dev_type_set = true;
				/* POE ignores other options */

			/* instances options */
			} else if (!strncasecmp(token, "instances=", 10)) {
				/* Ignored */

			/* network options */
			} else if (!strcasecmp(token, "ip")   ||
				  !strcasecmp(token, "ipv4")  ||
				  !strcasecmp(token, "ipv6")) {
				setenv("MP_EUILIB", "ip", 1);
				if (opt.launch_cmd)
					xstrfmtcat(poe_cmd_line,
						   " -euilib ip");
			} else if (!strcasecmp(token, "us")) {
				setenv("MP_EUILIB", "us", 1);
				if (opt.launch_cmd)
					xstrfmtcat(poe_cmd_line,
						   " -euilib us");
			/* protocol options */
			} else if ((!strncasecmp(token, "lapi", 4)) ||
				   (!strncasecmp(token, "mpi",  3)) ||
				   (!strncasecmp(token, "pami", 4)) ||
				   (!strncasecmp(token, "upc",  3))) {
				if (!protocol_set) {
					protocol_set = true;
					protocol = NULL;
				}
				if (protocol)
					xstrcat(protocol, ",");
				xstrcat(protocol, token);
				setenv("MP_MSG_API", protocol, 0);
			/* adapter options */
			} else if (!strcasecmp(token, "sn_all")) {
				setenv("MP_EUIDEVICE", "sn_all", 1);
				if (opt.launch_cmd)
					xstrfmtcat(poe_cmd_line,
						   " -euidevice sn_all");
			} else if (!strcasecmp(token, "sn_single")) {
				setenv("MP_EUIDEVICE", "sn_single", 1);
				if (opt.launch_cmd)
					xstrfmtcat(poe_cmd_line,
						   " -euidevice sn_single");
			/* Collective Acceleration Units (CAU) */
			} else if (!strncasecmp(token, "cau=", 4)) {
				setenv("MP_COLLECTIVE_GROUPS", token + 4, 1);
				if (opt.launch_cmd)
					xstrfmtcat(poe_cmd_line,
						   " -collective_groups %s",
						   token + 4);
				if (atoi(token + 4))
					cau_set = true;
			/* Immediate Send Slots Per Window */
			} else if (!strncasecmp(token, "immed=", 6)) {
				setenv("MP_IMM_SEND_BUFFERS", token + 6, 1);
				if (opt.launch_cmd)
					xstrfmtcat(poe_cmd_line,
						   " -imm_send_buffers %s",
						   token + 6);
			/* other */
			} else {
				info("switch/nrt: invalid option: %s", token);
			}
			token = strtok_r(NULL, ",", &save_ptr);
		}
		if (cau_set && !dev_type_set) {
			/* If POE is executed directly (not spawned by srun)
			 * it will generate an error if -collective_groups is
			 * non-zero and devtype is not set. Since we do not
			 * know what devices are available at this point, set
			 * the default type to hfi in hopes of avoiding an
			 * error. User can always specify a devtype in the
			 * --network option to avoid possible invalid value */
			setenv("MP_DEVTYPE", type_ptr, 1);
			if (opt.launch_cmd)
				xstrcat(poe_cmd_line, " -devtype hfi");
		}

		if (opt.launch_cmd)
			xstrfmtcat(poe_cmd_line, " -msg_api %s", protocol);
		if (protocol_set)
			xfree(protocol);
		else
			setenv("MP_MSG_API", protocol, 0);
	} else {
		if (!strcmp(protocol, "multi")) {
			setenv("MP_MSG_API", protocol, 0);
			if (opt.launch_cmd)
				xstrfmtcat(poe_cmd_line,
					   " -msg_api %s", protocol);
		}
	}

	if (opt.nodelist && (opt.distribution == SLURM_DIST_ARBITRARY)) {
		bool destroy_hostfile = 0;
		if (!opt.hostfile) {
			char *host_name, *host_line;
			pid_t pid = getpid();
			hostlist_t hl;
			int fd, len, offset, wrote;

			destroy_hostfile = 1;

			hl = hostlist_create(opt.nodelist);
			if (!hl)
				fatal("Invalid nodelist: %s", opt.nodelist);
			xstrfmtcat(opt.hostfile, "slurm_hostlist.%u",
				   (uint32_t) pid);
			if ((fd = creat(opt.hostfile, 0600)) < 0)
				fatal("creat(%s): %m", opt.hostfile);
			host_line = NULL;
			while ((host_name = hostlist_shift(hl))) {
				if (host_line)
					xstrcat(host_line, "\n");
				xstrcat(host_line, host_name);
				free(host_name);
			}
			hostlist_destroy(hl);
			len = strlen(host_line) + 1;
			offset = 0;
			while (len > offset) {
				wrote = write(fd, host_line + offset,
					      len - offset);
				if (wrote < 0) {
					if ((errno == EAGAIN) ||
					    (errno == EINTR))
						continue;
					fatal("write(%s): %m", opt.hostfile);
				}
				offset += wrote;
			}
			xfree(host_line);
			close(fd);
		}
		debug2("using hostfile %s", opt.hostfile);
		setenv("MP_HOSTFILE", opt.hostfile, 1);
		if (opt.launch_cmd) {
			xstrfmtcat(poe_cmd_line, " -hfile %s", opt.hostfile);
			if (destroy_hostfile)
				info("WARNING: hostlist file %s was created.  "
				     "User is responsible to remove it when "
				     "done.", opt.hostfile);
		} else if (destroy_hostfile)
			setenv("SRUN_DESTROY_HOSTFILE", opt.hostfile, 1);

		/* RESD has to be set to yes or for some reason poe
		   thinks things are already set up and then we are
		   screwed.
		*/
		setenv("MP_RESD", "yes", 1);
		if (opt.launch_cmd)
			xstrcat(poe_cmd_line, " -resd yes");
		/* FIXME: This next line is here just for debug
		 * purpose.  It makes it so each task has a separate
		 * line. */
		setenv("MP_STDOUTMODE", "unordered", 1);
		/* Just incase we didn't specify a file in srun. */
		setenv("SLURM_ARBITRARY_NODELIST", opt.nodelist, 1);
	} else {
		/* Since poe doesn't need to know about the partition and it
		   really needs to have RMPOOL set just set it to something.
		   This only needs to happen if we don't specify the
		   hostlist like above.
		*/
		setenv("MP_RMPOOL", "SLURM", 1);
		if (opt.launch_cmd)
			xstrfmtcat(poe_cmd_line, " -rmpool slurm");
	}

	if (opt.msg_timeout) {
		snprintf(value, sizeof(value), "%d", opt.msg_timeout);
		setenv("MP_TIMEOUT", value, 1);
		/* There is no equivelent cmd line option */
	}
	if (opt.immediate) {
		setenv("MP_RETRY", "0", 1);
		if (opt.launch_cmd)
			xstrfmtcat(poe_cmd_line, " -retry 0");
	}
	if (opt.labelio) {
		setenv("MP_LABELIO", "yes", 0);
		if (opt.launch_cmd)
			xstrfmtcat(poe_cmd_line, " -labelio yes");
	}
	if (nnodes) {
		snprintf(value, sizeof(value), "%u", nnodes);
		setenv("MP_NODES", value, 1);
		if (opt.launch_cmd)
			xstrfmtcat(poe_cmd_line, " -nodes %s", value);
	}
	if (ntasks) {
		snprintf(value, sizeof(value), "%u", ntasks);
		setenv("MP_PROCS", value, 1);
		if (opt.launch_cmd)
			xstrfmtcat(poe_cmd_line, " -procs %s", value);
	}
	if (opt.cpu_bind_type) {
		/* POE supports a limited subset of CPU binding options */
		opt.cpu_bind_type &= (CPU_BIND_TO_THREADS |
				      CPU_BIND_TO_CORES   |
				      CPU_BIND_RANK);
	}
	if (opt.cpu_bind_type) {
		char *units;
		int count = 1;

		if (opt.cpu_bind_type & CPU_BIND_TO_CORES)
			units = "core";
		else
			units = "cpu";

		if (opt.cpus_per_task)
			count = MAX(opt.cpus_per_task, 1);
		snprintf(value, sizeof(value), "%s:%d", units, count);
		setenv("MP_TASK_AFFINITY", value, 1);
		setenv("MP_BINDPROC", "yes", 1);
		if (opt.launch_cmd) {
			xstrfmtcat(poe_cmd_line, " -task_affinity %s", value);
			xstrfmtcat(poe_cmd_line, " -bindproc yes");
		}
	}
	if (opt.ntasks_per_node != NO_VAL) {
		snprintf(value, sizeof(value), "%u", opt.ntasks_per_node);
		setenv("MP_TASKS_PER_NODE", value, 1);
		if (opt.launch_cmd)
			xstrfmtcat(poe_cmd_line, " -tasks_per_node %s", value);
	}
	if (opt.unbuffered) {
		setenv("MP_STDOUTMODE", "unordered", 1);
		if (opt.launch_cmd)
			xstrfmtcat(poe_cmd_line,
				   " -stdoutmode unordered");
	}

	_propagate_srun_opts(nnodes, ntasks);
	setenv("SLURM_STARTED_STEP", "YES", 1);
	//disable_status = opt.disable_status;
	//quit_on_intr = opt.quit_on_intr;
	//srun_jobid = xstrdup(opt.jobid);

	if (opt.launch_cmd) {
		printf("%s\n", poe_cmd_line);
		xfree(poe_cmd_line);

		exit(0);
	}
 	return SLURM_SUCCESS;
}
Example #11
0
extern int sacctmgr_list_cluster(int argc, char *argv[])
{
	int rc = SLURM_SUCCESS;
	slurmdb_cluster_cond_t *cluster_cond =
		xmalloc(sizeof(slurmdb_cluster_cond_t));
	List cluster_list;
	int i=0;
	ListIterator itr = NULL;
	ListIterator itr2 = NULL;
	slurmdb_cluster_rec_t *cluster = NULL;
	char *tmp_char = NULL;

	int field_count = 0;

	print_field_t *field = NULL;

	List format_list = list_create(slurm_destroy_char);
	List print_fields_list; /* types are of print_field_t */

	slurmdb_init_cluster_cond(cluster_cond, 0);
	cluster_cond->cluster_list = list_create(slurm_destroy_char);
	for (i=0; i<argc; i++) {
		int command_len = strlen(argv[i]);
		if (!strncasecmp(argv[i], "Where", MAX(command_len, 5))
		    || !strncasecmp(argv[i], "Set", MAX(command_len, 3)))
			i++;
		_set_cond(&i, argc, argv, cluster_cond, format_list);
	}

	if (exit_code) {
		slurmdb_destroy_cluster_cond(cluster_cond);
		list_destroy(format_list);
		return SLURM_ERROR;
	}

	if (!list_count(format_list)) {
		slurm_addto_char_list(format_list,
				      "Cl,Controlh,Controlp,RPC");
		if (!without_limits)
			slurm_addto_char_list(format_list,
					      "Fa,GrpJ,GrpN,GrpS,MaxJ,MaxN,"
					      "MaxS,MaxW,QOS,DefaultQOS");
	}

	cluster_cond->with_deleted = with_deleted;

	print_fields_list = sacctmgr_process_format_list(format_list);
	list_destroy(format_list);

	if (exit_code) {
		slurmdb_destroy_cluster_cond(cluster_cond);
		list_destroy(print_fields_list);
		return SLURM_ERROR;
	}

	cluster_list = acct_storage_g_get_clusters(db_conn, my_uid,
						   cluster_cond);
	slurmdb_destroy_cluster_cond(cluster_cond);

	if (!cluster_list) {
		exit_code=1;
		fprintf(stderr, " Problem with query.\n");
		list_destroy(print_fields_list);
		return SLURM_ERROR;
	}

	itr = list_iterator_create(cluster_list);
	itr2 = list_iterator_create(print_fields_list);
	print_fields_header(print_fields_list);

	field_count = list_count(print_fields_list);

	while((cluster = list_next(itr))) {
		int curr_inx = 1;
		slurmdb_association_rec_t *assoc = cluster->root_assoc;
		/* set up the working cluster rec so nodecnt's and node names
		 * are handled correctly */
		working_cluster_rec = cluster;
		while((field = list_next(itr2))) {
			switch(field->type) {
			case PRINT_CLUSTER:
				field->print_routine(field,
						     cluster->name,
						     (curr_inx == field_count));
				break;
			case PRINT_CHOST:
				field->print_routine(field,
						     cluster->control_host,
						     (curr_inx == field_count));
				break;
			case PRINT_CPORT:
				field->print_routine(field,
						     cluster->control_port,
						     (curr_inx == field_count));
				break;
			case PRINT_CLASS:
				field->print_routine(field,
						     get_classification_str(
							     cluster->
							     classification),
						     (curr_inx == field_count));
				break;
			case PRINT_CPUS:
			{
				char tmp_char[9];
				convert_num_unit((float)cluster->cpu_count,
						 tmp_char, sizeof(tmp_char),
						 UNIT_NONE);
				field->print_routine(field,
						     tmp_char,
						     (curr_inx == field_count));
				break;
			}
			case PRINT_DQOS:
				if (!g_qos_list) {
					g_qos_list = acct_storage_g_get_qos(
						db_conn,
						my_uid,
						NULL);
				}
				tmp_char = slurmdb_qos_str(g_qos_list,
							   assoc->def_qos_id);
				field->print_routine(
					field,
					tmp_char,
					(curr_inx == field_count));
				break;
			case PRINT_FAIRSHARE:
				field->print_routine(
					field,
					assoc->shares_raw,
					(curr_inx == field_count));
				break;
			case PRINT_FLAGS:
			{
				char *tmp_char = slurmdb_cluster_flags_2_str(
					cluster->flags);
				field->print_routine(
					field,
					tmp_char,
					(curr_inx == field_count));
				xfree(tmp_char);
				break;
			}
			case PRINT_GRPC:
				field->print_routine(field,
						     assoc->grp_cpus,
						     (curr_inx == field_count));
				break;
			case PRINT_GRPJ:
				field->print_routine(field,
						     assoc->grp_jobs,
						     (curr_inx == field_count));
				break;
			case PRINT_GRPMEM:
				field->print_routine(field,
						     assoc->grp_mem,
						     (curr_inx == field_count));
				break;
			case PRINT_GRPN:
				field->print_routine(field,
						     assoc->grp_nodes,
						     (curr_inx == field_count));
				break;
			case PRINT_GRPS:
				field->print_routine(field,
						     assoc->grp_submit_jobs,
						     (curr_inx == field_count));
				break;
			case PRINT_MAXCM:
				field->print_routine(
					field,
					assoc->max_cpu_mins_pj,
					(curr_inx == field_count));
				break;
			case PRINT_MAXC:
				field->print_routine(field,
						     assoc->max_cpus_pj,
						     (curr_inx == field_count));
				break;
			case PRINT_MAXJ:
				field->print_routine(field,
						     assoc->max_jobs,
						     (curr_inx == field_count));
				break;
			case PRINT_MAXN:
				field->print_routine(field,
						     assoc->max_nodes_pj,
						     (curr_inx == field_count));
				break;
			case PRINT_MAXS:
				field->print_routine(field,
						     assoc->max_submit_jobs,
						     (curr_inx == field_count));
				break;
			case PRINT_MAXW:
				field->print_routine(
					field,
					assoc->max_wall_pj,
					(curr_inx == field_count));
				break;

			case PRINT_NODECNT:
			{
				hostlist_t hl = hostlist_create(cluster->nodes);
				int cnt = 0;
				if (hl) {
					cnt = hostlist_count(hl);
					hostlist_destroy(hl);
				}
				field->print_routine(
					field,
					cnt,
					(curr_inx == field_count));
				break;
			}
			case PRINT_CLUSTER_NODES:
				field->print_routine(
					field,
					cluster->nodes,
					(curr_inx == field_count));
				break;
			case PRINT_QOS:
				if (!g_qos_list)
					g_qos_list = acct_storage_g_get_qos(
						db_conn, my_uid, NULL);

				field->print_routine(field,
						     g_qos_list,
						     assoc->qos_list,
						     (curr_inx == field_count));
				break;
			case PRINT_QOS_RAW:
				field->print_routine(field,
						     assoc->qos_list,
						     (curr_inx == field_count));
				break;
			case PRINT_RPC_VERSION:
				field->print_routine(
					field,
					cluster->rpc_version,
					(curr_inx == field_count));
				break;
			case PRINT_SELECT:
				field->print_routine(
					field,
					cluster->plugin_id_select,
					(curr_inx == field_count));
				break;
			default:
				field->print_routine(
					field, NULL,
					(curr_inx == field_count));
				break;
			}
			curr_inx++;
		}
		list_iterator_reset(itr2);
		printf("\n");
	}
	/* clear the working cluster rec */
	working_cluster_rec = NULL;

	list_iterator_destroy(itr2);
	list_iterator_destroy(itr);
	list_destroy(cluster_list);
	list_destroy(print_fields_list);

	return rc;
}
Example #12
0
static agent_info_t *_make_agent_info(agent_arg_t *agent_arg_ptr)
{
	int i = 0, j = 0;
	agent_info_t *agent_info_ptr = NULL;
	thd_t *thread_ptr = NULL;
	int *span = NULL;
	int thr_count = 0;
	hostlist_t hl = NULL;
	char *name = NULL;

	agent_info_ptr = xmalloc(sizeof(agent_info_t));
	slurm_mutex_init(&agent_info_ptr->thread_mutex);
	if (pthread_cond_init(&agent_info_ptr->thread_cond, NULL))
		fatal("pthread_cond_init error %m");
	agent_info_ptr->thread_count   = agent_arg_ptr->node_count;
	agent_info_ptr->retry          = agent_arg_ptr->retry;
	agent_info_ptr->threads_active = 0;
	thread_ptr = xmalloc(agent_info_ptr->thread_count * sizeof(thd_t));
	memset(thread_ptr, 0, (agent_info_ptr->thread_count * sizeof(thd_t)));
	agent_info_ptr->thread_struct  = thread_ptr;
	agent_info_ptr->msg_type       = agent_arg_ptr->msg_type;
	agent_info_ptr->msg_args_pptr  = &agent_arg_ptr->msg_args;

	if ((agent_arg_ptr->msg_type != REQUEST_JOB_NOTIFY)	&&
	    (agent_arg_ptr->msg_type != REQUEST_SHUTDOWN)	&&
	    (agent_arg_ptr->msg_type != REQUEST_RECONFIGURE)	&&
	    (agent_arg_ptr->msg_type != SRUN_EXEC)		&&
	    (agent_arg_ptr->msg_type != SRUN_TIMEOUT)		&&
	    (agent_arg_ptr->msg_type != SRUN_NODE_FAIL)		&&
	    (agent_arg_ptr->msg_type != SRUN_REQUEST_SUSPEND)	&&
	    (agent_arg_ptr->msg_type != SRUN_USER_MSG)		&&
	    (agent_arg_ptr->msg_type != SRUN_STEP_MISSING)	&&
	    (agent_arg_ptr->msg_type != SRUN_JOB_COMPLETE)) {
#ifdef HAVE_FRONT_END
		span = set_span(agent_arg_ptr->node_count,
				agent_arg_ptr->node_count);
#else
		/* Sending message to a possibly large number of slurmd.
		 * Push all message forwarding to slurmd in order to
		 * offload as much work from slurmctld as possible. */
		span = set_span(agent_arg_ptr->node_count, 1);
#endif
		agent_info_ptr->get_reply = true;
	} else {
		/* Message is going to one node (for srun) or we want
		 * it to get processed ASAP (SHUTDOWN or RECONFIGURE).
		 * Send the message directly to each node. */
		span = set_span(agent_arg_ptr->node_count,
				agent_arg_ptr->node_count);
	}
	i = 0;
	while(i < agent_info_ptr->thread_count) {
		thread_ptr[thr_count].state      = DSH_NEW;
		thread_ptr[thr_count].addr = agent_arg_ptr->addr;
		name = hostlist_shift(agent_arg_ptr->hostlist);
		if(!name) {
			debug3("no more nodes to send to");
			break;
		}
		hl = hostlist_create(name);
		if(thread_ptr[thr_count].addr && span[thr_count]) {
			debug("warning: you will only be sending this to %s",
			      name);
			span[thr_count] = 0;
		}
		free(name);
		i++;
		for (j = 0; j < span[thr_count]; j++) {
			name = hostlist_shift(agent_arg_ptr->hostlist);
			if (!name)
				break;
			hostlist_push(hl, name);
			free(name);
			i++;
		}
		hostlist_uniq(hl);
		thread_ptr[thr_count].nodelist = 
			hostlist_ranged_string_xmalloc(hl);
		hostlist_destroy(hl);
#if 0
		info("sending msg_type %u to nodes %s",
		     agent_arg_ptr->msg_type, thread_ptr[thr_count].nodelist);
#endif
		thr_count++;
	}
	xfree(span);
	agent_info_ptr->thread_count = thr_count;
	return agent_info_ptr;
}
Example #13
0
/*
 * slurm_sprint_job_info - output information about a specific Slurm
 *	job based upon message as loaded using slurm_load_jobs
 * IN job_ptr - an individual job information record pointer
 * IN one_liner - print as a single line if true
 * RET out - char * containing formatted output (must be freed after call)
 *           NULL is returned on failure.
 */
extern char *
slurm_sprint_job_info ( job_info_t * job_ptr, int one_liner )
{
	int i, j;
	char time_str[32], *group_name, *user_name;
	char tmp1[128], tmp2[128], tmp3[128], tmp4[128], tmp5[128], *tmp6_ptr;
	char tmp_line[512];
	char *ionodes = NULL;
	uint16_t exit_status = 0, term_sig = 0;
	job_resources_t *job_resrcs = job_ptr->job_resrcs;
	char *out = NULL;
	time_t run_time;
	uint32_t min_nodes, max_nodes = 0;
	char *nodelist = "NodeList";
	bitstr_t *core_bitmap;
	char *host;
	int sock_inx, sock_reps, last;
	int abs_node_inx, rel_node_inx;
	int bit_inx, bit_reps;
	uint32_t *last_mem_alloc_ptr = NULL;
	uint32_t last_mem_alloc = NO_VAL;
	char *last_hosts;
	hostlist_t hl, hl_last;
	char select_buf[122];
	uint32_t cluster_flags = slurmdb_setup_cluster_flags();

	if (cluster_flags & CLUSTER_FLAG_BG) {
		nodelist = "MidplaneList";
		select_g_select_jobinfo_get(job_ptr->select_jobinfo,
					    SELECT_JOBDATA_IONODES,
					    &ionodes);
	}

	/****** Line 1 ******/
	snprintf(tmp_line, sizeof(tmp_line), "JobId=%u ", job_ptr->job_id);
	out = xstrdup(tmp_line);
	if (job_ptr->array_job_id) {
		snprintf(tmp_line, sizeof(tmp_line), 
			 "ArrayJobId=%u ArrayTaskId=%u ",
			 job_ptr->array_job_id, job_ptr->array_task_id);
		xstrcat(out, tmp_line);
	}
	snprintf(tmp_line, sizeof(tmp_line), "Name=%s", job_ptr->name);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 2 ******/
	user_name = uid_to_string((uid_t) job_ptr->user_id);
	group_name = gid_to_string((gid_t) job_ptr->group_id);
	snprintf(tmp_line, sizeof(tmp_line),
		 "UserId=%s(%u) GroupId=%s(%u)",
		 user_name, job_ptr->user_id, group_name, job_ptr->group_id);
	xfree(user_name);
	xfree(group_name);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 3 ******/
	snprintf(tmp_line, sizeof(tmp_line),
		 "Priority=%u Account=%s QOS=%s",
		 job_ptr->priority, job_ptr->account, job_ptr->qos);
	xstrcat(out, tmp_line);
	if (slurm_get_track_wckey()) {
		snprintf(tmp_line, sizeof(tmp_line),
			 " WCKey=%s", job_ptr->wckey);
		xstrcat(out, tmp_line);
	}
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 4 ******/
	if (job_ptr->state_desc) {
		/* Replace white space with underscore for easier parsing */
		for (j=0; job_ptr->state_desc[j]; j++) {
			if (isspace((int)job_ptr->state_desc[j]))
				job_ptr->state_desc[j] = '_';
		}
		tmp6_ptr = job_ptr->state_desc;
	} else
		tmp6_ptr = job_reason_string(job_ptr->state_reason);
	snprintf(tmp_line, sizeof(tmp_line),
		 "JobState=%s Reason=%s Dependency=%s",
		 job_state_string(job_ptr->job_state), tmp6_ptr,
		 job_ptr->dependency);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 5 ******/
	snprintf(tmp_line, sizeof(tmp_line),
		 "Requeue=%u Restarts=%u BatchFlag=%u ",
		 job_ptr->requeue, job_ptr->restart_cnt, job_ptr->batch_flag);
	xstrcat(out, tmp_line);
	if (WIFSIGNALED(job_ptr->exit_code))
		term_sig = WTERMSIG(job_ptr->exit_code);
	exit_status = WEXITSTATUS(job_ptr->exit_code);
	snprintf(tmp_line, sizeof(tmp_line),
		 "ExitCode=%u:%u", exit_status, term_sig);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 5a (optional) ******/
	if (!(job_ptr->show_flags & SHOW_DETAIL))
		goto line6;
	if (WIFSIGNALED(job_ptr->derived_ec))
		term_sig = WTERMSIG(job_ptr->derived_ec);
	else
		term_sig = 0;
	exit_status = WEXITSTATUS(job_ptr->derived_ec);
	snprintf(tmp_line, sizeof(tmp_line),
		 "DerivedExitCode=%u:%u", exit_status, term_sig);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 6 ******/
line6:
	snprintf(tmp_line, sizeof(tmp_line), "RunTime=");
	xstrcat(out, tmp_line);
	if (IS_JOB_PENDING(job_ptr))
		run_time = 0;
	else if (IS_JOB_SUSPENDED(job_ptr))
		run_time = job_ptr->pre_sus_time;
	else {
		time_t end_time;
		if (IS_JOB_RUNNING(job_ptr) || (job_ptr->end_time == 0))
			end_time = time(NULL);
		else
			end_time = job_ptr->end_time;
		if (job_ptr->suspend_time) {
			run_time = (time_t)
				(difftime(end_time, job_ptr->suspend_time)
				 + job_ptr->pre_sus_time);
		} else
			run_time = (time_t)
				difftime(end_time, job_ptr->start_time);
	}
	secs2time_str(run_time, tmp1, sizeof(tmp1));
	sprintf(tmp_line, "%s ", tmp1);
	xstrcat(out, tmp_line);

	snprintf(tmp_line, sizeof(tmp_line), "TimeLimit=");
	xstrcat(out, tmp_line);
	if (job_ptr->time_limit == NO_VAL)
		sprintf(tmp_line, "Partition_Limit");
	else {
		mins2time_str(job_ptr->time_limit, tmp_line,
			      sizeof(tmp_line));
	}
	xstrcat(out, tmp_line);
	snprintf(tmp_line, sizeof(tmp_line), " TimeMin=");
	xstrcat(out, tmp_line);
	if (job_ptr->time_min == 0)
		sprintf(tmp_line, "N/A");
	else {
		mins2time_str(job_ptr->time_min, tmp_line,
			      sizeof(tmp_line));
	}
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 7 ******/
	slurm_make_time_str((time_t *)&job_ptr->submit_time, time_str,
			    sizeof(time_str));
	snprintf(tmp_line, sizeof(tmp_line), "SubmitTime=%s ", time_str);
	xstrcat(out, tmp_line);

	slurm_make_time_str((time_t *)&job_ptr->eligible_time, time_str,
			    sizeof(time_str));
	snprintf(tmp_line, sizeof(tmp_line), "EligibleTime=%s", time_str);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 8 (optional) ******/
	if (job_ptr->resize_time) {
		slurm_make_time_str((time_t *)&job_ptr->resize_time, time_str,
				    sizeof(time_str));
		snprintf(tmp_line, sizeof(tmp_line), "ResizeTime=%s", time_str);
		xstrcat(out, tmp_line);
		if (one_liner)
			xstrcat(out, " ");
		else
			xstrcat(out, "\n   ");
	}

	/****** Line 9 ******/
	slurm_make_time_str((time_t *)&job_ptr->start_time, time_str,
			    sizeof(time_str));
	snprintf(tmp_line, sizeof(tmp_line), "StartTime=%s ", time_str);
	xstrcat(out, tmp_line);

	snprintf(tmp_line, sizeof(tmp_line), "EndTime=");
	xstrcat(out, tmp_line);
	if ((job_ptr->time_limit == INFINITE) &&
	    (job_ptr->end_time > time(NULL)))
		sprintf(tmp_line, "Unknown");
	else {
		slurm_make_time_str ((time_t *)&job_ptr->end_time, time_str,
				     sizeof(time_str));
		sprintf(tmp_line, "%s", time_str);
	}
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 10 ******/
	if (job_ptr->preempt_time == 0)
		sprintf(tmp_line, "PreemptTime=None ");
	else {
		slurm_make_time_str((time_t *)&job_ptr->preempt_time,
				    time_str, sizeof(time_str));
		snprintf(tmp_line, sizeof(tmp_line), "PreemptTime=%s ",
			 time_str);
	}
	xstrcat(out, tmp_line);
	if (job_ptr->suspend_time) {
		slurm_make_time_str ((time_t *)&job_ptr->suspend_time,
				     time_str, sizeof(time_str));
	} else {
		strncpy(time_str, "None", sizeof(time_str));
	}
	snprintf(tmp_line, sizeof(tmp_line),
		 "SuspendTime=%s SecsPreSuspend=%ld",
		 time_str, (long int)job_ptr->pre_sus_time);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 11 ******/
	snprintf(tmp_line, sizeof(tmp_line),
		 "Partition=%s AllocNode:Sid=%s:%u",
		 job_ptr->partition, job_ptr->alloc_node, job_ptr->alloc_sid);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 12 ******/
	snprintf(tmp_line, sizeof(tmp_line), "Req%s=%s Exc%s=%s",
		 nodelist, job_ptr->req_nodes, nodelist, job_ptr->exc_nodes);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 13 ******/
	xstrfmtcat(out, "%s=", nodelist);
	xstrcat(out, job_ptr->nodes);
	if (job_ptr->nodes && ionodes) {
		snprintf(tmp_line, sizeof(tmp_line), "[%s]", ionodes);
		xstrcat(out, tmp_line);
		xfree(ionodes);
	}
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 14 (optional) ******/
	if (job_ptr->batch_host) {
		snprintf(tmp_line, sizeof(tmp_line), "BatchHost=%s",
			 job_ptr->batch_host);
		xstrcat(out, tmp_line);
		if (one_liner)
			xstrcat(out, " ");
		else
			xstrcat(out, "\n   ");
	}

	/****** Line 15 ******/
	if (cluster_flags & CLUSTER_FLAG_BG) {
		select_g_select_jobinfo_get(job_ptr->select_jobinfo,
					    SELECT_JOBDATA_NODE_CNT,
					    &min_nodes);
		if ((min_nodes == 0) || (min_nodes == NO_VAL)) {
			min_nodes = job_ptr->num_nodes;
			max_nodes = job_ptr->max_nodes;
		} else if (job_ptr->max_nodes)
			max_nodes = min_nodes;
	} else {
		min_nodes = job_ptr->num_nodes;
		max_nodes = job_ptr->max_nodes;
	}

	_sprint_range(tmp1, sizeof(tmp1), job_ptr->num_cpus, job_ptr->max_cpus);
	_sprint_range(tmp2, sizeof(tmp2), min_nodes, max_nodes);
	if (job_ptr->sockets_per_node == (uint16_t) NO_VAL)
		strcpy(tmp3, "*");
	else
		snprintf(tmp3, sizeof(tmp3), "%u", job_ptr->sockets_per_node);
	if (job_ptr->cores_per_socket == (uint16_t) NO_VAL)
		strcpy(tmp4, "*");
	else
		snprintf(tmp4, sizeof(tmp4), "%u", job_ptr->cores_per_socket);
	if (job_ptr->threads_per_core == (uint16_t) NO_VAL)
		strcpy(tmp5, "*");
	else
		snprintf(tmp5, sizeof(tmp5), "%u", job_ptr->threads_per_core);
	snprintf(tmp_line, sizeof(tmp_line),
		 "NumNodes=%s NumCPUs=%s CPUs/Task=%u ReqS:C:T=%s:%s:%s",
		 tmp2, tmp1, job_ptr->cpus_per_task, tmp3, tmp4, tmp5);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	if (!job_resrcs)
		goto line15;

	if (cluster_flags & CLUSTER_FLAG_BG) {
		if ((job_resrcs->cpu_array_cnt > 0) &&
		    (job_resrcs->cpu_array_value) &&
		    (job_resrcs->cpu_array_reps)) {
			int length = 0;
			xstrcat(out, "CPUs=");
			length += 10;
			for (i = 0; i < job_resrcs->cpu_array_cnt; i++) {
				if (length > 70) {
					/* skip to last CPU group entry */
					if (i < job_resrcs->cpu_array_cnt - 1) {
						continue;
					}
					/* add ellipsis before last entry */
					xstrcat(out, "...,");
					length += 4;
				}

				snprintf(tmp_line, sizeof(tmp_line), "%d",
					 job_resrcs->cpus[i]);
				xstrcat(out, tmp_line);
				length += strlen(tmp_line);
				if (job_resrcs->cpu_array_reps[i] > 1) {
					snprintf(tmp_line, sizeof(tmp_line),
						 "*%d",
						 job_resrcs->cpu_array_reps[i]);
					xstrcat(out, tmp_line);
					length += strlen(tmp_line);
				}
				if (i < job_resrcs->cpu_array_cnt - 1) {
					xstrcat(out, ",");
					length++;
				}
			}
			if (one_liner)
				xstrcat(out, " ");
			else
				xstrcat(out, "\n   ");
		}
	} else {
		if (!job_resrcs->core_bitmap)
			goto line15;

		last  = bit_fls(job_resrcs->core_bitmap);
		if (last == -1)
			goto line15;

		hl = hostlist_create(job_ptr->nodes);
		if (!hl) {
			error("slurm_sprint_job_info: hostlist_create: %s",
			      job_ptr->nodes);
			return NULL;
		}
		hl_last = hostlist_create(NULL);
		if (!hl_last) {
			error("slurm_sprint_job_info: hostlist_create: NULL");
			hostlist_destroy(hl);
			return NULL;
		}

		bit_inx = 0;
		i = sock_inx = sock_reps = 0;
		abs_node_inx = job_ptr->node_inx[i];

/*	tmp1[] stores the current cpu(s) allocated	*/
		tmp2[0] = '\0';	/* stores last cpu(s) allocated */
		for (rel_node_inx=0; rel_node_inx < job_resrcs->nhosts;
		     rel_node_inx++) {

			if (sock_reps >=
			    job_resrcs->sock_core_rep_count[sock_inx]) {
				sock_inx++;
				sock_reps = 0;
			}
			sock_reps++;

			bit_reps = job_resrcs->sockets_per_node[sock_inx] *
				job_resrcs->cores_per_socket[sock_inx];

			core_bitmap = bit_alloc(bit_reps);
			for (j=0; j < bit_reps; j++) {
				if (bit_test(job_resrcs->core_bitmap, bit_inx))
					bit_set(core_bitmap, j);
				bit_inx++;
			}

			bit_fmt(tmp1, sizeof(tmp1), core_bitmap);
			FREE_NULL_BITMAP(core_bitmap);
			host = hostlist_shift(hl);
/*
 *		If the allocation values for this host are not the same as the
 *		last host, print the report of the last group of hosts that had
 *		identical allocation values.
 */
			if (strcmp(tmp1, tmp2) ||
			    (last_mem_alloc_ptr != job_resrcs->memory_allocated) ||
			    (job_resrcs->memory_allocated &&
			     (last_mem_alloc !=
			      job_resrcs->memory_allocated[rel_node_inx]))) {
				if (hostlist_count(hl_last)) {
					last_hosts = 
						hostlist_ranged_string_xmalloc(
						hl_last);
					snprintf(tmp_line, sizeof(tmp_line),
						 "  Nodes=%s CPU_IDs=%s Mem=%u",
						 last_hosts, tmp2,
						 last_mem_alloc_ptr ?
						 last_mem_alloc : 0);
					xfree(last_hosts);
					xstrcat(out, tmp_line);
					if (one_liner)
						xstrcat(out, " ");
					else
						xstrcat(out, "\n   ");

					hostlist_destroy(hl_last);
					hl_last = hostlist_create(NULL);
				}
				strcpy(tmp2, tmp1);
				last_mem_alloc_ptr = job_resrcs->memory_allocated;
				if (last_mem_alloc_ptr)
					last_mem_alloc = job_resrcs->
						memory_allocated[rel_node_inx];
				else
					last_mem_alloc = NO_VAL;
			}
			hostlist_push_host(hl_last, host);
			free(host);

			if (bit_inx > last)
				break;

			if (abs_node_inx > job_ptr->node_inx[i+1]) {
				i += 2;
				abs_node_inx = job_ptr->node_inx[i];
			} else {
				abs_node_inx++;
			}
		}

		if (hostlist_count(hl_last)) {
			last_hosts = hostlist_ranged_string_xmalloc(hl_last);
			snprintf(tmp_line, sizeof(tmp_line),
				 "  Nodes=%s CPU_IDs=%s Mem=%u",
				 last_hosts, tmp2,
				 last_mem_alloc_ptr ? last_mem_alloc : 0);
			xfree(last_hosts);
			xstrcat(out, tmp_line);
			if (one_liner)
				xstrcat(out, " ");
			else
				xstrcat(out, "\n   ");
		}
		hostlist_destroy(hl);
		hostlist_destroy(hl_last);
	}
	/****** Line 15 ******/
line15:
	if (job_ptr->pn_min_memory & MEM_PER_CPU) {
		job_ptr->pn_min_memory &= (~MEM_PER_CPU);
		tmp6_ptr = "CPU";
	} else
		tmp6_ptr = "Node";

	if (cluster_flags & CLUSTER_FLAG_BG) {
		convert_num_unit((float)job_ptr->pn_min_cpus,
				 tmp1, sizeof(tmp1), UNIT_NONE);
		snprintf(tmp_line, sizeof(tmp_line), "MinCPUsNode=%s",	tmp1);
	} else {
		snprintf(tmp_line, sizeof(tmp_line), "MinCPUsNode=%u",
			 job_ptr->pn_min_cpus);
	}

	xstrcat(out, tmp_line);
	convert_num_unit((float)job_ptr->pn_min_memory, tmp1, sizeof(tmp1),
			 UNIT_MEGA);
	convert_num_unit((float)job_ptr->pn_min_tmp_disk, tmp2, sizeof(tmp2),
			 UNIT_MEGA);
	snprintf(tmp_line, sizeof(tmp_line),
		 " MinMemory%s=%s MinTmpDiskNode=%s",
		 tmp6_ptr, tmp1, tmp2);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 16 ******/
	snprintf(tmp_line, sizeof(tmp_line),
		 "Features=%s Gres=%s Reservation=%s",
		 job_ptr->features, job_ptr->gres, job_ptr->resv_name);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 17 ******/
	snprintf(tmp_line, sizeof(tmp_line),
		 "Shared=%s Contiguous=%d Licenses=%s Network=%s",
		 (job_ptr->shared == 0 ? "0" :
		  job_ptr->shared == 1 ? "1" : "OK"),
		 job_ptr->contiguous, job_ptr->licenses, job_ptr->network);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 18 ******/
	snprintf(tmp_line, sizeof(tmp_line), "Command=%s",
		 job_ptr->command);
	xstrcat(out, tmp_line);
	if (one_liner)
		xstrcat(out, " ");
	else
		xstrcat(out, "\n   ");

	/****** Line 19 ******/
	snprintf(tmp_line, sizeof(tmp_line), "WorkDir=%s",
		 job_ptr->work_dir);
	xstrcat(out, tmp_line);

	if (cluster_flags & CLUSTER_FLAG_BG) {
		/****** Line 20 (optional) ******/
		select_g_select_jobinfo_sprint(job_ptr->select_jobinfo,
					       select_buf, sizeof(select_buf),
					       SELECT_PRINT_BG_ID);
		if (select_buf[0] != '\0') {
			if (one_liner)
				xstrcat(out, " ");
			else
				xstrcat(out, "\n   ");
			snprintf(tmp_line, sizeof(tmp_line),
				 "Block_ID=%s", select_buf);
			xstrcat(out, tmp_line);
		}

		/****** Line 21 (optional) ******/
		select_g_select_jobinfo_sprint(job_ptr->select_jobinfo,
					       select_buf, sizeof(select_buf),
					       SELECT_PRINT_MIXED_SHORT);
		if (select_buf[0] != '\0') {
			if (one_liner)
				xstrcat(out, " ");
			else
				xstrcat(out, "\n   ");
			xstrcat(out, select_buf);
		}

		if (cluster_flags & CLUSTER_FLAG_BGL) {
			/****** Line 22 (optional) ******/
			select_g_select_jobinfo_sprint(
				job_ptr->select_jobinfo,
				select_buf, sizeof(select_buf),
				SELECT_PRINT_BLRTS_IMAGE);
			if (select_buf[0] != '\0') {
				if (one_liner)
					xstrcat(out, " ");
				else
					xstrcat(out, "\n   ");
				snprintf(tmp_line, sizeof(tmp_line),
					 "BlrtsImage=%s", select_buf);
				xstrcat(out, tmp_line);
			}
		}
		/****** Line 23 (optional) ******/
		select_g_select_jobinfo_sprint(job_ptr->select_jobinfo,
					       select_buf, sizeof(select_buf),
					       SELECT_PRINT_LINUX_IMAGE);
		if (select_buf[0] != '\0') {
			if (one_liner)
				xstrcat(out, " ");
			else
				xstrcat(out, "\n   ");
			if (cluster_flags & CLUSTER_FLAG_BGL)
				snprintf(tmp_line, sizeof(tmp_line),
					 "LinuxImage=%s", select_buf);
			else
				snprintf(tmp_line, sizeof(tmp_line),
					 "CnloadImage=%s", select_buf);

			xstrcat(out, tmp_line);
		}
		/****** Line 24 (optional) ******/
		select_g_select_jobinfo_sprint(job_ptr->select_jobinfo,
					       select_buf, sizeof(select_buf),
					       SELECT_PRINT_MLOADER_IMAGE);
		if (select_buf[0] != '\0') {
			if (one_liner)
				xstrcat(out, " ");
			else
				xstrcat(out, "\n   ");
			snprintf(tmp_line, sizeof(tmp_line),
				 "MloaderImage=%s", select_buf);
			xstrcat(out, tmp_line);
		}
		/****** Line 25 (optional) ******/
		select_g_select_jobinfo_sprint(job_ptr->select_jobinfo,
					       select_buf, sizeof(select_buf),
					       SELECT_PRINT_RAMDISK_IMAGE);
		if (select_buf[0] != '\0') {
			if (one_liner)
				xstrcat(out, " ");
			else
				xstrcat(out, "\n   ");
			if (cluster_flags & CLUSTER_FLAG_BGL)
				snprintf(tmp_line, sizeof(tmp_line),
					 "RamDiskImage=%s", select_buf);
			else
				snprintf(tmp_line, sizeof(tmp_line),
					 "IoloadImage=%s", select_buf);
			xstrcat(out, tmp_line);
		}
	}

	/****** Line 26 (optional) ******/
	if (job_ptr->comment) {
		if (one_liner)
			xstrcat(out, " ");
		else
			xstrcat(out, "\n   ");
		snprintf(tmp_line, sizeof(tmp_line), "Comment=%s ",
			 job_ptr->comment);
		xstrcat(out, tmp_line);
	}

	/****** Line 27 (optional) ******/
	if (job_ptr->batch_script) {
		if (one_liner)
			xstrcat(out, " ");
		else
			xstrcat(out, "\n   ");
		xstrcat(out, "BatchScript=\n");
		xstrcat(out, job_ptr->batch_script);
	}

	/****** Line 28 (optional) ******/
	if (job_ptr->req_switch) {
		char time_buf[32];
		if (one_liner)
			xstrcat(out, " ");
		else
			xstrcat(out, "\n   ");
		secs2time_str((time_t) job_ptr->wait4switch, time_buf,
			      sizeof(time_buf));
		snprintf(tmp_line, sizeof(tmp_line), "Switches=%[email protected]%s\n",
			 job_ptr->req_switch, time_buf);
		xstrcat(out, tmp_line);
	}

	/****** Line 29 (optional) ******/
	if (one_liner)
		xstrcat(out, "\n");
	else
		xstrcat(out, "\n\n");

	return out;

}
Example #14
0
int
main (int argc, char *argv[])
{
    char *dir = NULL;
    char *spec, *host;
    char *nspec = NULL;
    int c, i;
    int nopt = 0;
    int vopt = 0;
    int fopt = 0;
    int aopt = 0;
    int dopt = 0;
    int rfd = -1, wfd = -1;
    Opt o; 

    diod_log_init (argv[0]);

    o = opt_create ();

    opterr = 0;
    while ((c = GETOPT (argc, argv, OPTIONS, longopts)) != -1) {
        switch (c) {
            case 'f':   /* --fake-mount */
                fopt = 1;
                break;
            case 'n':   /* --no-mtab */
                nopt = 1;
                break;
            case 'v':   /* --verbose */
                vopt++;
                break;
            case 'o':   /* --options OPT[,OPT]... */
                opt_addf (o, "%s", optarg);
                break;
            case 'a':   /* --9nbd-attach */
                aopt++;
                break;
            case 'd':   /* --9nbd-detach */
                dopt++;
                break;
            default:
                usage ();
        }
    }

    /* Take care of 9nbd operations and exit.
     */
    if (aopt) {
        _nbd_attach (o, argc - optind, argv + optind, nopt, vopt);
        exit (0);
    }
    if (dopt) {
        _nbd_detach (o, argc - optind, argv + optind, nopt, vopt);
        exit (0);
    }

    if (optind != argc - 2)
        usage ();

    if (geteuid () != 0)
        msg_exit ("you must be root");

    spec = argv[optind++];
    dir = argv[optind++];
    host = _parse_spec (spec, o);

    _verify_mountpoint (dir);

    /* Remount - only pass mount flags into the VFS for an existing mount.
     * Take care of it here and exit.
     */
    if (opt_find (o, "remount")) {
        if (opt_check_allowed_csv (o, "ro,rw,aname,remount"))
            msg_exit ("-oremount can only be used with ro,rw");
        _diod_remount (o, spec, dir, vopt, fopt);
        goto done;
    }

    /* Ensure uname and access are set, and to diod-compatible values.
     * The uname user becomes the euid which will be used by munge auth.
     */
    _parse_uname_access (o);
     if (seteuid (_uname2uid (opt_find (o, "uname"))) < 0)
        err_exit ("seteuid");

    /* We require -otrans=fd because auth occurs in user space, then live fd
     * is passed to the kernel via -orfdno,wfdno.
     */
    if (!opt_find (o, "trans"))
        opt_addf (o, "trans=%s", "fd");
    else if (!opt_find (o, "trans=fd"))
        msg_exit ("only -otrans=fd transport is supported");

    /* Set msize if not already set.  Validate it later.
     */
    if (!opt_find (o, "msize"))
        opt_addf (o, "msize=%d", DIOD_DEFAULT_MSIZE);

    /* Only .L version is supported.
     */
    if (!opt_find (o, "version"))
        opt_addf (o, "version=%s", "9p2000.L");
    else if (!opt_find (o, "version=9p2000.L"))
        msg_exit ("only -oversion=9p2000.L is supported (little p, big L)");

    /* Set debug level.
     */
    if (!opt_find (o, "debug"))
        opt_addf (o, "debug=%d", 0x1); /* send errors to dmesg */

    /* Set rwdepth (number of concurrent reads with buffer > msize).
     * N.B. this option is not upstream yet but unknown options are ignored.
     */
    if (!opt_find (o, "rwdepth"))
        opt_addf (o, "rwdepth=%d", 1);

    /* Server is on an inherited file descriptor.
     * For testing, we start server on a socketpair duped to fd 0.
     */
    if (opt_find (o, "rfdno") || opt_find (o, "wfdno")) {
        if (!opt_scanf (o, "rfdno=%d", &rfd) || !opt_scanf (o, "wfdno=%d",&wfd))
            msg_exit ("-orfdno,wfdno must be used together");
        nopt = 1; /* force no mtab */

    /* Connect to server on UNIX domain socket
     */
    } else if (host[0] == '/') {
        if (opt_find (o, "port"))
            msg_exit ("-oport won't work with UNIX domain socket");
        if ((rfd = diod_sock_connect_unix (host, 0)) < 0)
            exit (1);
        wfd = rfd;

        opt_addf (o, "rfdno=%d", rfd);
        opt_addf (o, "wfdno=%d", wfd);

    /* Connect to server on IANA port (or user-specified) and host.
     */
    } else {
        char *port = opt_find (o, "port");
        hostlist_iterator_t hi;
        hostlist_t hl; 
        char *h;

        if (!port)
            port = "564";
        if (!(hl = hostlist_create (host)))
            msg_exit ("error parsing host string: %s", host);
        if (!(hi = hostlist_iterator_create (hl)))
            msg_exit ("out of memory");
        while ((h = hostlist_next (hi))) {
            if (vopt)
                msg ("trying to connect to %s:%s", h, port);
            if ((rfd = diod_sock_connect_inet (h, port, DIOD_SOCK_QUIET)) >= 0)
                break;
        }
        if (h) { /* create new 'spec' string identifying successful host */
            char *p = strchr (spec , ':');
            int len = strlen (h) + (p ? strlen (p) : 0) + 1;

            if (!(nspec = malloc (len)))
                msg_exit ("out of memory");
            snprintf (nspec, len, "%s%s", h, p ? p : "");
        }
        hostlist_destroy (hl);
        if (rfd < 0)
            msg_exit ("could not connect to server(s), giving up");
        wfd = rfd;
        
        opt_delete (o, "port");
        opt_addf (o, "rfdno=%d", rfd);
        opt_addf (o, "wfdno=%d", wfd);
    }

    NP_ASSERT (opt_find (o, "trans=fd"));
    NP_ASSERT (opt_scanf (o, "msize=%d", &i));
    NP_ASSERT (opt_find (o, "version=9p2000.L"));
    NP_ASSERT (opt_scanf (o, "debug=%d", &i) || opt_scanf (o, "debug=%x", &i));
    NP_ASSERT (opt_scanf (o, "wfdno=%d", &i) && opt_scanf (o, "rfdno=%d", &i));
    NP_ASSERT ((opt_find (o, "access=user") && opt_find(o, "uname=root"))
         || (opt_scanf (o, "access=%d", &i) && opt_find(o, "uname")));

    NP_ASSERT (!opt_find (o, "port"));

    _diod_mount (o, rfd, wfd, nspec ? nspec : spec, dir, vopt, fopt, nopt);

done:
    opt_destroy (o);
    exit (0);
}
Example #15
0
static int
_setup_stepd_tree_info(const stepd_step_rec_t *job, char ***env)
{
	hostlist_t hl;
	char srun_host[64];
	uint16_t port;
	char *p;
	int tree_width;

	/* job info available */

	memset(&tree_info, 0, sizeof(tree_info));

	hl = hostlist_create(job_info.step_nodelist);
	p = hostlist_nth(hl, job_info.nodeid); /* strdup-ed */
	tree_info.this_node = xstrdup(p);
	free(p);

	/* this only controls the upward communication tree width */
	p = getenvp(*env, PMI2_TREE_WIDTH_ENV);
	if (p) {
		tree_width = atoi(p);
		if (tree_width < 2) {
			info("invalid PMI2 tree width value (%d) detected. "
			     "fallback to default value.", tree_width);
			tree_width = slurm_get_tree_width();
		}
	} else {
		tree_width = slurm_get_tree_width();
	}

	/* TODO: cannot launch 0 tasks on node */

	/*
	 * In tree position calculation, root of the tree is srun with id 0.
	 * Stepd's id will be its nodeid plus 1.
	 */
	reverse_tree_info(job_info.nodeid + 1, job_info.nnodes + 1,
			  tree_width, &tree_info.parent_id,
			  &tree_info.num_children, &tree_info.depth,
			  &tree_info.max_depth);
	tree_info.parent_id --;	       /* restore real nodeid */
	if (tree_info.parent_id < 0) {	/* parent is srun */
		tree_info.parent_node = NULL;
	} else {
		p = hostlist_nth(hl, tree_info.parent_id);
		tree_info.parent_node = xstrdup(p);
		free(p);
	}
	hostlist_destroy(hl);

	tree_info.pmi_port = 0;	/* not used */

	p = getenvp(*env, "SLURM_SRUN_COMM_HOST");
	if (!p) {
		error("mpi/pmi2: unable to find srun comm ifhn in env");
		return SLURM_ERROR;
	} else {
		strncpy(srun_host, p, 64);
	}
	p = getenvp(*env, PMI2_SRUN_PORT_ENV);
	if (!p) {
		error("mpi/pmi2: unable to find srun pmi2 port in env");
		return SLURM_ERROR;
	} else {
		port = atoi(p);
		unsetenvp(*env, PMI2_SRUN_PORT_ENV);
	}
	tree_info.srun_addr = xmalloc(sizeof(slurm_addr_t));
	slurm_set_addr(tree_info.srun_addr, port, srun_host);

	/* init kvs seq to 0. TODO: reduce array size */
	tree_info.children_kvs_seq = xmalloc(sizeof(uint32_t) *
					     job_info.nnodes);

	return SLURM_SUCCESS;
}
Example #16
0
/*
 * Read a SLURM hostfile specified by "filename".  "filename" must contain
 * a list of SLURM NodeNames, one per line.  Reads up to "n" number of hostnames
 * from the file. Returns a string representing a hostlist ranged string of
 * the contents of the file.  This is a helper function, it does not
 * contact any SLURM daemons.
 *
 * Returns a string representing the hostlist.  Returns NULL if there are fewer
 * than "n" hostnames in the file, or if an error occurs.  If "n" ==
 * NO_VAL then the entire file is read in
 *
 * Returned string must be freed with free().
 */
char *slurm_read_hostfile(char *filename, int n)
{
	FILE *fp = NULL;
	char in_line[BUFFER_SIZE];	/* input line */
	int i, j;
	int line_size;
	int line_num = 0;
	hostlist_t hostlist = NULL;
	char *nodelist = NULL;

	if (filename == NULL || strlen(filename) == 0)
		return NULL;

	if ((fp = fopen(filename, "r")) == NULL) {
		error("slurm_allocate_resources error opening file %s, %m",
		      filename);
		return NULL;
	}

	hostlist = hostlist_create(NULL);
	if (hostlist == NULL) {
		fclose(fp);
		return NULL;
	}

	while (fgets(in_line, BUFFER_SIZE, fp) != NULL) {
		line_num++;
		line_size = strlen(in_line);
		if (line_size == (BUFFER_SIZE - 1)) {
			error ("Line %d, of hostfile %s too long",
			       line_num, filename);
			fclose (fp);
			hostlist_destroy(hostlist);
			return NULL;
		}

		for (i = 0; i < line_size; i++) {
			if (in_line[i] == '\n') {
				in_line[i] = '\0';
				break;
			}
			if (in_line[i] == '\0')
				break;
			if (in_line[i] != '#')
				continue;
			if ((i > 0) && (in_line[i - 1] == '\\')) {
				for (j = i; j < line_size; j++) {
					in_line[j - 1] = in_line[j];
				}
				line_size--;
				continue;
			}
			in_line[i] = '\0';
			break;
		}

		hostlist_push(hostlist, in_line);
		if (n != (int)NO_VAL && hostlist_count(hostlist) == n)
			break;
	}
	fclose(fp);

	if (hostlist_count(hostlist) <= 0) {
		error("Hostlist is empty!");
		goto cleanup_hostfile;
	}
	if (hostlist_count(hostlist) < n) {
		error("Too few NodeNames in SLURM Hostfile");
		goto cleanup_hostfile;
	}

	nodelist = (char *)malloc(0xffff);
	if (!nodelist) {
		error("Nodelist xmalloc failed");
		goto cleanup_hostfile;
	}

	if (hostlist_ranged_string(hostlist, 0xffff, nodelist) == -1) {
		error("Hostlist is too long for the allocate RPC!");
		free(nodelist);
		nodelist = NULL;
		goto cleanup_hostfile;
	}

	debug2("Hostlist from SLURM_HOSTFILE = %s", nodelist);

cleanup_hostfile:
	hostlist_destroy(hostlist);

	return nodelist;
}
Example #17
0
/*
 * Create job description structure based off srun options
 * (see opt.h)
 */
job_desc_msg_t *
job_desc_msg_create_from_opts (void)
{
	job_desc_msg_t *j = xmalloc(sizeof(*j));
	hostlist_t hl = NULL;

	slurm_init_job_desc_msg(j);
#if defined HAVE_ALPS_CRAY && defined HAVE_REAL_CRAY
	uint64_t pagg_id = job_getjid(getpid());
	/*
	 * Interactive sessions require pam_job.so in /etc/pam.d/common-session
	 * since creating sgi_job containers requires root permissions. This is
	 * the only exception where we allow the fallback of using the SID to
	 * confirm the reservation (caught later, in do_basil_confirm).
	 */
	if (pagg_id == (uint64_t)-1) {
		error("No SGI job container ID detected - please enable the "
		      "Cray job service via /etc/init.d/job");
	} else {
		if (!j->select_jobinfo)
			j->select_jobinfo = select_g_select_jobinfo_alloc();

		select_g_select_jobinfo_set(j->select_jobinfo,
					    SELECT_JOBDATA_PAGG_ID, &pagg_id);
	}
#endif

	j->contiguous     = opt.contiguous;
	if (opt.core_spec != (uint16_t) NO_VAL)
		j->core_spec      = opt.core_spec;
	j->features       = opt.constraints;
	if (opt.gres && xstrcasecmp(opt.gres, "NONE"))
		j->gres   = opt.gres;
	if (opt.immediate == 1)
		j->immediate = opt.immediate;
	if (opt.job_name)
		j->name   = opt.job_name;
	else
		j->name   = opt.cmd_name;
	if (opt.argc > 0) {
		j->argc    = 1;
		j->argv    = (char **) xmalloc(sizeof(char *) * 2);
		j->argv[0] = xstrdup(opt.argv[0]);
	}
	if (opt.acctg_freq)
		j->acctg_freq     = xstrdup(opt.acctg_freq);
	j->reservation    = opt.reservation;
	j->wckey          = opt.wckey;

	j->req_nodes      = xstrdup(opt.nodelist);

	/* simplify the job allocation nodelist,
	 * not laying out tasks until step */
	if (j->req_nodes) {
		hl = hostlist_create(j->req_nodes);
		xfree(opt.nodelist);
		opt.nodelist = hostlist_ranged_string_xmalloc(hl);
		hostlist_uniq(hl);
		xfree(j->req_nodes);
		j->req_nodes = hostlist_ranged_string_xmalloc(hl);
		hostlist_destroy(hl);

	}

	if (((opt.distribution & SLURM_DIST_STATE_BASE) == SLURM_DIST_ARBITRARY)
	   && !j->req_nodes) {
		error("With Arbitrary distribution you need to "
		      "specify a nodelist or hostfile with the -w option");
		return NULL;
	}
	j->exc_nodes      = opt.exc_nodes;
	j->partition      = opt.partition;
	j->min_nodes      = opt.min_nodes;
	if (opt.sockets_per_node != NO_VAL)
		j->sockets_per_node    = opt.sockets_per_node;
	if (opt.cores_per_socket != NO_VAL)
		j->cores_per_socket      = opt.cores_per_socket;
	if (opt.threads_per_core != NO_VAL) {
		j->threads_per_core    = opt.threads_per_core;
		/* if 1 always make sure affinity knows about it */
		if (j->threads_per_core == 1)
			opt.cpu_bind_type |= CPU_BIND_ONE_THREAD_PER_CORE;
	}
	j->user_id        = opt.uid;
	j->dependency     = opt.dependency;
	if (opt.nice != NO_VAL)
		j->nice   = NICE_OFFSET + opt.nice;
	if (opt.priority)
		j->priority = opt.priority;

	if (opt.cpu_bind)
		j->cpu_bind       = opt.cpu_bind;
	if (opt.cpu_bind_type)
		j->cpu_bind_type  = opt.cpu_bind_type;
	if (opt.mem_bind)
		j->mem_bind       = opt.mem_bind;
	if (opt.mem_bind_type)
		j->mem_bind_type  = opt.mem_bind_type;
	if (opt.plane_size != NO_VAL)
		j->plane_size     = opt.plane_size;
	j->task_dist      = opt.distribution;

	j->group_id       = opt.gid;
	j->mail_type      = opt.mail_type;

	if (opt.ntasks_per_node != NO_VAL)
		j->ntasks_per_node   = opt.ntasks_per_node;
	if (opt.ntasks_per_socket != NO_VAL)
		j->ntasks_per_socket = opt.ntasks_per_socket;
	if (opt.ntasks_per_core != NO_VAL)
		j->ntasks_per_core   = opt.ntasks_per_core;

	if (opt.mail_user)
		j->mail_user = opt.mail_user;
	if (opt.burst_buffer)
		j->burst_buffer = opt.burst_buffer;
	if (opt.begin)
		j->begin_time = opt.begin;
	if (opt.deadline)
		j->deadline = opt.deadline;
	if (opt.licenses)
		j->licenses = opt.licenses;
	if (opt.network)
		j->network = opt.network;
	if (opt.profile)
		j->profile = opt.profile;
	if (opt.account)
		j->account = opt.account;
	if (opt.comment)
		j->comment = opt.comment;
	if (opt.qos)
		j->qos = opt.qos;
	if (opt.cwd)
		j->work_dir = opt.cwd;

	if (opt.hold)
		j->priority     = 0;
	if (opt.jobid != NO_VAL)
		j->job_id	= opt.jobid;
#ifdef HAVE_BG
	if (opt.geometry[0] > 0) {
		int i;
		for (i = 0; i < SYSTEM_DIMENSIONS; i++)
			j->geometry[i] = opt.geometry[i];
	}
#endif

	memcpy(j->conn_type, opt.conn_type, sizeof(j->conn_type));

	if (opt.reboot)
		j->reboot = 1;
	if (opt.no_rotate)
		j->rotate = 0;

	if (opt.blrtsimage)
		j->blrtsimage = opt.blrtsimage;
	if (opt.linuximage)
		j->linuximage = opt.linuximage;
	if (opt.mloaderimage)
		j->mloaderimage = opt.mloaderimage;
	if (opt.ramdiskimage)
		j->ramdiskimage = opt.ramdiskimage;

	if (opt.max_nodes)
		j->max_nodes    = opt.max_nodes;
	else if (opt.nodes_set) {
		/* On an allocation if the max nodes isn't set set it
		 * to do the same behavior as with salloc or sbatch.
		 */
		j->max_nodes    = opt.min_nodes;
	}
	if (opt.pn_min_cpus != NO_VAL)
		j->pn_min_cpus    = opt.pn_min_cpus;
	if (opt.pn_min_memory != NO_VAL)
		j->pn_min_memory = opt.pn_min_memory;
	else if (opt.mem_per_cpu != NO_VAL)
		j->pn_min_memory = opt.mem_per_cpu | MEM_PER_CPU;
	if (opt.pn_min_tmp_disk != NO_VAL)
		j->pn_min_tmp_disk = opt.pn_min_tmp_disk;
	if (opt.overcommit) {
		j->min_cpus    = opt.min_nodes;
		j->overcommit  = opt.overcommit;
	} else if (opt.cpus_set)
		j->min_cpus    = opt.ntasks * opt.cpus_per_task;
	else
		j->min_cpus    = opt.ntasks;
	if (opt.ntasks_set)
		j->num_tasks   = opt.ntasks;

	if (opt.cpus_set)
		j->cpus_per_task = opt.cpus_per_task;

	if (opt.no_kill)
		j->kill_on_node_fail   = 0;
	if (opt.time_limit != NO_VAL)
		j->time_limit          = opt.time_limit;
	if (opt.time_min != NO_VAL)
		j->time_min            = opt.time_min;
	if (opt.shared != (uint16_t) NO_VAL)
		j->shared = opt.shared;

	if (opt.warn_signal)
		j->warn_signal = opt.warn_signal;
	if (opt.warn_time)
		j->warn_time = opt.warn_time;
	if (opt.job_flags)
		j->bitflags = opt.job_flags;

	if (opt.cpu_freq_min != NO_VAL)
		j->cpu_freq_min = opt.cpu_freq_min;
	if (opt.cpu_freq_max != NO_VAL)
		j->cpu_freq_max = opt.cpu_freq_max;
	if (opt.cpu_freq_gov != NO_VAL)
		j->cpu_freq_gov = opt.cpu_freq_gov;

	if (opt.req_switch >= 0)
		j->req_switch = opt.req_switch;
	if (opt.wait4switch >= 0)
		j->wait4switch = opt.wait4switch;

	/* srun uses the same listening port for the allocation response
	 * message as all other messages */
	j->alloc_resp_port = slurmctld_comm_addr.port;
	j->other_port = slurmctld_comm_addr.port;

	if (opt.spank_job_env_size) {
		j->spank_job_env      = opt.spank_job_env;
		j->spank_job_env_size = opt.spank_job_env_size;
	}

	if (opt.power_flags)
		j->power_flags = opt.power_flags;
	if (opt.mcs_label)
		j->mcs_label = opt.mcs_label;

	return j;
}
Example #18
0
extern int switch_p_build_jobinfo(switch_jobinfo_t *switch_job,
				  slurm_step_layout_t *step_layout,
				  char *network)
{
	hostlist_t list = NULL;
	bool bulk_xfer = false, ip_v4 = true, user_space = false;
	uint32_t bulk_xfer_resources = 0;
	bool sn_all = true;	/* default to sn_all */
	int cau = 0, immed = 0, instances = 1;
	int dev_type = NRT_MAX_ADAPTER_TYPES;
	int err = SLURM_SUCCESS;
	char *adapter_name = NULL;
	char *protocol = NULL;
	char *network_str = NULL, *token = NULL, *save_ptr = NULL;
	DEF_TIMERS;

	if (debug_flags & DEBUG_FLAG_SWITCH) {
		START_TIMER;
		info("switch_p_build_jobinfo(): nodelist:%s network:%s",
		     step_layout->node_list, network);
	} else {
		debug3("network = \"%s\"", network);
	}

	list = hostlist_create(step_layout->node_list);
	if (!list)
		fatal("hostlist_create(%s): %m", step_layout->node_list);

	if (network) {
		network_str = xstrdup(network);
		token = strtok_r(network_str, ",", &save_ptr);
	}
	while (token) {
		/* bulk_xfer options */
		if (!strncasecmp(token, "bulk_xfer=", 10)) {
			long int resources;
			char *end_ptr = NULL;
			bulk_xfer = true;
			resources = strtol(token+10, &end_ptr, 10);
			if ((end_ptr[0] == 'k') || (end_ptr[0] == 'K'))
				resources *= 1024;
			else if ((end_ptr[0] == 'm') || (end_ptr[0] == 'M'))
				resources *= (1024 * 1024);
			else if ((end_ptr[0] == 'g') || (end_ptr[0] == 'G'))
				resources *= (1024 * 1024 * 1024);
			if (resources >= 0)
				bulk_xfer_resources = resources;
			else {
				info("switch/nrt: invalid option: %s", token);
				err = SLURM_ERROR;
			}
		} else if (!strcasecmp(token, "bulk_xfer")) {
			bulk_xfer = true;

		/* device name options */
		} else if (!strncasecmp(token, "devname=", 8)) {
			char *name_ptr = token + 8;
			if (nrt_adapter_name_check(name_ptr, list)) {
				debug("switch/nrt: Found adapter %s in "
				      "network string", token);
				adapter_name = xstrdup(name_ptr);
				sn_all = false;
			} else if (!strcasecmp(name_ptr, "sn_all")) {
				sn_all = true;
			} else if (!strcasecmp(name_ptr, "sn_single")) {
				sn_all = false;
			} else {
				info("switch/nrt: invalid devname: %s",
				     name_ptr);
				err = SLURM_ERROR;
			}

		/* device type options */
		} else if (!strncasecmp(token, "devtype=", 8)) {
			char *type_ptr = token + 8;
			if (!strcasecmp(type_ptr, "ib")) {
				dev_type = NRT_IB;
			} else if (!strcasecmp(type_ptr, "hfi")) {
				dev_type = NRT_HFI;
			} else if (!strcasecmp(type_ptr, "iponly")) {
				dev_type = NRT_IPONLY;
			} else if (!strcasecmp(type_ptr, "hpce")) {
				dev_type = NRT_HPCE;
			} else if (!strcasecmp(type_ptr, "kmux")) {
				dev_type = NRT_KMUX;
			} else if (!strcasecmp(type_ptr, "sn_all")) {
				sn_all = true;
			} else if (!strcasecmp(type_ptr, "sn_single")) {
				sn_all = false;
			} else {
				info("switch/nrt: invalid option: %s", token);
				err = SLURM_ERROR;
			}

		/* instances options */
		} else if (!strncasecmp(token, "instances=", 10)) {
			long int count;
			char *end_ptr = NULL;
			count = strtol(token+10, &end_ptr, 10);
			if ((end_ptr[0] == 'k') || (end_ptr[0] == 'K'))
				count *= 1024;
			if (count >= 0)
				instances = count;
			else {
				info("switch/nrt: invalid option: %s", token);
				err = SLURM_ERROR;
			}

		/* network options */
		} else if (!strcasecmp(token, "ip")) {
			ip_v4 = true;
		} else if (!strcasecmp(token, "ipv4")) {
			ip_v4 = true;
		} else if (!strcasecmp(token, "ipv6")) {
			ip_v4 = false;
		} else if (!strcasecmp(token, "us")) {
			user_space = true;

		/* protocol options */
		} else if ((!strncasecmp(token, "lapi",  4)) ||
			   (!strncasecmp(token, "mpi",   3)) ||
			   (!strncasecmp(token, "pami",  4)) ||
			   (!strncasecmp(token, "shmem", 5)) ||
			   (!strncasecmp(token, "upc",   3))) {
			if (protocol)
				xstrcat(protocol, ",");
			xstrcat(protocol, token);

		/* adapter options */
		} else if (!strcasecmp(token, "sn_all")) {
			sn_all = true;
		} else if (!strcasecmp(token, "sn_single")) {
			sn_all = false;

		/* Collective Acceleration Units (CAU) */
		} else if (!strncasecmp(token, "cau=", 4)) {
			long int count;
			char *end_ptr = NULL;
			count = strtol(token+4, &end_ptr, 10);
			if ((end_ptr[0] == 'k') || (end_ptr[0] == 'K'))
				count *= 1024;
			if (count >= 0)
				cau = count;
			else {
				info("switch/nrt: invalid option: %s", token);
				err = SLURM_ERROR;
			}

		/* Immediate Send Slots Per Window */
		} else if (!strncasecmp(token, "immed=", 6)) {
			long int count;
			char *end_ptr = NULL;
			count = strtol(token+6, &end_ptr, 10);
			if ((end_ptr[0] == 'k') || (end_ptr[0] == 'K'))
				count *= 1024;
			if (count >= 0)
				immed = count;
			else {
				info("switch/nrt: invalid option: %s", token);
				err = SLURM_ERROR;
			}

		/* other */
		} else {
			info("switch/nrt: invalid option: %s", token);
			err = SLURM_ERROR;
		}
		token = strtok_r(NULL, ",", &save_ptr);
	}

	if (protocol == NULL)
		xstrcat(protocol, "mpi");
	if (!user_space) {
		/* Bulk transfer only supported with user space */
		bulk_xfer = false;
		bulk_xfer_resources = 0;
	}

	if (err == SLURM_SUCCESS) {
		err = nrt_build_jobinfo((slurm_nrt_jobinfo_t *)switch_job,
					list, step_layout->tasks,
					step_layout->tids, sn_all,
					adapter_name, dev_type,
					bulk_xfer, bulk_xfer_resources,
					ip_v4, user_space, protocol,
					instances, cau, immed);
	}

	nrt_need_state_save = true;
	xfree(adapter_name);
	xfree(protocol);
	hostlist_destroy(list);
	xfree(network_str);
	if (debug_flags & DEBUG_FLAG_SWITCH) {
		END_TIMER;
		info("switch_p_build_jobinfo() ending %s", TIME_STR);
	}

	return err;
}
Example #19
0
static slurmdb_job_rec_t *_slurmdb_create_job_rec(
	filetxt_job_rec_t *filetxt_job, slurmdb_job_cond_t *job_cond)
{
	slurmdb_job_rec_t *slurmdb_job = NULL;
	ListIterator itr = NULL;
	filetxt_step_rec_t *filetxt_step = NULL;

	if (!job_cond)
		goto no_cond;

	if (job_cond->state_list
	    && list_count(job_cond->state_list)) {
		char *object = NULL;
		itr = list_iterator_create(job_cond->state_list);
		while((object = list_next(itr))) {
			if (atoi(object) == filetxt_job->status) {
				list_iterator_destroy(itr);
				goto foundstate;
			}
		}
		list_iterator_destroy(itr);
		return NULL;	/* no match */
	}

foundstate:

no_cond:
	slurmdb_job = slurmdb_create_job_rec();
	slurmdb_job->associd = 0;
	slurmdb_job->account = xstrdup(filetxt_job->account);
	slurmdb_job->blockid = xstrdup(filetxt_job->header.blockid);
	slurmdb_job->cluster = NULL;
	slurmdb_job->elapsed = filetxt_job->elapsed;
	slurmdb_job->eligible = filetxt_job->header.job_submit;
	slurmdb_job->end = filetxt_job->header.timestamp;
	slurmdb_job->exitcode = filetxt_job->exitcode;
	slurmdb_job->gid = filetxt_job->header.gid;
	slurmdb_job->jobid = filetxt_job->header.jobnum;
	slurmdb_job->jobname = xstrdup(filetxt_job->jobname);
	slurmdb_job->partition = xstrdup(filetxt_job->header.partition);
	slurmdb_job->req_cpus = filetxt_job->ncpus;
	slurmdb_job->alloc_cpus = filetxt_job->ncpus;
	if (filetxt_job->nodes) {
		hostlist_t hl = hostlist_create(filetxt_job->nodes);
		slurmdb_job->alloc_nodes = hostlist_count(hl);
		hostlist_destroy(hl);
	}
	slurmdb_job->nodes = xstrdup(filetxt_job->nodes);
	slurmdb_job->priority = filetxt_job->priority;
	slurmdb_job->requid = filetxt_job->requid;
	memcpy(&slurmdb_job->stats, &filetxt_job->stats,
	       sizeof(slurmdb_stats_t));
	slurmdb_job->show_full = filetxt_job->show_full;
	slurmdb_job->start = filetxt_job->header.timestamp -
		slurmdb_job->elapsed;
	slurmdb_job->state = filetxt_job->status;

	slurmdb_job->steps = list_create(slurmdb_destroy_step_rec);
	if (filetxt_job->steps) {
		itr = list_iterator_create(filetxt_job->steps);
		while((filetxt_step = list_next(itr))) {
			slurmdb_step_rec_t *step =
				_slurmdb_create_step_rec(filetxt_step);
			if (step) {
				step->job_ptr = slurmdb_job;
				if (!slurmdb_job->first_step_ptr)
					slurmdb_job->first_step_ptr = step;
				list_append(slurmdb_job->steps, step);
			}
		}
		list_iterator_destroy(itr);
	}
	slurmdb_job->submit = filetxt_job->header.job_submit;

	slurmdb_job->sys_cpu_sec = filetxt_job->rusage.ru_stime.tv_sec;
	slurmdb_job->sys_cpu_usec = filetxt_job->rusage.ru_stime.tv_usec;
	slurmdb_job->tot_cpu_sec = filetxt_job->tot_cpu_sec;
	slurmdb_job->tot_cpu_usec = filetxt_job->tot_cpu_usec;
	slurmdb_job->track_steps = filetxt_job->track_steps;
	slurmdb_job->uid = filetxt_job->header.uid;
	slurmdb_job->user = NULL;
	slurmdb_job->user_cpu_sec = filetxt_job->rusage.ru_utime.tv_sec;
	slurmdb_job->user_cpu_usec = filetxt_job->rusage.ru_utime.tv_usec;

	return slurmdb_job;
}
Example #20
0
static char *	_dump_all_nodes(int *node_cnt, time_t update_time)
{
	int i, cnt = 0, rc;
	struct node_record *node_ptr = node_record_table_ptr;
	char *tmp_buf = NULL, *buf = NULL;
	struct node_record *uniq_node_ptr = NULL;
	hostlist_t hl = NULL;

	for (i=0; i<node_record_count; i++, node_ptr++) {
		if (node_ptr->name == NULL)
			continue;
		if (IS_NODE_FUTURE(node_ptr))
			continue;
		if (_hidden_node(node_ptr))
			continue;
		if (use_host_exp == 2) {
			rc = _same_info(uniq_node_ptr, node_ptr, update_time);
			if (rc == 0) {
				uniq_node_ptr = node_ptr;
				if (hl) {
					hostlist_push(hl, node_ptr->name);
				} else {
					hl = hostlist_create(node_ptr->name);
					if (!hl) {
						fatal("Invalid node_name: %s",
						      node_ptr->name);
					}
				}
				continue;
			} else {
				tmp_buf = _dump_node(uniq_node_ptr, hl,
						     update_time);
				hostlist_destroy(hl);
				hl = hostlist_create(node_ptr->name);
				if (!hl) {
					fatal("Invalid node_name: %s",
					      node_ptr->name);
				}
				uniq_node_ptr = node_ptr;
			}
		} else {
			tmp_buf = _dump_node(node_ptr, hl, update_time);
		}
		if (cnt > 0)
			xstrcat(buf, "#");
		xstrcat(buf, tmp_buf);
		xfree(tmp_buf);
		cnt++;
	}

	if (hl) {
		tmp_buf = _dump_node(uniq_node_ptr, hl, update_time);
		hostlist_destroy(hl);
		if (cnt > 0)
			xstrcat(buf, "#");
		xstrcat(buf, tmp_buf);
		xfree(tmp_buf);
		cnt++;
	}

	*node_cnt = cnt;
	return buf;
}
Example #21
0
static int _resources_set(char ***env)
{
	char *p = NULL;

	/* Initialize all memory pointers that would be allocated to NULL
	 * So in case of error exit we will know what to xfree
	 */
	_pmixp_job_info.job_hl = hostlist_create("");
	_pmixp_job_info.step_hl = hostlist_create("");
	_pmixp_job_info.hostname = NULL;

	/* Save step host list */
	p = getenvp(*env, PMIXP_STEP_NODES_ENV);
	if (!p) {
		PMIXP_ERROR_NO(ENOENT, "Environment variable %s not found",
				PMIXP_STEP_NODES_ENV);
		goto err_exit;
	}
	hostlist_push(_pmixp_job_info.step_hl, p);

	/* Extract our node name */
	p = hostlist_nth(_pmixp_job_info.step_hl, _pmixp_job_info.node_id);
	_pmixp_job_info.hostname = xstrdup(p);
	free(p);

	/* Determine job-wide node id and job-wide node count */
	p = getenvp(*env, PMIXP_JOB_NODES_ENV);
	if (p == NULL) {
		p = getenvp(*env, PMIXP_JOB_NODES_ENV_DEP);
		if (p == NULL) {
			/* shouldn't happen if we are under SLURM! */
			PMIXP_ERROR_NO(ENOENT, "Neither of nodelist environment variables: %s OR %s was found!",
					PMIXP_JOB_NODES_ENV, PMIXP_JOB_NODES_ENV_DEP);
			goto err_exit;
		}
	}
	hostlist_push(_pmixp_job_info.job_hl, p);
	_pmixp_job_info.nnodes_job = hostlist_count(_pmixp_job_info.job_hl);
	_pmixp_job_info.node_id_job = hostlist_find(_pmixp_job_info.job_hl,
			_pmixp_job_info.hostname);

	/* FIXME!! ------------------------------------------------------------- */
	/* TODO: _get_task_count not always works well.
	 if (_get_task_count(env, &_pmixp_job_info.ntasks_job, &_pmixp_job_info.ncpus_job) < 0) {
	 _pmixp_job_info.ntasks_job  = _pmixp_job_info.ntasks;
	 _pmixp_job_info.ncpus_job  = _pmixp_job_info.ntasks;
	 }
	 xassert(_pmixp_job_info.ntasks <= _pmixp_job_info.ntasks_job);
	 */
	_pmixp_job_info.ntasks_job = _pmixp_job_info.ntasks;
	_pmixp_job_info.ncpus_job = _pmixp_job_info.ntasks;

	/* Save task-to-node mapping */
	p = getenvp(*env, PMIXP_SLURM_MAPPING_ENV);
	if (p == NULL) {
		/* Direct modex won't work */
		PMIXP_ERROR_NO(ENOENT, "No %s environment variable found!",
				PMIXP_SLURM_MAPPING_ENV);
		goto err_exit;
	}

	_pmixp_job_info.task_map_packed = xstrdup(p);

	return SLURM_SUCCESS;
err_exit:
	hostlist_destroy(_pmixp_job_info.job_hl);
	hostlist_destroy(_pmixp_job_info.step_hl);
	if (NULL != _pmixp_job_info.hostname) {
		xfree(_pmixp_job_info.hostname);
	}
	return SLURM_ERROR;
}
Example #22
0
extern int parse_blockreq(void **dest, slurm_parser_enum_t type,
			  const char *key, const char *value,
			  const char *line, char **leftover)
{
	s_p_options_t block_options[] = {
		{"Type", S_P_STRING},
		{"32CNBlocks", S_P_UINT16},
		{"128CNBlocks", S_P_UINT16},
#ifdef HAVE_BGL
		{"Nodecards", S_P_UINT16},
		{"Quarters", S_P_UINT16},
		{"BlrtsImage", S_P_STRING},
		{"LinuxImage", S_P_STRING},
		{"RamDiskImage", S_P_STRING},
#else
#ifdef HAVE_BGP
		{"16CNBlocks", S_P_UINT16},
		{"CnloadImage", S_P_STRING},
		{"IoloadImage", S_P_STRING},
#endif
		{"64CNBlocks", S_P_UINT16},
		{"256CNBlocks", S_P_UINT16},
#endif
		{"MloaderImage", S_P_STRING},
		{NULL}
	};
	s_p_hashtbl_t *tbl;
	char *tmp = NULL;
	select_ba_request_t *n = NULL;
	hostlist_t hl = NULL;

	tbl = s_p_hashtbl_create(block_options);
	s_p_parse_line(tbl, *leftover, leftover);
	if (!value) {
		return 0;
	}
	n = xmalloc(sizeof(select_ba_request_t));
	hl = hostlist_create(value);
	n->save_name = hostlist_ranged_string_xmalloc(hl);
	hostlist_destroy(hl);
#ifdef HAVE_BGL
	s_p_get_string(&n->blrtsimage, "BlrtsImage", tbl);
	s_p_get_string(&n->linuximage, "LinuxImage", tbl);
	s_p_get_string(&n->ramdiskimage, "RamDiskImage", tbl);
#elif defined HAVE_BGP
	s_p_get_string(&n->linuximage, "CnloadImage", tbl);
	s_p_get_string(&n->ramdiskimage, "IoloadImage", tbl);
#endif
	s_p_get_string(&n->mloaderimage, "MloaderImage", tbl);

	s_p_get_string(&tmp, "Type", tbl);
	if (tmp) {
		verify_conn_type(tmp, n->conn_type);
		xfree(tmp);
	}

	if (!s_p_get_uint16(&n->small32, "32CNBlocks", tbl)) {
#ifdef HAVE_BGL
		s_p_get_uint16(&n->small32, "Nodecards", tbl);
#else
		;
#endif
	}
	if (!s_p_get_uint16(&n->small128, "128CNBlocks", tbl)) {
#ifdef HAVE_BGL
		s_p_get_uint16(&n->small128, "Quarters", tbl);
#else
		;
#endif
	}

#ifndef HAVE_BGL
#ifdef HAVE_BGP
	s_p_get_uint16(&n->small16, "16CNBlocks", tbl);
#endif
	s_p_get_uint16(&n->small64, "64CNBlocks", tbl);
	s_p_get_uint16(&n->small256, "256CNBlocks", tbl);
#endif
	if (n->small16 || n->small32 || n->small64
	    || n->small128 || n->small256) {
		if (n->conn_type[0] < SELECT_SMALL) {
			error("Block def on midplane(s) %s is "
			      "asking for small blocks but given "
			      "TYPE=%s, setting it to Small",
			      n->save_name, conn_type_string(n->conn_type[0]));
			n->conn_type[0] = SELECT_SMALL;
		}
	} else {
		if (n->conn_type[0] == (uint16_t)NO_VAL) {
			n->conn_type[0] = bg_conf->default_conn_type[0];
		} else if (n->conn_type[0] >= SELECT_SMALL) {
			error("Block def on midplane(s) %s is given "
			      "TYPE=%s but isn't asking for any small "
			      "blocks.  Giving it %s.",
			      n->save_name, conn_type_string(n->conn_type[0]),
			      conn_type_string(
				      bg_conf->default_conn_type[0]));
			n->conn_type[0] = bg_conf->default_conn_type[0];
		}
#ifndef HAVE_BG_L_P
		int i;

		for (i=1; i<SYSTEM_DIMENSIONS; i++) {
			if (n->conn_type[i] == (uint16_t)NO_VAL)
				n->conn_type[i] = bg_conf->default_conn_type[i];
			else if (n->conn_type[i] >= SELECT_SMALL) {
				error("Block def on midplane(s) %s dim %d "
				      "is given TYPE=%s but isn't asking "
				      "for any small blocks.  Giving it %s.",
				      n->save_name, i,
				      conn_type_string(n->conn_type[i]),
				      conn_type_string(
					      bg_conf->default_conn_type[i]));
				n->conn_type[i] = bg_conf->default_conn_type[i];
			}
		}
#endif
	}
	s_p_hashtbl_destroy(tbl);

	*dest = (void *)n;
	return 1;
}
Example #23
0
/*
 * _build_sinfo_data - make a sinfo_data entry for each unique node
 *	configuration and add it to the sinfo_list for later printing.
 * sinfo_list IN/OUT - list of unique sinfo_data records to report
 * partition_msg IN - partition info message
 * node_msg IN - node info message
 * RET zero or error code
 */
static int _build_sinfo_data(List sinfo_list,
			     partition_info_msg_t *partition_msg,
			     node_info_msg_t *node_msg)
{
	pthread_attr_t attr_sinfo;
	pthread_t thread_sinfo;
	build_part_info_t *build_struct_ptr;
	node_info_t *node_ptr = NULL;
	partition_info_t *part_ptr = NULL;
	int j;

	g_node_scaling = node_msg->node_scaling;

	/* by default every partition is shown, even if no nodes */
	if ((!params.node_flag) && params.match_flags.partition_flag) {
		part_ptr = partition_msg->partition_array;
		for (j=0; j<partition_msg->record_count; j++, part_ptr++) {
			if ((!params.partition) ||
			    (_strcmp(params.partition, part_ptr->name) == 0)) {
				list_append(sinfo_list, _create_sinfo(
						    part_ptr, (uint16_t) j,
						    NULL,
						    node_msg->node_scaling));
			}
		}
	}

	if (params.filtering) {
		for (j = 0; j < node_msg->record_count; j++) {
			node_ptr = &(node_msg->node_array[j]);
			if (node_ptr->name && _filter_out(node_ptr))
				xfree(node_ptr->name);
		}
	}

	/* make sinfo_list entries for every node in every partition */
	for (j=0; j<partition_msg->record_count; j++, part_ptr++) {
		part_ptr = &(partition_msg->partition_array[j]);

		if (params.filtering && params.partition &&
		    _strcmp(part_ptr->name, params.partition))
			continue;

		if (node_msg->record_count == 1) { /* node_name_single */
			int pos = -1;
			uint16_t subgrp_size = 0;
			hostlist_t hl;

			node_ptr = &(node_msg->node_array[0]);
			if ((node_ptr->name == NULL) ||
			    (part_ptr->nodes == NULL))
				continue;
			hl = hostlist_create(part_ptr->nodes);
			pos = hostlist_find(hl, node_msg->node_array[0].name);
			hostlist_destroy(hl);
			if (pos < 0)
				continue;
			if (select_g_select_nodeinfo_get(
				   node_ptr->select_nodeinfo,
				   SELECT_NODEDATA_SUBGRP_SIZE,
				   0,
				   &subgrp_size) == SLURM_SUCCESS
			    && subgrp_size) {
				_handle_subgrps(sinfo_list,
						(uint16_t) j,
						part_ptr,
						node_ptr,
						node_msg->
						node_scaling);
			} else {
				_insert_node_ptr(sinfo_list,
						 (uint16_t) j,
						 part_ptr,
						 node_ptr,
						 node_msg->
						 node_scaling);
			}
			continue;
		}

		/* Process each partition using a separate thread */
		build_struct_ptr = xmalloc(sizeof(build_part_info_t));
		build_struct_ptr->node_msg   = node_msg;
		build_struct_ptr->part_num   = (uint16_t) j;
		build_struct_ptr->part_ptr   = part_ptr;
		build_struct_ptr->sinfo_list = sinfo_list;

		slurm_mutex_lock(&sinfo_cnt_mutex);
		sinfo_cnt++;
		slurm_mutex_unlock(&sinfo_cnt_mutex);

		slurm_attr_init(&attr_sinfo);
		if (pthread_attr_setdetachstate
		    (&attr_sinfo, PTHREAD_CREATE_DETACHED))
			error("pthread_attr_setdetachstate error %m");
		while (pthread_create(&thread_sinfo, &attr_sinfo,
				      _build_part_info,
				      (void *) build_struct_ptr)) {
			error("pthread_create error %m");
			usleep(10000);	/* sleep and retry */
		}
		slurm_attr_destroy(&attr_sinfo);
	}

	slurm_mutex_lock(&sinfo_cnt_mutex);
	while (sinfo_cnt) {
		pthread_cond_wait(&sinfo_cnt_cond, &sinfo_cnt_mutex);
	}
	slurm_mutex_unlock(&sinfo_cnt_mutex);

	_sort_hostlist(sinfo_list);
	return SLURM_SUCCESS;
}
Example #24
0
extern int basil_node_ranking(struct node_record *node_array, int node_cnt)
{
    enum basil_version version = get_basil_version();
    struct basil_inventory *inv;
    struct basil_node *node;
    int rank_count = 0, i;
    hostlist_t hl = hostlist_create(NULL);
    bool bad_node = 0;

    /*
     * When obtaining the initial configuration, we can not allow ALPS to
     * fail. If there is a problem at this stage it is better to restart
     * SLURM completely, after investigating (and/or fixing) the cause.
     */
    inv = get_full_inventory(version);
    if (inv == NULL)
        fatal("failed to get BASIL %s ranking", bv_names_long[version]);
    else if (!inv->batch_total)
        fatal("system has no usable batch compute nodes");
    else if (inv->batch_total < node_cnt)
        info("Warning: ALPS sees only %d/%d slurm.conf nodes, "
             "check DownNodes", inv->batch_total, node_cnt);

    debug("BASIL %s RANKING INVENTORY: %d/%d batch nodes",
          bv_names_long[version], inv->batch_avail, inv->batch_total);

    /*
     * Node ranking is based on a subset of the inventory: only nodes in
     * batch allocation mode which are up and not allocated. Assign a
     * 'NO_VAL' rank to all other nodes, which will translate as a very
     * high value, (unsigned)-2, to put those nodes last in the ranking.
     * The rest of the code must ensure that those nodes are never chosen.
     */
    for (i = 0; i < node_cnt; i++)
        node_array[i].node_rank = NO_VAL;

    for (node = inv->f->node_head; node; node = node->next) {
        struct node_record *node_ptr;
        char tmp[50];

        /* This will ignore interactive nodes when iterating through
         * the apbasil inventory.  If we don't do this, SLURM is
         * unable to resolve the ID to a nidXXX name since it's not in
         * the slurm.conf file.  (Chris North)
         */
        if (node->role == BNR_INTER)
            continue;

        node_ptr = _find_node_by_basil_id(node->node_id);
        if (node_ptr == NULL) {
            error("nid%05u (%s node in state %s) not in slurm.conf",
                  node->node_id, nam_noderole[node->role],
                  nam_nodestate[node->state]);
            bad_node = 1;
        } else
            node_ptr->node_rank = inv->nodes_total - rank_count++;
        sprintf(tmp, "nid%05u", node->node_id);
        hostlist_push(hl, tmp);
    }
    free_inv(inv);
    if (bad_node) {
        hostlist_sort(hl);
        char *name = hostlist_ranged_string_xmalloc(hl);
        info("It appears your slurm.conf nodelist doesn't "
             "match the alps system.  Here are the nodes alps knows "
             "about\n%s", name);
    }
    hostlist_destroy(hl);

    return SLURM_SUCCESS;
}
Example #25
0
/* Get the next job ID from local variables set up by _is_job_id() */
static char *_next_job_id(void)
{
	static hostlist_t hl = NULL;
	static char *save_ptr = NULL;
	static char *next_job_id = NULL;
	static char *task_id_spec = NULL;
	char *job_id_str = NULL, *bracket_ptr, *under_ptr;
	char *tmp_str, *end_job_str;
	int i;

	/* Clean up from previous calls */
	xfree(next_job_id);

	if (hl) {
		/* Process job ID regular expression using previously
		 * established hostlist data structure */
		tmp_str = hostlist_shift(hl);
		if (tmp_str) {
			next_job_id = xstrdup(tmp_str);
			free(tmp_str);
			if (task_id_spec) {
				xstrcat(next_job_id, "_");
				xstrcat(next_job_id, task_id_spec);
			}
			return next_job_id;
		}
		hostlist_destroy(hl);
		hl = NULL;
	}

	/* Get next token */
	xfree(task_id_spec);
	if (local_job_str && !save_ptr)	/* Get first token */
		job_id_str = strtok_r(local_job_str, "^", &save_ptr);
	else if (save_ptr)		/* Get next token */
		job_id_str = strtok_r(NULL, "^", &save_ptr);

	if (!job_id_str)	/* No more tokens */
		goto fini;

	under_ptr = strchr(job_id_str, '_');
	if (under_ptr) {
		if (under_ptr[1] == '[') {
			/* Strip brackets from job array task ID spec */
			task_id_spec = xstrdup(under_ptr + 2);
			for (i = 0; task_id_spec[i]; i++) {
				if (task_id_spec[i] == ']') {
					task_id_spec[i] = '\0';
					break;
				}
			}
		} else {
			task_id_spec = xstrdup(under_ptr + 1);
		}
	}

	bracket_ptr = strchr(job_id_str, '[');
	if (bracket_ptr && (!under_ptr || (bracket_ptr < under_ptr))) {
		/* Job ID specification uses regular expression */
		tmp_str = xstrdup(job_id_str);
		if ((end_job_str = strchr(tmp_str, '_')))
			end_job_str[0] = '\0';
		hl = hostlist_create(tmp_str);
		if (!hl) {
			error("Invalid job id: %s", job_id_str);
			xfree(tmp_str);
			goto fini;
		}
		xfree(tmp_str);
		tmp_str = hostlist_shift(hl);
		if (!tmp_str) {
			error("Invalid job id: %s", job_id_str);
			hostlist_destroy(hl);
			goto fini;
		}
		next_job_id = xstrdup(tmp_str);
		free(tmp_str);
	} else if (under_ptr) {
		under_ptr[0] = '\0';
		next_job_id = xstrdup(job_id_str);
		under_ptr[0] = '_';
	} else {
		next_job_id = xstrdup(job_id_str);
	}

	if (task_id_spec) {
		xstrcat(next_job_id, "_");
		xstrcat(next_job_id, task_id_spec);
	}

	return next_job_id;

fini:	xfree(local_job_str);
	save_ptr = NULL;
	return NULL;
}
Example #26
0
lcb_error_t lcb_init_providers(lcb_t obj,
                               const struct lcb_create_st2 *e_options)
{
    hostlist_t mc_nodes;
    lcb_error_t err;
    const char *hosts;
    int http_enabled = 1;
    int cccp_enabled = 1;

    clconfig_provider *http =
            lcb_confmon_get_provider(obj->confmon, LCB_CLCONFIG_HTTP);

    clconfig_provider *cccp =
            lcb_confmon_get_provider(obj->confmon, LCB_CLCONFIG_CCCP);


    if (e_options->transports) {
        int cccp_found = 0;
        int http_found = 0;
        const lcb_config_transport_t *cur;

        for (cur = e_options->transports;
                *cur != LCB_CONFIG_TRANSPORT_LIST_END; cur++) {
            if (*cur == LCB_CONFIG_TRANSPORT_CCCP) {
                cccp_found = 1;
            } else if (*cur == LCB_CONFIG_TRANSPORT_HTTP) {
                http_found = 1;
            } else {
                return LCB_EINVAL;
            }
        }

        if (http_found || cccp_found) {
            cccp_enabled = cccp_found;
            http_enabled = http_found;
        }
    }

    if (lcb_getenv_boolean("LCB_NO_CCCP")) {
        cccp_enabled = 0;
    }

    if (lcb_getenv_boolean("LCB_NO_HTTP")) {
        http_enabled = 0;
    }

    /** The only way we can get to here is if one of the vars are set */
    if (cccp_enabled == 0 && http_enabled == 0) {
        return LCB_BAD_ENVIRONMENT;
    }

    if (http_enabled) {
        lcb_clconfig_http_enable(http);
        lcb_clconfig_http_set_nodes(http, obj->usernodes);
    } else {
        lcb_confmon_set_provider_active(obj->confmon, LCB_CLCONFIG_HTTP, 0);
    }

    if (!cccp_enabled) {
        lcb_confmon_set_provider_active(obj->confmon, LCB_CLCONFIG_CCCP, 0);
        return LCB_SUCCESS;
    }

    hosts = get_nonempty_string(e_options->mchosts);
    mc_nodes = hostlist_create();

    if (!mc_nodes) {
        return LCB_CLIENT_ENOMEM;
    }

    if (hosts) {
        err = hostlist_add_stringz(mc_nodes, hosts, LCB_CONFIG_MCD_PORT);
        if (err != LCB_SUCCESS) {
            hostlist_destroy(mc_nodes);
            return err;
        }

    } else {
        lcb_size_t ii;
        for (ii = 0; ii < obj->usernodes->nentries; ii++) {
            lcb_host_t *cur = obj->usernodes->entries + ii;
            hostlist_add_stringz(mc_nodes, cur->host, LCB_CONFIG_MCD_PORT);
        }
    }

    lcb_clconfig_cccp_enable(cccp, obj);
    lcb_clconfig_cccp_set_nodes(cccp, mc_nodes);
    hostlist_destroy(mc_nodes);
    return LCB_SUCCESS;
}
Example #27
0
static void _append_hostlist (hostlist_t *hl1, hostlist_t hl2)
{
    if (*hl1 == NULL)
        *hl1 = hostlist_create ("");
    hostlist_push_list (*hl1, hl2);
}
Example #28
0
LIBCOUCHBASE_API
lcb_error_t lcb_create(lcb_t *instance,
                       const struct lcb_create_st *options)
{
    const char *host = NULL;
    const char *user = NULL;
    const char *passwd = NULL;
    const char *bucket = NULL;

    struct lcb_io_opt_st *io = NULL;
    struct lcb_create_st options_container;
    struct lcb_create_st2 *e_options = &options_container.v.v2;

    lcb_type_t type = LCB_TYPE_BUCKET;
    lcb_t obj;
    lcb_error_t err;
    lcb_settings *settings;

    err = normalize_options(&options_container, options);

    if (err != LCB_SUCCESS) {
        return err;
    }

    host = get_nonempty_string(e_options->host);
    user = get_nonempty_string(e_options->user);
    passwd = get_nonempty_string(e_options->passwd);
    bucket = get_nonempty_string(e_options->bucket);
    io = e_options->io;
    type = e_options->type;

    if (type == LCB_TYPE_CLUSTER && user == NULL && passwd == NULL) {
        return LCB_EINVAL;
    }

    if (host == NULL) {
        host = "localhost";
    }

    if (bucket == NULL) {
        bucket = "default";
    }

    /* Do not allow people use Administrator account for data access */
    if (type == LCB_TYPE_BUCKET && user && strcmp(user, bucket) != 0) {
        return LCB_INVALID_USERNAME;
    }

    if ((obj = calloc(1, sizeof(*obj))) == NULL) {
        return LCB_CLIENT_ENOMEM;
    }

    obj->type = type;
    obj->compat.type = (lcb_compat_t)0xdead;

    if (io == NULL) {
        lcb_io_opt_t ops;
        if ((err = lcb_create_io_ops(&ops, NULL)) != LCB_SUCCESS) {
            /* You can't initialize the library without a io-handler! */
            free(obj);
            return err;
        }
        io = ops;
        io->v.v0.need_cleanup = 1;
    }

    settings = &obj->settings;
    settings->randomize_bootstrap_nodes = 1;
    settings->bummer = 0;
    settings->io = io;
    obj->syncmode = LCB_ASYNCHRONOUS;
    settings->ipv6 = LCB_IPV6_DISABLED;

    settings->operation_timeout = LCB_DEFAULT_TIMEOUT;
    settings->config_timeout = LCB_DEFAULT_CONFIGURATION_TIMEOUT;
    settings->config_node_timeout = LCB_DEFAULT_NODECONFIG_TIMEOUT;
    settings->views_timeout = LCB_DEFAULT_VIEW_TIMEOUT;
    settings->rbufsize = LCB_DEFAULT_RBUFSIZE;
    settings->wbufsize = LCB_DEFAULT_WBUFSIZE;
    settings->durability_timeout = LCB_DEFAULT_DURABILITY_TIMEOUT;
    settings->durability_interval = LCB_DEFAULT_DURABILITY_INTERVAL;
    settings->http_timeout = LCB_DEFAULT_HTTP_TIMEOUT;
    settings->weird_things_threshold = LCB_DEFAULT_CONFIG_ERRORS_THRESHOLD;
    settings->weird_things_delay = LCB_DEFAULT_CONFIG_ERRORS_DELAY;
    settings->max_redir = LCB_DEFAULT_CONFIG_MAXIMUM_REDIRECTS;
    settings->grace_next_cycle = LCB_DEFAULT_CLCONFIG_GRACE_CYCLE;
    settings->grace_next_provider = LCB_DEFAULT_CLCONFIG_GRACE_NEXT;
    settings->bc_http_stream_time = LCB_DEFAULT_BC_HTTP_DISCONNTMO;
    settings->bucket = strdup(bucket);
    settings->logger = lcb_init_console_logger();
    settings->iid = lcb_instance_index++;


    if (user) {
        settings->username = strdup(user);
    } else {
        settings->username = strdup(settings->bucket);
    }

    if (passwd) {
        settings->password = strdup(passwd);
    }

    lcb_initialize_packet_handlers(obj);

    obj->memd_sockpool = connmgr_create(settings, io);
    obj->memd_sockpool->max_idle = 1;
    obj->memd_sockpool->idle_timeout = 10000000;

    obj->confmon = lcb_confmon_create(settings);
    obj->usernodes = hostlist_create();

    /** We might want to sanitize this a bit more later on.. */
    if (strstr(host, "://") != NULL && strstr(host, "http://") == NULL) {
        lcb_destroy(obj);
        return LCB_INVALID_HOST_FORMAT;
    }


    err = hostlist_add_string(obj->usernodes, host, -1, LCB_CONFIG_HTTP_PORT);
    if (err != LCB_SUCCESS) {
        lcb_destroy(obj);
        return err;
    }

    err = lcb_init_providers(obj, e_options);
    if (err != LCB_SUCCESS) {
        lcb_destroy(obj);
        return err;
    }

    lcb_initialize_packet_handlers(obj);

    obj->timers = hashset_create();
    obj->http_requests = hashset_create();
    obj->durability_polls = hashset_create();
    /* No error has occurred yet. */
    obj->last_error = LCB_SUCCESS;
    if ((obj->cmdht = lcb_hashtable_szt_new(32)) == NULL) {
        lcb_destroy(obj);
        return LCB_CLIENT_ENOMEM;
    }


    if (!ringbuffer_initialize(&obj->purged_buf, 4096)) {
        lcb_destroy(obj);
        return LCB_CLIENT_ENOMEM;
    }
    if (!ringbuffer_initialize(&obj->purged_cookies, 4096)) {
        lcb_destroy(obj);
        return LCB_CLIENT_ENOMEM;
    }

    *instance = obj;
    return LCB_SUCCESS;
}
Example #29
0
/*
 * build_all_frontend_info - get a array of slurm_conf_frontend_t structures
 *	from the slurm.conf reader, build table, and set values
 * is_slurmd_context: set to true if run from slurmd
 * RET 0 if no error, error code otherwise
 */
extern int build_all_frontend_info (bool is_slurmd_context)
{
	slurm_conf_frontend_t **ptr_array;
#ifdef HAVE_FRONT_END
	slurm_conf_frontend_t *fe_single, *fe_line;
	int i, count, max_rc = SLURM_SUCCESS;
	bool front_end_debug;

	if (slurm_get_debug_flags() & DEBUG_FLAG_FRONT_END)
		front_end_debug = true;
	else
		front_end_debug = false;
	count = slurm_conf_frontend_array(&ptr_array);
	if (count == 0)
		fatal("No FrontendName information available!");

	for (i = 0; i < count; i++) {
		hostlist_t hl_name, hl_addr;
		char *fe_name, *fe_addr;

		fe_line = ptr_array[i];
		hl_name = hostlist_create(fe_line->frontends);
		if (hl_name == NULL)
			fatal("Invalid FrontendName:%s", fe_line->frontends);
		hl_addr = hostlist_create(fe_line->addresses);
		if (hl_addr == NULL)
			fatal("Invalid FrontendAddr:%s", fe_line->addresses);
		if (hostlist_count(hl_name) != hostlist_count(hl_addr)) {
			fatal("Inconsistent node count between "
			      "FrontendName(%s) and FrontendAddr(%s)",
			      fe_line->frontends, fe_line->addresses);
		}
		while ((fe_name = hostlist_shift(hl_name))) {
			fe_addr = hostlist_shift(hl_addr);
			fe_single = xmalloc(sizeof(slurm_conf_frontend_t));
			list_append(front_end_list, fe_single);
			fe_single->frontends = xstrdup(fe_name);
			fe_single->addresses = xstrdup(fe_addr);
			free(fe_name);
			free(fe_addr);
			if (fe_line->allow_groups && fe_line->allow_groups[0]) {
				fe_single->allow_groups =
					xstrdup(fe_line->allow_groups);
			}
			if (fe_line->allow_users && fe_line->allow_users[0]) {
				fe_single->allow_users =
					xstrdup(fe_line->allow_users);
			}
			if (fe_line->deny_groups && fe_line->deny_groups[0]) {
				fe_single->deny_groups =
					xstrdup(fe_line->deny_groups);
			}
			if (fe_line->deny_users && fe_line->deny_users[0]) {
				fe_single->deny_users =
					xstrdup(fe_line->deny_users);
			}
			fe_single->port = fe_line->port;
			if (fe_line->reason && fe_line->reason[0])
				fe_single->reason = xstrdup(fe_line->reason);
			fe_single->node_state = fe_line->node_state;
			if (front_end_debug && !is_slurmd_context)
				_dump_front_end(fe_single);
		}
		hostlist_destroy(hl_addr);
		hostlist_destroy(hl_name);
	}
	return max_rc;
#else
	if (slurm_conf_frontend_array(&ptr_array) != 0)
		fatal("FrontendName information configured!");
	return SLURM_SUCCESS;
#endif
}
Example #30
0
/* Start a job:
 *	CMD=STARTJOB ARG=<jobid> TASKLIST=<node_list>
 * RET 0 on success, -1 on failure */
extern int	start_job(char *cmd_ptr, int *err_code, char **err_msg)
{
	char *arg_ptr, *task_ptr, *tasklist, *tmp_char;
	int rc, task_cnt;
	uint32_t jobid;
	hostlist_t hl = (hostlist_t) NULL;
	char *host_string;
	static char reply_msg[128];

	arg_ptr = strstr(cmd_ptr, "ARG=");
	if (arg_ptr == NULL) {
		*err_code = -300;
		*err_msg = "STARTJOB lacks ARG";
		error("wiki: STARTJOB lacks ARG");
		return -1;
	}
	jobid = strtoul(arg_ptr+4, &tmp_char, 10);
	if (!isspace(tmp_char[0])) {
		*err_code = -300;
		*err_msg = "Invalid ARG value";
		error("wiki: STARTJOB has invalid jobid");
		return -1;
	}

	task_ptr = strstr(cmd_ptr, "TASKLIST=");
	if (task_ptr == NULL) {
		*err_code = -300;
		*err_msg = "STARTJOB lacks TASKLIST";
		error("wiki: STARTJOB lacks TASKLIST");
		return -1;
	}
	task_ptr += 9;	/* skip over "TASKLIST=" */
	null_term(task_ptr);
	tasklist = moab2slurm_task_list(task_ptr, &task_cnt);
	if (tasklist)
		hl = hostlist_create(tasklist);
	if ((tasklist == NULL) || (hl == NULL)) {
		*err_code = -300;
		*err_msg = "STARTJOB TASKLIST is invalid";
		error("wiki: STARTJOB TASKLIST is invalid: %s",
			task_ptr);
		xfree(tasklist);
		return -1;
	}
	hostlist_uniq(hl);
	hostlist_sort(hl);
	host_string = hostlist_ranged_string_xmalloc(hl);
	hostlist_destroy(hl);
	if (host_string == NULL) {
		*err_code = -300;
		*err_msg = "STARTJOB has invalid TASKLIST";
		error("wiki: STARTJOB has invalid TASKLIST: %s", tasklist);
		xfree(tasklist);
		return -1;
	}

	rc = _start_job(jobid, task_cnt, host_string, tasklist,
			err_code, err_msg);
	xfree(host_string);
	xfree(tasklist);
	if (rc == 0) {
		snprintf(reply_msg, sizeof(reply_msg),
			"job %u started successfully", jobid);
		*err_msg = reply_msg;
	}
	return rc;
}