Beispiel #1
0
/*
 * send_slurmctld_register_req - request register from slurmctld
 * IN host: control host of cluster
 * IN port: control port of cluster
 * IN rpc_version: rpc version of cluster
 * RET:  error code
 */
static int _send_slurmctld_register_req(slurmdb_cluster_rec_t *cluster_rec)
{
	slurm_addr_t ctld_address;
	slurm_fd_t fd;
	int rc = SLURM_SUCCESS;

	slurm_set_addr_char(&ctld_address, cluster_rec->control_port,
			    cluster_rec->control_host);
	fd = slurm_open_msg_conn(&ctld_address);
	if (fd < 0) {
		rc = SLURM_ERROR;
	} else {
		slurm_msg_t out_msg;
		slurm_msg_t_init(&out_msg);
		out_msg.msg_type = ACCOUNTING_REGISTER_CTLD;
		out_msg.flags = SLURM_GLOBAL_AUTH_KEY;
		out_msg.protocol_version
			= slurmdbd_translate_rpc(cluster_rec->rpc_version);
		slurm_send_node_msg(fd, &out_msg);
		/* We probably need to add matching recv_msg function
		 * for an arbitray fd or should these be fire
		 * and forget?  For this, that we can probably
		 * forget about it */
		slurm_close_stream(fd);
	}
	return rc;
}
Beispiel #2
0
/* Open event_fd as needed
 * RET 0 on success, -1 on failure */
static int _open_fd(time_t now)
{
	if (event_fd != -1)
		return 0;

	/* Identify address for socket connection.
	 * Done only on first call, then cached. */
	if (event_addr_set == 0) {
		slurm_set_addr(&moab_event_addr, e_port, e_host);
		event_addr_set = 1;
		if (e_host_bu[0] != '\0') {
			slurm_set_addr(&moab_event_addr_bu, e_port,
				e_host_bu);
			event_addr_set = 2;
		}
	}

	/* Open the event port on moab as needed */
	if (event_fd == -1) {
		event_fd = slurm_open_msg_conn(&moab_event_addr);
		if (event_fd == -1) {
			error("Unable to open primary wiki "
				"event port %s:%u: %m",
				e_host, e_port);
		}
	}
	if ((event_fd == -1) && (event_addr_set == 2)) {
		event_fd = slurm_open_msg_conn(&moab_event_addr_bu);
		if (event_fd == -1) {
			error("Unable to open backup wiki "
				"event port %s:%u: %m",
				e_host_bu, e_port);
		}
	}
	if (event_fd == -1)
		return -1;

	/* We can't have the controller block on the following write() */
	fd_set_nonblocking(event_fd);
	return 0;
}
Beispiel #3
0
extern int slurm_persist_conn_open_without_init(
	slurm_persist_conn_t *persist_conn)
{
	slurm_addr_t addr;

	xassert(persist_conn);
	xassert(persist_conn->rem_host);
	xassert(persist_conn->rem_port);
	xassert(persist_conn->cluster_name);

	if (persist_conn->fd > 0)
		_close_fd(&persist_conn->fd);
	else
		persist_conn->fd = -1;

	if (!persist_conn->inited)
		persist_conn->inited = true;

	if (!persist_conn->version) {
		/* Set to MIN_PROTOCOL so that a higher version controller can
		 * talk to a lower protocol version controller. When talking to
		 * the DBD, the protocol version should be set to the current
		 * protocol version prior to calling this. */
		persist_conn->version = SLURM_MIN_PROTOCOL_VERSION;
	}
	if (persist_conn->timeout < 0)
		persist_conn->timeout = slurm_get_msg_timeout() * 1000;

	slurm_set_addr_char(&addr, persist_conn->rem_port,
			    persist_conn->rem_host);
	if ((persist_conn->fd = slurm_open_msg_conn(&addr)) < 0) {
		if (_comm_fail_log(persist_conn)) {
			char *s = xstrdup_printf("%s: failed to open persistent connection to %s:%d: %m",
						 __func__,
						 persist_conn->rem_host,
						 persist_conn->rem_port);
			if (persist_conn->flags & PERSIST_FLAG_SUPPRESS_ERR)
				debug2("%s", s);
			else
				error("%s", s);
			xfree(s);
		}
		return SLURM_ERROR;
	}
	fd_set_nonblocking(persist_conn->fd);
	fd_set_close_on_exec(persist_conn->fd);

	return SLURM_SUCCESS;
}
Beispiel #4
0
/*
 * cluster_first_reg - ask for controller to send nodes in a down state
 *    and jobs pending or running on first registration.
 *
 * IN host: controller host
 * IN port: controller port
 * IN rpc_version: controller rpc version
 * RET: error code
 */
extern int cluster_first_reg(char *host, uint16_t port, uint16_t rpc_version)
{
	slurm_addr_t ctld_address;
	int fd;
	int rc = SLURM_SUCCESS;

	info("First time to register cluster requesting "
	     "running jobs and system information.");

	slurm_set_addr_char(&ctld_address, port, host);
	fd = slurm_open_msg_conn(&ctld_address);
	if (fd < 0) {
		error("can not open socket back to slurmctld "
		      "%s(%u): %m", host, port);
		rc = SLURM_ERROR;
	} else {
		slurm_msg_t out_msg;
		accounting_update_msg_t update;
		/* We have to put this update message here so
		   we can tell the sender to send the correct
		   RPC version.
		*/
		memset(&update, 0, sizeof(accounting_update_msg_t));
		update.rpc_version = rpc_version;
		slurm_msg_t_init(&out_msg);
		out_msg.msg_type = ACCOUNTING_FIRST_REG;
		out_msg.flags = SLURM_GLOBAL_AUTH_KEY;
		out_msg.data = &update;
		slurm_send_node_msg(fd, &out_msg);
		/* We probably need to add matching recv_msg function
		 * for an arbitray fd or should these be fire
		 * and forget?  For this, that we can probably
		 * forget about it */
		close(fd);
	}
	return rc;
}
Beispiel #5
0
void *_forward_thread(void *arg)
{
	forward_msg_t *fwd_msg = (forward_msg_t *)arg;
	forward_struct_t *fwd_struct = fwd_msg->fwd_struct;
	Buf buffer = init_buf(BUF_SIZE);	/* probably enough for header */
	List ret_list = NULL;
	int fd = -1;
	ret_data_info_t *ret_data_info = NULL;
	char *name = NULL;
	hostlist_t hl = hostlist_create(fwd_msg->header.forward.nodelist);
	slurm_addr_t addr;
	char *buf = NULL;
	int steps = 0;
	int start_timeout = fwd_msg->timeout;

	/* repeat until we are sure the message was sent */
	while ((name = hostlist_shift(hl))) {
		if (slurm_conf_get_addr(name, &addr) == SLURM_ERROR) {
			error("forward_thread: can't find address for host "
			      "%s, check slurm.conf", name);
			slurm_mutex_lock(&fwd_struct->forward_mutex);
			mark_as_failed_forward(&fwd_struct->ret_list, name,
					       SLURM_UNKNOWN_FORWARD_ADDR);
 			free(name);
			if (hostlist_count(hl) > 0) {
				slurm_mutex_unlock(&fwd_struct->forward_mutex);
				continue;
			}
			goto cleanup;
		}
		if ((fd = slurm_open_msg_conn(&addr)) < 0) {
			error("forward_thread to %s: %m", name);

			slurm_mutex_lock(&fwd_struct->forward_mutex);
			mark_as_failed_forward(
				&fwd_struct->ret_list, name,
				SLURM_COMMUNICATIONS_CONNECTION_ERROR);
			free(name);
			if (hostlist_count(hl) > 0) {
				slurm_mutex_unlock(&fwd_struct->forward_mutex);
				/* Abandon tree. This way if all the
				 * nodes in the branch are down we
				 * don't have to time out for each
				 * node serially.
				 */
				_forward_msg_internal(hl, NULL, fwd_struct,
						      &fwd_msg->header, 0,
						      hostlist_count(hl));
				continue;
			}
			goto cleanup;
		}
		buf = hostlist_ranged_string_xmalloc(hl);

		xfree(fwd_msg->header.forward.nodelist);
		fwd_msg->header.forward.nodelist = buf;
		fwd_msg->header.forward.cnt = hostlist_count(hl);
#if 0
		info("sending %d forwards (%s) to %s",
		     fwd_msg->header.forward.cnt,
		     fwd_msg->header.forward.nodelist, name);
#endif
		if (fwd_msg->header.forward.nodelist[0]) {
			debug3("forward: send to %s along with %s",
			       name, fwd_msg->header.forward.nodelist);
		} else
			debug3("forward: send to %s ", name);

		pack_header(&fwd_msg->header, buffer);

		/* add forward data to buffer */
		if (remaining_buf(buffer) < fwd_struct->buf_len) {
			int new_size = buffer->processed + fwd_struct->buf_len;
			new_size += 1024; /* padded for paranoia */
			xrealloc_nz(buffer->head, new_size);
			buffer->size = new_size;
		}
		if (fwd_struct->buf_len) {
			memcpy(&buffer->head[buffer->processed],
			       fwd_struct->buf, fwd_struct->buf_len);
			buffer->processed += fwd_struct->buf_len;
		}

		/*
		 * forward message
		 */
		if (slurm_msg_sendto(fd,
				     get_buf_data(buffer),
				     get_buf_offset(buffer),
				     SLURM_PROTOCOL_NO_SEND_RECV_FLAGS ) < 0) {
			error("forward_thread: slurm_msg_sendto: %m");

			slurm_mutex_lock(&fwd_struct->forward_mutex);
			mark_as_failed_forward(&fwd_struct->ret_list, name,
					       errno);
			free(name);
			if (hostlist_count(hl) > 0) {
				free_buf(buffer);
				buffer = init_buf(fwd_struct->buf_len);
				slurm_mutex_unlock(&fwd_struct->forward_mutex);
				slurm_close(fd);
				fd = -1;
				/* Abandon tree. This way if all the
				 * nodes in the branch are down we
				 * don't have to time out for each
				 * node serially.
				 */
				_forward_msg_internal(hl, NULL, fwd_struct,
						      &fwd_msg->header, 0,
						      hostlist_count(hl));
				continue;
			}
			goto cleanup;
		}

		/* These messages don't have a return message, but if
		 * we got here things worked out so make note of the
		 * list of nodes as success.
		 */
		if ((fwd_msg->header.msg_type == REQUEST_SHUTDOWN) ||
		    (fwd_msg->header.msg_type == REQUEST_RECONFIGURE) ||
		    (fwd_msg->header.msg_type == REQUEST_REBOOT_NODES)) {
			slurm_mutex_lock(&fwd_struct->forward_mutex);
			ret_data_info = xmalloc(sizeof(ret_data_info_t));
			list_push(fwd_struct->ret_list, ret_data_info);
			ret_data_info->node_name = xstrdup(name);
			free(name);
			while ((name = hostlist_shift(hl))) {
				ret_data_info =
					xmalloc(sizeof(ret_data_info_t));
				list_push(fwd_struct->ret_list, ret_data_info);
				ret_data_info->node_name = xstrdup(name);
				free(name);
			}
			goto cleanup;
		}

		if (fwd_msg->header.forward.cnt > 0) {
			static int message_timeout = -1;
			if (message_timeout < 0)
				message_timeout =
					slurm_get_msg_timeout() * 1000;
			if (!fwd_msg->header.forward.tree_width)
				fwd_msg->header.forward.tree_width =
					slurm_get_tree_width();
			steps = (fwd_msg->header.forward.cnt+1) /
					fwd_msg->header.forward.tree_width;
			fwd_msg->timeout = (message_timeout*steps);
			/* info("got %d * %d = %d", message_timeout, */
			/*      steps, fwd_msg->timeout); */
			steps++;
			fwd_msg->timeout += (start_timeout*steps);
			/* info("now  + %d*%d = %d", start_timeout, */
			/*      steps, fwd_msg->timeout); */
		}

		ret_list = slurm_receive_msgs(fd, steps, fwd_msg->timeout);
		/* info("sent %d forwards got %d back", */
		/*      fwd_msg->header.forward.cnt, list_count(ret_list)); */

		if (!ret_list || (fwd_msg->header.forward.cnt != 0
				  && list_count(ret_list) <= 1)) {
			slurm_mutex_lock(&fwd_struct->forward_mutex);
			mark_as_failed_forward(&fwd_struct->ret_list, name,
					       errno);
			free(name);
			FREE_NULL_LIST(ret_list);
			if (hostlist_count(hl) > 0) {
				free_buf(buffer);
				buffer = init_buf(fwd_struct->buf_len);
				slurm_mutex_unlock(&fwd_struct->forward_mutex);
				slurm_close(fd);
				fd = -1;
				continue;
			}
			goto cleanup;
		} else if ((fwd_msg->header.forward.cnt+1)
			  != list_count(ret_list)) {
			/* this should never be called since the above
			   should catch the failed forwards and pipe
			   them back down, but this is here so we
			   never have to worry about a locked
			   mutex */
			ListIterator itr = NULL;
			char *tmp = NULL;
			int first_node_found = 0;
			hostlist_iterator_t host_itr
				= hostlist_iterator_create(hl);
			error("We shouldn't be here.  We forwarded to %d "
			      "but only got %d back",
			      (fwd_msg->header.forward.cnt+1),
			      list_count(ret_list));
			while ((tmp = hostlist_next(host_itr))) {
				int node_found = 0;
				itr = list_iterator_create(ret_list);
				while ((ret_data_info = list_next(itr))) {
					if (!ret_data_info->node_name) {
						first_node_found = 1;
						ret_data_info->node_name =
							xstrdup(name);
					}
					if (!xstrcmp(tmp,
						   ret_data_info->node_name)) {
						node_found = 1;
						break;
					}
				}
				list_iterator_destroy(itr);
				if (!node_found) {
					mark_as_failed_forward(
						&fwd_struct->ret_list,
						tmp,
						SLURM_COMMUNICATIONS_CONNECTION_ERROR);
				}
				free(tmp);
			}
			hostlist_iterator_destroy(host_itr);
			if (!first_node_found) {
				mark_as_failed_forward(
					&fwd_struct->ret_list,
					name,
					SLURM_COMMUNICATIONS_CONNECTION_ERROR);
			}
		}
		break;
	}
	slurm_mutex_lock(&fwd_struct->forward_mutex);
	if (ret_list) {
		while ((ret_data_info = list_pop(ret_list)) != NULL) {
			if (!ret_data_info->node_name) {
				ret_data_info->node_name = xstrdup(name);
			}
			list_push(fwd_struct->ret_list, ret_data_info);
			debug3("got response from %s",
			       ret_data_info->node_name);
		}
		FREE_NULL_LIST(ret_list);
	}
	free(name);
cleanup:
	if ((fd >= 0) && slurm_close(fd) < 0)
		error ("close(%d): %m", fd);
	hostlist_destroy(hl);
	destroy_forward(&fwd_msg->header.forward);
	free_buf(buffer);
	slurm_cond_signal(&fwd_struct->notify);
	slurm_mutex_unlock(&fwd_struct->forward_mutex);
	xfree(fwd_msg);

	return (NULL);
}