Esempio n. 1
0
extern int
spawn_resp_send_to_fd(spawn_resp_t *resp, int fd)
{
	Buf buf;
	int rc;

	buf = init_buf(1024);

	/* sync with spawn_req_send_to_srun */
/* 	cmd = TREE_CMD_SPAWN_RESP; */
/* 	pack16(cmd, buf); */
	spawn_resp_pack(resp, buf);
	rc = slurm_msg_sendto(fd, get_buf_data(buf), get_buf_offset(buf),
			      SLURM_PROTOCOL_NO_SEND_RECV_FLAGS);
	free_buf(buf);

	return rc;
}
Esempio n. 2
0
void *_forward_thread(void *arg)
{
	forward_msg_t *fwd_msg = (forward_msg_t *)arg;
	forward_struct_t *fwd_struct = fwd_msg->fwd_struct;
	Buf buffer = init_buf(BUF_SIZE);	/* probably enough for header */
	List ret_list = NULL;
	int fd = -1;
	ret_data_info_t *ret_data_info = NULL;
	char *name = NULL;
	hostlist_t hl = hostlist_create(fwd_msg->header.forward.nodelist);
	slurm_addr_t addr;
	char *buf = NULL;
	int steps = 0;
	int start_timeout = fwd_msg->timeout;

	/* repeat until we are sure the message was sent */
	while ((name = hostlist_shift(hl))) {
		if (slurm_conf_get_addr(name, &addr) == SLURM_ERROR) {
			error("forward_thread: can't find address for host "
			      "%s, check slurm.conf", name);
			slurm_mutex_lock(&fwd_struct->forward_mutex);
			mark_as_failed_forward(&fwd_struct->ret_list, name,
					       SLURM_UNKNOWN_FORWARD_ADDR);
 			free(name);
			if (hostlist_count(hl) > 0) {
				slurm_mutex_unlock(&fwd_struct->forward_mutex);
				continue;
			}
			goto cleanup;
		}
		if ((fd = slurm_open_msg_conn(&addr)) < 0) {
			error("forward_thread to %s: %m", name);

			slurm_mutex_lock(&fwd_struct->forward_mutex);
			mark_as_failed_forward(
				&fwd_struct->ret_list, name,
				SLURM_COMMUNICATIONS_CONNECTION_ERROR);
			free(name);
			if (hostlist_count(hl) > 0) {
				slurm_mutex_unlock(&fwd_struct->forward_mutex);
				/* Abandon tree. This way if all the
				 * nodes in the branch are down we
				 * don't have to time out for each
				 * node serially.
				 */
				_forward_msg_internal(hl, NULL, fwd_struct,
						      &fwd_msg->header, 0,
						      hostlist_count(hl));
				continue;
			}
			goto cleanup;
		}
		buf = hostlist_ranged_string_xmalloc(hl);

		xfree(fwd_msg->header.forward.nodelist);
		fwd_msg->header.forward.nodelist = buf;
		fwd_msg->header.forward.cnt = hostlist_count(hl);
#if 0
		info("sending %d forwards (%s) to %s",
		     fwd_msg->header.forward.cnt,
		     fwd_msg->header.forward.nodelist, name);
#endif
		if (fwd_msg->header.forward.nodelist[0]) {
			debug3("forward: send to %s along with %s",
			       name, fwd_msg->header.forward.nodelist);
		} else
			debug3("forward: send to %s ", name);

		pack_header(&fwd_msg->header, buffer);

		/* add forward data to buffer */
		if (remaining_buf(buffer) < fwd_struct->buf_len) {
			int new_size = buffer->processed + fwd_struct->buf_len;
			new_size += 1024; /* padded for paranoia */
			xrealloc_nz(buffer->head, new_size);
			buffer->size = new_size;
		}
		if (fwd_struct->buf_len) {
			memcpy(&buffer->head[buffer->processed],
			       fwd_struct->buf, fwd_struct->buf_len);
			buffer->processed += fwd_struct->buf_len;
		}

		/*
		 * forward message
		 */
		if (slurm_msg_sendto(fd,
				     get_buf_data(buffer),
				     get_buf_offset(buffer),
				     SLURM_PROTOCOL_NO_SEND_RECV_FLAGS ) < 0) {
			error("forward_thread: slurm_msg_sendto: %m");

			slurm_mutex_lock(&fwd_struct->forward_mutex);
			mark_as_failed_forward(&fwd_struct->ret_list, name,
					       errno);
			free(name);
			if (hostlist_count(hl) > 0) {
				free_buf(buffer);
				buffer = init_buf(fwd_struct->buf_len);
				slurm_mutex_unlock(&fwd_struct->forward_mutex);
				slurm_close(fd);
				fd = -1;
				/* Abandon tree. This way if all the
				 * nodes in the branch are down we
				 * don't have to time out for each
				 * node serially.
				 */
				_forward_msg_internal(hl, NULL, fwd_struct,
						      &fwd_msg->header, 0,
						      hostlist_count(hl));
				continue;
			}
			goto cleanup;
		}

		/* These messages don't have a return message, but if
		 * we got here things worked out so make note of the
		 * list of nodes as success.
		 */
		if ((fwd_msg->header.msg_type == REQUEST_SHUTDOWN) ||
		    (fwd_msg->header.msg_type == REQUEST_RECONFIGURE) ||
		    (fwd_msg->header.msg_type == REQUEST_REBOOT_NODES)) {
			slurm_mutex_lock(&fwd_struct->forward_mutex);
			ret_data_info = xmalloc(sizeof(ret_data_info_t));
			list_push(fwd_struct->ret_list, ret_data_info);
			ret_data_info->node_name = xstrdup(name);
			free(name);
			while ((name = hostlist_shift(hl))) {
				ret_data_info =
					xmalloc(sizeof(ret_data_info_t));
				list_push(fwd_struct->ret_list, ret_data_info);
				ret_data_info->node_name = xstrdup(name);
				free(name);
			}
			goto cleanup;
		}

		if (fwd_msg->header.forward.cnt > 0) {
			static int message_timeout = -1;
			if (message_timeout < 0)
				message_timeout =
					slurm_get_msg_timeout() * 1000;
			if (!fwd_msg->header.forward.tree_width)
				fwd_msg->header.forward.tree_width =
					slurm_get_tree_width();
			steps = (fwd_msg->header.forward.cnt+1) /
					fwd_msg->header.forward.tree_width;
			fwd_msg->timeout = (message_timeout*steps);
			/* info("got %d * %d = %d", message_timeout, */
			/*      steps, fwd_msg->timeout); */
			steps++;
			fwd_msg->timeout += (start_timeout*steps);
			/* info("now  + %d*%d = %d", start_timeout, */
			/*      steps, fwd_msg->timeout); */
		}

		ret_list = slurm_receive_msgs(fd, steps, fwd_msg->timeout);
		/* info("sent %d forwards got %d back", */
		/*      fwd_msg->header.forward.cnt, list_count(ret_list)); */

		if (!ret_list || (fwd_msg->header.forward.cnt != 0
				  && list_count(ret_list) <= 1)) {
			slurm_mutex_lock(&fwd_struct->forward_mutex);
			mark_as_failed_forward(&fwd_struct->ret_list, name,
					       errno);
			free(name);
			FREE_NULL_LIST(ret_list);
			if (hostlist_count(hl) > 0) {
				free_buf(buffer);
				buffer = init_buf(fwd_struct->buf_len);
				slurm_mutex_unlock(&fwd_struct->forward_mutex);
				slurm_close(fd);
				fd = -1;
				continue;
			}
			goto cleanup;
		} else if ((fwd_msg->header.forward.cnt+1)
			  != list_count(ret_list)) {
			/* this should never be called since the above
			   should catch the failed forwards and pipe
			   them back down, but this is here so we
			   never have to worry about a locked
			   mutex */
			ListIterator itr = NULL;
			char *tmp = NULL;
			int first_node_found = 0;
			hostlist_iterator_t host_itr
				= hostlist_iterator_create(hl);
			error("We shouldn't be here.  We forwarded to %d "
			      "but only got %d back",
			      (fwd_msg->header.forward.cnt+1),
			      list_count(ret_list));
			while ((tmp = hostlist_next(host_itr))) {
				int node_found = 0;
				itr = list_iterator_create(ret_list);
				while ((ret_data_info = list_next(itr))) {
					if (!ret_data_info->node_name) {
						first_node_found = 1;
						ret_data_info->node_name =
							xstrdup(name);
					}
					if (!xstrcmp(tmp,
						   ret_data_info->node_name)) {
						node_found = 1;
						break;
					}
				}
				list_iterator_destroy(itr);
				if (!node_found) {
					mark_as_failed_forward(
						&fwd_struct->ret_list,
						tmp,
						SLURM_COMMUNICATIONS_CONNECTION_ERROR);
				}
				free(tmp);
			}
			hostlist_iterator_destroy(host_itr);
			if (!first_node_found) {
				mark_as_failed_forward(
					&fwd_struct->ret_list,
					name,
					SLURM_COMMUNICATIONS_CONNECTION_ERROR);
			}
		}
		break;
	}
	slurm_mutex_lock(&fwd_struct->forward_mutex);
	if (ret_list) {
		while ((ret_data_info = list_pop(ret_list)) != NULL) {
			if (!ret_data_info->node_name) {
				ret_data_info->node_name = xstrdup(name);
			}
			list_push(fwd_struct->ret_list, ret_data_info);
			debug3("got response from %s",
			       ret_data_info->node_name);
		}
		FREE_NULL_LIST(ret_list);
	}
	free(name);
cleanup:
	if ((fd >= 0) && slurm_close(fd) < 0)
		error ("close(%d): %m", fd);
	hostlist_destroy(hl);
	destroy_forward(&fwd_msg->header.forward);
	free_buf(buffer);
	slurm_cond_signal(&fwd_struct->notify);
	slurm_mutex_unlock(&fwd_struct->forward_mutex);
	xfree(fwd_msg);

	return (NULL);
}