예제 #1
0
파일: spawn.c 프로젝트: Xarthisius/slurm
extern int
spawn_resp_send_to_fd(spawn_resp_t *resp, int fd)
{
	Buf buf;
	int rc;
	
	buf = init_buf(1024);

	/* sync with spawn_req_send_to_srun */
/* 	cmd = TREE_CMD_SPAWN_RESP; */
/* 	pack16(cmd, buf); */
	spawn_resp_pack(resp, buf);
	rc = _slurm_msg_sendto(fd, get_buf_data(buf), get_buf_offset(buf),
			       SLURM_PROTOCOL_NO_SEND_RECV_FLAGS);
	free_buf(buf);

	return rc;
}
예제 #2
0
파일: forward.c 프로젝트: IFCA/slurm
void *_forward_thread(void *arg)
{
	forward_msg_t *fwd_msg = (forward_msg_t *)arg;
	Buf buffer = init_buf(fwd_msg->buf_len);
	List ret_list = NULL;
	slurm_fd_t fd = -1;
	ret_data_info_t *ret_data_info = NULL;
	char *name = NULL;
	hostlist_t hl = hostlist_create(fwd_msg->header.forward.nodelist);
	slurm_addr_t addr;
	char *buf = NULL;
	int steps = 0;
	int start_timeout = fwd_msg->timeout;

	/* repeat until we are sure the message was sent */
	while ((name = hostlist_shift(hl))) {
		if (slurm_conf_get_addr(name, &addr) == SLURM_ERROR) {
			error("forward_thread: can't find address for host "
			      "%s, check slurm.conf", name);
			slurm_mutex_lock(fwd_msg->forward_mutex);
			mark_as_failed_forward(&fwd_msg->ret_list, name,
					       SLURM_UNKNOWN_FORWARD_ADDR);
 			free(name);
			if (hostlist_count(hl) > 0) {
				slurm_mutex_unlock(fwd_msg->forward_mutex);
				continue;
			}
			goto cleanup;
		}
		if ((fd = slurm_open_msg_conn(&addr)) < 0) {
			error("forward_thread to %s: %m", name);

			slurm_mutex_lock(fwd_msg->forward_mutex);
			mark_as_failed_forward(
				&fwd_msg->ret_list, name,
				SLURM_COMMUNICATIONS_CONNECTION_ERROR);
			free(name);
			if (hostlist_count(hl) > 0) {
				slurm_mutex_unlock(fwd_msg->forward_mutex);
				continue;
			}
			goto cleanup;
		}
		buf = hostlist_ranged_string_xmalloc(hl);

		xfree(fwd_msg->header.forward.nodelist);
		fwd_msg->header.forward.nodelist = buf;
		fwd_msg->header.forward.cnt = hostlist_count(hl);
		/* info("sending %d forwards (%s) to %s", */
/* 		     fwd_msg->header.forward.cnt, */
/* 		     fwd_msg->header.forward.nodelist, name); */
		if (fwd_msg->header.forward.nodelist[0]) {
			debug3("forward: send to %s along with %s",
			       name, fwd_msg->header.forward.nodelist);
		} else
			debug3("forward: send to %s ", name);

		pack_header(&fwd_msg->header, buffer);

		/* add forward data to buffer */
		if (remaining_buf(buffer) < fwd_msg->buf_len) {
			buffer->size += (fwd_msg->buf_len + BUF_SIZE);
			xrealloc(buffer->head, buffer->size);
		}
		if (fwd_msg->buf_len) {
			memcpy(&buffer->head[buffer->processed],
			       fwd_msg->buf, fwd_msg->buf_len);
			buffer->processed += fwd_msg->buf_len;
		}

		/*
		 * forward message
		 */
		if (_slurm_msg_sendto(fd,
				     get_buf_data(buffer),
				     get_buf_offset(buffer),
				     SLURM_PROTOCOL_NO_SEND_RECV_FLAGS ) < 0) {
			error("forward_thread: slurm_msg_sendto: %m");

			slurm_mutex_lock(fwd_msg->forward_mutex);
			mark_as_failed_forward(&fwd_msg->ret_list, name,
					       errno);
			free(name);
			if (hostlist_count(hl) > 0) {
				free_buf(buffer);
				buffer = init_buf(fwd_msg->buf_len);
				slurm_mutex_unlock(fwd_msg->forward_mutex);
				slurm_close_accepted_conn(fd);
				fd = -1;
				continue;
			}
			goto cleanup;
		}

		if ((fwd_msg->header.msg_type == REQUEST_SHUTDOWN) ||
		    (fwd_msg->header.msg_type == REQUEST_RECONFIGURE) ||
		    (fwd_msg->header.msg_type == REQUEST_REBOOT_NODES)) {
			slurm_mutex_lock(fwd_msg->forward_mutex);
			ret_data_info = xmalloc(sizeof(ret_data_info_t));
			list_push(fwd_msg->ret_list, ret_data_info);
			ret_data_info->node_name = xstrdup(name);
			free(name);
			while ((name = hostlist_shift(hl))) {
				ret_data_info =
					xmalloc(sizeof(ret_data_info_t));
				list_push(fwd_msg->ret_list, ret_data_info);
				ret_data_info->node_name = xstrdup(name);
				free(name);
			}
			goto cleanup;
		}

		if (fwd_msg->header.forward.cnt > 0) {
			static int message_timeout = -1;
			if (message_timeout < 0)
				message_timeout =
					slurm_get_msg_timeout() * 1000;
			steps = (fwd_msg->header.forward.cnt+1) /
				slurm_get_tree_width();
			fwd_msg->timeout = (message_timeout*steps);
/* 			info("got %d * %d = %d", message_timeout, steps, fwd_msg->timeout); */
			steps++;
			fwd_msg->timeout += (start_timeout*steps);
/* 			info("now  + %d*%d = %d", start_timeout, steps, fwd_msg->timeout); */
		}

		ret_list = slurm_receive_msgs(fd, steps, fwd_msg->timeout);
		/* info("sent %d forwards got %d back", */
/* 		     fwd_msg->header.forward.cnt, list_count(ret_list)); */

		if (!ret_list || (fwd_msg->header.forward.cnt != 0
				 && list_count(ret_list) <= 1)) {
			slurm_mutex_lock(fwd_msg->forward_mutex);
			mark_as_failed_forward(&fwd_msg->ret_list, name,
					       errno);
			free(name);
			if (ret_list)
				list_destroy(ret_list);
			if (hostlist_count(hl) > 0) {
				free_buf(buffer);
				buffer = init_buf(fwd_msg->buf_len);
				slurm_mutex_unlock(fwd_msg->forward_mutex);
				slurm_close_accepted_conn(fd);
				fd = -1;
				continue;
			}
			goto cleanup;
		} else if ((fwd_msg->header.forward.cnt+1)
			  != list_count(ret_list)) {
			/* this should never be called since the above
			   should catch the failed forwards and pipe
			   them back down, but this is here so we
			   never have to worry about a locked
			   mutex */
			ListIterator itr = NULL;
			char *tmp = NULL;
			int first_node_found = 0;
			hostlist_iterator_t host_itr
				= hostlist_iterator_create(hl);
			error("We shouldn't be here.  We forwarded to %d "
			      "but only got %d back",
			      (fwd_msg->header.forward.cnt+1),
			      list_count(ret_list));
			while ((tmp = hostlist_next(host_itr))) {
				int node_found = 0;
				itr = list_iterator_create(ret_list);
				while ((ret_data_info = list_next(itr))) {
					if (!ret_data_info->node_name) {
						first_node_found = 1;
						ret_data_info->node_name =
							xstrdup(name);
					}
					if (!strcmp(tmp,
						   ret_data_info->node_name)) {
						node_found = 1;
						break;
					}
				}
				list_iterator_destroy(itr);
				if (!node_found) {
					mark_as_failed_forward(
						&fwd_msg->ret_list,
						tmp,
						SLURM_COMMUNICATIONS_CONNECTION_ERROR);
				}
				free(tmp);
			}
			hostlist_iterator_destroy(host_itr);
			if (!first_node_found) {
				mark_as_failed_forward(&fwd_msg->ret_list,
						       name,
						       SLURM_COMMUNICATIONS_CONNECTION_ERROR);
			}
		}
		break;
	}
	slurm_mutex_lock(fwd_msg->forward_mutex);
	if (ret_list) {
		while ((ret_data_info = list_pop(ret_list)) != NULL) {
			if (!ret_data_info->node_name) {
				ret_data_info->node_name = xstrdup(name);
			}
			list_push(fwd_msg->ret_list, ret_data_info);
			debug3("got response from %s",
			       ret_data_info->node_name);
		}
		list_destroy(ret_list);
	}
	free(name);
cleanup:
	if ((fd >= 0) && slurm_close_accepted_conn(fd) < 0)
		error ("close(%d): %m", fd);
	hostlist_destroy(hl);
	destroy_forward(&fwd_msg->header.forward);
	free_buf(buffer);
	pthread_cond_signal(fwd_msg->notify);
	slurm_mutex_unlock(fwd_msg->forward_mutex);

	return (NULL);
}