Example #1
0
File: eio.c Project: Poshi/slurm
int eio_message_socket_accept(eio_obj_t *obj, List objs)
{
	int fd;
	unsigned char *uc;
	unsigned short port;
	struct sockaddr_in addr;
	slurm_msg_t *msg = NULL;
	int len = sizeof(addr);

	debug3("Called eio_msg_socket_accept");

	xassert(obj);
	xassert(obj->ops->handle_msg);

	while ((fd = accept(obj->fd, (struct sockaddr *)&addr,
			    (socklen_t *)&len)) < 0) {
		if (errno == EINTR)
			continue;
		if (errno == EAGAIN       ||
		    errno == ECONNABORTED ||
		    errno == EWOULDBLOCK) {
			return SLURM_SUCCESS;
		}
		error("Error on msg accept socket: %m");
		obj->shutdown = true;
		return SLURM_SUCCESS;
	}

	fd_set_close_on_exec(fd);
	fd_set_blocking(fd);

	/* Should not call slurm_get_addr() because the IP may not be
	   in /etc/hosts. */
	uc = (unsigned char *)&addr.sin_addr.s_addr;
	port = addr.sin_port;
	debug2("got message connection from %u.%u.%u.%u:%hu %d",
	       uc[0], uc[1], uc[2], uc[3], ntohs(port), fd);
	fflush(stdout);

	msg = xmalloc(sizeof(slurm_msg_t));
	slurm_msg_t_init(msg);
again:
	if(slurm_receive_msg(fd, msg, obj->ops->timeout) != 0) {
		if (errno == EINTR) {
			goto again;
		}
		error("slurm_receive_msg[%u.%u.%u.%u]: %m",
		      uc[0],uc[1],uc[2],uc[3]);
		goto cleanup;
	}

	(*obj->ops->handle_msg)(obj->arg, msg); /* handle_msg should free
					      * msg->data */
cleanup:
	if ((msg->conn_fd >= 0) && slurm_close_accepted_conn(msg->conn_fd) < 0)
		error ("close(%d): %m", msg->conn_fd);
	slurm_free_msg(msg);

	return SLURM_SUCCESS;
}
Example #2
0
static void init_open_commandpipe(void) {
    if (!(g_fprdcmdpipe = fopen(ZTEBLADEFM_PIPE_CMD, "r")))
        panic("[%s:init_open_commandpipe(...) @ %d] - Could not open %s for read!", __FILE__, __LINE__, ZTEBLADEFM_PIPE_CMD);
    g_rdcmdpipe = fileno(g_fprdcmdpipe);
    if (!fd_set_blocking(g_rdcmdpipe, 0)) {
        panic("[%s:init_open_commandpipe(...) @ %d] - Could not set command pipe to non-block!", __FILE__, __LINE__);
    } else {
        logmsg("Set command pipe to non-block OK!\n");
    }
    if (setvbuf(g_fprdcmdpipe, NULL, _IONBF, CMDBUF_LEN) != 0)
        panic("[%s:init_open_commandpipe(...) @ %d] - Could not set buffer for command pipe!", __FILE__, __LINE__);
    FD_ZERO(&g_fdset_r);
    FD_SET(g_rdcmdpipe, &g_fdset_r);
}
Example #3
0
static void init_open_statuspipe(void) {
    int nPipeFlags = 0;
    if (!(g_fpwrstatpipe = fopen(ZTEBLADEFM_PIPE_STATUS,"w")))
        panic("[%s:init_open_statuspipe(...) @ %d] - Could not open %s for writing!", __FILE__, __LINE__, ZTEBLADEFM_PIPE_STATUS);
    g_wrstatpipe = fileno(g_fpwrstatpipe);
    if (!fd_set_blocking(g_wrstatpipe, 0)) {
        panic("[%s:init_open_statuspipe(...) @ %d] - Could not set status pipe to non-block!", __FILE__, __LINE__);
    } else {
        logmsg("Set status pipe to non-block OK!\n");
    }
    if (setvbuf(g_fpwrstatpipe, NULL, _IONBF, CMDBUF_LEN) != 0)
        panic("[%s:init_open_statuspipe(...) @ %d] - Could not set buffer for status pipe!", __FILE__, __LINE__);
    FD_ZERO(&g_fdset_w);
    FD_SET(g_wrstatpipe, &g_fdset_w);
}
Example #4
0
size_t pmixp_read_buf(int sd, void *buf, size_t count, int *shutdown,
		bool blocking)
{
	ssize_t ret, offs = 0;

	*shutdown = 0;

	if (!blocking && !pmixp_fd_read_ready(sd, shutdown)) {
		return 0;
	}

	if (blocking) {
		fd_set_blocking(sd);
	}

	while (count - offs > 0) {
		ret = read(sd, (char *)buf + offs, count - offs);
		if (ret > 0) {
			offs += ret;
			continue;
		} else if (ret == 0) {
			/* connection closed. */
			*shutdown = 1;
			return offs;
		}
		switch (errno) {
		case EINTR:
			continue;
		case EWOULDBLOCK:
			/* we can get here in non-blocking mode only */
			return offs;
		default:
			PMIXP_ERROR_STD("blocking=%d", blocking);
			*shutdown = -errno;
			return offs;
		}
	}

	if (blocking) {
		fd_set_nonblocking(sd);
	}
	return offs;
}
Example #5
0
size_t pmixp_write_buf(int sd, void *buf, size_t count, int *shutdown,
		bool blocking)
{
	ssize_t ret, offs = 0;

	*shutdown = 0;

	if (!blocking && !pmixp_fd_write_ready(sd, shutdown)) {
		return 0;
	}

	if (blocking) {
		fd_set_blocking(sd);
	}

	while (count - offs > 0) {
		ret = write(sd, (char *)buf + offs, count - offs);
		if (ret > 0) {
			offs += ret;
			continue;
		}
		switch (errno) {
		case EINTR:
			continue;
		case EWOULDBLOCK:
			return offs;
		default:
			*shutdown = -errno;
			return offs;
		}
	}

	if (blocking) {
		fd_set_nonblocking(sd);
	}

	return offs;
}
int _slurm_set_stream_blocking(slurm_fd_t fd)
{
	fd_set_blocking(fd);
	return SLURM_SUCCESS;
}
Example #7
0
File: req.c Project: VURM/slurm
static int
_msg_socket_accept(eio_obj_t *obj, List objs)
{
	slurmd_job_t *job = (slurmd_job_t *)obj->arg;
	int fd;
	struct sockaddr_un addr;
	int len = sizeof(addr);
	struct request_params *param = NULL;
	pthread_attr_t attr;
	pthread_t id;
	int retries = 0;

	debug3("Called _msg_socket_accept");

	while ((fd = accept(obj->fd, (struct sockaddr *)&addr,
			    (socklen_t *)&len)) < 0) {
		if (errno == EINTR)
			continue;
		if (errno == EAGAIN
		    || errno == ECONNABORTED
		    || errno == EWOULDBLOCK) {
			return SLURM_SUCCESS;
		}
		error("Error on msg accept socket: %m");
		obj->shutdown = true;
		return SLURM_SUCCESS;
	}

	pthread_mutex_lock(&message_lock);
	message_connections++;
	pthread_mutex_unlock(&message_lock);

	fd_set_close_on_exec(fd);
	fd_set_blocking(fd);

	slurm_attr_init(&attr);
	if (pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED) != 0) {
		error("Unable to set detachstate on attr: %m");
		slurm_attr_destroy(&attr);
		close(fd);
		return SLURM_ERROR;
	}

	param = xmalloc(sizeof(struct request_params));
	param->fd = fd;
	param->job = job;
	while (pthread_create(&id, &attr, &_handle_accept, (void *)param)) {
		error("stepd_api message engine pthread_create: %m");
		if (++retries > MAX_RETRIES) {
			error("running handle_accept without "
			      "starting a thread stepd will be "
			      "unresponsive until done");
			_handle_accept((void *)param);
			info("stepd should be responsive now");
			break;
		}
		usleep(10);	/* sleep and again */
	}

	slurm_attr_destroy(&attr);
	param = NULL;

	debug3("Leaving _msg_socket_accept");
	return SLURM_SUCCESS;
}
Example #8
0
/* Wait for barrier and get full PMI Keyval space data */
int  slurm_get_kvs_comm_set(struct kvs_comm_set **kvs_set_ptr,
		int pmi_rank, int pmi_size)
{
	int rc, srun_fd, retries = 0, timeout = 0;
	slurm_msg_t msg_send, msg_rcv;
	slurm_addr_t slurm_addr, srun_reply_addr;
	char hostname[64];
	uint16_t port;
	kvs_get_msg_t data;
	char *env_pmi_ifhn;

	if (kvs_set_ptr == NULL)
		return EINVAL;
	*kvs_set_ptr = NULL;	/* initialization */

	if ((rc = _get_addr()) != SLURM_SUCCESS) {
		error("_get_addr: %m");
		return rc;
	}

	_set_pmi_time();

	if (pmi_fd < 0) {
		if ((pmi_fd = slurm_init_msg_engine_port(0)) < 0) {
			error("slurm_init_msg_engine_port: %m");
			return SLURM_ERROR;
		}
		fd_set_blocking(pmi_fd);
	}
	if (slurm_get_stream_addr(pmi_fd, &slurm_addr) < 0) {
		error("slurm_get_stream_addr: %m");
		return SLURM_ERROR;
	}
	/* hostname is not set here, so slurm_get_addr fails
	slurm_get_addr(&slurm_addr, &port, hostname, sizeof(hostname)); */
	port = ntohs(slurm_addr.sin_port);
	if ((env_pmi_ifhn = getenv("SLURM_PMI_RESP_IFHN"))) {
		strncpy(hostname, env_pmi_ifhn, sizeof(hostname));
		hostname[sizeof(hostname)-1] = 0;
	} else
		gethostname_short(hostname, sizeof(hostname));

	data.task_id = pmi_rank;
	data.size = pmi_size;
	data.port = port;
	data.hostname = hostname;
	slurm_msg_t_init(&msg_send);
	slurm_msg_t_init(&msg_rcv);
	msg_send.address = srun_addr;
	msg_send.msg_type = PMI_KVS_GET_REQ;
	msg_send.data = &data;

	/* Send the RPC to the local srun communcation manager.
	 * Since the srun can be sent thousands of messages at
	 * the same time and refuse some connections, retry as
	 * needed. Wait until all key-pairs have been sent by
	 * all tasks then spread out messages by task's rank.
	 * Also increase the message timeout if many tasks
	 * since the srun command can get very overloaded (the
	 * default timeout is 10 secs).
	 */
	_delay_rpc(pmi_rank, pmi_size);
	if      (pmi_size > 4000)	/* 240 secs */
		timeout = slurm_get_msg_timeout() * 24000;
	else if (pmi_size > 1000)	/* 120 secs */
		timeout = slurm_get_msg_timeout() * 12000;
	else if (pmi_size > 100)	/* 60 secs */
		timeout = slurm_get_msg_timeout() * 6000;
	else if (pmi_size > 10)		/* 20 secs */
		timeout = slurm_get_msg_timeout() * 2000;

	while (slurm_send_recv_rc_msg_only_one(&msg_send, &rc, timeout) < 0) {
		if (retries++ > MAX_RETRIES) {
			error("slurm_get_kvs_comm_set: %m");
			return SLURM_ERROR;
		} else
			debug("get kvs retry %d", retries);
		_delay_rpc(pmi_rank, pmi_size);
	}
	if (rc != SLURM_SUCCESS) {
		error("slurm_get_kvs_comm_set error_code=%d", rc);
		return rc;
	}

	/* get the message after all tasks reach the barrier */
	srun_fd = slurm_accept_msg_conn(pmi_fd, &srun_reply_addr);
	if (srun_fd < 0) {
		error("slurm_accept_msg_conn: %m");
		return errno;
	}

	while ((rc = slurm_receive_msg(srun_fd, &msg_rcv, timeout)) != 0) {
		if (errno == EINTR)
			continue;
		error("slurm_receive_msg: %m");
		slurm_close(srun_fd);
		return errno;
	}
	if (msg_rcv.auth_cred)
		(void)g_slurm_auth_destroy(msg_rcv.auth_cred);

	if (msg_rcv.msg_type != PMI_KVS_GET_RESP) {
		error("slurm_get_kvs_comm_set msg_type=%d", msg_rcv.msg_type);
		slurm_close(srun_fd);
		return SLURM_UNEXPECTED_MSG_ERROR;
	}
	if (slurm_send_rc_msg(&msg_rcv, SLURM_SUCCESS) < 0)
		error("slurm_send_rc_msg: %m");

	slurm_close(srun_fd);
	*kvs_set_ptr = msg_rcv.data;

	rc = _forward_comm_set(*kvs_set_ptr);
	return rc;
}