Example #1
0
/* Wait until a file is writeable,
 * RET false if can not be written to within 5 seconds */
extern bool fd_writeable(slurm_fd_t fd)
{
	struct pollfd ufds;
	int msg_timeout = 5000;
	int rc, time_left;
	struct timeval tstart;
	char temp[2];

	ufds.fd     = fd;
	ufds.events = POLLOUT;
	gettimeofday(&tstart, NULL);
	while (shutdown_time == 0) {
		time_left = msg_timeout - _tot_wait(&tstart);
		rc = poll(&ufds, 1, time_left);
		if (shutdown_time)
			return false;
		if (rc == -1) {
			if ((errno == EINTR) || (errno == EAGAIN))
				continue;
			error("poll: %m");
			return false;
		}
		if (rc == 0) {
			debug2("write timeout");
			return false;
		}

		/*
		 * Check here to make sure the socket really is there.
		 * If not then exit out and notify the sender.  This
 		 * is here since a write doesn't always tell you the
		 * socket is gone, but getting 0 back from a
		 * nonblocking read means just that.
		 */
		if (ufds.revents & POLLHUP || (recv(fd, &temp, 1, 0) == 0)) {
			debug3("Write connection %d closed", fd);
			return false;
		}
		if (ufds.revents & POLLNVAL) {
			error("Connection %d is invalid", fd);
			return false;
		}
		if (ufds.revents & POLLERR) {
			error("Connection %d experienced an error", fd);
			return false;
		}
		if ((ufds.revents & POLLOUT) == 0) {
			error("Connection %d events %d", fd, ufds.revents);
			return false;
		}
		break;
	}
	errno = 0;
	return true;
}
Example #2
0
/* Wait until a file is readable,
 * RET false if can not be read */
static bool _conn_readable(slurm_persist_conn_t *persist_conn)
{
	struct pollfd ufds;
	int rc, time_left;

	xassert(persist_conn->shutdown);

	ufds.fd     = persist_conn->fd;
	ufds.events = POLLIN;
	while (!(*persist_conn->shutdown)) {
		if (persist_conn->timeout) {
			struct timeval tstart;
			gettimeofday(&tstart, NULL);
			time_left = persist_conn->timeout - _tot_wait(&tstart);
		} else
			time_left = -1;
		rc = poll(&ufds, 1, time_left);
		if (*persist_conn->shutdown)
			return false;
		if (rc == -1) {
			if ((errno == EINTR) || (errno == EAGAIN))
				continue;
			error("poll: %m");
			return false;
		}
		if (rc == 0)
			return false;
		if ((ufds.revents & POLLHUP) &&
		    ((ufds.revents & POLLIN) == 0)) {
			debug2("persistent connection closed");
			return false;
		}
		if (ufds.revents & POLLNVAL) {
			error("persistent connection is invalid");
			return false;
		}
		if (ufds.revents & POLLERR) {
			error("persistent connection experienced an error");
			return false;
		}
		if ((ufds.revents & POLLIN) == 0) {
			error("persistent connection %d events %d",
			      persist_conn->fd, ufds.revents);
			return false;
		}
		/* revents == POLLIN */
		errno = 0;
		return true;
	}
	return false;
}
Example #3
0
/* Wait until a file is writeable,
 * RET 1 if file can be written now,
 *     0 if can not be written to within 5 seconds
 *     -1 if file has been closed POLLHUP
 */
extern int slurm_persist_conn_writeable(slurm_persist_conn_t *persist_conn)
{
	struct pollfd ufds;
	int write_timeout = 5000;
	int rc, time_left;
	struct timeval tstart;
	char temp[2];

	xassert(persist_conn->shutdown);

	if (persist_conn->fd < 0)
		return -1;

	ufds.fd     = persist_conn->fd;
	ufds.events = POLLOUT;
	gettimeofday(&tstart, NULL);
	while ((*persist_conn->shutdown) == 0) {
		time_left = write_timeout - _tot_wait(&tstart);
		rc = poll(&ufds, 1, time_left);
		if (rc == -1) {
			if ((errno == EINTR) || (errno == EAGAIN))
				continue;
			error("poll: %m");
			return -1;
		}
		if (rc == 0)
			return 0;
		/*
		 * Check here to make sure the socket really is there.
		 * If not then exit out and notify the conn.  This
		 * is here since a write doesn't always tell you the
		 * socket is gone, but getting 0 back from a
		 * nonblocking read means just that.
		 */
		if (ufds.revents & POLLHUP ||
		    (recv(persist_conn->fd, &temp, 1, 0) == 0)) {
			debug2("persistent connection is closed");
			if (persist_conn->trigger_callbacks.dbd_fail)
				(persist_conn->trigger_callbacks.dbd_fail)();
			return -1;
		}
		if (ufds.revents & POLLNVAL) {
			error("persistent connection is invalid");
			return 0;
		}
		if (ufds.revents & POLLERR) {
			if (_comm_fail_log(persist_conn)) {
				error("persistent connection experienced an error: %m");
			}
			if (persist_conn->trigger_callbacks.dbd_fail)
				(persist_conn->trigger_callbacks.dbd_fail)();
			return 0;
		}
		if ((ufds.revents & POLLOUT) == 0) {
			error("persistent connection %d events %d",
			      persist_conn->fd, ufds.revents);
			return 0;
		}
		/* revents == POLLOUT */
		errno = 0;
		return 1;
	}
	return 0;
}
/* Get slurm message with timeout
 * RET message size (as specified in argument) or SLURM_ERROR on error */
int _slurm_recv_timeout(slurm_fd_t fd, char *buffer, size_t size,
			uint32_t flags, int timeout )
{
	int rc;
	int recvlen = 0;
	int fd_flags;
	struct pollfd  ufds;
	struct timeval tstart;
	int timeleft = timeout;

	ufds.fd     = fd;
	ufds.events = POLLIN;

	fd_flags = _slurm_fcntl(fd, F_GETFL);
	fd_set_nonblocking(fd);

	gettimeofday(&tstart, NULL);

	while (recvlen < size) {
		timeleft = timeout - _tot_wait(&tstart);
		if (timeleft <= 0) {
			debug("_slurm_recv_timeout at %d of %zd, timeout",
				recvlen, size);
			slurm_seterrno(SLURM_PROTOCOL_SOCKET_IMPL_TIMEOUT);
			recvlen = SLURM_ERROR;
			goto done;
		}

		if ((rc = poll(&ufds, 1, timeleft)) <= 0) {
			if ((errno == EINTR) || (errno == EAGAIN) || (rc == 0))
				continue;
			else {
				debug("_slurm_recv_timeout at %d of %zd, "
					"poll error: %s",
					recvlen, size, strerror(errno));
 				slurm_seterrno(
					SLURM_COMMUNICATIONS_RECEIVE_ERROR);
 				recvlen = SLURM_ERROR;
  				goto done;
			}
		}

		if (ufds.revents & POLLERR) {
			debug("_slurm_recv_timeout: Socket POLLERR");
			slurm_seterrno(ENOTCONN);
			recvlen = SLURM_ERROR;
			goto done;
		}
		if ((ufds.revents & POLLNVAL) ||
		    ((ufds.revents & POLLHUP) &&
		     ((ufds.revents & POLLIN) == 0))) {
			debug2("_slurm_recv_timeout: Socket no longer there");
			slurm_seterrno(ENOTCONN);
			recvlen = SLURM_ERROR;
			goto done;
		}
		if ((ufds.revents & POLLIN) != POLLIN) {
			error("_slurm_recv_timeout: Poll failure, revents:%d",
			      ufds.revents);
			continue;
		}

		rc = _slurm_recv(fd, &buffer[recvlen], (size - recvlen), flags);
		if (rc < 0)  {
			if (errno == EINTR)
				continue;
			else {
				debug("_slurm_recv_timeout at %d of %zd, "
					"recv error: %s",
					recvlen, size, strerror(errno));
				slurm_seterrno(
					SLURM_COMMUNICATIONS_RECEIVE_ERROR);
				recvlen = SLURM_ERROR;
				goto done;
			}
		}
		if (rc == 0) {
			debug("_slurm_recv_timeout at %d of %zd, "
				"recv zero bytes", recvlen, size);
			slurm_seterrno(SLURM_PROTOCOL_SOCKET_ZERO_BYTES_SENT);
			recvlen = SLURM_ERROR;
			goto done;
		}
		recvlen += rc;
	}


    done:
	/* Reset fd flags to prior state, preserve errno */
	if (fd_flags != SLURM_PROTOCOL_ERROR) {
		int slurm_err = slurm_get_errno();
		_slurm_fcntl(fd , F_SETFL , fd_flags);
		slurm_seterrno(slurm_err);
	}

	return recvlen;
}
/* Send slurm message with timeout
 * RET message size (as specified in argument) or SLURM_ERROR on error */
int _slurm_send_timeout(slurm_fd_t fd, char *buf, size_t size,
			uint32_t flags, int timeout)
{
	int rc;
	int sent = 0;
	int fd_flags;
	struct pollfd ufds;
	struct timeval tstart;
	int timeleft = timeout;
	char temp[2];

	ufds.fd     = fd;
	ufds.events = POLLOUT;

	fd_flags = _slurm_fcntl(fd, F_GETFL);
	fd_set_nonblocking(fd);

	gettimeofday(&tstart, NULL);

	while (sent < size) {
		timeleft = timeout - _tot_wait(&tstart);
		if (timeleft <= 0) {
			debug("_slurm_send_timeout at %d of %zd, timeout",
				sent, size);
			slurm_seterrno(SLURM_PROTOCOL_SOCKET_IMPL_TIMEOUT);
			sent = SLURM_ERROR;
			goto done;
		}

		if ((rc = poll(&ufds, 1, timeleft)) <= 0) {
			if ((rc == 0) || (errno == EINTR) || (errno == EAGAIN))
 				continue;
			else {
				debug("_slurm_send_timeout at %d of %zd, "
					"poll error: %s",
					sent, size, strerror(errno));
				slurm_seterrno(SLURM_COMMUNICATIONS_SEND_ERROR);
				sent = SLURM_ERROR;
				goto done;
			}
		}

		/*
		 * Check here to make sure the socket really is there.
		 * If not then exit out and notify the sender.  This
 		 * is here since a write doesn't always tell you the
		 * socket is gone, but getting 0 back from a
		 * nonblocking read means just that.
		 */
		if (ufds.revents & POLLERR) {
			debug("_slurm_send_timeout: Socket POLLERR");
			slurm_seterrno(ENOTCONN);
			sent = SLURM_ERROR;
			goto done;
		}
		if ((ufds.revents & POLLHUP) || (ufds.revents & POLLNVAL) ||
		    (_slurm_recv(fd, &temp, 1, flags) == 0)) {
			debug2("_slurm_send_timeout: Socket no longer there");
			slurm_seterrno(ENOTCONN);
			sent = SLURM_ERROR;
			goto done;
		}
		if ((ufds.revents & POLLOUT) != POLLOUT) {
			error("_slurm_send_timeout: Poll failure, revents:%d",
			      ufds.revents);
		}

		rc = _slurm_send(fd, &buf[sent], (size - sent), flags);
		if (rc < 0) {
 			if (errno == EINTR)
				continue;
			debug("_slurm_send_timeout at %d of %zd, "
				"send error: %s",
				sent, size, strerror(errno));
 			if (errno == EAGAIN) {	/* poll() lied to us */
				usleep(10000);
				continue;
			}
 			slurm_seterrno(SLURM_COMMUNICATIONS_SEND_ERROR);
			sent = SLURM_ERROR;
			goto done;
		}
		if (rc == 0) {
			debug("_slurm_send_timeout at %d of %zd, "
				"sent zero bytes", sent, size);
			slurm_seterrno(SLURM_PROTOCOL_SOCKET_ZERO_BYTES_SENT);
			sent = SLURM_ERROR;
			goto done;
		}

		sent += rc;
	}

    done:
	/* Reset fd flags to prior state, preserve errno */
	if (fd_flags != SLURM_PROTOCOL_ERROR) {
		int slurm_err = slurm_get_errno();
		_slurm_fcntl(fd , F_SETFL , fd_flags);
		slurm_seterrno(slurm_err);
	}

	return sent;

}
Example #6
0
/* Execute a script, wait for termination and return its stdout.
 * script_type IN - Type of program being run (e.g. "StartStageIn")
 * script_path IN - Fully qualified pathname of the program to execute
 * script_args IN - Arguments to the script
 * max_wait IN - Maximum time to wait in milliseconds,
 *		 -1 for no limit (asynchronous)
 * tid IN - thread we are called from
 * status OUT - Job exit code
 * Return stdout+stderr of spawned program, value must be xfreed. */
extern char *run_command(char *script_type, char *script_path,
			 char **script_argv, int max_wait,
			 pthread_t tid, int *status)
{
	int i, new_wait, resp_size = 0, resp_offset = 0;
	pid_t cpid;
	char *resp = NULL;
	int pfd[2] = { -1, -1 };

	if ((script_path == NULL) || (script_path[0] == '\0')) {
		error("%s: no script specified", __func__);
		*status = 127;
		resp = xstrdup("Run command failed - configuration error");
		return resp;
	}
	if (script_path[0] != '/') {
		error("%s: %s is not fully qualified pathname (%s)",
		      __func__, script_type, script_path);
		*status = 127;
		resp = xstrdup("Run command failed - configuration error");
		return resp;
	}
	if (access(script_path, R_OK | X_OK) < 0) {
		error("%s: %s can not be executed (%s) %m",
		      __func__, script_type, script_path);
		*status = 127;
		resp = xstrdup("Run command failed - configuration error");
		return resp;
	}
	if (max_wait != -1) {
		if (pipe(pfd) != 0) {
			error("%s: pipe(): %m", __func__);
			*status = 127;
			resp = xstrdup("System error");
			return resp;
		}
	}
	slurm_mutex_lock(&proc_count_mutex);
	child_proc_count++;
	slurm_mutex_unlock(&proc_count_mutex);
	if ((cpid = fork()) == 0) {
		int cc;

		cc = sysconf(_SC_OPEN_MAX);
		if (max_wait != -1) {
			dup2(pfd[1], STDERR_FILENO);
			dup2(pfd[1], STDOUT_FILENO);
			for (i = 0; i < cc; i++) {
				if ((i != STDERR_FILENO) &&
				    (i != STDOUT_FILENO))
					close(i);
			}
		} else {
			for (i = 0; i < cc; i++)
				close(i);
			if ((cpid = fork()) < 0)
				exit(127);
			else if (cpid > 0)
				exit(0);
		}
		setpgid(0, 0);
		execv(script_path, script_argv);
		error("%s: execv(%s): %m", __func__, script_path);
		exit(127);
	} else if (cpid < 0) {
		if (max_wait != -1) {
			close(pfd[0]);
			close(pfd[1]);
		}
		error("%s: fork(): %m", __func__);
		slurm_mutex_lock(&proc_count_mutex);
		child_proc_count--;
		slurm_mutex_unlock(&proc_count_mutex);
	} else if (max_wait != -1) {
		struct pollfd fds;
		struct timeval tstart;
		resp_size = 1024;
		resp = xmalloc(resp_size);
		close(pfd[1]);
		gettimeofday(&tstart, NULL);
		if (tid)
			track_script_reset_cpid(tid, cpid);
		while (1) {
			if (shutdown) {
				error("%s: killing %s operation on shutdown",
				      __func__, script_type);
				break;
			}
			fds.fd = pfd[0];
			fds.events = POLLIN | POLLHUP | POLLRDHUP;
			fds.revents = 0;
			if (max_wait <= 0) {
				new_wait = MAX_POLL_WAIT;
			} else {
				new_wait = max_wait - _tot_wait(&tstart);
				if (new_wait <= 0) {
					error("%s: %s poll timeout @ %d msec",
					      __func__, script_type, max_wait);
					break;
				}
				new_wait = MIN(new_wait, MAX_POLL_WAIT);
			}
			i = poll(&fds, 1, new_wait);
			if (i == 0) {
				continue;
			} else if (i < 0) {
				error("%s: %s poll:%m", __func__, script_type);
				break;
			}
			if ((fds.revents & POLLIN) == 0)
				break;
			i = read(pfd[0], resp + resp_offset,
				 resp_size - resp_offset);
			if (i == 0) {
				break;
			} else if (i < 0) {
				if (errno == EAGAIN)
					continue;
				error("%s: read(%s): %m", __func__,
				      script_path);
				break;
			} else {
				resp_offset += i;
				if (resp_offset + 1024 >= resp_size) {
					resp_size *= 2;
					resp = xrealloc(resp, resp_size);
				}
			}
		}
		killpg(cpid, SIGTERM);
		usleep(10000);
		killpg(cpid, SIGKILL);
		waitpid(cpid, status, 0);
		close(pfd[0]);
		slurm_mutex_lock(&proc_count_mutex);
		child_proc_count--;
		slurm_mutex_unlock(&proc_count_mutex);
	} else {
		if (tid)
			track_script_reset_cpid(tid, cpid);
		waitpid(cpid, status, 0);
	}

	return resp;
}
Example #7
0
/* Run a script and return its stdout plus exit status */
static char *_run_script(char **script_argv, int *status)
{
	int cc, i, new_wait, resp_size = 0, resp_offset = 0;
	pid_t cpid;
	char *resp = NULL;
	int pfd[2] = { -1, -1 };

	if (access(capmc_path, R_OK | X_OK) < 0) {
		error("%s: Can not execute: %s", prog_name, capmc_path);
		*status = 127;
		resp = xstrdup("Slurm node_features/knl_cray configuration error");
		return resp;
	}
	if (pipe(pfd) != 0) {
		error("%s: pipe(): %s", prog_name,
		      slurm_strerror(slurm_get_errno()));
		*status = 127;
		resp = xstrdup("System error");
		return resp;
	}

	if ((cpid = fork()) == 0) {
		cc = sysconf(_SC_OPEN_MAX);
		dup2(pfd[1], STDERR_FILENO);
		dup2(pfd[1], STDOUT_FILENO);
		for (i = 0; i < cc; i++) {
			if ((i != STDERR_FILENO) && (i != STDOUT_FILENO))
				close(i);
		}
#ifdef SETPGRP_TWO_ARGS
		setpgrp(0, 0);
#else
		setpgrp();
#endif
		execv(capmc_path, script_argv);
		error("%s: execv(): %s", prog_name,
		      slurm_strerror(slurm_get_errno()));
		exit(127);
	} else if (cpid < 0) {
		close(pfd[0]);
		close(pfd[1]);
		error("%s: fork(): %s", prog_name,
		      slurm_strerror(slurm_get_errno()));
		*status = 127;
		resp = xstrdup("System error");
		return resp;
	} else {
		struct pollfd fds;
		struct timeval tstart;
		resp_size = 1024;
		resp = xmalloc(resp_size);
		close(pfd[1]);
		gettimeofday(&tstart, NULL);
		while (1) {
			fds.fd = pfd[0];
			fds.events = POLLIN | POLLHUP | POLLRDHUP;
			fds.revents = 0;
			new_wait = capmc_timeout - _tot_wait(&tstart);
			if (new_wait <= 0) {
				error("%s: poll() timeout @ %d msec", prog_name,
				      capmc_timeout);
				break;
			}
			new_wait = MIN(new_wait, MAX_POLL_WAIT);
			i = poll(&fds, 1, new_wait);
			if (i == 0) {
				continue;
			} else if (i < 0) {
				error("%s: poll(): %s", prog_name,
				      slurm_strerror(slurm_get_errno()));
				break;
			}
			if ((fds.revents & POLLIN) == 0)
				break;
			i = read(pfd[0], resp + resp_offset,
				 resp_size - resp_offset);
			if (i == 0) {
				break;
			} else if (i < 0) {
				if (errno == EAGAIN)
					continue;
				error("%s: read(): %s", prog_name,
				      slurm_strerror(slurm_get_errno()));
				break;
			} else {
				resp_offset += i;
				if (resp_offset + 1024 >= resp_size) {
					resp_size *= 2;
					resp = xrealloc(resp, resp_size);
				}
			}
		}
		killpg(cpid, SIGTERM);
		usleep(10000);
		killpg(cpid, SIGKILL);
		waitpid(cpid, status, 0);
		close(pfd[0]);
	}
	return resp;
}