Beispiel #1
0
static void *_pty_thread(void *arg)
{
	int fd = -1;
	srun_job_t *job = (srun_job_t *) arg;
	slurm_addr_t client_addr;

	xsignal_unblock(pty_sigarray);
	xsignal(SIGWINCH, _handle_sigwinch);

	if ((fd = slurm_accept_msg_conn(job->pty_fd, &client_addr)) < 0) {
		error("pty: accept failure: %m");
		return NULL;
	}

	while (job->state <= SRUN_JOB_RUNNING) {
		debug2("waiting for SIGWINCH");
		poll(NULL, 0, -1);
		if (winch) {
			set_winsize(job);
			_notify_winsize_change(fd, job);
		}
		winch = 0;
	}
	return NULL;
}
Beispiel #2
0
static void *_msg_thr_internal(void *arg)
{
	slurm_addr_t cli_addr;
	int newsockfd;
	slurm_msg_t msg;
	int *slurmctld_fd_ptr = (int *)arg;

	(void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
	(void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);

	while (!srun_shutdown) {
		newsockfd = slurm_accept_msg_conn(*slurmctld_fd_ptr, &cli_addr);
		if (newsockfd == SLURM_SOCKET_ERROR) {
			if (errno != EINTR)
				error("slurm_accept_msg_conn: %m");
			continue;
		}
		slurm_msg_t_init(&msg);
		if (slurm_receive_msg(newsockfd, &msg, 0) != 0) {
			error("slurm_receive_msg: %m");
			/* close the new socket */
			close(newsockfd);
			continue;
		}
		_handle_msg(&msg);
		slurm_free_msg_members(&msg);
		close(newsockfd);
	}
	return NULL;
}
Beispiel #3
0
static void
_msg_engine(void)
{
	slurm_addr_t *cli;
	slurm_fd_t sock;

	msg_pthread = pthread_self();
	slurmd_req(NULL);	/* initialize timer */
	while (!_shutdown) {
		if (_reconfig) {
			verbose("got reconfigure request");
			_wait_for_all_threads(5); /* Wait for RPCs to finish */
			_reconfigure();
		}

		cli = xmalloc (sizeof (slurm_addr_t));
		if ((sock = slurm_accept_msg_conn(conf->lfd, cli)) >= 0) {
			_handle_connection(sock, cli);
			continue;
		}
		/*
		 *  Otherwise, accept() failed.
		 */
		xfree (cli);
		if (errno == EINTR)
			continue;
		error("accept: %m");
	}
	verbose("got shutdown request");
	slurm_shutdown_msg_engine(conf->lfd);
	return;
}
Beispiel #4
0
void *_recv_msg_proc(void *no_data)
{
	slurm_fd_t sock_fd, new_sock_fd;
	slurm_addr_t client_addr;
	connection_arg_t *conn_arg = (connection_arg_t*)malloc(sizeof(connection_arg_t));

	sock_fd = slurm_init_msg_engine_addrname_port(slurmctld_conf.control_addr,
												slurmctld_conf.slurmctld_port);

	if (sock_fd == SLURM_SOCKET_ERROR)
	{
		fatal("slurm_init_msg_engine_addrname_port error %m");
	}

	while (1)
	{
		if ((new_sock_fd = slurm_accept_msg_conn(sock_fd,
					&client_addr))== SLURM_SOCKET_ERROR)
		{
			error("slurm_accept_msg_conn: %m");
			continue;
		}

		conn_arg->newsockfd = new_sock_fd;

		pthread_t* serv_thread = (pthread_t*)malloc(sizeof(pthread_t));
		while (pthread_create(serv_thread, NULL, _service_connection, (void*) conn_arg))
		{
			error("pthread_create error:%m");
			sleep(1);
		}
	}

	return NULL;
}
Beispiel #5
0
/* Process incoming RPCs. Meant to execute as a pthread */
extern void *rpc_mgr(void *no_data)
{
	int sockfd, newsockfd;
	int i;
	uint16_t port;
	slurm_addr_t cli_addr;
	slurmdbd_conn_t *conn_arg = NULL;

	master_thread_id = pthread_self();

	/* initialize port for RPCs */
	if ((sockfd = slurm_init_msg_engine_port(get_dbd_port()))
	    == SLURM_SOCKET_ERROR)
		fatal("slurm_init_msg_engine_port error %m");

	slurm_persist_conn_recv_server_init();

	/*
	 * Process incoming RPCs until told to shutdown
	 */
	while (!shutdown_time &&
	       (i = slurm_persist_conn_wait_for_thread_loc()) >= 0) {
		/*
		 * accept needed for stream implementation is a no-op in
		 * message implementation that just passes sockfd to newsockfd
		 */
		if ((newsockfd = slurm_accept_msg_conn(sockfd,
						       &cli_addr)) ==
		    SLURM_SOCKET_ERROR) {
			slurm_persist_conn_free_thread_loc(i);
			if (errno != EINTR)
				error("slurm_accept_msg_conn: %m");
			continue;
		}
		fd_set_nonblocking(newsockfd);

		conn_arg = xmalloc(sizeof(slurmdbd_conn_t));
		conn_arg->conn = xmalloc(sizeof(slurm_persist_conn_t));
		conn_arg->conn->fd = newsockfd;
		conn_arg->conn->flags = PERSIST_FLAG_DBD;
		conn_arg->conn->callback_proc = proc_req;
		conn_arg->conn->callback_fini = _connection_fini_callback;
		conn_arg->conn->shutdown = &shutdown_time;
		conn_arg->conn->version = SLURM_MIN_PROTOCOL_VERSION;
		conn_arg->conn->rem_host = xmalloc_nz(sizeof(char) * 16);
		/* Don't fill in the rem_port here.  It will be filled in
		 * later if it is a slurmctld connection. */
		slurm_get_ip_str(&cli_addr, &port,
				 conn_arg->conn->rem_host, sizeof(char) * 16);

		slurm_persist_conn_recv_thread_init(
			conn_arg->conn, i, conn_arg);
	}

	debug("rpc_mgr shutting down");
	(void) slurm_shutdown_msg_engine(sockfd);
	pthread_exit((void *) 0);
	return NULL;
}
Beispiel #6
0
/*****************************************************************************\
 * message hander thread
\*****************************************************************************/
static void *_msg_thread(void *no_data)
{
	slurm_fd_t sock_fd = -1, new_fd;
	slurm_addr_t cli_addr;
	char *msg;
	int i;

	/* If JobSubmitDynAllocPort is already taken, keep trying to open it
	 * once per minute. Slurmctld will continue to function
	 * during this interval even if nothing can be scheduled. */
	for (i=0; (!thread_shutdown); i++) {
		if (i > 0)
			sleep(60);
		sock_fd = slurm_init_msg_engine_port(sched_port);
		if (sock_fd != SLURM_SOCKET_ERROR)
			break;
		error("dynalloc: slurm_init_msg_engine_port %u %m",
			sched_port);
		error("dynalloc: Unable to communicate with ORTE RAS");
	}

	/* Process incoming RPCs until told to shutdown */
	while (!thread_shutdown) {
		if ((new_fd = slurm_accept_msg_conn(sock_fd, &cli_addr))
				== SLURM_SOCKET_ERROR) {
			if (errno != EINTR)
				error("dyalloc: slurm_accept_msg_conn %m");
			continue;
		}

		if (thread_shutdown) {
			close(new_fd);
			break;
		}

		err_code = 0;
		err_msg = "";
		msg = _recv_msg(new_fd);
		if (msg) {
			_proc_msg(new_fd, msg);
			xfree(msg);
		}
		slurm_close_accepted_conn(new_fd);
	}
	verbose("dynalloc: message engine shutdown");
	if (sock_fd > 0)
		(void) slurm_shutdown_msg_engine(sock_fd);
	pthread_exit((void *) 0);
	return NULL;
}
Beispiel #7
0
static void *_msg_thread(void *no_data)
{
	int sock_fd = -1, new_fd;
	slurm_addr_t cli_addr;
	char *msg;
	int i;

	/* If Port is already taken, keep trying to open it 10 secs */
	for (i = 0; (!thread_shutdown); i++) {
		if (i > 0)
			sleep(10);
		sock_fd = slurm_init_msg_engine_port(nonstop_comm_port);
		if (sock_fd != SLURM_SOCKET_ERROR)
			break;
		error("slurmctld/nonstop: can not open port: %hu %m",
		      nonstop_comm_port);
	}

	/* Process incoming RPCs until told to shutdown */
	while (!thread_shutdown) {
		new_fd = slurm_accept_msg_conn(sock_fd, &cli_addr);
		if (new_fd == SLURM_SOCKET_ERROR) {
			if (errno != EINTR) {
				info("slurmctld/nonstop: "
				     "slurm_accept_msg_conn %m");
			}
			continue;
		}
		if (thread_shutdown) {
			close(new_fd);
			break;
		}
		/* It would be nice to create a pthread for each new
		 * RPC, but that leaks memory on some systems when
		 * done from a plugin. Alternately, we could maintain
		 * a pool of pthreads and reuse them. */
		msg = _recv_msg(new_fd);
		if (msg) {
			_proc_msg(new_fd, msg, cli_addr);
			xfree(msg);
		}
		slurm_close(new_fd);
	}
	debug("slurmctld/nonstop: message engine shutdown");
	if (sock_fd > 0)
		(void) slurm_shutdown_msg_engine(sock_fd);
	pthread_exit((void *) 0);
	return NULL;
}
Beispiel #8
0
/* Accept RPC from slurmctld and process it.
 * IN slurmctld_fd: file descriptor for slurmctld communications
 * OUT resp: resource allocation response message
 * RET 1 if resp is filled in, 0 otherwise */
static int
_accept_msg_connection(int listen_fd,
		       resource_allocation_response_msg_t **resp)
{
	int	     conn_fd;
	slurm_msg_t  *msg = NULL;
	slurm_addr_t   cli_addr;
	char         host[256];
	uint16_t     port;
	int          rc = 0;

	conn_fd = slurm_accept_msg_conn(listen_fd, &cli_addr);
	if (conn_fd < 0) {
		error("Unable to accept connection: %m");
		return rc;
	}

	slurm_get_addr(&cli_addr, &port, host, sizeof(host));
	debug2("got message connection from %s:%hu", host, port);

	msg = xmalloc(sizeof(slurm_msg_t));
	slurm_msg_t_init(msg);

	if((rc = slurm_receive_msg(conn_fd, msg, 0)) != 0) {
		slurm_free_msg(msg);

		if (errno == EINTR) {
			slurm_close_accepted_conn(conn_fd);
			*resp = NULL;
			return 0;
		}

		error("_accept_msg_connection[%s]: %m", host);
		slurm_close_accepted_conn(conn_fd);
		return SLURM_ERROR;
	}

	rc = _handle_msg(msg, resp); /* handle_msg frees msg */
	slurm_free_msg(msg);

	slurm_close_accepted_conn(conn_fd);
	return rc;
}
Beispiel #9
0
/* _background_rpc_mgr - Read and process incoming RPCs to the background
 *	controller (that's us) */
static void *_background_rpc_mgr(void *no_data)
{
	slurm_fd_t newsockfd;
	slurm_fd_t sockfd;
	slurm_addr_t cli_addr;
	slurm_msg_t *msg = NULL;
	int error_code;
	char* node_addr = NULL;

	/* Read configuration only */
	slurmctld_lock_t config_read_lock = {
		READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK };
	int sigarray[] = {SIGUSR1, 0};

	(void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
	(void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);
	debug3("_background_rpc_mgr pid = %lu", (unsigned long) getpid());

	/* initialize port for RPCs */
	lock_slurmctld(config_read_lock);

	/* set node_addr to bind to (NULL means any) */
	if ((strcmp(slurmctld_conf.backup_controller,
		    slurmctld_conf.backup_addr) != 0)) {
		node_addr = slurmctld_conf.backup_addr ;
	}

	if ((sockfd =
	     slurm_init_msg_engine_addrname_port(node_addr,
						 slurmctld_conf.
						 slurmctld_port))
	    == SLURM_SOCKET_ERROR)
		fatal("slurm_init_msg_engine_addrname_port error %m");
	unlock_slurmctld(config_read_lock);

	/* Prepare to catch SIGUSR1 to interrupt accept().
	 * This signal is generated by the slurmctld signal
	 * handler thread upon receipt of SIGABRT, SIGINT,
	 * or SIGTERM. That thread does all processing of
	 * all signals. */
	xsignal(SIGUSR1, _sig_handler);
	xsignal_unblock(sigarray);

	/*
	 * Process incoming RPCs indefinitely
	 */
	while (slurmctld_config.shutdown_time == 0) {
		/* accept needed for stream implementation
		 * is a no-op in message implementation that just passes
		 * sockfd to newsockfd */
		if ((newsockfd = slurm_accept_msg_conn(sockfd, &cli_addr))
		    == SLURM_SOCKET_ERROR) {
			if (errno != EINTR)
				error("slurm_accept_msg_conn: %m");
			continue;
		}

		msg = xmalloc(sizeof(slurm_msg_t));
		slurm_msg_t_init(msg);
		if (slurm_receive_msg(newsockfd, msg, 0) != 0)
			error("slurm_receive_msg: %m");

		error_code = _background_process_msg(msg);
		if ((error_code == SLURM_SUCCESS)			&&
		    (msg->msg_type == REQUEST_SHUTDOWN_IMMEDIATE)	&&
		    (slurmctld_config.shutdown_time == 0))
			slurmctld_config.shutdown_time = time(NULL);

		slurm_free_msg_data(msg->msg_type, msg->data);
		slurm_free_msg(msg);

		slurm_close(newsockfd);	/* close new socket */
	}

	debug3("_background_rpc_mgr shutting down");
	slurm_close(sockfd);	/* close the main socket */
	pthread_exit((void *) 0);
	return NULL;
}
Beispiel #10
0
Datei: msg.c Projekt: IFCA/slurm
/*****************************************************************************\
 * message hander thread
\*****************************************************************************/
static void *_msg_thread(void *no_data)
{
	slurm_fd_t sock_fd = -1, new_fd;
	slurm_addr_t cli_addr;
	char *msg;
	slurm_ctl_conf_t *conf;
	int i;
	/* Locks: Write configuration, job, node, and partition */
	slurmctld_lock_t config_write_lock = {
		WRITE_LOCK, WRITE_LOCK, WRITE_LOCK, WRITE_LOCK };

	conf = slurm_conf_lock();
	sched_port = conf->schedport;
	slurm_conf_unlock();

	/* Wait until configuration is completely loaded */
	lock_slurmctld(config_write_lock);
	unlock_slurmctld(config_write_lock);

	/* If SchedulerPort is already taken, keep trying to open it
	 * once per minute. Slurmctld will continue to function
	 * during this interval even if nothing can be scheduled. */
	for (i=0; (!thread_shutdown); i++) {
		if (i > 0)
			sleep(60);
		sock_fd = slurm_init_msg_engine_port(sched_port);
		if (sock_fd != SLURM_SOCKET_ERROR)
			break;
		error("wiki: slurm_init_msg_engine_port %u %m",
			sched_port);
		error("wiki: Unable to communicate with Moab");
	}

	/* Process incoming RPCs until told to shutdown */
	while (!thread_shutdown) {
		if ((new_fd = slurm_accept_msg_conn(sock_fd, &cli_addr))
				== SLURM_SOCKET_ERROR) {
			if (errno != EINTR)
				error("wiki: slurm_accept_msg_conn %m");
			continue;
		}
                if (thread_shutdown) {
                        close(new_fd);
                        break;
                }
		/* It would be nice to create a pthread for each new
		 * RPC, but that leaks memory on some systems when
		 * done from a plugin.
		 * FIXME: Maintain a pool of and reuse them. */
		err_code = 0;
		err_msg = "";
		msg = _recv_msg(new_fd);
		if (msg) {
			_proc_msg(new_fd, msg);
			xfree(msg);
		}
		slurm_close_accepted_conn(new_fd);
	}
	if (sock_fd > 0)
		(void) slurm_shutdown_msg_engine(sock_fd);
	pthread_exit((void *) 0);
	return NULL;
}
Beispiel #11
0
/* Process incoming RPCs. Meant to execute as a pthread */
extern void *rpc_mgr(void *no_data)
{
	pthread_attr_t thread_attr_rpc_req;
	slurm_fd_t sockfd, newsockfd;
	int i, retry_cnt, sigarray[] = {SIGUSR1, 0};
	slurm_addr_t cli_addr;
	slurmdbd_conn_t *conn_arg = NULL;

	slurm_mutex_lock(&thread_count_lock);
	master_thread_id = pthread_self();
	slurm_mutex_unlock(&thread_count_lock);

	(void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
	(void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);

	/* threads to process individual RPC's are detached */
	slurm_attr_init(&thread_attr_rpc_req);
	if (pthread_attr_setdetachstate
	    (&thread_attr_rpc_req, PTHREAD_CREATE_DETACHED))
		fatal("pthread_attr_setdetachstate %m");

	/* initialize port for RPCs */
	if ((sockfd = slurm_init_msg_engine_port(get_dbd_port()))
	    == SLURM_SOCKET_ERROR)
		fatal("slurm_init_msg_engine_port error %m");

	/* Prepare to catch SIGUSR1 to interrupt accept().
	 * This signal is generated by the slurmdbd signal
	 * handler thread upon receipt of SIGABRT, SIGINT,
	 * or SIGTERM. That thread does all processing of
	 * all signals. */
	xsignal(SIGUSR1, _sig_handler);
	xsignal_unblock(sigarray);

	/*
	 * Process incoming RPCs until told to shutdown
	 */
	while ((i = _wait_for_server_thread()) >= 0) {
		/*
		 * accept needed for stream implementation is a no-op in
		 * message implementation that just passes sockfd to newsockfd
		 */
		if ((newsockfd = slurm_accept_msg_conn(sockfd,
						       &cli_addr)) ==
		    SLURM_SOCKET_ERROR) {
			_free_server_thread((pthread_t) 0);
			if (errno != EINTR)
				error("slurm_accept_msg_conn: %m");
			continue;
		}
		fd_set_nonblocking(newsockfd);

		conn_arg = xmalloc(sizeof(slurmdbd_conn_t));
		conn_arg->newsockfd = newsockfd;
		slurm_get_ip_str(&cli_addr, &conn_arg->orig_port,
				 conn_arg->ip, sizeof(conn_arg->ip));
		retry_cnt = 0;
		while (pthread_create(&slave_thread_id[i],
				      &thread_attr_rpc_req,
				      _service_connection,
				      (void *) conn_arg)) {
			if (retry_cnt > 0) {
				error("pthread_create failure, "
				      "aborting RPC: %m");
				close(newsockfd);
				break;
			}
			error("pthread_create failure: %m");
			retry_cnt++;
			usleep(1000);	/* retry in 1 msec */
		}
	}

	debug3("rpc_mgr shutting down");
	slurm_attr_destroy(&thread_attr_rpc_req);
	(void) slurm_shutdown_msg_engine(sockfd);
	_wait_for_thread_fini();
	pthread_exit((void *) 0);
	return NULL;
}
Beispiel #12
0
/* Wait for barrier and get full PMI Keyval space data */
int  slurm_get_kvs_comm_set(struct kvs_comm_set **kvs_set_ptr,
		int pmi_rank, int pmi_size)
{
	int rc, srun_fd, retries = 0, timeout = 0;
	slurm_msg_t msg_send, msg_rcv;
	slurm_addr_t slurm_addr, srun_reply_addr;
	char hostname[64];
	uint16_t port;
	kvs_get_msg_t data;
	char *env_pmi_ifhn;

	if (kvs_set_ptr == NULL)
		return EINVAL;
	*kvs_set_ptr = NULL;	/* initialization */

	if ((rc = _get_addr()) != SLURM_SUCCESS) {
		error("_get_addr: %m");
		return rc;
	}

	_set_pmi_time();

	if (pmi_fd < 0) {
		if ((pmi_fd = slurm_init_msg_engine_port(0)) < 0) {
			error("slurm_init_msg_engine_port: %m");
			return SLURM_ERROR;
		}
		fd_set_blocking(pmi_fd);
	}
	if (slurm_get_stream_addr(pmi_fd, &slurm_addr) < 0) {
		error("slurm_get_stream_addr: %m");
		return SLURM_ERROR;
	}
	/* hostname is not set here, so slurm_get_addr fails
	slurm_get_addr(&slurm_addr, &port, hostname, sizeof(hostname)); */
	port = ntohs(slurm_addr.sin_port);
	if ((env_pmi_ifhn = getenv("SLURM_PMI_RESP_IFHN"))) {
		strncpy(hostname, env_pmi_ifhn, sizeof(hostname));
		hostname[sizeof(hostname)-1] = 0;
	} else
		gethostname_short(hostname, sizeof(hostname));

	data.task_id = pmi_rank;
	data.size = pmi_size;
	data.port = port;
	data.hostname = hostname;
	slurm_msg_t_init(&msg_send);
	slurm_msg_t_init(&msg_rcv);
	msg_send.address = srun_addr;
	msg_send.msg_type = PMI_KVS_GET_REQ;
	msg_send.data = &data;

	/* Send the RPC to the local srun communcation manager.
	 * Since the srun can be sent thousands of messages at
	 * the same time and refuse some connections, retry as
	 * needed. Wait until all key-pairs have been sent by
	 * all tasks then spread out messages by task's rank.
	 * Also increase the message timeout if many tasks
	 * since the srun command can get very overloaded (the
	 * default timeout is 10 secs).
	 */
	_delay_rpc(pmi_rank, pmi_size);
	if      (pmi_size > 4000)	/* 240 secs */
		timeout = slurm_get_msg_timeout() * 24000;
	else if (pmi_size > 1000)	/* 120 secs */
		timeout = slurm_get_msg_timeout() * 12000;
	else if (pmi_size > 100)	/* 60 secs */
		timeout = slurm_get_msg_timeout() * 6000;
	else if (pmi_size > 10)		/* 20 secs */
		timeout = slurm_get_msg_timeout() * 2000;

	while (slurm_send_recv_rc_msg_only_one(&msg_send, &rc, timeout) < 0) {
		if (retries++ > MAX_RETRIES) {
			error("slurm_get_kvs_comm_set: %m");
			return SLURM_ERROR;
		} else
			debug("get kvs retry %d", retries);
		_delay_rpc(pmi_rank, pmi_size);
	}
	if (rc != SLURM_SUCCESS) {
		error("slurm_get_kvs_comm_set error_code=%d", rc);
		return rc;
	}

	/* get the message after all tasks reach the barrier */
	srun_fd = slurm_accept_msg_conn(pmi_fd, &srun_reply_addr);
	if (srun_fd < 0) {
		error("slurm_accept_msg_conn: %m");
		return errno;
	}

	while ((rc = slurm_receive_msg(srun_fd, &msg_rcv, timeout)) != 0) {
		if (errno == EINTR)
			continue;
		error("slurm_receive_msg: %m");
		slurm_close(srun_fd);
		return errno;
	}
	if (msg_rcv.auth_cred)
		(void)g_slurm_auth_destroy(msg_rcv.auth_cred);

	if (msg_rcv.msg_type != PMI_KVS_GET_RESP) {
		error("slurm_get_kvs_comm_set msg_type=%d", msg_rcv.msg_type);
		slurm_close(srun_fd);
		return SLURM_UNEXPECTED_MSG_ERROR;
	}
	if (slurm_send_rc_msg(&msg_rcv, SLURM_SUCCESS) < 0)
		error("slurm_send_rc_msg: %m");

	slurm_close(srun_fd);
	*kvs_set_ptr = msg_rcv.data;

	rc = _forward_comm_set(*kvs_set_ptr);
	return rc;
}