Exemplo n.º 1
0
/*
 * main loop of agent thread
 */
static void *
_agent(void * unused)
{
	eio_handle_t *pmi2_handle;
	eio_obj_t *tree_listen_obj, *task_obj;
	int i;
	
	pmi2_handle = eio_handle_create();

	//fd_set_nonblocking(tree_sock);
	tree_listen_obj = eio_obj_create(tree_sock, &tree_listen_ops,
					 (void *)(-1));
	eio_new_initial_obj(pmi2_handle, tree_listen_obj);
	
	/* for stepd, add the sockets to tasks */
	if (in_stepd()) {
		for (i = 0; i < job_info.ltasks; i ++) {
			task_obj = eio_obj_create(STEPD_PMI_SOCK(i), &task_ops,
						  (void*)(long)(i));
			eio_new_initial_obj(pmi2_handle, task_obj);
		}
		initialized = xmalloc(job_info.ltasks * sizeof(int));
		finalized = xmalloc(job_info.ltasks * sizeof(int));
	}
	
	eio_handle_mainloop(pmi2_handle);

	debug("mpi/pmi2: agent thread exit");

	eio_handle_destroy(pmi2_handle);
	return NULL;
}
Exemplo n.º 2
0
Arquivo: kvs.c Projeto: SchedMD/slurm
extern int
temp_kvs_init(void)
{
	uint16_t cmd;
	uint32_t nodeid, num_children, size;
	Buf buf = NULL;

	xfree(temp_kvs_buf);
	temp_kvs_cnt = 0;
	temp_kvs_size = TEMP_KVS_SIZE_INC;
	temp_kvs_buf = xmalloc(temp_kvs_size);

	/* put the tree cmd here to simplify message sending */
	if (in_stepd()) {
		cmd = TREE_CMD_KVS_FENCE;
	} else {
		cmd = TREE_CMD_KVS_FENCE_RESP;
	}

	buf = init_buf(1024);
	pack16(cmd, buf);
	if (in_stepd()) {
		nodeid = job_info.nodeid;
		/* XXX: TBC */
		num_children = tree_info.num_children + 1;

		pack32(nodeid, buf); /* from_nodeid */
		packstr(tree_info.this_node, buf); /* from_node */
		pack32(num_children, buf); /* num_children */
		pack32(kvs_seq, buf);
	} else {
		pack32(kvs_seq, buf);
	}
	size = get_buf_offset(buf);
	if (temp_kvs_cnt + size > temp_kvs_size) {
		temp_kvs_size += TEMP_KVS_SIZE_INC;
		xrealloc(temp_kvs_buf, temp_kvs_size);
	}
	memcpy(&temp_kvs_buf[temp_kvs_cnt], get_buf_data(buf), size);
	temp_kvs_cnt += size;
	free_buf(buf);

	tasks_to_wait = 0;
	children_to_wait = 0;

	return SLURM_SUCCESS;
}
Exemplo n.º 3
0
static int
_handle_tree_request(int fd)
{
	uint32_t temp;
	int rc = SLURM_SUCCESS;
	
	if (in_stepd()) {	/* skip uid passed from slurmd */
		safe_read(fd, &temp, sizeof(uint32_t));
		temp = ntohl(temp);
		debug3("mpi/pmi2: _handle_tree_request: req from uid %u", temp);
	}
	rc = handle_tree_cmd(fd);
	return rc;
rwfail:
	return SLURM_ERROR;
}
Exemplo n.º 4
0
static int
_tree_listen_read(eio_obj_t *obj, List objs)
{
	int sd;
	struct sockaddr addr;
	struct sockaddr_in *sin;
	socklen_t size = sizeof(addr);
	char buf[INET_ADDRSTRLEN];

	debug2("mpi/pmi2: _tree_listen_read");
	
	while (1) {
                /* 
                 * Return early if fd is not now ready
                 */
                if (!_is_fd_ready(obj->fd))
                        return 0;

                while ((sd = accept(obj->fd, &addr, &size)) < 0) {
                        if (errno == EINTR)
                                continue;
                        if (errno == EAGAIN)    /* No more connections */
                                return 0;
                        if ((errno == ECONNABORTED) ||
                            (errno == EWOULDBLOCK)) {
                                return 0;
                        }
                        error("mpi/pmi2: unable to accept new connection: %m");
                        return 0;
		}

		if (! in_stepd()) {
			sin = (struct sockaddr_in *) &addr;
			inet_ntop(AF_INET, &sin->sin_addr, buf, INET_ADDRSTRLEN);
			debug3("mpi/pmi2: accepted tree connection: ip=%s sd=%d",
			       buf, sd);
		}
		
		/* read command from socket and handle it */
		_handle_tree_request(sd);
		close(sd);
        }
	return 0;
}
Exemplo n.º 5
0
Arquivo: kvs.c Projeto: SchedMD/slurm
extern int
temp_kvs_send(void)
{
	int rc = SLURM_ERROR, retry = 0;
	unsigned int delay = 1;
	char *nodelist = NULL;

	if (!in_stepd())	/* srun */
		nodelist = xstrdup(job_info.step_nodelist);
	else if (tree_info.parent_node)
		nodelist = xstrdup(tree_info.parent_node);

	/* cmd included in temp_kvs_buf */
	kvs_seq++; /* expecting new kvs after now */

	while (1) {
		if (retry == 1)
			verbose("failed to send temp kvs, rc=%d, retrying", rc);

		if (nodelist)
			/* srun or non-first-level stepds */
			rc = slurm_forward_data(&nodelist,
						tree_sock_addr,
						temp_kvs_cnt,
						temp_kvs_buf);
		else		/* first level stepds */
			rc = tree_msg_to_srun(temp_kvs_cnt, temp_kvs_buf);

		if (rc == SLURM_SUCCESS)
			break;

		if (++retry >= MAX_RETRIES)
			break;
		/* wait, in case parent stepd / srun not ready */
		sleep(delay);
		delay *= 2;
	}
	temp_kvs_init();	/* clear old temp kvs */

	xfree(nodelist);

	return rc;
}
Exemplo n.º 6
0
Arquivo: kvs.c Projeto: BYUHPC/slurm
extern int
temp_kvs_send(void)
{
	int rc = SLURM_ERROR, retry = 0;
	unsigned int delay = 1;

	/* cmd included in temp_kvs_buf */
	kvs_seq ++; /* expecting new kvs after now */

	while (1) {
		if (retry == 1) {
			verbose("failed to send temp kvs, rc=%d, retrying", rc);
		}
		if (! in_stepd()) {	/* srun */
			rc = tree_msg_to_stepds(job_info.step_nodelist,
						temp_kvs_cnt,
						temp_kvs_buf);
		} else if (tree_info.parent_node != NULL) {
			/* non-first-level stepds */
			rc = tree_msg_to_stepds(tree_info.parent_node,
						temp_kvs_cnt,
						temp_kvs_buf);
		} else {		/* first level stepds */
			rc = tree_msg_to_srun(temp_kvs_cnt, temp_kvs_buf);
		}
		if (rc == SLURM_SUCCESS)
			break;
		retry ++;
		if (retry >= MAX_RETRIES)
			break;
		/* wait, in case parent stepd / srun not ready */
		sleep(delay);
		delay *= 2;
	}
	temp_kvs_init();	/* clear old temp kvs */
	return rc;
}