Exemple #1
0
static int _serv_write(eio_obj_t *obj, List objs)
{
	/* sanity check */
	xassert(NULL != obj );
	if (obj->shutdown) {
		/* corresponding connection will be
		 * cleaned up during plugin finalize
		 */
		return 0;
	}

	PMIXP_DEBUG("fd = %d", obj->fd);
	pmixp_conn_t *conn = (pmixp_conn_t *)obj->arg;

	/* debug stub */
	pmixp_debug_hang(0);

	/* progress sends */
	pmixp_conn_progress_snd(conn);

	/* if we are done with this connection - remove it */
	if (!pmixp_conn_is_alive(conn)) {
		obj->shutdown = true;
		PMIXP_DEBUG("Connection finalized fd = %d", obj->fd);
		/* cleanup after this connection */
		eio_remove_obj(obj, objs);
		pmixp_conn_return(conn);
	}
	return 0;
}
Exemple #2
0
static int _serv_read(eio_obj_t *obj, List objs)
{
	/* sanity check */
	xassert(NULL != obj );
	if (obj->shutdown) {
		/* corresponding connection will be
		 * cleaned up during plugin finalize
		 */
		return 0;
	}

	PMIXP_DEBUG("fd = %d", obj->fd);
	pmixp_conn_t *conn = (pmixp_conn_t *)obj->arg;
	bool proceed = true;

	/* debug stub */
	pmixp_debug_hang(0);

	/* Read and process all received messages */
	while (proceed) {
		if (!pmixp_conn_progress_rcv(conn)) {
			proceed = 0;
		}
		if (!pmixp_conn_is_alive(conn)) {
			obj->shutdown = true;
			PMIXP_DEBUG("Connection closed fd = %d", obj->fd);
			/* cleanup after this connection */
			eio_remove_obj(obj, objs);
			pmixp_conn_return(conn);
			proceed = 0;
		}
	}
	return 0;
}
Exemple #3
0
static bool _serv_writable(eio_obj_t *obj)
{
	/* sanity check */
	xassert(NULL != obj );
	if (obj->shutdown) {
		/* corresponding connection will be
		 * cleaned up during plugin finalize
		 */
		return false;
	}

	/* get I/O engine */
	pmixp_conn_t *conn = (pmixp_conn_t *)obj->arg;
	pmixp_io_engine_t *eng = conn->eng;

	/* debug stub */
	pmixp_debug_hang(0);

	/* Invoke cleanup callbacks if any */
	pmixp_io_send_cleanup(eng);

	/* check if we have something to send */
	if (pmixp_io_send_pending(eng)) {
		return true;
	}
	return false;
}
Exemple #4
0
/*
 * TODO: we need to keep track of the "me"
 * structures created here, because we need to
 * free them in "pmixp_stepd_finalize"
 */
void pmixp_server_slurm_conn(int fd)
{
	eio_obj_t *obj;
	pmixp_conn_t *conn = NULL;

	PMIXP_DEBUG("Request from fd = %d", fd);
	pmixp_debug_hang(0);

	/* Set nonblocking */
	fd_set_nonblocking(fd);
	fd_set_close_on_exec(fd);
	conn = pmixp_conn_new_temp(PMIXP_PROTO_SLURM, fd, _slurm_new_msg);

	/* try to process right here */
	pmixp_conn_progress_rcv(conn);
	if (!pmixp_conn_is_alive(conn)) {
		/* success, don't need this connection anymore */
		pmixp_conn_return(conn);
		return;
	}

	/* If it is a blocking operation: create AIO object to
	 * handle it */
	obj = eio_obj_create(fd, &slurm_peer_ops, (void *)conn);
	eio_new_obj(pmixp_info_io(), obj);
}
Exemple #5
0
static int _pmix_p2p_send_core(const char *nodename, const char *address,
			       const char *data, uint32_t len)
{
	int rc, timeout;
	slurm_msg_t msg;
	forward_data_msg_t req;
	List ret_list;
	ret_data_info_t *ret_data_info = NULL;

	pmixp_debug_hang(0);

	slurm_msg_t_init(&msg);

	PMIXP_DEBUG("nodelist=%s, address=%s, len=%u", nodename, address, len);
	req.address = (char *)address;
	req.len = len;
	/* there is not much we can do - just cast) */
	req.data = (char*)data;

	msg.msg_type = REQUEST_FORWARD_DATA;
	msg.data = &req;

	if (slurm_conf_get_addr(nodename, &msg.address) == SLURM_ERROR) {
		PMIXP_ERROR("Can't find address for host "
			    "%s, check slurm.conf", nodename);
		return SLURM_ERROR;
	}

	timeout = slurm_get_msg_timeout() * 1000;
	msg.forward.timeout = timeout;
	msg.forward.cnt = 0;
	msg.forward.nodelist = NULL;
	ret_list = slurm_send_addr_recv_msgs(&msg, (char*)nodename, timeout);
	if (!ret_list) {
		/* This should never happen (when this was
		 * written slurm_send_addr_recv_msgs always
		 * returned a list */
		PMIXP_ERROR("No return list given from "
			    "slurm_send_addr_recv_msgs spawned for %s",
			    nodename);
		return SLURM_ERROR;
	} else if ((errno != SLURM_COMMUNICATIONS_CONNECTION_ERROR) &&
		   !list_count(ret_list)) {
		PMIXP_ERROR("failed to send to %s, errno=%d", nodename, errno);
		return SLURM_ERROR;
	}

	rc = SLURM_SUCCESS;
	while ((ret_data_info = list_pop(ret_list))) {
		int temp_rc = slurm_get_return_code(ret_data_info->type,
						    ret_data_info->data);
		if (temp_rc != SLURM_SUCCESS)
			rc = temp_rc;
		destroy_data_info(ret_data_info);
	}

	FREE_NULL_LIST(ret_list);

	return rc;
}
Exemple #6
0
int pmixp_p2p_send(const char *nodename, const char *address, const char *data,
		   uint32_t len, unsigned int start_delay,
		   unsigned int retry_cnt, int silent)
{
	int retry = 0, rc;
	unsigned int delay = start_delay; /* in milliseconds */

	pmixp_debug_hang(0);

	while (1) {
		if (!silent && retry >= 1) {
			PMIXP_DEBUG("send failed, rc=%d, try #%d", rc, retry);
		}

		rc = _pmix_p2p_send_core(nodename, address, data, len);

		if (rc == SLURM_SUCCESS)
			break;

		retry++;
		if (retry >= retry_cnt) {
			PMIXP_ERROR("send failed, rc=%d, exceeded the retry limit", rc);
			break;
		}

		/* wait with constantly increasing delay */
		struct timespec ts =
		{(delay / 1000), ((delay % 1000) * 1000000)};
		nanosleep(&ts, NULL);
		delay *= 2;
	}

	return rc;
}
static int _serv_read(eio_obj_t *obj, List objs)
{
	PMIXP_DEBUG("fd = %d", obj->fd);
	pmixp_io_engine_t *me = (pmixp_io_engine_t *)obj->arg;
	bool proceed = true;

	pmixp_debug_hang(0);

	/* Read and process all received messages */
	while (proceed) {
		switch( _process_message(me) ){
		case 2:
			obj->shutdown = true;
			PMIXP_DEBUG("Connection finalized fd = %d", obj->fd);
			/* cleanup after this connection */
			eio_remove_obj(obj, objs);
			xfree(me);
		case 0:
			proceed = 0;
		case 1:
			break;
		}
	}
	return 0;
}
Exemple #8
0
int pmixp_coll_ring_check(pmixp_coll_t *coll, pmixp_coll_ring_msg_hdr_t *hdr)
{
	char *nodename = NULL;
	int rc;

	if (hdr->nodeid != _ring_prev_id(coll)) {
		nodename = pmixp_info_job_host(hdr->nodeid);
		PMIXP_ERROR("%p: unexpected contrib from %s:%u, expected is %d",
			    coll, nodename, hdr->nodeid, _ring_prev_id(coll));
		return SLURM_ERROR;
	}
	rc = pmixp_coll_check(coll, hdr->seq);
	if (PMIXP_COLL_REQ_FAILURE == rc) {
		/* this is an unacceptable event: either something went
		 * really wrong or the state machine is incorrect.
		 * This will 100% lead to application hang.
		 */
		nodename = pmixp_info_job_host(hdr->nodeid);
		PMIXP_ERROR("Bad collective seq. #%d from %s:%u, current is %d",
			    hdr->seq, nodename, hdr->nodeid, coll->seq);
		pmixp_debug_hang(0); /* enable hang to debug this! */
		slurm_kill_job_step(pmixp_info_jobid(),
				    pmixp_info_stepid(), SIGKILL);
		xfree(nodename);
		return SLURM_SUCCESS;
	} else if (PMIXP_COLL_REQ_SKIP == rc) {
#ifdef PMIXP_COLL_DEBUG
		nodename = pmixp_info_job_host(hdr->nodeid);
		PMIXP_ERROR("Wrong collective seq. #%d from nodeid %u, current is %d, skip this message",
			    hdr->seq, hdr->nodeid, coll->seq);
#endif
		return SLURM_ERROR;
	}
	return SLURM_SUCCESS;
}
Exemple #9
0
pmix_status_t fencenb_fn(const pmix_proc_t procs[], size_t nprocs,
			 const pmix_info_t info[], size_t ninfo,
			 char *data, size_t ndata,
			 pmix_modex_cbfunc_t cbfunc, void *cbdata)
{
	PMIXP_DEBUG("called");
	pmixp_coll_t *coll;
	pmixp_coll_type_t type = PMIXP_COLL_TYPE_FENCE;
	pmix_status_t status = PMIX_SUCCESS;

	pmixp_debug_hang(0);

	coll = pmixp_state_coll_get(type, procs, nprocs);
	pmixp_coll_set_callback(coll, cbfunc, cbdata);
	if (SLURM_SUCCESS != pmixp_coll_contrib_local(coll, data, ndata)) {
		goto error;
	}
	return PMIX_SUCCESS;
error:
	cbfunc(status, NULL, 0, cbdata, NULL, NULL);
	return status;
}
Exemple #10
0
/*
 * For this to work the following conditions supposed to be
 * satisfied:
 * - SLURM has to be configured with `--enable-debug` option
 * - jobstep needs to have at least two nodes
 * In this case communication exchange will be done between
 * the first two nodes.
 */
void pmixp_server_run_cperf()
{
	int size;
	size_t start, end, bound;

	pmixp_debug_hang(0);

	start = 1 << _pmixp_cperf_low;
	end = 1 << _pmixp_cperf_up;
	bound = 1 << _pmixp_cperf_bound;

	for (size = start; size <= end; size *= 2) {
		int j, iters = _pmixp_cperf_siter;
		struct timeval tv1, tv2;
		if (size >= bound) {
			iters = _pmixp_cperf_liter;
		}
		double times[iters];
		char *data = xmalloc(size);

		PMIXP_ERROR("coll perf %d", size);

		for(j=0; j<iters; j++){
			gettimeofday(&tv1, NULL);
			_pmixp_server_cperf_iter(data, size);
			gettimeofday(&tv2, NULL);
			times[j] = tv2.tv_sec + 1E-6 * tv2.tv_usec -
					(tv1.tv_sec + 1E-6 * tv1.tv_usec);
		}

		for(j=0; j<iters; j++){
			/* Output measurements to the slurmd.log */
			PMIXP_ERROR("\t%d %d: %.9lf", j, size, times[j]);
		}
		xfree(data);
	}
}
Exemple #11
0
static void _process_server_request(pmixp_base_hdr_t *hdr, Buf buf)
{
	int rc;

	switch (hdr->type) {
	case PMIXP_MSG_FAN_IN:
	case PMIXP_MSG_FAN_OUT: {
		pmixp_coll_t *coll;
		pmixp_proc_t *procs = NULL;
		size_t nprocs = 0;
		pmixp_coll_type_t type = 0;
		int c_nodeid;

		rc = pmixp_coll_unpack_info(buf, &type, &c_nodeid,
					    &procs, &nprocs);
		if (SLURM_SUCCESS != rc) {
			char *nodename = pmixp_info_job_host(hdr->nodeid);
			PMIXP_ERROR("Bad message header from node %s",
				    nodename);
			xfree(nodename);
			goto exit;
		}
		coll = pmixp_state_coll_get(type, procs, nprocs);
		xfree(procs);

		PMIXP_DEBUG("FENCE collective message from nodeid = %u, "
			    "type = %s, seq = %d",
			    hdr->nodeid,
			    ((PMIXP_MSG_FAN_IN == hdr->type) ?
				     "fan-in" : "fan-out"),
			    hdr->seq);
		rc = pmixp_coll_check_seq(coll, hdr->seq);
		if (PMIXP_COLL_REQ_FAILURE == rc) {
			/* this is unexepable event: either something went
			 * really wrong or the state machine is incorrect.
			 * This will 100% lead to application hang.
			 */
			char *nodename = pmixp_info_job_host(hdr->nodeid);
			PMIXP_ERROR("Bad collective seq. #%d from %s, current"
				    " is %d",
				    hdr->seq, nodename, coll->seq);
			pmixp_debug_hang(0); /* enable hang to debug this! */
			slurm_kill_job_step(pmixp_info_jobid(),
					    pmixp_info_stepid(), SIGKILL);
			xfree(nodename);
			break;
		} else if (PMIXP_COLL_REQ_SKIP == rc) {
			PMIXP_DEBUG("Wrong collective seq. #%d from"
				    " nodeid %u, current is %d, skip "
				    "this message",
				    hdr->seq, hdr->nodeid, coll->seq);
			goto exit;
		}

		if (PMIXP_MSG_FAN_IN == hdr->type) {
			pmixp_coll_contrib_child(coll, hdr->nodeid,
						 hdr->seq, buf);
		} else {
			pmixp_coll_contrib_parent(coll, hdr->nodeid,
						  hdr->seq, buf);
		}

		break;
	}
	case PMIXP_MSG_DMDX: {
		pmixp_dmdx_process(buf, hdr->nodeid, hdr->seq);
		/* buf will be free'd by the PMIx callback so
		 * protect the data by voiding the buffer.
		 * Use the statement below instead of (buf = NULL)
		 * to maintain incapsulation - in general `buf`is
		 * not a pointer, but opaque type.
		 */
		buf = create_buf(NULL, 0);
		break;
	}
	case PMIXP_MSG_INIT_DIRECT:
		PMIXP_DEBUG("Direct connection init from %d", hdr->nodeid);
		break;
#ifndef NDEBUG
	case PMIXP_MSG_PINGPONG: {
		/* if the pingpong mode was activated -
		 * node 0 sends ping requests
		 * and receiver assumed to respond back to node 0
		 */
		int msize = remaining_buf(buf);

		if (pmixp_info_nodeid()) {
			pmixp_server_pp_send(0, msize);
		} else {
			if (pmixp_server_pp_same_thread()) {
				if (pmixp_server_pp_count() ==
				    pmixp_server_pp_warmups()) {
					pmixp_server_pp_start();
				}
				if (!pmixp_server_pp_check_fini(msize)) {
					pmixp_server_pp_send(1, msize);
				}
			}
		}
		pmixp_server_pp_inc();
		break;
	}
#endif
	default:
		PMIXP_ERROR("Unknown message type %d", hdr->type);
		break;
	}

exit:
	free_buf(buf);
}
Exemple #12
0
int pmixp_coll_contrib_local(pmixp_coll_t *coll, char *data, size_t size,
			     pmix_modex_cbfunc_t cbfunc, void *cbdata)
{
	int ret = SLURM_SUCCESS;

	pmixp_debug_hang(0);

	/* sanity check */
	pmixp_coll_sanity_check(coll);

	/* lock the structure */
	slurm_mutex_lock(&coll->lock);

#ifdef PMIXP_COLL_DEBUG
	PMIXP_DEBUG("%p: contrib/loc: seqnum=%u, state=%s, size=%zd",
		    coll, coll->seq, pmixp_coll_state2str(coll->state), size);
#endif

	switch (coll->state) {
	case PMIXP_COLL_SYNC:
		/* change the state */
		coll->ts = time(NULL);
		/* fall-thru */
	case PMIXP_COLL_COLLECT:
		/* sanity check */
		break;
	case PMIXP_COLL_DOWNFWD:
		/* We are waiting for some send requests
		 * to be finished, but local node has started
		 * the next contribution.
		 * This is an OK situation, go ahead and store
		 * it, the buffer with the contribution is not used
		 * now.
		 */
#ifdef PMIXP_COLL_DEBUG
		PMIXP_DEBUG("%p: contrib/loc: next coll!", coll);
#endif
		break;
	case PMIXP_COLL_UPFWD:
	case PMIXP_COLL_UPFWD_WSC:
	case PMIXP_COLL_UPFWD_WPC:
		/* this is not a correct behavior, respond with an error. */
#ifdef PMIXP_COLL_DEBUG
		PMIXP_DEBUG("%p: contrib/loc: before prev coll is finished!",
			    coll);
#endif
		ret = SLURM_ERROR;
		goto exit;
	default:
		/* FATAL: should not happen in normal workflow */
		PMIXP_ERROR("%p: local contrib while active collective, "
			    "state = %s",
			    coll, pmixp_coll_state2str(coll->state));
		xassert(0);
		abort();
	}

	if (coll->contrib_local) {
		/* Double contribution - reject */
		ret = SLURM_ERROR;
		goto exit;
	}

	/* save & mark local contribution */
	coll->contrib_local = true;
	pmixp_server_buf_reserve(coll->ufwd_buf, size);
	memcpy(get_buf_data(coll->ufwd_buf) + get_buf_offset(coll->ufwd_buf),
	       data, size);
	set_buf_offset(coll->ufwd_buf, get_buf_offset(coll->ufwd_buf) + size);

	/* setup callback info */
	coll->cbfunc = cbfunc;
	coll->cbdata = cbdata;

	/* check if the collective is ready to progress */
	_progress_coll(coll);

#ifdef PMIXP_COLL_DEBUG
	PMIXP_DEBUG("%p: finish, state=%s",
		    coll, pmixp_coll_state2str(coll->state));
#endif

exit:
	/* unlock the structure */
	slurm_mutex_unlock(&coll->lock);
	return ret;
}
static void _process_server_request(recv_header_t *_hdr, void *payload)
{
	send_header_t *hdr = &_hdr->send_hdr;
	char *nodename = pmixp_info_job_host(hdr->nodeid);
	Buf buf;
	int rc;

	buf = create_buf(payload, hdr->msgsize);

	switch (hdr->type) {
	case PMIXP_MSG_FAN_IN:
	case PMIXP_MSG_FAN_OUT: {
		pmixp_coll_t *coll;
		pmix_proc_t *procs = NULL;
		size_t nprocs = 0;
		pmixp_coll_type_t type = 0;

		rc = pmixp_coll_unpack_ranges(buf, &type, &procs, &nprocs);
		if (SLURM_SUCCESS != rc) {
			PMIXP_ERROR("Bad message header from node %s", nodename);
			return;
		}
		coll = pmixp_state_coll_get(type, procs, nprocs);
		xfree(procs);

		PMIXP_DEBUG("FENCE collective message from node \"%s\", type = %s, seq = %d",
			    nodename, (PMIXP_MSG_FAN_IN == hdr->type) ? "fan-in" : "fan-out",
			    hdr->seq);
		rc = pmixp_coll_check_seq(coll, hdr->seq, nodename);
		if (PMIXP_COLL_REQ_FAILURE == rc) {
			/* this is unexepable event: either something went
			 * really wrong or the state machine is incorrect.
			 * This will 100% lead to application hang.
			 */
			PMIXP_ERROR("Bad collective seq. #%d from %s, current is %d",
				    hdr->seq, nodename, coll->seq);
			pmixp_debug_hang(0); /* enable hang to debug this! */
			slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(),
					    SIGKILL);

			break;
		} else if (PMIXP_COLL_REQ_SKIP == rc) {
			PMIXP_DEBUG("Wrong collective seq. #%d from %s, current is %d, skip this message",
				    hdr->seq, nodename, coll->seq);
			free_buf(buf);
			break;
		}

		if (PMIXP_MSG_FAN_IN == hdr->type) {
			pmixp_coll_contrib_node(coll, nodename, buf);
			/* we don't need this buffer anymore */
			free_buf(buf);
		} else {
			pmixp_coll_bcast(coll, buf);
			/* buf will be free'd by the PMIx callback */
		}

		break;
	}
	case PMIXP_MSG_DMDX: {
		pmixp_dmdx_process(buf, nodename, hdr->seq);
		break;
	}
	case PMIXP_MSG_HEALTH_CHK: {
		/* this is just health ping.
		 * TODO: can we do something more sophisticated?
		 */
		free_buf(buf);
		break;
	}
	default:
		PMIXP_ERROR("Unknown message type %d", hdr->type);
		break;
	}
	xfree(nodename);
}
Exemple #14
0
int pmixp_libpmix_job_set(void)
{
	List lresp;
	pmix_info_t *info;
	int ninfo;
	ListIterator it;
	pmix_info_t *kvp;

	int i, rc;
	uid_t uid = pmixp_info_jobuid();
	gid_t gid = pmixp_info_jobgid();
	_register_caddy_t *register_caddy;

	register_caddy = xmalloc(sizeof(_register_caddy_t)*(pmixp_info_tasks_loc()+1));
	pmixp_debug_hang(0);

	/* Use list to safely expand/reduce key-value pairs. */
	lresp = list_create(pmixp_xfree_xmalloced);

	_general_proc_info(lresp);

	_set_tmpdirs(lresp);

	_set_procdatas(lresp);

	_set_sizeinfo(lresp);

	if (SLURM_SUCCESS != _set_mapsinfo(lresp)) {
		list_destroy(lresp);
		PMIXP_ERROR("Can't build nodemap");
		return SLURM_ERROR;
	}

	_set_localinfo(lresp);

	ninfo = list_count(lresp);
	PMIX_INFO_CREATE(info, ninfo);
	it = list_iterator_create(lresp);
	i = 0;
	while (NULL != (kvp = list_next(it))) {
		info[i] = *kvp;
		i++;
	}
	list_destroy(lresp);

	register_caddy[0].active = 1;
	rc = PMIx_server_register_nspace(pmixp_info_namespace(),
			pmixp_info_tasks_loc(), info, ninfo, _release_cb,
			&register_caddy[0]);

	if (PMIX_SUCCESS != rc) {
		PMIXP_ERROR("Cannot register namespace %s, nlocalproc=%d, "
			    "ninfo = %d", pmixp_info_namespace(),
			    pmixp_info_tasks_loc(), ninfo);
		return SLURM_ERROR;
	}

	PMIXP_DEBUG("task initialization");
	for (i = 0; i < pmixp_info_tasks_loc(); i++) {
		pmix_proc_t proc;
		register_caddy[i+1].active = 1;
		strncpy(proc.nspace, pmixp_info_namespace(), PMIX_MAX_NSLEN);
		proc.rank = pmixp_info_taskid(i);
		rc = PMIx_server_register_client(&proc, uid, gid, NULL,
				_release_cb, &register_caddy[i + 1]);
		if (PMIX_SUCCESS != rc) {
			PMIXP_ERROR("Cannot register client %d(%d) in namespace %s",
				    pmixp_info_taskid(i), i,
				    pmixp_info_namespace());
			return SLURM_ERROR;
		}
	}

	/* wait for all registration actions to finish */
	while( 1 ){
		int exit_flag = 1;
		struct timespec ts;
		ts.tv_sec = 0;
		ts.tv_nsec = 100;

		for(i=0; i <  pmixp_info_tasks_loc() + 1; i++){
			if( register_caddy[i].active ){
				exit_flag = 0;
			}
		}
		if( exit_flag ){
			break;
		}
		nanosleep(&ts, NULL);
	}
	PMIX_INFO_FREE(info, ninfo);
	xfree(register_caddy);

	return SLURM_SUCCESS;
}
Exemple #15
0
static void _progress_fan_in(pmixp_coll_t *coll)
{
	pmixp_srv_cmd_t type;
	const char *addr = pmixp_info_srv_addr();
	char *hostlist = NULL;
	int rc, is_p2p = 0;
	Buf root_buf;

	PMIXP_DEBUG("%s:%d: start, local=%d, child_cntr=%d",
			pmixp_info_namespace(), pmixp_info_nodeid(),
			coll->contrib_local, coll->contrib_cntr);

	/* lock the collective */
	slurm_mutex_lock(&coll->lock);

	pmixp_coll_sanity_check(coll);

	if (PMIXP_COLL_FAN_IN != coll->state) {
		/* In case of race condition between libpmix and
		 * slurm threads progress_fan_in can be called
		 * after we moved to the next step. */
		goto unlock;
	}

	if (!coll->contrib_local || coll->contrib_cntr != coll->children_cnt) {
		/* Not yet ready to go to the next step */
		goto unlock;
	}

	/* The root of the collective will have parent_host == NULL */
	if (NULL != coll->parent_host) {
		hostlist = xstrdup(coll->parent_host);
		type = PMIXP_MSG_FAN_IN;
		PMIXP_DEBUG("%s:%d: switch to PMIXP_COLL_FAN_OUT state",
			    pmixp_info_namespace(), pmixp_info_nodeid());
		is_p2p = 1;
	} else {
		if (0 < hostlist_count(coll->all_children)) {
			hostlist = hostlist_ranged_string_xmalloc(
					coll->all_children);
			type = PMIXP_MSG_FAN_OUT;
			pmixp_debug_hang(0);
		}
		rc = _copy_payload(coll->buf, coll->serv_offs, &root_buf);
		xassert(0 == rc);
		PMIXP_DEBUG("%s:%d: finish with this collective (I am the root)",
			    pmixp_info_namespace(), pmixp_info_nodeid());
	}

	PMIXP_DEBUG("%s:%d: send data to %s", pmixp_info_namespace(),
			pmixp_info_nodeid(), hostlist);

	/* Check for the singletone case */
	if (NULL != hostlist) {
		if( 0 == coll->seq && NULL != coll->parent_host ){
			/* This is the first message sent to the parent.
			 * There might be a race condition where parent
			 * is not ready to receive the messages.
			 * Use zero-size message to check parent status first
			 * and then send the full message.
			 */
			pmixp_server_health_chk(hostlist, addr);
		}
		rc = pmixp_server_send(hostlist, type, coll->seq, addr,
				get_buf_data(coll->buf),
				get_buf_offset(coll->buf), is_p2p);

		if (SLURM_SUCCESS != rc) {
			PMIXP_ERROR(
					"Cannot send data (size = %lu), to hostlist:\n%s",
					(uint64_t) get_buf_offset(coll->buf),
					hostlist);
			/* return error indication to PMIx. Nodes that haven't received data
			 * will exit by a timeout.
			 * FIXME: do we need to do something with successfuly finished nodes?
			 */
			goto unlock;
		}
	}

	/* transit to the next state */
	_fan_in_finished(coll);

	/* if we are root - push data to PMIx here.
	 * Originally there was a homogenuous solution: root nodename was in the hostlist.
	 * However this may lead to the undesired side effects: we are blocked here sending
	 * data and cannot receive (it will be triggered in this thread after we will leave
	 * this callback), so we have to rely on buffering on the SLURM side.
	 * Better not to do so. */
	if (NULL == coll->parent_host) {
		/* if I am the root - pass the data to PMIx and reset collective here */
		/* copy payload excluding reserved server header */
		_progres_fan_out(coll, root_buf);
	}

unlock:
	if (NULL != hostlist) {
		xfree(hostlist);
	}

	/* lock the */
	slurm_mutex_unlock(&coll->lock);
}