Пример #1
0
int pmixp_server_health_chk(char *hostlist,  const char *addr)
{
	send_header_t hdr;
	char nhdr[sizeof(send_header_t)];
	size_t hsize;
	Buf buf = pmixp_server_new_buf();
	char *data = get_buf_data(buf);
	int rc;

	hdr.magic = PMIX_SERVER_MSG_MAGIC;
	hdr.type = PMIXP_MSG_HEALTH_CHK;
	hdr.msgsize = 1;
	hdr.seq = 0;
	/* Store global nodeid that is
	 *  independent from exact collective */
	hdr.nodeid = pmixp_info_nodeid_job();
	hsize = _send_pack_hdr(&hdr, nhdr);
	memcpy(data, nhdr, hsize);

	grow_buf(buf, sizeof(char));
	pack8('\n', buf);

	rc = pmixp_stepd_send(hostlist, addr, data, get_buf_offset(buf), 4, 14, 1);
	if (SLURM_SUCCESS != rc) {
		PMIXP_ERROR("Was unable to wait for the parent %s to become alive on addr %s",
			    hostlist, addr);
	}

	return rc;
}
Пример #2
0
static void _dmdx_pmix_cb(pmix_status_t status, char *data, size_t sz,
		void *cbdata)
{
	dmdx_caddy_t *caddy = (dmdx_caddy_t *)cbdata;
	Buf buf = pmixp_server_new_buf();
	char *addr;
	int rc;

	/* setup response header */
	_setup_header(buf, DMDX_RESPONSE, caddy->proc.nspace, caddy->proc.rank,
			status);

	/* pack the response */
	packmem(data, sz, buf);

	/* setup response address */
	addr = pmixp_info_nspace_usock(caddy->sender_ns);

	/* send the request */
	rc = pmixp_server_send(caddy->sender_host, PMIXP_MSG_DMDX,
			caddy->seq_num, addr, get_buf_data(buf),
			get_buf_offset(buf), 1);
	if (SLURM_SUCCESS != rc) {
		/* not much we can do here. Caller will react by timeout */
		PMIXP_ERROR("Cannot send direct modex response to %s",
				caddy->sender_host);
	}
	xfree(addr);
	free_buf(buf);
	_dmdx_free_caddy(caddy);
}
Пример #3
0
int pmixp_dmdx_get(const char *nspace, int rank,
		   pmix_modex_cbfunc_t cbfunc, void *cbdata)
{
	dmdx_req_info_t *req;
	char *addr, *host;
	Buf buf;
	int rc;
	uint32_t seq;

	/* need to send the request */
	host = pmixp_nspace_resolve(nspace, rank);
	xassert(NULL != host);
	if (NULL == host) {
		return SLURM_ERROR;
	}

	buf = pmixp_server_new_buf();

	/* setup message header */
	_setup_header(buf, DMDX_REQUEST, nspace, rank, SLURM_SUCCESS);
	/* generate namespace usocket name */
	addr = pmixp_info_nspace_usock(nspace);
	/* store cur seq. num and move to the next request */
	seq = _dmdx_seq_num++;

	/* track this request */
	req = xmalloc(sizeof(dmdx_req_info_t));
	req->seq_num = seq;
	req->cbfunc = cbfunc;
	req->cbdata = cbdata;
	req->ts = time(NULL);
#ifndef NDEBUG
	strncpy(req->nspace, nspace, PMIX_MAX_NSLEN);
	req->rank = rank;
#endif
	list_append(_dmdx_requests, req);

	/* send the request */
	rc = pmixp_server_send(host, PMIXP_MSG_DMDX, seq, addr,
			get_buf_data(buf), get_buf_offset(buf), 1);

	/* cleanup the resources */
	xfree(addr);
	free_buf(buf);

	/* check the return status */
	if (SLURM_SUCCESS != rc) {
		PMIXP_ERROR("Cannot send direct modex request to %s", host);
		cbfunc(PMIX_ERROR, NULL, 0, cbdata, NULL, NULL);
		return SLURM_ERROR;
	}

	return rc;
}
Пример #4
0
/*
 * Based on ideas provided by Hongjia Cao <*****@*****.**> in PMI2 plugin
 */
int pmixp_coll_init(pmixp_coll_t *coll, const pmix_proc_t *procs,
		size_t nprocs, pmixp_coll_type_t type)
{
	hostlist_t hl;
	uint32_t nodeid = 0, nodes = 0;
	int parent_id, depth, max_depth, tmp;
	int width, my_nspace = -1;
	char *p;
	int i, *ch_nodeids = NULL;

#ifndef NDEBUG
	coll->magic = PMIXP_COLL_STATE_MAGIC;
#endif
	coll->type = type;
	coll->state = PMIXP_COLL_SYNC;
	coll->procs = xmalloc(sizeof(*procs) * nprocs);
	memcpy(coll->procs, procs, sizeof(*procs) * nprocs);
	coll->nprocs = nprocs;
	coll->my_nspace = my_nspace;

	if (SLURM_SUCCESS != _hostset_from_ranges(procs, nprocs, &hl)) {
		/* TODO: provide ranges output routine */
		PMIXP_ERROR("Bad ranges information");
		goto err_exit;
	}

	width = slurm_get_tree_width();
	nodes = hostlist_count(hl);
	nodeid = hostlist_find(hl, pmixp_info_hostname());
	reverse_tree_info(nodeid, nodes, width, &parent_id, &tmp, &depth,
			&max_depth);
	coll->children_cnt = tmp;
	coll->nodeid = nodeid;

	/* We interested in amount of direct childs */
	coll->seq = 0;
	coll->contrib_cntr = 0;
	coll->contrib_local = false;
	ch_nodeids = xmalloc(sizeof(int) * width);
	coll->ch_contribs = xmalloc(sizeof(int) * width);
	coll->children_cnt = reverse_tree_direct_children(nodeid, nodes, width,
			depth, ch_nodeids);

	/* create the hostlist with extract direct children's hostnames */
	coll->ch_hosts = hostlist_create("");
	for (i = 0; i < coll->children_cnt; i++) {
		char *hname = hostlist_nth(hl, ch_nodeids[i]);
		hostlist_push(coll->ch_hosts, hname);
	}
	/* just in case, shouldn't be needed */
	hostlist_uniq(coll->ch_hosts);
	xfree(ch_nodeids);

	if (parent_id == -1) {
		/* if we are the root of the tree:
		 * - we don't have a parent;
		 * - we have large list of all_childrens (we don't want ourselfs there)
		 */
		coll->parent_host = NULL;
		hostlist_delete_host(hl, pmixp_info_hostname());
		coll->all_children = hl;
	} else if (parent_id >= 0) {
		/* for all other nodes in the tree we need to know:
		 * - nodename of our parent;
		 * - we don't need a list of all_childrens and hl anymore
		 */
		p = hostlist_nth(hl, parent_id);
		coll->parent_host = xstrdup(p);
		/* use empty hostlist here */
		coll->all_children = hostlist_create("");
		free(p);
		hostlist_destroy(hl);
	}

	/* Collective data */
	coll->buf = pmixp_server_new_buf();
	coll->serv_offs = get_buf_offset(coll->buf);

	if (SLURM_SUCCESS != _pack_ranges(coll)) {
		PMIXP_ERROR("Cannot pack ranges to coll message header!");
		goto err_exit;
	}

	/* Callback information */
	coll->cbdata = NULL;
	coll->cbfunc = NULL;

	/* init fine grained lock */
	slurm_mutex_init(&coll->lock);

	return SLURM_SUCCESS;
err_exit:
	return SLURM_ERROR;
}