Example #1
0
int pmixp_coll_ring_check(pmixp_coll_t *coll, pmixp_coll_ring_msg_hdr_t *hdr)
{
	char *nodename = NULL;
	int rc;

	if (hdr->nodeid != _ring_prev_id(coll)) {
		nodename = pmixp_info_job_host(hdr->nodeid);
		PMIXP_ERROR("%p: unexpected contrib from %s:%u, expected is %d",
			    coll, nodename, hdr->nodeid, _ring_prev_id(coll));
		return SLURM_ERROR;
	}
	rc = pmixp_coll_check(coll, hdr->seq);
	if (PMIXP_COLL_REQ_FAILURE == rc) {
		/* this is an unacceptable event: either something went
		 * really wrong or the state machine is incorrect.
		 * This will 100% lead to application hang.
		 */
		nodename = pmixp_info_job_host(hdr->nodeid);
		PMIXP_ERROR("Bad collective seq. #%d from %s:%u, current is %d",
			    hdr->seq, nodename, hdr->nodeid, coll->seq);
		pmixp_debug_hang(0); /* enable hang to debug this! */
		slurm_kill_job_step(pmixp_info_jobid(),
				    pmixp_info_stepid(), SIGKILL);
		xfree(nodename);
		return SLURM_SUCCESS;
	} else if (PMIXP_COLL_REQ_SKIP == rc) {
#ifdef PMIXP_COLL_DEBUG
		nodename = pmixp_info_job_host(hdr->nodeid);
		PMIXP_ERROR("Wrong collective seq. #%d from nodeid %u, current is %d, skip this message",
			    hdr->seq, hdr->nodeid, coll->seq);
#endif
		return SLURM_ERROR;
	}
	return SLURM_SUCCESS;
}
Example #2
0
static void errhandler(pmix_status_t status,
		       pmix_proc_t proc[], size_t nproc,
		       pmix_info_t info[], size_t ninfo)
{
	/* TODO: do something more sophisticated here */
	/* FIXME: use proper specificator for nranges */
	PMIXP_ERROR_STD("Error handler invoked: status = %d, nranges = %d",
			status, (int) nproc);
	slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(), SIGKILL);
}
Example #3
0
static pmix_status_t abort_fn(const pmix_proc_t *proc, void *server_object,
			      int status, const char msg[], pmix_proc_t procs[],
			      size_t nprocs, pmix_op_cbfunc_t cbfunc, void *cbdata)
{
	/* Just kill this stepid for now. Think what we can do for FT here? */
	PMIXP_DEBUG("called: status = %d, msg = %s", status, msg);
	slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(), SIGKILL);

	if (NULL != cbfunc) {
		cbfunc(PMIX_SUCCESS, cbdata);
	}
	return PMIX_SUCCESS;
}
Example #4
0
static void _errhandler(size_t evhdlr_registration_id,
			pmix_status_t status,
			const pmix_proc_t *source,
			pmix_info_t info[], size_t ninfo,
			pmix_info_t *results, size_t nresults,
			pmix_event_notification_cbfunc_fn_t cbfunc,
			void *cbdata)
{
	/* TODO: do something more sophisticated here */
	/* FIXME: use proper specificator for nranges */
	PMIXP_ERROR_STD("Error handler invoked: status = %d",
			status);
	slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(), SIGKILL);
}
Example #5
0
int pmixp_info_set(const stepd_step_rec_t *job, char ***env)
{
	int i, rc;
	size_t msize;
	memset(&_pmixp_job_info, 0, sizeof(_pmixp_job_info));
#ifndef NDEBUG
	_pmixp_job_info.magic = PMIX_INFO_MAGIC;
#endif
	/* security info */
	_pmixp_job_info.uid = job->uid;
	_pmixp_job_info.gid = job->gid;

	/* This node info */
	_pmixp_job_info.jobid = job->jobid;
	_pmixp_job_info.stepid = job->stepid;
	_pmixp_job_info.node_id = job->nodeid;
	_pmixp_job_info.node_tasks = job->node_tasks;

	/* Global info */
	_pmixp_job_info.ntasks = job->ntasks;
	_pmixp_job_info.nnodes = job->nnodes;
	msize = sizeof(*_pmixp_job_info.task_cnts) * job->nnodes;
	_pmixp_job_info.task_cnts = xmalloc(msize);
	for (i = 0; i < job->nnodes; i++) {
		_pmixp_job_info.task_cnts[i] = job->task_cnts[i];
	}

	msize = _pmixp_job_info.node_tasks * sizeof(uint32_t);
	_pmixp_job_info.gtids = xmalloc(msize);
	for (i = 0; i < job->node_tasks; i++) {
		_pmixp_job_info.gtids[i] = job->task[i]->gtid;
	}

	/* Setup hostnames and job-wide info */
	if ((rc = _resources_set(env))) {
		return rc;
	}

	if ((rc = _env_set(env))) {
		return rc;
	}

	snprintf(_pmixp_job_info.nspace, PMIX_MAX_NSLEN, "slurm.pmix.%d.%d",
			pmixp_info_jobid(), pmixp_info_stepid());

	return SLURM_SUCCESS;
}
Example #6
0
static void _process_server_request(pmixp_base_hdr_t *hdr, Buf buf)
{
	int rc;

	switch (hdr->type) {
	case PMIXP_MSG_FAN_IN:
	case PMIXP_MSG_FAN_OUT: {
		pmixp_coll_t *coll;
		pmixp_proc_t *procs = NULL;
		size_t nprocs = 0;
		pmixp_coll_type_t type = 0;
		int c_nodeid;

		rc = pmixp_coll_unpack_info(buf, &type, &c_nodeid,
					    &procs, &nprocs);
		if (SLURM_SUCCESS != rc) {
			char *nodename = pmixp_info_job_host(hdr->nodeid);
			PMIXP_ERROR("Bad message header from node %s",
				    nodename);
			xfree(nodename);
			goto exit;
		}
		coll = pmixp_state_coll_get(type, procs, nprocs);
		xfree(procs);

		PMIXP_DEBUG("FENCE collective message from nodeid = %u, "
			    "type = %s, seq = %d",
			    hdr->nodeid,
			    ((PMIXP_MSG_FAN_IN == hdr->type) ?
				     "fan-in" : "fan-out"),
			    hdr->seq);
		rc = pmixp_coll_check_seq(coll, hdr->seq);
		if (PMIXP_COLL_REQ_FAILURE == rc) {
			/* this is unexepable event: either something went
			 * really wrong or the state machine is incorrect.
			 * This will 100% lead to application hang.
			 */
			char *nodename = pmixp_info_job_host(hdr->nodeid);
			PMIXP_ERROR("Bad collective seq. #%d from %s, current"
				    " is %d",
				    hdr->seq, nodename, coll->seq);
			pmixp_debug_hang(0); /* enable hang to debug this! */
			slurm_kill_job_step(pmixp_info_jobid(),
					    pmixp_info_stepid(), SIGKILL);
			xfree(nodename);
			break;
		} else if (PMIXP_COLL_REQ_SKIP == rc) {
			PMIXP_DEBUG("Wrong collective seq. #%d from"
				    " nodeid %u, current is %d, skip "
				    "this message",
				    hdr->seq, hdr->nodeid, coll->seq);
			goto exit;
		}

		if (PMIXP_MSG_FAN_IN == hdr->type) {
			pmixp_coll_contrib_child(coll, hdr->nodeid,
						 hdr->seq, buf);
		} else {
			pmixp_coll_contrib_parent(coll, hdr->nodeid,
						  hdr->seq, buf);
		}

		break;
	}
	case PMIXP_MSG_DMDX: {
		pmixp_dmdx_process(buf, hdr->nodeid, hdr->seq);
		/* buf will be free'd by the PMIx callback so
		 * protect the data by voiding the buffer.
		 * Use the statement below instead of (buf = NULL)
		 * to maintain incapsulation - in general `buf`is
		 * not a pointer, but opaque type.
		 */
		buf = create_buf(NULL, 0);
		break;
	}
	case PMIXP_MSG_INIT_DIRECT:
		PMIXP_DEBUG("Direct connection init from %d", hdr->nodeid);
		break;
#ifndef NDEBUG
	case PMIXP_MSG_PINGPONG: {
		/* if the pingpong mode was activated -
		 * node 0 sends ping requests
		 * and receiver assumed to respond back to node 0
		 */
		int msize = remaining_buf(buf);

		if (pmixp_info_nodeid()) {
			pmixp_server_pp_send(0, msize);
		} else {
			if (pmixp_server_pp_same_thread()) {
				if (pmixp_server_pp_count() ==
				    pmixp_server_pp_warmups()) {
					pmixp_server_pp_start();
				}
				if (!pmixp_server_pp_check_fini(msize)) {
					pmixp_server_pp_send(1, msize);
				}
			}
		}
		pmixp_server_pp_inc();
		break;
	}
#endif
	default:
		PMIXP_ERROR("Unknown message type %d", hdr->type);
		break;
	}

exit:
	free_buf(buf);
}
Example #7
0
static int _env_set(char ***env)
{
	char *p = NULL;

	/* ----------- Temp directories settings ------------- */

	/*
	 * FIXME: This is dangerous to set this from the user environment.
	 * I was using this to debug in linux containers
	 * On real hardware each node has it's own separate /tmp directory
	 */

	/* set server temp directory - change this process environment */
	p = getenvp(*env, PMIXP_TMPDIR_SRV);
	if (NULL != p) {
		setenv(PMIXP_OS_TMPDIR_ENV, p, 1);
	}

	p = getenv(PMIXP_OS_TMPDIR_ENV);
	if (NULL == p) {
		p = PMIXP_TMPDIR_DEFAULT;
	}
	_pmixp_job_info.lib_tmpdir = xstrdup_printf("%s/pmix.%d.%d/", p,
			pmixp_info_jobid(), pmixp_info_stepid());

	/* save client temp directory if requested
	 * TODO: We want to get TmpFS value as well if exists.
	 * Need to sync with SLURM developers.
	 */
	p = getenvp(*env, PMIXP_TMPDIR_CLI);
	if (NULL != p) {
		_pmixp_job_info.cli_tmpdir = xstrdup(p);
	} else {
		p = slurm_get_tmp_fs();
		if (NULL != p) {
			_pmixp_job_info.cli_tmpdir = p;
		}
	}

	/* ----------- Timeout setting ------------- */
	/* TODO: also would be nice to have a cluster-wide setting in SLURM */
	_pmixp_job_info.timeout = PMIXP_TIMEOUT_DEFAULT;
	p = getenvp(*env, PMIXP_TIMEOUT);
	if (NULL != p) {
		int tmp;
		tmp = atoi(p);
		if (tmp > 0) {
			_pmixp_job_info.timeout = tmp;
		}
	}

	/* ----------- Forward PMIX settings ------------- */
	/* FIXME: this may be intrusive as well as PMIx library will create
	 * lots of output files in /tmp by default.
	 * somebody can use this or annoyance */
	p = getenvp(*env, PMIXP_PMIXLIB_DEBUG);
	if (NULL != p) {
		setenv(PMIXP_PMIXLIB_DEBUG, p, 1);
		/* output into the file since we are in slurmstepd
		 * and stdout is muted.
		 * One needs to check TMPDIR for the results */
		setenv(PMIXP_PMIXLIB_DEBUG_REDIR, "file", 1);
	}

	return SLURM_SUCCESS;
}
static void _process_server_request(recv_header_t *_hdr, void *payload)
{
	send_header_t *hdr = &_hdr->send_hdr;
	char *nodename = pmixp_info_job_host(hdr->nodeid);
	Buf buf;
	int rc;

	buf = create_buf(payload, hdr->msgsize);

	switch (hdr->type) {
	case PMIXP_MSG_FAN_IN:
	case PMIXP_MSG_FAN_OUT: {
		pmixp_coll_t *coll;
		pmix_proc_t *procs = NULL;
		size_t nprocs = 0;
		pmixp_coll_type_t type = 0;

		rc = pmixp_coll_unpack_ranges(buf, &type, &procs, &nprocs);
		if (SLURM_SUCCESS != rc) {
			PMIXP_ERROR("Bad message header from node %s", nodename);
			return;
		}
		coll = pmixp_state_coll_get(type, procs, nprocs);
		xfree(procs);

		PMIXP_DEBUG("FENCE collective message from node \"%s\", type = %s, seq = %d",
			    nodename, (PMIXP_MSG_FAN_IN == hdr->type) ? "fan-in" : "fan-out",
			    hdr->seq);
		rc = pmixp_coll_check_seq(coll, hdr->seq, nodename);
		if (PMIXP_COLL_REQ_FAILURE == rc) {
			/* this is unexepable event: either something went
			 * really wrong or the state machine is incorrect.
			 * This will 100% lead to application hang.
			 */
			PMIXP_ERROR("Bad collective seq. #%d from %s, current is %d",
				    hdr->seq, nodename, coll->seq);
			pmixp_debug_hang(0); /* enable hang to debug this! */
			slurm_kill_job_step(pmixp_info_jobid(), pmixp_info_stepid(),
					    SIGKILL);

			break;
		} else if (PMIXP_COLL_REQ_SKIP == rc) {
			PMIXP_DEBUG("Wrong collective seq. #%d from %s, current is %d, skip this message",
				    hdr->seq, nodename, coll->seq);
			free_buf(buf);
			break;
		}

		if (PMIXP_MSG_FAN_IN == hdr->type) {
			pmixp_coll_contrib_node(coll, nodename, buf);
			/* we don't need this buffer anymore */
			free_buf(buf);
		} else {
			pmixp_coll_bcast(coll, buf);
			/* buf will be free'd by the PMIx callback */
		}

		break;
	}
	case PMIXP_MSG_DMDX: {
		pmixp_dmdx_process(buf, nodename, hdr->seq);
		break;
	}
	case PMIXP_MSG_HEALTH_CHK: {
		/* this is just health ping.
		 * TODO: can we do something more sophisticated?
		 */
		free_buf(buf);
		break;
	}
	default:
		PMIXP_ERROR("Unknown message type %d", hdr->type);
		break;
	}
	xfree(nodename);
}
Example #9
0
static int _env_set(char ***env)
{
	char *p = NULL;

	xassert(_pmixp_job_info.hostname);

	_pmixp_job_info.server_addr_unfmt = slurm_get_slurmd_spooldir(NULL);

	_pmixp_job_info.lib_tmpdir = slurm_conf_expand_slurmd_path(
		_pmixp_job_info.server_addr_unfmt, _pmixp_job_info.hostname);

	xstrfmtcat(_pmixp_job_info.server_addr_unfmt, "/stepd.slurm.pmix.%d.%d",
		   pmixp_info_jobid(), pmixp_info_stepid());

	_pmixp_job_info.spool_dir = xstrdup(_pmixp_job_info.lib_tmpdir);

	/* ----------- Temp directories settings ------------- */
	xstrfmtcat(_pmixp_job_info.lib_tmpdir, "/pmix.%d.%d/",
		   pmixp_info_jobid(), pmixp_info_stepid());

	/* save client temp directory if requested
	 * TODO: We want to get TmpFS value as well if exists.
	 * Need to sync with SLURM developers.
	 */
	p = getenvp(*env, PMIXP_TMPDIR_CLI);

	if (p)
		_pmixp_job_info.cli_tmpdir_base = xstrdup(p);
	else
		_pmixp_job_info.cli_tmpdir_base = slurm_get_tmp_fs(
			_pmixp_job_info.hostname);

	_pmixp_job_info.cli_tmpdir =
		xstrdup_printf("%s/spmix_appdir_%d.%d",
			       _pmixp_job_info.cli_tmpdir_base,
			       pmixp_info_jobid(), pmixp_info_stepid());


	/* ----------- Timeout setting ------------- */
	/* TODO: also would be nice to have a cluster-wide setting in SLURM */
	_pmixp_job_info.timeout = PMIXP_TIMEOUT_DEFAULT;
	p = getenvp(*env, PMIXP_TIMEOUT);
	if (NULL != p) {
		int tmp;
		tmp = atoi(p);
		if (tmp > 0) {
			_pmixp_job_info.timeout = tmp;
		}
	}

	/* ----------- Forward PMIX settings ------------- */
	/* FIXME: this may be intrusive as well as PMIx library will create
	 * lots of output files in /tmp by default.
	 * somebody can use this or annoyance */
	p = getenvp(*env, PMIXP_PMIXLIB_DEBUG);
	if (NULL != p) {
		setenv(PMIXP_PMIXLIB_DEBUG, p, 1);
		/* output into the file since we are in slurmstepd
		 * and stdout is muted.
		 * One needs to check TMPDIR for the results */
		setenv(PMIXP_PMIXLIB_DEBUG_REDIR, "file", 1);
	}

	return SLURM_SUCCESS;
}
Example #10
0
/*
 * information about relative ranks as assigned by the RM
 */
static void _set_procdatas(List lresp)
{
	pmixp_namespace_t *nsptr = pmixp_nspaces_local();
	pmix_info_t *kvp, *tkvp;
	char *p = NULL;
	int i;

	/* (char*) jobid assigned by scheduler */
	xstrfmtcat(p, "%d.%d", pmixp_info_jobid(), pmixp_info_stepid());
	PMIXP_ALLOC_KEY(kvp, PMIX_JOBID);
	PMIX_VAL_SET(&kvp->value, string, p);
	xfree(p);
	list_append(lresp, kvp);

	PMIXP_ALLOC_KEY(kvp, PMIX_NODEID);
	PMIX_VAL_SET(&kvp->value, uint32_t, nsptr->node_id);
	list_append(lresp, kvp);

	/* store information about local processes */
	for (i = 0; i < pmixp_info_tasks(); i++) {
		List rankinfo;
		ListIterator it;
		int count, j, localid, nodeid;
		char *nodename;
		pmix_info_t *info;

		rankinfo = list_create(pmixp_xfree_xmalloced);

		PMIXP_ALLOC_KEY(kvp, PMIX_RANK);
		PMIX_VAL_SET(&kvp->value, int, i);
		list_append(rankinfo, kvp);

		/* TODO: always use 0 so far. this is not the general case though
		 * (see SLURM MIMD: man srun, section MULTIPLE PROGRAM CONFIGURATION)
		 */
		PMIXP_ALLOC_KEY(kvp, PMIX_APPNUM);
		PMIX_VAL_SET(&kvp->value, int, 0);
		list_append(rankinfo, kvp);

		/* TODO: the same as for previous here */
		PMIXP_ALLOC_KEY(kvp, PMIX_APPLDR);
		PMIX_VAL_SET(&kvp->value, int, 0);
		list_append(rankinfo, kvp);

		/* TODO: fix when several apps will appear */
		PMIXP_ALLOC_KEY(kvp, PMIX_GLOBAL_RANK);
		PMIX_VAL_SET(&kvp->value, uint32_t, i);
		list_append(rankinfo, kvp);

		/* TODO: fix when several apps will appear */
		PMIXP_ALLOC_KEY(kvp, PMIX_APP_RANK);
		PMIX_VAL_SET(&kvp->value, uint32_t, i);
		list_append(rankinfo, kvp);

		localid = pmixp_info_taskid2localid(i);
		/* this rank is local, store local info ab't it! */
		if (0 <= localid) {
			PMIXP_ALLOC_KEY(kvp, PMIX_LOCAL_RANK);
			PMIX_VAL_SET(&kvp->value, uint16_t, localid);
			list_append(rankinfo, kvp);

			/* TODO: fix when several apps will appear */
			PMIXP_ALLOC_KEY(kvp, PMIX_NODE_RANK);
			PMIX_VAL_SET(&kvp->value, uint16_t, localid);
			list_append(rankinfo, kvp);
		}

		nodeid = nsptr->task_map[i];
		nodename = hostlist_nth(nsptr->hl, nodeid);
		PMIXP_ALLOC_KEY(kvp, PMIX_HOSTNAME);
		PMIX_VAL_SET(&kvp->value, string, nodename);
		list_append(rankinfo, kvp);
		free(nodename);

		/* merge rankinfo into one PMIX_PROC_DATA key */
		count = list_count(rankinfo);
		PMIXP_ALLOC_KEY(kvp, PMIX_PROC_DATA);
		kvp->value.type = PMIX_INFO_ARRAY;
		kvp->value.data.array.size = count;
		PMIX_INFO_CREATE(info, count);
		it = list_iterator_create(rankinfo);
		j = 0;
		while (NULL != (tkvp = list_next(it))) {
			/* Just copy all the fields here. We will free original kvp's
			 * using list_destroy without free'ing their fields so it is
			 * safe to do so.
			 */
			info[j] = *tkvp;
			j++;
		}
		list_destroy(rankinfo);

		kvp->value.data.array.array = (pmix_info_t *)info;
		info = NULL;

		/* put the complex key to the list */
		list_append(lresp, kvp);
	}
}