Exemple #1
0
static
int pspsm_open_endpoint(void)
{
	psm_error_t ret;

	if (!pspsm_ep){
		struct psm_ep_open_opts opts;

		ret = psm_ep_open_opts_get_defaults(&opts);
		if (ret != PSM_OK) goto err;

		ret = psm_ep_open(pspsm_uuid.as_uuid, &opts,
				  &pspsm_ep, &pspsm_epid);
		if (ret != PSM_OK) goto err;

		sendbuf = malloc(pscom.env.readahead);

		pspsm_dprint(2, "pspsm_open_endpoint: OK");
	}
	return 0;

 err:
	pspsm_err(psm_error_get_string(ret));
	pspsm_dprint(1, "pspsm_open_endpoint: %s", pspsm_err_str);
	return -1;
}
Exemple #2
0
static
int pspsm_con_connect(pspsm_con_info_t *con_info, pspsm_info_msg_t *info_msg)
{
	psm_error_t ret, ret1;

	if (memcmp(info_msg->protocol_version, PSPSM_PROTOCOL_VERSION,
		   sizeof(info_msg->protocol_version))) {
		goto err_protocol;
	}

	ret = psm_ep_connect(pspsm_ep, 1, &info_msg->epid, NULL, &ret1,
			     &con_info->epaddr, 0);
	con_info->send_id = info_msg->id;

	if (ret != PSM_OK) goto err_connect;
	pspsm_dprint(2, "pspsm_con_connect: OK");
	pspsm_dprint(2, "sending with %"PRIx64", receiving %"PRIx64,
		     con_info->send_id, con_info->recv_id);
	return 0;

 err_connect:
	pspsm_err(psm_error_get_string(ret));
	pspsm_dprint(1, "pspsm_con_connect: %s", pspsm_err_str);
	return -1;
 err_protocol:
	{
		char str[80];
		snprintf(str, sizeof(str), "protocol error : '%.8s' != '%.8s'",
			 info_msg->protocol_version, PSPSM_PROTOCOL_VERSION);
		pspsm_err(str);
		pspsm_dprint(1, "pspsm_con_connect: %s", pspsm_err_str);
	}
	return -1;
}
Exemple #3
0
static
int pspsm_close_endpoint(void)
{
#if 1
	/* psm_ep_close() SegFaults. A sleep(1) before sometimes helps, disabling
	   the cleanup always helps.
	   (Seen with infinipath-libs-3.2-32129.1162_rhel6_qlc.x86_64) */
	return 0;
#else
	psm_error_t ret;

	if (pspsm_ep){
		ret = psm_ep_close(pspsm_ep, PSM_EP_CLOSE_GRACEFUL, 0);
		pspsm_ep = NULL;
		if (ret != PSM_OK) goto err;

		if (sendbuf) free(sendbuf);

		pspsm_dprint(2, "pspsm_close_endpoint: OK");
	}
	return 0;

 err:
	pspsm_err(psm_error_get_string(ret));
	pspsm_dprint(1, "pspsm_close_endpoint: %s", pspsm_err_str);
	return -1;
#endif
}
Exemple #4
0
int pspsm_finalize_mq(void)
{
	psm_error_t ret;

	if (pspsm_mq){
		ret = psm_mq_finalize(pspsm_mq);
		if (ret != PSM_OK) goto err;
		pspsm_dprint(2, "pspsm_finalize_mq: OK");
	}
	return 0;

 err:
	pspsm_err(psm_error_get_string(ret));
	pspsm_dprint(1, "pspsm_finalize_mq: %s", pspsm_err_str);
	return -1;
}
Exemple #5
0
static
int pscom_psm_peek()
{
	unsigned read_progress = 0;
	psm_mq_req_t req;
	psm_mq_status_t status;
	psm_error_t ret;
	do {
		ret = psm_mq_ipeek(pspsm_mq, &req, /* status */ NULL);
		if (ret == PSM_MQ_INCOMPLETE)
			return read_progress;
		if (ret != PSM_OK)
			goto err;
		ret = psm_mq_test(&req, &status);
		if (ret != PSM_OK)
			goto err;
		read_progress += pscom_psm_process(&status);
	}
	while (1);

 err:
	pspsm_err(psm_error_get_string(ret));
	pspsm_dprint(1, "pscom_psm_peek: %s", pspsm_err_str);
	return read_progress;

}
Exemple #6
0
static
void pscom_psm_finalize(void){
	if (pspsm_close_endpoint() == -1) goto err;
	if (pspsm_finalize_mq() == -1) goto err;
	return;
 err:
	pspsm_dprint(1, "pspsm_psm_finalize not successful");
}
Exemple #7
0
static
int pspsm_init_mq(void)
{
	psm_error_t ret;

	if (!pspsm_mq){
		ret = psm_mq_init(pspsm_ep, PSM_MQ_ORDERMASK_ALL, NULL, 0,
				  &pspsm_mq);

		if (ret != PSM_OK) goto err;
		pspsm_dprint(2, "pspsm_init_mq: OK");
	}
	return 0;

 err:
	pspsm_err(psm_error_get_string(ret));
	pspsm_dprint(1, "pspsm_init_mq: %s", pspsm_err_str);
	return -1;
}
Exemple #8
0
static
void pspsm_iov_print(const struct iovec *iov, size_t len)
{
	while (len > 0) {
		if (iov->iov_len) {
			pspsm_dprint(2, "SENDV %p %zu", iov->iov_base, iov->iov_len);
			len -= iov->iov_len;
		}
		iov++;
	}
}
Exemple #9
0
static
int pspsm_close_endpoint(void)
{
	psm_error_t ret;

	if (pspsm_ep){
		ret = psm_ep_close(pspsm_ep, PSM_EP_CLOSE_GRACEFUL, 0);
		pspsm_ep = NULL;
		if (ret != PSM_OK) goto err;

		if (sendbuf) free(sendbuf);

		pspsm_dprint(2, "pspsm_close_endpoint: OK");
	}
	return 0;

 err:
	pspsm_err(psm_error_get_string(ret));
	pspsm_dprint(1, "pspsm_close_endpoint: %s", pspsm_err_str);
	return -1;
}
Exemple #10
0
static inline
int _pspsm_send_buf(pspsm_con_info_t *con_info, char *buf, size_t len,
		    uint64_t tag, psm_mq_req_t *req, unsigned long nr)
{
	void *context = (void *)((uintptr_t)con_info | nr);
	psm_error_t ret;
	assert(*req == PSM_MQ_REQINVALID);
	ret = psm_mq_isend(pspsm_mq, con_info->epaddr,
			   /* flags */ 0, tag, buf, len,
			   context, req);
	if (ret != PSM_OK) goto err;
	return 0;

 err:
	pspsm_err(psm_error_get_string(ret));
	pspsm_dprint(1, "_pspsm_send_buf: %s", pspsm_err_str);
	return -EPIPE;
}
Exemple #11
0
static
int pspsm_con_init(pspsm_con_info_t *con_info)
{
	static uint64_t id = 42;

	con_info->con_broken = 0;
	con_info->recv_id = id++;
	con_info->rbuf = NULL;
	con_info->req = NULL;

	con_info->rreq = PSM_MQ_REQINVALID;
	con_info->sreqs[0] = PSM_MQ_REQINVALID;
	con_info->sreqs[1] = PSM_MQ_REQINVALID;

	/* debug */
	con_info->magic = UINTMAX_C(0xdeadbeefcafebabe);

	pspsm_dprint(2, "pspsm_con_init: OK");
	return 0;
}
Exemple #12
0
/* sends an iov. FIXME: returns 0 if the send is complete, -EAGAIN if
   it created one or more requests for it, and -EPIPE in case of an
   error. */
static
int _pspsm_sendv(pspsm_con_info_t *con_info, uint64_t magic)
{
	uint64_t tag = con_info->send_id | magic;
	unsigned int i=0;
	psm_error_t ret;
	size_t len = con_info->iov[0].iov_len + con_info->iov[1].iov_len;

	if (len <= pscom.env.readahead){
		pscom_memcpy_from_iov(sendbuf, con_info->iov, len);
		/* we hope that doesn't block - it shouldn't, as the
		 * message is sufficiently small */
		ret = psm_mq_send(pspsm_mq, con_info->epaddr,
				  /* flags*/ 0, tag, sendbuf, len);
		if (ret != PSM_OK) goto err;
		return 0;
	}

	for (i=0; i<2; i++){
		if (con_info->iov[i].iov_len){
			/* pspsm_dprint(0, "Send part[%d], %p len %d to con %s\n", i,
			   con_info->iov[i].iov_base, (int)con_info->iov[i].iov_len,
			   con_info->con->pub.remote_con_info.name); */
			if (_pspsm_send_buf(con_info, con_info->iov[i].iov_base,
					    con_info->iov[i].iov_len,
					    tag, &con_info->sreqs[i], i)){
				return -EPIPE;
			}
			/* inc for each outstanding send request */
			poll_user_inc();
		}
	}
	return -EAGAIN;

 err:
	pspsm_err(psm_error_get_string(ret));
	pspsm_dprint(1, "_pspsm_send_buf: %s", pspsm_err_str);
	return -EPIPE;
}
Exemple #13
0
static
int pspsm_recvlook(pspsm_con_info_t *con_info)
{
	/* ToDo: rename me to something like "post a receive". */
	psm_error_t ret;
	uint64_t rtag = con_info->recv_id;
	void *context = (void *)((uintptr_t)con_info | 2);

	assert(con_info->rreq == PSM_MQ_REQINVALID);
	ret = psm_mq_irecv(pspsm_mq, rtag, mask, 0 /*flags*/,
			   con_info->rbuf, con_info->rbuflen,
			   context, &con_info->rreq);
	if (ret != PSM_OK) goto out_err;

	/* FIXME: Should probably not return an error code to indicate
	   success. */
	return -EAGAIN;

 out_err:
	pspsm_err(psm_error_get_string(ret));
	pspsm_dprint(1, "pspsm_recvlook: %s", pspsm_err_str);
	return -1;
}
Exemple #14
0
static
int pspsm_init(void)
{
	static pspsm_init_state_t init_state = PSPSM_INIT_START;
	int verno_minor = PSM_VERNO_MINOR;
	int verno_major = PSM_VERNO_MAJOR;
	psm_error_t ret;

	if (init_state == PSPSM_INIT_START) {
		/* Check for an available /dev/ipath */
		ret = pspsm_check_dev_ipath();
		if (ret != 0) {
			goto err_dev_ipath;
		}

		ret = psm_init(&verno_major, &verno_minor);
		if (ret != PSM_OK) {
			goto err_init;
		}

		/*
		 * All processes wanting to communicate need to use
		 * the same UUID.
		 *
		 * It is unclear whether there are drawbacks from
		 * simply using the same UUID for groups of processes
		 * that will never communicate.
		 *
		 * On top of a constant fill pattern, we use:
		 *
		 * - PSP_PSM_UNIQ_ID if set and not zero, or
		 * - PMI_ID, if set and not zero - that's not entirely
		 *   clean, but a practical solution for MPI apps (as
		 *   long as we do not implement communication between
		 *   two sets of MPI processes not sharing a
		 *   communicator).
		 */
		memset(pspsm_uuid.as_uuid, DEFAULT_UUID_PATTERN,
		       sizeof(pspsm_uuid.as_uuid));

		if (pscom.env.psm_uniq_id) {
			pspsm_dprint(2, "seeding PSM UUID with %u", pscom.env.psm_uniq_id);
			pspsm_uuid.as_uint = pscom.env.psm_uniq_id;
		}

		/* Open the endpoint here in init with the hope that
		   every mpi rank call indirect psm_ep_open() before
		   transmitting any data from or to this endpoint.
		   This is to avoid a race condition in
		   libpsm_infinipath.  Downside: We consume PSM
		   Contexts even in the case of only local
		   communication. You could use PSP_PSM=0 in this
		   case.
		*/
		if (pspsm_open_endpoint()) goto err_ep;
		if (pspsm_init_mq()) goto err_mq;

		pspsm_dprint(2, "pspsm_init: OK");
		init_state = PSPSM_INIT_DONE;
	}
	return init_state; /* 0 = success, -1 = error */
err_dev_ipath:
	pspsm_dprint(2, "pspsm_init: No \"/dev/ipath\" found. Arch psm is disabled.");
	goto err_exit;
err_init:
	pspsm_err(psm_error_get_string(ret));
	pspsm_dprint(1, "pspsm_init: %s", pspsm_err_str);
	// Fall through
 err_ep:
 err_mq:
err_exit:
	init_state = PSPSM_INIT_FAILED;
	return init_state; /* 0 = success, -1 = error */
}