Example #1
0
psm_error_t __psm_ep_epid_lookup(psm_epid_t epid, psm_epconn_t *epconn)
{
	psm_error_t err = PSM_OK;
	psm_epaddr_t epaddr;
	psm_ep_t ep;

	PSMI_ERR_UNLESS_INITIALIZED(NULL);

	/* Need to have an opened endpoint before we can resolve epids */
	if (psmi_opened_endpoint == NULL) {
		err = psmi_handle_error(NULL, PSM_EP_WAS_CLOSED,
					"PSM Endpoint is closed or does not exist");
		return err;
	}

	ep = psmi_opened_endpoint;
	while (ep) {
		epaddr = psmi_epid_lookup(ep, epid);
		if (!epaddr) {
			ep = ep->user_ep_next;
			continue;
		}

		/* Found connection for epid. Return info about endpoint to caller. */
		psmi_assert_always(epaddr->ptlctl->ep == ep);
		epconn->addr = epaddr;
		epconn->ep = ep;
		epconn->mq = ep->mq;
		return err;
	}

	err = psmi_handle_error(NULL, PSM_EPID_UNKNOWN,
				"Endpoint connection status unknown");
	return err;
}
Example #2
0
static psm_error_t
psmi_ep_verify_pkey(psm_ep_t ep, uint16_t pkey, uint16_t *opkey)
{
	int i, ret;
	psm_error_t err;

	for (i = 0; i < 16; i++) {
		ret = hfi_get_port_index2pkey(ep->unit_id, ep->portnum, i);
		if (ret < 0) {
			err = psmi_handle_error(NULL, PSM_EP_DEVICE_FAILURE,
						"Can't get a valid pkey value from pkey table\n");
			return err;
		} else if (ret == 0x7fff || ret == 0xffff) {
			continue;	/* management pkey, not for app traffic. */
		}

		if (pkey == (uint16_t) ret)
			break;
	}

	/* if pkey does not match */
	if (i == 16) {
		err = psmi_handle_error(NULL, PSM_EP_DEVICE_FAILURE,
					"Wrong pkey 0x%x, please use PSM_PKEY to specify a valid pkey\n",
					pkey);
		return err;
	}

	/* return the final pkey */
	*opkey = pkey;

	return PSM_OK;
}
Example #3
0
psm_error_t
__psm_am_register_handlers(psm_ep_t ep,
			   const psm_am_handler_fn_t *handlers,
			   int num_handlers, int *handlers_idx)
{
	int i, j;

	/* For now just assign any free one */
	for (i = 0, j = 0; i < PSMI_AM_NUM_HANDLERS; i++) {
		if (ep->am_htable[i] == _ignore_handler) {
			ep->am_htable[i] = handlers[j];
			handlers_idx[j] = i;
			if (++j == num_handlers)	/* all registered */
				break;
		}
	}

	if (j < num_handlers) {
		/* Not enough free handlers, restore unused handlers */
		for (i = 0; i < j; i++)
			ep->am_htable[handlers_idx[i]] = _ignore_handler;

		return psmi_handle_error(ep, PSM_EP_NO_RESOURCES,
					 "Insufficient "
					 "available AM handlers: registered %d of %d requested handlers",
					 j, num_handlers);
	} else
		return PSM_OK;
}
Example #4
0
/*
 *
 * Call driver to free all cached tids.
 */
psm2_error_t
ips_tidcache_cleanup(struct ips_tid *tidc)
{
	cl_qmap_t *p_map = &tidc->tid_cachemap;
	psm2_error_t err;
	int i, j;

	j = 0;
	for (i = 1; i <= tidc->tid_ctrl->tid_num_max; i++) {
		psmi_assert(REFCNT(i) == 0);
		if (INVALIDATE(i) == 0) {
			tidc->tid_array[j++] = p_map->root[i].payload.tidinfo;
		}
	}

	if (j > 0) {
		/*
		 * call driver to free the tids.
		 */
		if (hfi_free_tid(tidc->context->ctrl,
			    (uint64_t) (uintptr_t) tidc->tid_array, j) < 0) {
			/* If failed to unpin pages, it's fatal error */
			err = psmi_handle_error(tidc->context->ep,
				PSM2_EP_DEVICE_FAILURE,
				"Failed to tid free %d tids", j);
			return err;
		}
	}

	psmi_free(tidc->tid_array);
	psmi_free(tidc->tid_cachemap.root);

	return PSM2_OK;
}
Example #5
0
/*
 *
 * Force to remove a tid, check invalidation event afterwards.
 */
static psm2_error_t
ips_tidcache_remove(struct ips_tid *tidc, uint32_t tidcnt)
{
	cl_qmap_t *p_map = &tidc->tid_cachemap;
	uint32_t idx;
	psm2_error_t err;

	/*
	 * call driver to free the tids.
	 */
	if (hfi_free_tid(tidc->context->ctrl,
		    (uint64_t) (uintptr_t) tidc->tid_array, tidcnt) < 0) {
		/* If failed to unpin pages, it's fatal error */
		err = psmi_handle_error(tidc->context->ep,
			PSM2_EP_DEVICE_FAILURE,
			"Failed to tid free %d tids", 1);
		return err;
	}

	while (tidcnt) {
		tidcnt--;
		idx = 2*IPS_TIDINFO_GET_TID(tidc->tid_array[tidcnt]) +
			IPS_TIDINFO_GET_TIDCTRL(tidc->tid_array[tidcnt]);

		/*
		 * sanity check.
		 */
		psmi_assert(idx != 0);
		psmi_assert(idx <= tidc->tid_ctrl->tid_num_max);
		psmi_assert(INVALIDATE(idx) == 0);
		psmi_assert(REFCNT(idx) == 0);

		/*
		 * mark the tid invalidated.
		 */
		INVALIDATE(idx) = 1;

		/*
		 * remove the tid from RB tree.
		 */
		IDLE_REMOVE(idx);
		ips_cl_qmap_remove_item(p_map, &p_map->root[idx]);
	}

	/*
	 * Because the freed tid is not from invalidation list,
	 * it is possible that kernel just invalidated the tid,
	 * then we need to check and process the invalidation
	 * before we can re-use this tid. The reverse order
	 * will wrongly invalidate this tid again.
	 */
	if ((*tidc->invalidation_event) & HFI1_EVENT_TID_MMU_NOTIFY) {
		err = ips_tidcache_invalidation(tidc);
		if (err)
			return err;
	}

	return PSM2_OK;
}
Example #6
0
/*
 * Add ipsaddr with epid to the epstate table, return new index to caller in
 * 'connidx'.
 */
psm2_error_t
ips_epstate_add(struct ips_epstate *eps, struct ips_epaddr *ipsaddr,
		ips_epstate_idx *connidx_o)
{
	int i, j;
	ips_epstate_idx connidx;

	if (++eps->eps_tabsizeused > eps->eps_tabsize) {	/* realloc */
		struct ips_epstate_entry *newtab;
		eps->eps_tabsize += PTL_EPADDR_ALLOC_CHUNK;
		newtab = (struct ips_epstate_entry *)
		    psmi_calloc(eps->context->ep, PER_PEER_ENDPOINT,
				eps->eps_tabsize,
				sizeof(struct ips_epstate_entry));
		if (newtab == NULL)
			return PSM2_NO_MEMORY;
		else if (eps->eps_tab) {	/* NOT first alloc */
			for (i = 0;
			     i < eps->eps_tabsize - PTL_EPADDR_ALLOC_CHUNK; i++)
				newtab[i] = eps->eps_tab[i];	/* deep copy */
			psmi_free(eps->eps_tab);
		}
		eps->eps_tab = newtab;
	}
	/* Find the next free hole.  We can afford to do this since connect is not
	 * in the critical path */
	for (i = 0, j = eps->eps_tab_nextidx; i < eps->eps_tabsize; i++, j++) {
		if (j == eps->eps_tabsize)
			j = 0;
		if (eps->eps_tab[j].ipsaddr == NULL) {
			eps->eps_tab_nextidx = j + 1;
			if (eps->eps_tab_nextidx == eps->eps_tabsize)
				eps->eps_tab_nextidx = 0;
			break;
		}
	}
	psmi_assert_always(i != eps->eps_tabsize);
	connidx = (j - eps->eps_base_idx) & (IPS_EPSTATE_CONNIDX_MAX-1);
	_HFI_VDBG("node %s gets connidx=%d (table idx %d)\n",
		  psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), connidx,
		  j);
	eps->eps_tab[j].ipsaddr = ipsaddr;
	if (j >= IPS_EPSTATE_CONNIDX_MAX) {
		return psmi_handle_error(eps->context->ep,
					 PSM2_TOO_MANY_ENDPOINTS,
					 "Can't connect to more than %d non-local endpoints",
					 IPS_EPSTATE_CONNIDX_MAX);
	}
	*connidx_o = connidx;
	return PSM2_OK;
}
Example #7
0
psm_error_t __psm_am_get_source(psm_am_token_t token, psm_epaddr_t *epaddr_out)
{
	struct psmi_am_token *tok;

	if (token == NULL || epaddr_out == NULL) {
		return psmi_handle_error(NULL, PSM_PARAM_ERR,
					 "Invalid psm_am_get_source parameters");
	}

	tok = (struct psmi_am_token *)token;
	*epaddr_out = tok->epaddr_from;

	return PSM_OK;
}
Example #8
0
psm2_error_t
ips_tidcache_release(struct ips_tid *tidc,
		uint32_t *tid_array, uint32_t tidcnt)
{
	cl_qmap_t *p_map = &tidc->tid_cachemap;
	uint32_t i, j, idx;
	psm2_error_t err;

	psmi_assert(tidcnt > 0);

	j = 0;
	for (i = 0; i < tidcnt; i++) {
		/*
		 * Driver only returns tidctrl=1 or tidctrl=2.
		 */
		idx = 2*IPS_TIDINFO_GET_TID(tid_array[i]) +
			IPS_TIDINFO_GET_TIDCTRL(tid_array[i]);
		psmi_assert(idx != 0);
		psmi_assert(idx <= tidc->tid_ctrl->tid_num_max);
		psmi_assert(REFCNT(idx) != 0);

		REFCNT(idx)--;
		if (REFCNT(idx) == 0) {
			if (INVALIDATE(idx) != 0) {
				ips_cl_qmap_remove_item(p_map, &p_map->root[idx]);

				tidc->tid_array[j] = tid_array[i];
				j++;
			} else {
				IDLE_INSERT(idx);
			}
		}
	}

	if (j > 0) {
		/*
		 * call driver to free the tids.
		 */
		if (hfi_free_tid(tidc->context->ctrl,
			    (uint64_t) (uintptr_t) tidc->tid_array, j) < 0) {
			/* If failed to unpin pages, it's fatal error */
			err = psmi_handle_error(tidc->context->ep,
				PSM2_EP_DEVICE_FAILURE,
				"Failed to tid free %d tids", j);
			return err;
		}
	}

	return PSM2_OK;
}
Example #9
0
psm_error_t __psm_ep_query(int *num_of_epinfo, psm_epinfo_t *array_of_epinfo)
{
	psm_error_t err = PSM_OK;
	int i;
	psm_ep_t ep;

	PSMI_ERR_UNLESS_INITIALIZED(NULL);

	if (*num_of_epinfo <= 0) {
		err = psmi_handle_error(NULL, PSM_PARAM_ERR,
					"Invalid psm_ep_query parameters");
		return err;
	}

	if (psmi_opened_endpoint == NULL) {
		err = psmi_handle_error(NULL, PSM_EP_WAS_CLOSED,
					"PSM Endpoint is closed or does not exist");
		return err;
	}

	ep = psmi_opened_endpoint;
	for (i = 0; i < *num_of_epinfo; i++) {
		if (ep == NULL)
			break;
		array_of_epinfo[i].ep = ep;
		array_of_epinfo[i].epid = ep->epid;
		array_of_epinfo[i].jkey = ep->jkey;
		memcpy(array_of_epinfo[i].uuid,
		       (void *)ep->uuid, sizeof(psm_uuid_t));
		psmi_uuid_unparse(ep->uuid, array_of_epinfo[i].uuid_str);
		ep = ep->user_ep_next;
	}
	*num_of_epinfo = i;

	return err;
}
Example #10
0
psm_error_t
ips_tid_init(const psmi_context_t *context,
	     struct ips_tid *tidc, ips_tid_avail_cb_fn_t cb, void *cb_context)
{
	const struct hfi1_ctxt_info *ctxt_info = &context->ctrl->ctxt_info;

	struct psmi_stats_entry entries[] = {
		PSMI_STATS_DECL("tid update count", MPSPAWN_STATS_REDUCTION_ALL,
				NULL, &tidc->tid_num_total),
	};

	tidc->context = context;
	/* these are in group unit, a group is 8 tids or 4 tidpairs */
	tidc->tid_num_total = 0;
	tidc->tid_num_inuse = 0;
	tidc->tid_avail_cb = cb;
	tidc->tid_avail_context = cb_context;

	tidc->tid_ctrl = (struct ips_tid_ctrl *)context->tid_ctrl;
	if (!tidc->tid_ctrl) {
		tidc->tid_ctrl = (struct ips_tid_ctrl *)
		    psmi_calloc(context->ep, UNDEFINED, 1,
				sizeof(struct ips_tid_ctrl));
		if (tidc->tid_ctrl == NULL) {
			return PSM_NO_MEMORY;
		}
	}

	/*
	 * Only the master process can initialize.
	 */
	if (ctxt_info->subctxt == 0) {
		pthread_spin_init(&tidc->tid_ctrl->tid_ctrl_lock,
					PTHREAD_PROCESS_SHARED);

		/* check if exp tids are multiple of 8 (a group) */
		if (context->ctrl->__hfi_tidexpcnt % 8)
			return psmi_handle_error(context->ep,
			      PSM_INTERNAL_ERR,
			      "Expected tids(%d) are not multi-groups(8)",
			      context->ctrl->__hfi_tidexpcnt);

		tidc->tid_ctrl->tid_num_max =
		    context->ctrl->__hfi_tidexpcnt >> 3;
		tidc->tid_ctrl->tid_num_avail = tidc->tid_ctrl->tid_num_max;
	}
Example #11
0
psm_error_t
__psm_am_get_parameters(psm_ep_t ep, struct psm_am_parameters *parameters,
			size_t sizeof_parameters_in,
			size_t *sizeof_parameters_out)
{
	size_t s;

	if (parameters == NULL) {
		return psmi_handle_error(NULL, PSM_PARAM_ERR,
					 "Invalid psm_am_get_parameters parameters");
	}

	memset(parameters, 0, sizeof_parameters_in);
	s = min(sizeof(psmi_am_parameters), sizeof_parameters_in);
	memcpy(parameters, &psmi_am_parameters, s);
	*sizeof_parameters_out = s;

	return PSM_OK;
}
Example #12
0
psm2_error_t
ips_tid_release(struct ips_tid *tidc,
		uint32_t *tid_array, uint32_t tidcnt)
{
	struct ips_tid_ctrl *ctrl = tidc->tid_ctrl;
	psm2_error_t err = PSM2_OK;

	psmi_assert(tidcnt > 0);
	if (tidc->context->tid_ctrl)
		pthread_spin_lock(&ctrl->tid_ctrl_lock);

	if (hfi_free_tid(tidc->context->ctrl,
		    (uint64_t) (uintptr_t) tid_array, tidcnt) < 0) {
		if (tidc->context->tid_ctrl)
			pthread_spin_unlock(&ctrl->tid_ctrl_lock);

		/* If failed to unpin pages, it's fatal error */
		err = psmi_handle_error(tidc->context->ep,
			PSM2_EP_DEVICE_FAILURE,
			"Failed to tid free %d tids",
			tidcnt);
		goto fail;
	}

	ctrl->tid_num_avail += tidcnt;
	if (tidc->context->tid_ctrl)
		pthread_spin_unlock(&ctrl->tid_ctrl_lock);

	tidc->tid_num_inuse -= tidcnt;
	/* If an available callback is registered invoke it */
	if (((tidc->tid_num_inuse + tidcnt) == ctrl->tid_num_max)
	    && tidc->tid_avail_cb)
		tidc->tid_avail_cb(tidc, tidc->tid_avail_context);

fail:
	return err;
}
Example #13
0
int __recvpath
psmi_mq_handle_envelope_unexpected(
	psm_mq_t mq, uint16_t mode, psm_epaddr_t epaddr,
	uint64_t tag, psmi_egrid_t egrid, uint32_t send_msglen, 
	const void *payload, uint32_t paylen)
{
    psm_mq_req_t req;
    uint32_t msglen;

    /* 
     * Keep a callback here in case we want to fit some other high-level
     * protocols over MQ (i.e. shmem).  These protocols would bypass the
     * normal mesage handling and go to higher-level message handlers.
     */
    if (mode >= MQ_MSG_USER_FIRST && mq->unexpected_callback) {
	mq->unexpected_callback(mq,mode,epaddr,tag,send_msglen,payload,paylen);
	return MQ_RET_UNEXP_OK;
    }
    req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV);
    psmi_assert(req != NULL);

    req->tag = tag;
    req->recv_msgoff = 0;
    req->recv_msglen = req->send_msglen = req->buf_len = msglen = send_msglen;

    _IPATH_VDBG(
		"from=%s match=NO (req=%p) mode=%x mqtag=%" PRIx64
		" send_msglen=%d\n", psmi_epaddr_get_name(epaddr->epid), 
		req, mode, tag, send_msglen);
#if 0
    if (mq->cur_sysbuf_bytes+msglen > mq->max_sysbuf_bytes) {
		_IPATH_VDBG("req=%p with len=%d exceeds limit of %llu sysbuf_bytes\n",
			req, msglen, (unsigned long long) mq->max_sysbuf_bytes);
		return MQ_RET_UNEXP_NO_RESOURCES;
    }
#endif
    switch (mode) {
	case MQ_MSG_TINY:
	    if (msglen > 0) {
		req->buf = psmi_mq_sysbuf_alloc(mq, msglen);
		mq_copy_tiny((uint32_t *)req->buf, (uint32_t *)payload, msglen);
	    }
	    else
		req->buf = NULL;
	    req->state = MQ_STATE_COMPLETE;
	    break;

	case MQ_MSG_SHORT:
	    req->buf = psmi_mq_sysbuf_alloc(mq, msglen);
	    psmi_mq_mtucpy(req->buf, payload, msglen);
	    req->state = MQ_STATE_COMPLETE;
	    break;

	case MQ_MSG_LONG:
	    req->egrid = egrid;
	    req->send_msgoff = 0;
	    req->buf = psmi_mq_sysbuf_alloc(mq, msglen);
	    req->state = MQ_STATE_UNEXP;
	    req->type |= MQE_TYPE_EGRLONG;
	    STAILQ_INSERT_TAIL(&epaddr->mctxt_master->egrlong, req, nextq);
	    _IPATH_VDBG("unexp MSG_LONG %d of length %d bytes pay=%d\n", 
			egrid.egr_msgno, msglen, paylen);
	    if (paylen > 0)
		psmi_mq_handle_data(req, epaddr,
			egrid.egr_data, 0, payload, paylen);
	    psmi_mq_handle_egrdata(mq, req, epaddr);
	    break;

	default:
	    psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
			    "Internal error, unknown packet 0x%x", mode);
    }
    mq_sq_append(&mq->unexpected_q, req);
    mq->stats.rx_sys_bytes += msglen;
    mq->stats.rx_sys_num++;

    return MQ_RET_UNEXP_OK;
}
Example #14
0
static
psm_error_t
psmi_ep_open_device(const psm_ep_t ep,
		    const struct psm_ep_open_opts *opts,
		    const psm_uuid_t unique_job_key,
		    struct psmi_context *context, psm_epid_t *epid)
{
	psm_error_t err = PSM_OK;

	/* Skip affinity.  No affinity if:
	 * 1. User explicitly sets no-affinity=YES in environment.
	 * 2. User doesn't set affinity in environment and PSM is opened with
	 *    option affinity skip.
	 */
	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) {
		uint32_t rcvthread_flags;
		union psmi_envvar_val env_rcvthread;
		static int norcvthread;	/* only for first rail */

		ep->out_sl = opts->outsl;

		if ((err =
		     psmi_context_open(ep, opts->unit, opts->port,
				       unique_job_key, opts->timeout,
				       context)) != PSM_OK)
			goto fail;

		_HFI_DBG("[%d]use unit %d port %d\n", getpid(),
			 context->ctrl->__hfi_unit, 1);

		/* At this point, we have the unit id and port number, so
		 * check if pkey is not 0x0/0x7fff/0xffff, and match one
		 * of the pkey in table.
		 */
		if ((err =
		     psmi_ep_verify_pkey(ep, (uint16_t) opts->network_pkey,
					 &ep->network_pkey)) != PSM_OK)
			goto fail;

		/* See if we want to activate support for receive thread */
		psmi_getenv("PSM_RCVTHREAD",
			    "Recv thread flags (0 disables thread)",
			    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
			    (union psmi_envvar_val)(norcvthread++ ? 0 :
						    PSMI_RCVTHREAD_FLAGS),
			    &env_rcvthread);
		rcvthread_flags = env_rcvthread.e_uint;

		/* If enabled, use the pollurg capability to implement a receive
		 * interrupt thread that can handle urg packets */
		if (rcvthread_flags) {
			context->runtime_flags |= PSMI_RUNTIME_RCVTHREAD;
#ifdef PSMI_PLOCK_IS_NOLOCK
			psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
					  "#define PSMI_PLOCK_IS_NOLOCK not functional yet "
					  "with RCVTHREAD on");
#endif
		}
		context->rcvthread_flags = rcvthread_flags;

		*epid = context->epid;
	} else if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) {
		int rank;

		/* In shm-only mode, we need to derive a valid epid
		 * based on our rank.  We try to get it from the
		 * environment if its available, or resort to using
		 * our PID as the rank.
		 */
		union psmi_envvar_val env_rankid;

		if (psmi_getenv
		    ("MPI_LOCALRANKID", "Shared context rankid",
		     PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
		     (union psmi_envvar_val)-1, &env_rankid)) {
			if (psmi_getenv
			    ("PSC_MPI_NODE_RANK",
			     "Shared context rankid",
			     PSMI_ENVVAR_LEVEL_HIDDEN,
			     PSMI_ENVVAR_TYPE_INT,
			     (union psmi_envvar_val)-1, &env_rankid)) {
				rank = getpid();
			} else
				rank = env_rankid.e_int;
		} else
			rank = env_rankid.e_int;

		/*
		 * We use a LID of 0 for non-HFI communication.
		 * Since a jobkey is not available from IPS, pull the
		 * first 16 bits from the UUID.
		 */

		*epid = PSMI_EPID_PACK(((uint16_t *) unique_job_key)[0],
				       (rank >> 3), rank, 0,
				       PSMI_HFI_TYPE_DEFAULT, 0x7, rank);
	} else {
Example #15
0
psm2_error_t
__psm2_ep_connect(psm2_ep_t ep, int num_of_epid, psm2_epid_t const *array_of_epid,
		 int const *array_of_epid_mask,	/* can be NULL */
		 psm2_error_t *array_of_errors, psm2_epaddr_t *array_of_epaddr,
		 int64_t timeout)
{
	psm2_error_t err = PSM2_OK;
	ptl_ctl_t *ptlctl;
	ptl_t *ptl;
	int i, j, dup_idx;
	int num_toconnect = 0;
	int *epid_mask = NULL;
	int *epid_mask_isdupof = NULL;
	char *device;
	uint64_t t_start = get_cycles();
	uint64_t t_left;
	union psmi_envvar_val timeout_intval;

	PSM2_LOG_MSG("entering");
	PSMI_ERR_UNLESS_INITIALIZED(ep);

	PSMI_PLOCK();

	/*
	 * Normally we would lock here, but instead each implemented ptl component
	 * does its own locking.  This is mostly because the ptl components are
	 * ahead of the PSM interface in that they can disconnect their peers.
	 */
	if (ep == NULL || array_of_epaddr == NULL || array_of_epid == NULL ||
	    num_of_epid < 1) {
		err = psmi_handle_error(ep, PSM2_PARAM_ERR,
					"Invalid psm2_ep_connect parameters");
		goto fail;
	}

	/* We need two of these masks to detect duplicates */
	err = PSM2_NO_MEMORY;
	epid_mask =
	    (int *)psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epid);
	if (epid_mask == NULL)
		goto fail;
	epid_mask_isdupof =
	    (int *)psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epid);
	if (epid_mask_isdupof == NULL)
		goto fail;
	err = PSM2_OK;

	/* Eventually handle timeouts across all connects. */
	for (j = 0; j < num_of_epid; j++) {
		if (array_of_epid_mask != NULL && !array_of_epid_mask[j])
			epid_mask[j] = 0;
		else {
			epid_mask[j] = 1;
			array_of_errors[j] = PSM2_EPID_UNKNOWN;
			array_of_epaddr[j] = NULL;
			num_toconnect++;
		}
		epid_mask_isdupof[j] = -1;
	}

	psmi_getenv("PSM2_CONNECT_TIMEOUT",
		    "End-point connection timeout over-ride. 0 for no time-out.",
		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
		    (union psmi_envvar_val)0, &timeout_intval);

	if (getenv("PSM2_CONNECT_TIMEOUT")) {
		timeout = timeout_intval.e_uint * SEC_ULL;
	} else if (timeout > 0) {
		/* The timeout parameter provides the minimum timeout. A heuristic
		 * is used to scale up the timeout linearly with the number of
		 * endpoints, and we allow one second per 100 endpoints. */
		timeout = max(timeout, (num_toconnect * SEC_ULL) / 100);
	}

	if (timeout > 0 && timeout < PSMI_MIN_EP_CONNECT_TIMEOUT)
		timeout = PSMI_MIN_EP_CONNECT_TIMEOUT;
	_HFI_PRDBG("Connect to %d endpoints with time-out of %.2f secs\n",
		   num_toconnect, (double)timeout / 1e9);

	/* Look for duplicates in input array */
	for (i = 0; i < num_of_epid; i++) {
		for (j = i + 1; j < num_of_epid; j++) {
			if (array_of_epid[i] == array_of_epid[j] &&
			    epid_mask[i] && epid_mask[j]) {
				epid_mask[j] = 0;	/* don't connect more than once */
				epid_mask_isdupof[j] = i;
			}
		}
	}

	for (i = 0; i < PTL_MAX_INIT; i++) {
		if (ep->devid_enabled[i] == -1)
			continue;
		/* Set up the right connect ptrs */
		switch (ep->devid_enabled[i]) {
		case PTL_DEVID_IPS:
			ptlctl = &ep->ptl_ips;
			ptl = ep->ptl_ips.ptl;
			device = "ips";
			break;
		case PTL_DEVID_AMSH:
			ptlctl = &ep->ptl_amsh;
			ptl = ep->ptl_amsh.ptl;
			device = "amsh";
			break;
		case PTL_DEVID_SELF:
			ptlctl = &ep->ptl_self;
			ptl = ep->ptl_self.ptl;
			device = "self";
			break;
		default:
			device = "unknown";
			ptlctl = &ep->ptl_ips;	/*no-unused */
			ptl = ep->ptl_ips.ptl;	/*no-unused */
			device = "ips";	/*no-unused */
			psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
					  "Unknown/unhandled PTL id %d\n",
					  ep->devid_enabled[i]);
			break;
		}
		t_left = psmi_cycles_left(t_start, timeout);

		_HFI_VDBG("Trying to connect with device %s\n", device);
		if ((err = ptlctl->ep_connect(ptl, num_of_epid, array_of_epid,
					      epid_mask, array_of_errors,
					      array_of_epaddr,
					      cycles_to_nanosecs(t_left)))) {
			_HFI_PRDBG("Connect failure in device %s err=%d\n",
				   device, err);
			goto connect_fail;
		}

		/* Now process what's been connected */
		for (j = 0; j < num_of_epid; j++) {
			dup_idx = epid_mask_isdupof[j];
			if (!epid_mask[j] && dup_idx == -1)
				continue;

			if (dup_idx != -1) {	/* dup */
				array_of_epaddr[j] = array_of_epaddr[dup_idx];
				array_of_errors[j] = array_of_errors[dup_idx];
				epid_mask_isdupof[j] = -1;
			}

			if (array_of_errors[j] == PSM2_OK) {
				epid_mask[j] = 0;	/* don't try on next ptl */
				ep->connections++;
			}
		}
	}

	for (i = 0; i < num_of_epid; i++) {
		ptl_ctl_t *c = NULL;
		if (array_of_epid_mask != NULL && !array_of_epid_mask[i])
			continue;
		/* If we see unreachable here, that means some PTLs were not enabled */
		if (array_of_errors[i] == PSM2_EPID_UNREACHABLE) {
			err = PSM2_EPID_UNREACHABLE;
			break;
		}

		psmi_assert_always(array_of_epaddr[i] != NULL);
		c = array_of_epaddr[i]->ptlctl;
		psmi_assert_always(c != NULL);
		_HFI_VDBG("%-20s DEVICE %s (%p)\n",
			  psmi_epaddr_get_name(array_of_epid[i]),
			  c == &ep->ptl_ips ? "hfi" :
			  (c == &ep->ptl_amsh ? "amsh" : "self"),
			  (void *)array_of_epaddr[i]->ptlctl->ptl);
	}

connect_fail:
	/* If the error is a timeout (at worse) and the client is OPA MPI,
	 * just return timeout to let OPA MPI handle the hostnames that
	 * timed out */
	if (err != PSM2_OK) {
		char errbuf[PSM2_ERRSTRING_MAXLEN];
		size_t len;
		int j = 0;

		if (err == PSM2_EPID_UNREACHABLE) {
			char *deverr = "of an incorrect setting";
			char *eperr = " ";
			char *devname = NULL;
			if (!psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) {
				deverr =
				    "there is no shared memory PSM device (shm)";
				eperr = " shared memory ";
			} else
			    if (!psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) {
				deverr =
				    "there is no OPA PSM device (hfi)";
				eperr = " OPA ";
			}

			len = snprintf(errbuf, sizeof(errbuf) - 1,
				       "Some%sendpoints could not be connected because %s "
				       "in the currently enabled PSM_DEVICES (",
				       eperr, deverr);
			for (i = 0; i < PTL_MAX_INIT && len < sizeof(errbuf) - 1;
			     i++) {
				switch (ep->devid_enabled[i]) {
				case PTL_DEVID_IPS:
					devname = "hfi";
					break;
				case PTL_DEVID_AMSH:
					devname = "shm";
					break;
				case PTL_DEVID_SELF:
				default:
					devname = "self";
					break;
				}
				len +=
				    snprintf(errbuf + len,
					     sizeof(errbuf) - len - 1, "%s,",
					     devname);
			}
			if (len < sizeof(errbuf) - 1 && devname != NULL)
				/* parsed something, remove trailing comma */
				errbuf[len - 1] = ')';
		} else
			len = snprintf(errbuf, sizeof(errbuf) - 1,
				       "%s", err == PSM2_TIMEOUT ?
				       "Dectected connection timeout" :
				       psm2_error_get_string(err));

		/* first pass, look for all nodes with the error */
		for (i = 0; i < num_of_epid && len < sizeof(errbuf) - 1; i++) {
			if (array_of_epid_mask != NULL
			    && !array_of_epid_mask[i])
				continue;
			if (array_of_errors[i] == PSM2_OK)
				continue;
			if (array_of_errors[i] == PSM2_EPID_UNREACHABLE &&
			    err != PSM2_EPID_UNREACHABLE)
				continue;
			if (err == array_of_errors[i]) {
				len +=
				    snprintf(errbuf + len,
					     sizeof(errbuf) - len - 1, "%c %s",
					     j == 0 ? ':' : ',',
					     psmi_epaddr_get_hostname
					     (array_of_epid[i]));
				j++;
			}
		}
		errbuf[sizeof(errbuf) - 1] = '\0';
		err = psmi_handle_error(ep, err, errbuf);
	}

fail:
	PSMI_PUNLOCK();

	if (epid_mask != NULL)
		psmi_free(epid_mask);
	if (epid_mask_isdupof != NULL)
		psmi_free(epid_mask_isdupof);

	PSM2_LOG_MSG("leaving");
	return err;
}
Example #16
0
static psm_error_t
psmi_ep_multirail(int *num_rails, uint32_t *unit, uint16_t *port)
{
	uint32_t num_units;
	uint64_t gid_hi, gid_lo;
	int i, j, ret, count = 0;
	char *env;
	psm_error_t err = PSM_OK;
	uint64_t gidh[HFI_MAX_RAILS][3];

	env = getenv("PSM_MULTIRAIL");
	if (!env || atoi(env) == 0) {
		*num_rails = 0;
		return err;
	}

/*
 * map is in format: unit:port,unit:port,...
 */
	if ((env = getenv("PSM_MULTIRAIL_MAP"))) {
		if (sscanf(env, "%d:%d", &i, &j) == 2) {
			char *comma = strchr(env, ',');
			unit[count] = i;
			port[count] = j;
			count++;
			while (comma) {
				if (sscanf(comma, ",%d:%d", &i, &j) != 2) {
					break;
				}
				unit[count] = i;
				port[count] = j;
				count++;
				if (count == HFI_MAX_RAILS)
					break;
				comma = strchr(comma + 1, ',');
			}
		}
		*num_rails = count;

/*
 * Check if any of the port is not usable.
 */
		for (i = 0; i < count; i++) {
			ret = hfi_get_port_lid(unit[i], port[i]);
			if (ret == -1) {
				err =
				    psmi_handle_error(NULL,
						      PSM_EP_DEVICE_FAILURE,
						      "Couldn't get lid for unit %d:%d",
						      unit[i], port[i]);
				return err;
			}
			ret =
			    hfi_get_port_gid(unit[i], port[i], &gid_hi,
					     &gid_lo);
			if (ret == -1) {
				err =
				    psmi_handle_error(NULL,
						      PSM_EP_DEVICE_FAILURE,
						      "Couldn't get gid for unit %d:%d",
						      unit[i], port[i]);
				return err;
			}
		}

		return err;
	}

	if ((err = psm_ep_num_devunits(&num_units))) {
		return err;
	}
	if (num_units > HFI_MAX_RAILS) {
		_HFI_INFO
		    ("Found %d units, max %d units are supported, use %d\n",
		     num_units, HFI_MAX_RAILS, HFI_MAX_RAILS);
		num_units = HFI_MAX_RAILS;
	}

/*
 * Get all the ports with a valid lid and gid, one per unit.
 */
	for (i = 0; i < num_units; i++) {
		for (j = 1; j <= HFI_MAX_PORT; j++) {
			ret = hfi_get_port_lid(i, j);
			if (ret == -1)
				continue;
			ret = hfi_get_port_gid(i, j, &gid_hi, &gid_lo);
			if (ret == -1)
				continue;

			gidh[count][0] = gid_hi;
			gidh[count][1] = i;
			gidh[count][2] = j;
			count++;
			break;
		}
	}

/*
 * Sort all the ports with gidh from small to big.
 * This is for multiple fabrics, and we use fabric with the
 * smallest gid to make the master connection.
 */
	qsort(gidh, count, sizeof(uint64_t) * 3, cmpfunc);

	for (i = 0; i < count; i++) {
		unit[i] = (uint32_t) gidh[i][1];
		port[i] = (uint16_t) (uint32_t) gidh[i][2];
	}
	*num_rails = count;
	return err;
}
Example #17
0
static psm_error_t
psmi_ep_devlids(uint16_t **lids, uint32_t *num_lids_o,
		uint64_t my_gid_hi, uint64_t my_gid_lo)
{
	static uint16_t *hfi_lids;
	static uint32_t nlids;
	uint32_t num_units;
	int i;
	psm_error_t err = PSM_OK;

	PSMI_ERR_UNLESS_INITIALIZED(NULL);

	if (hfi_lids == NULL) {
		if ((err = psm_ep_num_devunits(&num_units)))
			goto fail;
		hfi_lids = (uint16_t *)
		    psmi_calloc(PSMI_EP_NONE, UNDEFINED,
				num_units * HFI_MAX_PORT, sizeof(uint16_t));
		if (hfi_lids == NULL) {
			err = psmi_handle_error(NULL, PSM_NO_MEMORY,
						"Couldn't allocate memory for dev_lids structure");
			goto fail;
		}

		for (i = 0; i < num_units; i++) {
			int j;
			for (j = 1; j <= HFI_MAX_PORT; j++) {
				int lid = hfi_get_port_lid(i, j);
				int ret;
				uint64_t gid_hi = 0, gid_lo = 0;

				if (lid == -1)
					continue;
				ret = hfi_get_port_gid(i, j, &gid_hi, &gid_lo);
				if (ret == -1)
					continue;
				else if (my_gid_hi != gid_hi) {
					_HFI_VDBG("LID %d, unit %d, port %d, "
						  "mismatched GID %llx:%llx and "
						  "%llx:%llx\n",
						  lid, i, j,
						  (unsigned long long)gid_hi,
						  (unsigned long long)gid_lo,
						  (unsigned long long)my_gid_hi,
						  (unsigned long long)
						  my_gid_lo);
					continue;
				}
				_HFI_VDBG("LID %d, unit %d, port %d, "
					  "matching GID %llx:%llx and "
					  "%llx:%llx\n", lid, i, j,
					  (unsigned long long)gid_hi,
					  (unsigned long long)gid_lo,
					  (unsigned long long)my_gid_hi,
					  (unsigned long long)my_gid_lo);

				hfi_lids[nlids++] = (uint16_t) lid;
			}
		}
		if (nlids == 0) {
			err = psmi_handle_error(NULL, PSM_EP_DEVICE_FAILURE,
						"Couldn't get lid&gid from any unit/port");
			goto fail;
		}
	}
	*lids = hfi_lids;
	*num_lids_o = nlids;

fail:
	return err;
}
Example #18
0
psm_error_t __psm_ep_close(psm_ep_t ep, int mode, int64_t timeout_in)
{
	psm_error_t err = PSM_OK;
	uint64_t t_start = get_cycles();
	union psmi_envvar_val timeout_intval;
	psm_ep_t tmp, mep;

	PSMI_ERR_UNLESS_INITIALIZED(ep);
	psmi_assert_always(ep->mctxt_master == ep);

	PSMI_PLOCK();

	if (psmi_opened_endpoint == NULL) {
		err = psmi_handle_error(NULL, PSM_EP_WAS_CLOSED,
					"PSM Endpoint is closed or does not exist");
		return err;
	}

	tmp = psmi_opened_endpoint;
	while (tmp && tmp != ep) {
		tmp = tmp->user_ep_next;
	}
	if (!tmp) {
		err = psmi_handle_error(NULL, PSM_EP_WAS_CLOSED,
					"PSM Endpoint is closed or does not exist");
		return err;
	}

	psmi_getenv("PSM_CLOSE_TIMEOUT",
		    "End-point close timeout over-ride.",
		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
		    (union psmi_envvar_val)0, &timeout_intval);

	if (getenv("PSM_CLOSE_TIMEOUT")) {
		timeout_in = timeout_intval.e_uint * SEC_ULL;
	} else if (timeout_in > 0) {
		/* The timeout parameter provides the minimum timeout. A heuristic
		 * is used to scale up the timeout linearly with the number of
		 * endpoints, and we allow one second per 100 endpoints. */
		timeout_in = max(timeout_in, (ep->connections * SEC_ULL) / 100);
	}

	if (timeout_in > 0 && timeout_in < PSMI_MIN_EP_CLOSE_TIMEOUT)
		timeout_in = PSMI_MIN_EP_CLOSE_TIMEOUT;

	/* Infinite and excessive close time-out are limited here to a max.
	 * The "rationale" is that there is no point waiting around forever for
	 * graceful termination. Normal (or forced) process termination should clean
	 * up the context state correctly even if termination is not graceful. */
	if (timeout_in <= 0 || timeout_in < PSMI_MAX_EP_CLOSE_TIMEOUT)
		timeout_in = PSMI_MAX_EP_CLOSE_TIMEOUT;
	_HFI_PRDBG("Closing endpoint %p with force=%s and to=%.2f seconds and "
		   "%d connections\n",
		   ep, mode == PSM_EP_CLOSE_FORCE ? "YES" : "NO",
		   (double)timeout_in / 1e9, (int)ep->connections);

	/* XXX We currently cheat in the sense that we leave each PTL the allowed
	 * timeout.  There's no good way to do this until we change the PTL
	 * interface to allow asynchronous finalization
	 */
	mep = ep;
	tmp = ep->mctxt_prev;
	do {
		ep = tmp;
		tmp = ep->mctxt_prev;
		PSM_MCTXT_REMOVE(ep);
		if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH))
			err =
			    psmi_ptl_amsh.fini(ep->ptl_amsh.ptl, mode,
					       timeout_in);

		if ((err == PSM_OK || err == PSM_TIMEOUT) &&
		    psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS))
			err =
			    psmi_ptl_ips.fini(ep->ptl_ips.ptl, mode,
					      timeout_in);

		/* If there's timeouts in the disconnect requests,
		 * still make sure that we still get to close the
		 *endpoint and mark it closed */
		if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS))
			psmi_context_close(&ep->context);

		psmi_free(ep->epaddr);
		psmi_free(ep->context_mylabel);
		/*
		 * Before freeing the master ep itself,
		 * remove it from the global linklist.
		 * We do it here to let atexit handler in ptl_am directory
		 * to search the global linklist and free the shared memory file.
		 */
		if (ep == mep) {
			if (psmi_opened_endpoint == ep) {
				psmi_opened_endpoint = ep->user_ep_next;
			} else {
				tmp = psmi_opened_endpoint;
				while (tmp->user_ep_next != ep) {
					tmp = tmp->user_ep_next;
				}
				tmp->user_ep_next = ep->user_ep_next;
			}
			psmi_opened_endpoint_count--;
		}
		psmi_free(ep);

	} while ((err == PSM_OK || err == PSM_TIMEOUT) && tmp != ep);

	PSMI_PUNLOCK();

	_HFI_PRDBG("Closed endpoint in %.3f secs\n",
		   (double)cycles_to_nanosecs(get_cycles() -
					      t_start) / SEC_ULL);
	return err;
}
Example #19
0
/*
 * Get mmu notifier invalidation info and update PSM's caching.
 */
psm2_error_t
ips_tidcache_invalidation(struct ips_tid *tidc)
{
	cl_qmap_t *p_map = &tidc->tid_cachemap;
	uint32_t i, j, idx, tidcnt;
	psm2_error_t err;

	/*
	 * get a list of invalidated tids from driver,
	 * driver will clear the event bit before return.
	 */
	tidcnt = 0;
	if (hfi_get_invalidation(tidc->context->ctrl,
		   (uint64_t) (uintptr_t) tidc->tid_array, &tidcnt) < 0) {
		/* If failed to get invalidation info, it's fatal error */
		err = psmi_handle_error(tidc->context->ep,
			PSM2_EP_DEVICE_FAILURE,
			"Failed to get invalidation info");
		return err;
	}
	psmi_assert(tidcnt > 0 && tidcnt <= tidc->tid_ctrl->tid_num_max);

	j = 0;
	for (i = 0; i < tidcnt; i++) {
		/*
		 * Driver only returns tidctrl=1 or tidctrl=2.
		 */
		idx = 2*IPS_TIDINFO_GET_TID(tidc->tid_array[i]) +
			IPS_TIDINFO_GET_TIDCTRL(tidc->tid_array[i]);
		psmi_assert(idx != 0);
		psmi_assert(idx <= tidc->tid_ctrl->tid_num_max);

		/*
		 * sanity check.
		 */
		psmi_assert(p_map->root[idx].payload.tidinfo == tidc->tid_array[i]);
		psmi_assert(LENGTH(idx) ==
				IPS_TIDINFO_GET_LENGTH(tidc->tid_array[i]));

		/*
		 * if the tid is already invalidated, ignore it,
		 * but do sanity check.
		 */
		if (INVALIDATE(idx) != 0) {
			psmi_assert(REFCNT(idx) == 0);
			continue;
		}

		/*
		 * mark the tid invalidated.
		 */
		INVALIDATE(idx) = 1;

		/*
		 * if the tid is idle, remove the tid from RB tree
		 * and idle queue, put on free list.
		 */
		if (REFCNT(idx) == 0) {
			IDLE_REMOVE(idx);
			ips_cl_qmap_remove_item(p_map, &p_map->root[idx]);

			if (i != j)
				tidc->tid_array[j] = tidc->tid_array[i];
			j++;
		}
	}

	if (j > 0) {
		/*
		 * call driver to free the tids.
		 */
		if (hfi_free_tid(tidc->context->ctrl,
			    (uint64_t) (uintptr_t) tidc->tid_array, j) < 0) {
			/* If failed to unpin pages, it's fatal error */
			err = psmi_handle_error(tidc->context->ep,
				PSM2_EP_DEVICE_FAILURE,
				"Failed to tid free %d tids", j);
			return err;
		}
	}

	return PSM2_OK;
}
Example #20
0
/* 
 * This handles the regular (i.e. non-rendezvous MPI envelopes) 
 */
int __recvpath
psmi_mq_handle_envelope(psm_mq_t mq, uint16_t mode, psm_epaddr_t epaddr,
		   uint64_t tag, psmi_egrid_t egrid, uint32_t send_msglen, 
		   const void *payload, uint32_t paylen)
{
    psm_mq_req_t req;
    uint32_t msglen;
    int rc;

    psmi_assert(epaddr != NULL);

    req = mq_req_match(&(mq->expected_q), tag, 1);

    if (req) { /* we have a match */
	psmi_assert(MQE_TYPE_IS_RECV(req->type));
	req->tag = tag;
	msglen = mq_set_msglen(req, req->buf_len, send_msglen);

	_IPATH_VDBG("from=%s match=YES (req=%p) mode=%x mqtag=%"
		PRIx64" msglen=%d paylen=%d\n", psmi_epaddr_get_name(epaddr->epid), 
		req, mode, tag, msglen, paylen);

	switch(mode) {
	    case MQ_MSG_TINY:
		PSM_VALGRIND_DEFINE_MQ_RECV(req->buf, req->buf_len, msglen);
		mq_copy_tiny((uint32_t *)req->buf, (uint32_t *)payload, msglen);
		req->state = MQ_STATE_COMPLETE;
		mq_qq_append(&mq->completed_q, req);
		break;

	    case MQ_MSG_SHORT: /* message fits in 1 payload */
		PSM_VALGRIND_DEFINE_MQ_RECV(req->buf, req->buf_len, msglen);
		psmi_mq_mtucpy(req->buf, payload, msglen);
		req->state = MQ_STATE_COMPLETE;
		mq_qq_append(&mq->completed_q, req);
		break;

	    case MQ_MSG_LONG:
		req->egrid = egrid;
		req->state = MQ_STATE_MATCHED;
		req->type |= MQE_TYPE_EGRLONG;
		req->send_msgoff = req->recv_msgoff = 0;
		STAILQ_INSERT_TAIL(&epaddr->mctxt_master->egrlong, req, nextq);
		_IPATH_VDBG("exp MSG_LONG %d of length %d bytes pay=%d\n", 
			egrid.egr_msgno, msglen, paylen);
		if (paylen > 0)
		    psmi_mq_handle_data(req, epaddr,
			egrid.egr_data, 0, payload, paylen);
		psmi_mq_handle_egrdata(mq, req, epaddr);
		break;

	    default:
		psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
			    "Internal error, unknown packet 0x%x", mode);
	}

	mq->stats.rx_user_bytes += msglen;
	mq->stats.rx_user_num++;

	rc = MQ_RET_MATCH_OK;
	if (mode == MQ_MSG_LONG)
	    return rc;
    }
    else
	rc =  psmi_mq_handle_envelope_unexpected(mq, mode, epaddr, tag,
		    egrid, send_msglen, payload, paylen);

    return rc;
}
Example #21
0
int __recvpath
psmi_mq_handle_envelope_outoforder(psm_mq_t mq, uint16_t mode,
		   psm_epaddr_t epaddr, uint16_t msg_seqnum,
		   uint64_t tag, psmi_egrid_t egrid, uint32_t send_msglen, 
		   const void *payload, uint32_t paylen)
{
    psm_mq_req_t req;
    uint32_t msglen;

    req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV);
    psmi_assert(req != NULL);

    req->tag = tag;
    req->recv_msgoff = 0;
    req->recv_msglen = req->send_msglen = req->buf_len = msglen = send_msglen;

    _IPATH_VDBG(
		"from=%s match=NO (req=%p) mode=%x mqtag=%" PRIx64
		" send_msglen=%d\n", psmi_epaddr_get_name(epaddr->epid), 
		req, mode, tag, send_msglen);
    switch (mode) {
	case MQ_MSG_TINY:
	    if (msglen > 0) {
		req->buf = psmi_mq_sysbuf_alloc(mq, msglen);
		mq_copy_tiny((uint32_t *)req->buf, (uint32_t *)payload, msglen);
	    }
	    else
		req->buf = NULL;
	    req->state = MQ_STATE_COMPLETE;
	    break;

	case MQ_MSG_SHORT:
	    req->buf = psmi_mq_sysbuf_alloc(mq, msglen);
	    psmi_mq_mtucpy(req->buf, payload, msglen);
	    req->state = MQ_STATE_COMPLETE;
	    break;

	case MQ_MSG_LONG:
	    req->egrid = egrid;
	    req->epaddr = epaddr;
	    req->send_msgoff = 0;
	    req->buf = psmi_mq_sysbuf_alloc(mq, msglen);
	    req->state = MQ_STATE_UNEXP;
	    req->type |= MQE_TYPE_EGRLONG;
	    STAILQ_INSERT_TAIL(&epaddr->mctxt_master->egrlong, req, nextq);
	    _IPATH_VDBG("unexp MSG_LONG %d of length %d bytes pay=%d\n", 
			egrid.egr_msgno, msglen, paylen);
	    if (paylen > 0)
		psmi_mq_handle_data(req, epaddr,
			egrid.egr_data, 0, payload, paylen);
	    psmi_mq_handle_egrdata(mq, req, epaddr);
	    break;

	default:
	    psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR,
			    "Internal error, unknown packet 0x%x", mode);
    }

    req->msg_seqnum = msg_seqnum;
    mq_sq_append(&epaddr->mctxt_master->outoforder_q, req);
    epaddr->mctxt_master->outoforder_c++;
    mq->stats.rx_sys_bytes += msglen;
    mq->stats.rx_sys_num++;

    return MQ_RET_UNEXP_OK;
}
Example #22
0
psm_error_t
__psm_ep_open_internal(psm_uuid_t const unique_job_key, int *devid_enabled,
		       struct psm_ep_open_opts const *opts_i, psm_mq_t mq,
		       psm_ep_t *epo, psm_epid_t *epido)
{
	psm_ep_t ep = NULL;
	uint32_t num_units;
	size_t len;
	psm_error_t err;
	psm_epaddr_t epaddr = NULL;
	char buf[128], *p, *e;
	union psmi_envvar_val envvar_val;
	size_t ptl_sizes;
	struct psm_ep_open_opts opts;
	ptl_t *amsh_ptl, *ips_ptl, *self_ptl;
	int i;

	/* First get the set of default options, we overwrite with the user's
	 * desired values afterwards */
	if ((err = psm_ep_open_opts_get_defaults(&opts)))
		goto fail;

	if (opts_i != NULL) {
		if (opts_i->timeout != -1)
			opts.timeout = opts_i->timeout;
		if (opts_i->unit != -1)
			opts.unit = opts_i->unit;
		if (opts_i->affinity != -1)
			opts.affinity = opts_i->affinity;

		if (opts_i->sendbufs_num != -1)
			opts.sendbufs_num = opts_i->sendbufs_num;

		if (opts_i->network_pkey != HFI_DEFAULT_P_KEY)
			opts.network_pkey = opts_i->network_pkey;

		if (opts_i->port != 0)
			opts.port = opts_i->port;

		if (opts_i->outsl != -1)
			opts.outsl = opts_i->outsl;

		if (opts_i->service_id)
			opts.service_id = (uint64_t) opts_i->service_id;
		if (opts_i->path_res_type != PSM_PATH_RES_NONE)
			opts.path_res_type = opts_i->path_res_type;

		if (opts_i->senddesc_num)
			opts.senddesc_num = opts_i->senddesc_num;
		if (opts_i->imm_size)
			opts.imm_size = opts_i->imm_size;
	}

	/* Get Service ID from environment */
	if (!psmi_getenv("PSM_IB_SERVICE_ID",
			 "HFI Service ID for path resolution",
			 PSMI_ENVVAR_LEVEL_USER,
			 PSMI_ENVVAR_TYPE_ULONG_ULONG,
			 (union psmi_envvar_val)HFI_DEFAULT_SERVICE_ID,
			 &envvar_val)) {
		opts.service_id = (uint64_t) envvar_val.e_ulonglong;
	}

	/* Get Path resolution type from environment Possible choices are:
	 *
	 * NONE : Default same as previous instances. Utilizes static data.
	 * OPP  : Use OFED Plus Plus library to do path record queries.
	 * UMAD : Use raw libibumad interface to form and process path records.
	 */
	if (!psmi_getenv("PSM_PATH_REC",
			 "Mechanism to query HFI path record (default is no path query)",
			 PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
			 (union psmi_envvar_val)"none", &envvar_val)) {
		if (!strcasecmp(envvar_val.e_str, "none"))
			opts.path_res_type = PSM_PATH_RES_NONE;
		else if (!strcasecmp(envvar_val.e_str, "opp"))
			opts.path_res_type = PSM_PATH_RES_OPP;
		else if (!strcasecmp(envvar_val.e_str, "umad"))
			opts.path_res_type = PSM_PATH_RES_UMAD;
		else {
			_HFI_ERROR("Unknown path resolution type %s. "
				   "Disabling use of path record query.\n",
				   envvar_val.e_str);
			opts.path_res_type = PSM_PATH_RES_NONE;
		}
	}

	/* If a specific unit is set in the environment, use that one. */
	if (!psmi_getenv("HFI_UNIT", "Device Unit number (-1 autodetects)",
			 PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_LONG,
			 (union psmi_envvar_val)HFI_UNIT_ID_ANY, &envvar_val)) {
		opts.unit = envvar_val.e_long;
	}

	/* Get user specified port number to use. */
	if (!psmi_getenv("HFI_PORT", "IB Port number (0 autodetects)",
			 PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_LONG,
			 (union psmi_envvar_val)HFI_PORT_NUM_ANY,
			 &envvar_val)) {
		opts.port = envvar_val.e_long;
	}

	/* Get service level from environment, path-query overrides it */
	if (!psmi_getenv
	    ("HFI_SL", "HFI outging ServiceLevel number (default 0)",
	     PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_LONG,
	     (union psmi_envvar_val)PSMI_SL_DEFAULT, &envvar_val)) {
		opts.outsl = envvar_val.e_long;
	}

	/* Get network key from environment. MVAPICH and other vendor MPIs do not
	 * specify it on ep open and we may require it for vFabrics.
	 * path-query will override it.
	 */
	if (!psmi_getenv("PSM_PKEY",
			 "HFI PKey to use for endpoint",
			 PSMI_ENVVAR_LEVEL_USER,
			 PSMI_ENVVAR_TYPE_ULONG,
			 (union psmi_envvar_val)HFI_DEFAULT_P_KEY,
			 &envvar_val)) {
		opts.network_pkey = (uint64_t) envvar_val.e_ulong;
	}

	/* BACKWARDS COMPATIBILITY:  Open MPI likes to choose its own PKEY of
	   0x7FFF.  That's no longer a valid default, so override it if the
	   client was compiled against PSM v1 */
	if (PSMI_VERNO_GET_MAJOR(psmi_verno_client()) < 2 &&
			opts.network_pkey == 0x7FFF) {
		opts.network_pkey = HFI_DEFAULT_P_KEY;
	}

	/* Get number of default send buffers from environment */
	if (!psmi_getenv("PSM_NUM_SEND_BUFFERS",
			 "Number of send buffers to allocate [1024]",
			 PSMI_ENVVAR_LEVEL_USER,
			 PSMI_ENVVAR_TYPE_UINT,
			 (union psmi_envvar_val)1024, &envvar_val)) {
		opts.sendbufs_num = envvar_val.e_uint;
	}

	/* Get immediate data size - transfers less than immediate data size do
	 * not consume a send buffer and require just a send descriptor.
	 */
	if (!psmi_getenv("PSM_SEND_IMMEDIATE_SIZE",
			 "Immediate data send size not requiring a buffer [128]",
			 PSMI_ENVVAR_LEVEL_USER,
			 PSMI_ENVVAR_TYPE_UINT,
			 (union psmi_envvar_val)128, &envvar_val)) {
		opts.imm_size = envvar_val.e_uint;
	}

	/* Get numner of send descriptors - by default this is 4 times the number
	 * of send buffers - mainly used for short/inlined messages.
	 */
	if (!psmi_getenv("PSM_NUM_SEND_DESCRIPTORS",
			 "Number of send descriptors to allocate [4096]",
			 PSMI_ENVVAR_LEVEL_USER,
			 PSMI_ENVVAR_TYPE_UINT,
			 (union psmi_envvar_val)4096, &envvar_val)) {
		opts.senddesc_num = envvar_val.e_uint;
	}

	if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) {
		if ((err = psm_ep_num_devunits(&num_units)) != PSM_OK)
			goto fail;
	} else
		num_units = 0;

	/* do some error checking */
	if (opts.timeout < -1) {
		err = psmi_handle_error(NULL, PSM_PARAM_ERR,
					"Invalid timeout value %lld",
					(long long)opts.timeout);
		goto fail;
	} else if (num_units && (opts.unit < -1 || opts.unit >= (int)num_units)) {
		err = psmi_handle_error(NULL, PSM_PARAM_ERR,
					"Invalid Device Unit ID %d (%d units found)",
					opts.unit, num_units);
		goto fail;
	} else if (opts.port < 0 || opts.port > HFI_MAX_PORT) {
		err = psmi_handle_error(NULL, PSM_PARAM_ERR,
					"Invalid Device port number %d",
					opts.port);
		goto fail;
	} else if (opts.affinity < 0
		   || opts.affinity > PSM_EP_OPEN_AFFINITY_FORCE) {
		err =
		    psmi_handle_error(NULL, PSM_PARAM_ERR,
				      "Invalid Affinity option: %d",
				      opts.affinity);
		goto fail;
	} else if (opts.outsl < PSMI_SL_MIN || opts.outsl > PSMI_SL_MAX) {
		err = psmi_handle_error(NULL, PSM_PARAM_ERR,
					"Invalid SL number: %lld",
					(unsigned long long)opts.outsl);
		goto fail;
	}

	/* Set environment variable if PSM is not allowed to set affinity */
	if (opts.affinity == PSM_EP_OPEN_AFFINITY_SKIP)
		setenv("HFI_NO_CPUAFFINITY", "1", 1);

	/* Allocate end point structure storage */
	ptl_sizes =
	    (psmi_device_is_enabled(devid_enabled, PTL_DEVID_SELF) ?
	     psmi_ptl_self.sizeof_ptl() : 0) +
	    (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS) ?
	     psmi_ptl_ips.sizeof_ptl() : 0) +
	    (psmi_device_is_enabled(devid_enabled, PTL_DEVID_AMSH) ?
	     psmi_ptl_amsh.sizeof_ptl() : 0);
	if (ptl_sizes == 0)
		return PSM_EP_NO_DEVICE;

	ep = (psm_ep_t) psmi_memalign(PSMI_EP_NONE, UNDEFINED, 64,
				      sizeof(struct psm_ep) + ptl_sizes);
	epaddr = (psm_epaddr_t) psmi_calloc(PSMI_EP_NONE, PER_PEER_ENDPOINT,
					    1, sizeof(struct psm_epaddr));
	if (ep == NULL || epaddr == NULL) {
		err = psmi_handle_error(NULL, PSM_NO_MEMORY,
					"Couldn't allocate memory for %s structure",
					ep == NULL ? "psm_ep" : "psm_epaddr");
		goto fail;
	}

	/* Copy PTL enabled status */
	for (i = 0; i < PTL_MAX_INIT; i++)
		ep->devid_enabled[i] = devid_enabled[i];

	/* Matched Queue initialization.  We do this early because we have to
	 * make sure ep->mq exists and is valid before calling ips_do_work.
	 */
	ep->mq = mq;

	/* Get ready for PTL initialization */
	memcpy(&ep->uuid, (void *)unique_job_key, sizeof(psm_uuid_t));
	ep->epaddr = epaddr;
	ep->memmode = mq->memmode;
	ep->hfi_num_sendbufs = opts.sendbufs_num;
	ep->service_id = opts.service_id;
	ep->path_res_type = opts.path_res_type;
	ep->hfi_num_descriptors = opts.senddesc_num;
	ep->hfi_imm_size = opts.imm_size;
	ep->errh = psmi_errhandler_global;	/* by default use the global one */
	ep->ptl_amsh.ep_poll = psmi_poll_noop;
	ep->ptl_ips.ep_poll = psmi_poll_noop;
	ep->connections = 0;

	/* See how many iterations we want to spin before yielding */
	psmi_getenv("PSM_YIELD_SPIN_COUNT",
		    "Spin poll iterations before yield",
		    PSMI_ENVVAR_LEVEL_HIDDEN,
		    PSMI_ENVVAR_TYPE_UINT,
		    (union psmi_envvar_val)PSMI_BLOCKUNTIL_POLLS_BEFORE_YIELD,
		    &envvar_val);
	ep->yield_spin_cnt = envvar_val.e_uint;

	ptl_sizes = 0;
	amsh_ptl = ips_ptl = self_ptl = NULL;
	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) {
		amsh_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes);
		ptl_sizes += psmi_ptl_amsh.sizeof_ptl();
	}
	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) {
		ips_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes);
		ptl_sizes += psmi_ptl_ips.sizeof_ptl();
	}
	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_SELF)) {
		self_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes);
		ptl_sizes += psmi_ptl_self.sizeof_ptl();
	}

	if ((err = psmi_ep_open_device(ep, &opts, unique_job_key,
				       &(ep->context), &ep->epid)))
		goto fail;

	psmi_assert_always(ep->epid != 0);
	ep->epaddr->epid = ep->epid;

	_HFI_VDBG("psmi_ep_open_device() passed\n");

	/* Set our new label as soon as we know what it is */
	strncpy(buf, psmi_gethostname(), sizeof(buf) - 1);
	buf[sizeof(buf) - 1] = '\0';

	p = buf + strlen(buf);

	/* If our rank is set, use it. If not, use context.subcontext notation */
	if (((e = getenv("MPI_RANKID")) != NULL && *e) ||
	    ((e = getenv("PSC_MPI_RANK")) != NULL && *e))
		len = snprintf(p, sizeof(buf) - strlen(buf), ":%d.", atoi(e));
	else
		len = snprintf(p, sizeof(buf) - strlen(buf), ":%d.%d.",
			       (uint32_t) psm_epid_context(ep->epid),
			       (uint32_t) psmi_epid_subcontext(ep->epid));
	*(p + len) = '\0';
	ep->context_mylabel = psmi_strdup(ep, buf);
	if (ep->context_mylabel == NULL) {
		err = PSM_NO_MEMORY;
		goto fail;
	}
	/* hfi_set_mylabel(ep->context_mylabel); */

	if ((err = psmi_epid_set_hostname(psm_epid_nid(ep->epid), buf, 0)))
		goto fail;

	_HFI_VDBG("start ptl device init...\n");
	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_SELF)) {
		if ((err = psmi_ptl_self.init(ep, self_ptl, &ep->ptl_self)))
			goto fail;
	}
	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) {
		if ((err = psmi_ptl_ips.init(ep, ips_ptl, &ep->ptl_ips)))
			goto fail;
	}
	/* If we're shm-only, this device is enabled above */
	if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) {
		if ((err = psmi_ptl_amsh.init(ep, amsh_ptl, &ep->ptl_amsh)))
			goto fail;
	} else {
		/* We may have pre-attached as part of getting our rank for enabling
		 * shared contexts.  */
	}

	_HFI_VDBG("finish ptl device init...\n");

	/*
	 * Keep only IPS since only IPS support multi-rail, other devices
	 * are only setup once. IPS device can come to this function again.
	 */
	for (i = 0; i < PTL_MAX_INIT; i++) {
		if (devid_enabled[i] != PTL_DEVID_IPS) {
			devid_enabled[i] = -1;
		}
	}

	*epido = ep->epid;
	*epo = ep;

	return PSM_OK;

fail:
	if (ep != NULL) {
		if (ep->context.fd != -1)
			close(ep->context.fd);
		psmi_free(ep);
	}
	if (epaddr != NULL)
		psmi_free(epaddr);
	return err;
}