psm_error_t __psm_ep_epid_lookup(psm_epid_t epid, psm_epconn_t *epconn) { psm_error_t err = PSM_OK; psm_epaddr_t epaddr; psm_ep_t ep; PSMI_ERR_UNLESS_INITIALIZED(NULL); /* Need to have an opened endpoint before we can resolve epids */ if (psmi_opened_endpoint == NULL) { err = psmi_handle_error(NULL, PSM_EP_WAS_CLOSED, "PSM Endpoint is closed or does not exist"); return err; } ep = psmi_opened_endpoint; while (ep) { epaddr = psmi_epid_lookup(ep, epid); if (!epaddr) { ep = ep->user_ep_next; continue; } /* Found connection for epid. Return info about endpoint to caller. */ psmi_assert_always(epaddr->ptlctl->ep == ep); epconn->addr = epaddr; epconn->ep = ep; epconn->mq = ep->mq; return err; } err = psmi_handle_error(NULL, PSM_EPID_UNKNOWN, "Endpoint connection status unknown"); return err; }
static psm_error_t psmi_ep_verify_pkey(psm_ep_t ep, uint16_t pkey, uint16_t *opkey) { int i, ret; psm_error_t err; for (i = 0; i < 16; i++) { ret = hfi_get_port_index2pkey(ep->unit_id, ep->portnum, i); if (ret < 0) { err = psmi_handle_error(NULL, PSM_EP_DEVICE_FAILURE, "Can't get a valid pkey value from pkey table\n"); return err; } else if (ret == 0x7fff || ret == 0xffff) { continue; /* management pkey, not for app traffic. */ } if (pkey == (uint16_t) ret) break; } /* if pkey does not match */ if (i == 16) { err = psmi_handle_error(NULL, PSM_EP_DEVICE_FAILURE, "Wrong pkey 0x%x, please use PSM_PKEY to specify a valid pkey\n", pkey); return err; } /* return the final pkey */ *opkey = pkey; return PSM_OK; }
psm_error_t __psm_am_register_handlers(psm_ep_t ep, const psm_am_handler_fn_t *handlers, int num_handlers, int *handlers_idx) { int i, j; /* For now just assign any free one */ for (i = 0, j = 0; i < PSMI_AM_NUM_HANDLERS; i++) { if (ep->am_htable[i] == _ignore_handler) { ep->am_htable[i] = handlers[j]; handlers_idx[j] = i; if (++j == num_handlers) /* all registered */ break; } } if (j < num_handlers) { /* Not enough free handlers, restore unused handlers */ for (i = 0; i < j; i++) ep->am_htable[handlers_idx[i]] = _ignore_handler; return psmi_handle_error(ep, PSM_EP_NO_RESOURCES, "Insufficient " "available AM handlers: registered %d of %d requested handlers", j, num_handlers); } else return PSM_OK; }
/* * * Call driver to free all cached tids. */ psm2_error_t ips_tidcache_cleanup(struct ips_tid *tidc) { cl_qmap_t *p_map = &tidc->tid_cachemap; psm2_error_t err; int i, j; j = 0; for (i = 1; i <= tidc->tid_ctrl->tid_num_max; i++) { psmi_assert(REFCNT(i) == 0); if (INVALIDATE(i) == 0) { tidc->tid_array[j++] = p_map->root[i].payload.tidinfo; } } if (j > 0) { /* * call driver to free the tids. */ if (hfi_free_tid(tidc->context->ctrl, (uint64_t) (uintptr_t) tidc->tid_array, j) < 0) { /* If failed to unpin pages, it's fatal error */ err = psmi_handle_error(tidc->context->ep, PSM2_EP_DEVICE_FAILURE, "Failed to tid free %d tids", j); return err; } } psmi_free(tidc->tid_array); psmi_free(tidc->tid_cachemap.root); return PSM2_OK; }
/* * * Force to remove a tid, check invalidation event afterwards. */ static psm2_error_t ips_tidcache_remove(struct ips_tid *tidc, uint32_t tidcnt) { cl_qmap_t *p_map = &tidc->tid_cachemap; uint32_t idx; psm2_error_t err; /* * call driver to free the tids. */ if (hfi_free_tid(tidc->context->ctrl, (uint64_t) (uintptr_t) tidc->tid_array, tidcnt) < 0) { /* If failed to unpin pages, it's fatal error */ err = psmi_handle_error(tidc->context->ep, PSM2_EP_DEVICE_FAILURE, "Failed to tid free %d tids", 1); return err; } while (tidcnt) { tidcnt--; idx = 2*IPS_TIDINFO_GET_TID(tidc->tid_array[tidcnt]) + IPS_TIDINFO_GET_TIDCTRL(tidc->tid_array[tidcnt]); /* * sanity check. */ psmi_assert(idx != 0); psmi_assert(idx <= tidc->tid_ctrl->tid_num_max); psmi_assert(INVALIDATE(idx) == 0); psmi_assert(REFCNT(idx) == 0); /* * mark the tid invalidated. */ INVALIDATE(idx) = 1; /* * remove the tid from RB tree. */ IDLE_REMOVE(idx); ips_cl_qmap_remove_item(p_map, &p_map->root[idx]); } /* * Because the freed tid is not from invalidation list, * it is possible that kernel just invalidated the tid, * then we need to check and process the invalidation * before we can re-use this tid. The reverse order * will wrongly invalidate this tid again. */ if ((*tidc->invalidation_event) & HFI1_EVENT_TID_MMU_NOTIFY) { err = ips_tidcache_invalidation(tidc); if (err) return err; } return PSM2_OK; }
/* * Add ipsaddr with epid to the epstate table, return new index to caller in * 'connidx'. */ psm2_error_t ips_epstate_add(struct ips_epstate *eps, struct ips_epaddr *ipsaddr, ips_epstate_idx *connidx_o) { int i, j; ips_epstate_idx connidx; if (++eps->eps_tabsizeused > eps->eps_tabsize) { /* realloc */ struct ips_epstate_entry *newtab; eps->eps_tabsize += PTL_EPADDR_ALLOC_CHUNK; newtab = (struct ips_epstate_entry *) psmi_calloc(eps->context->ep, PER_PEER_ENDPOINT, eps->eps_tabsize, sizeof(struct ips_epstate_entry)); if (newtab == NULL) return PSM2_NO_MEMORY; else if (eps->eps_tab) { /* NOT first alloc */ for (i = 0; i < eps->eps_tabsize - PTL_EPADDR_ALLOC_CHUNK; i++) newtab[i] = eps->eps_tab[i]; /* deep copy */ psmi_free(eps->eps_tab); } eps->eps_tab = newtab; } /* Find the next free hole. We can afford to do this since connect is not * in the critical path */ for (i = 0, j = eps->eps_tab_nextidx; i < eps->eps_tabsize; i++, j++) { if (j == eps->eps_tabsize) j = 0; if (eps->eps_tab[j].ipsaddr == NULL) { eps->eps_tab_nextidx = j + 1; if (eps->eps_tab_nextidx == eps->eps_tabsize) eps->eps_tab_nextidx = 0; break; } } psmi_assert_always(i != eps->eps_tabsize); connidx = (j - eps->eps_base_idx) & (IPS_EPSTATE_CONNIDX_MAX-1); _HFI_VDBG("node %s gets connidx=%d (table idx %d)\n", psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), connidx, j); eps->eps_tab[j].ipsaddr = ipsaddr; if (j >= IPS_EPSTATE_CONNIDX_MAX) { return psmi_handle_error(eps->context->ep, PSM2_TOO_MANY_ENDPOINTS, "Can't connect to more than %d non-local endpoints", IPS_EPSTATE_CONNIDX_MAX); } *connidx_o = connidx; return PSM2_OK; }
psm_error_t __psm_am_get_source(psm_am_token_t token, psm_epaddr_t *epaddr_out) { struct psmi_am_token *tok; if (token == NULL || epaddr_out == NULL) { return psmi_handle_error(NULL, PSM_PARAM_ERR, "Invalid psm_am_get_source parameters"); } tok = (struct psmi_am_token *)token; *epaddr_out = tok->epaddr_from; return PSM_OK; }
psm2_error_t ips_tidcache_release(struct ips_tid *tidc, uint32_t *tid_array, uint32_t tidcnt) { cl_qmap_t *p_map = &tidc->tid_cachemap; uint32_t i, j, idx; psm2_error_t err; psmi_assert(tidcnt > 0); j = 0; for (i = 0; i < tidcnt; i++) { /* * Driver only returns tidctrl=1 or tidctrl=2. */ idx = 2*IPS_TIDINFO_GET_TID(tid_array[i]) + IPS_TIDINFO_GET_TIDCTRL(tid_array[i]); psmi_assert(idx != 0); psmi_assert(idx <= tidc->tid_ctrl->tid_num_max); psmi_assert(REFCNT(idx) != 0); REFCNT(idx)--; if (REFCNT(idx) == 0) { if (INVALIDATE(idx) != 0) { ips_cl_qmap_remove_item(p_map, &p_map->root[idx]); tidc->tid_array[j] = tid_array[i]; j++; } else { IDLE_INSERT(idx); } } } if (j > 0) { /* * call driver to free the tids. */ if (hfi_free_tid(tidc->context->ctrl, (uint64_t) (uintptr_t) tidc->tid_array, j) < 0) { /* If failed to unpin pages, it's fatal error */ err = psmi_handle_error(tidc->context->ep, PSM2_EP_DEVICE_FAILURE, "Failed to tid free %d tids", j); return err; } } return PSM2_OK; }
psm_error_t __psm_ep_query(int *num_of_epinfo, psm_epinfo_t *array_of_epinfo) { psm_error_t err = PSM_OK; int i; psm_ep_t ep; PSMI_ERR_UNLESS_INITIALIZED(NULL); if (*num_of_epinfo <= 0) { err = psmi_handle_error(NULL, PSM_PARAM_ERR, "Invalid psm_ep_query parameters"); return err; } if (psmi_opened_endpoint == NULL) { err = psmi_handle_error(NULL, PSM_EP_WAS_CLOSED, "PSM Endpoint is closed or does not exist"); return err; } ep = psmi_opened_endpoint; for (i = 0; i < *num_of_epinfo; i++) { if (ep == NULL) break; array_of_epinfo[i].ep = ep; array_of_epinfo[i].epid = ep->epid; array_of_epinfo[i].jkey = ep->jkey; memcpy(array_of_epinfo[i].uuid, (void *)ep->uuid, sizeof(psm_uuid_t)); psmi_uuid_unparse(ep->uuid, array_of_epinfo[i].uuid_str); ep = ep->user_ep_next; } *num_of_epinfo = i; return err; }
psm_error_t ips_tid_init(const psmi_context_t *context, struct ips_tid *tidc, ips_tid_avail_cb_fn_t cb, void *cb_context) { const struct hfi1_ctxt_info *ctxt_info = &context->ctrl->ctxt_info; struct psmi_stats_entry entries[] = { PSMI_STATS_DECL("tid update count", MPSPAWN_STATS_REDUCTION_ALL, NULL, &tidc->tid_num_total), }; tidc->context = context; /* these are in group unit, a group is 8 tids or 4 tidpairs */ tidc->tid_num_total = 0; tidc->tid_num_inuse = 0; tidc->tid_avail_cb = cb; tidc->tid_avail_context = cb_context; tidc->tid_ctrl = (struct ips_tid_ctrl *)context->tid_ctrl; if (!tidc->tid_ctrl) { tidc->tid_ctrl = (struct ips_tid_ctrl *) psmi_calloc(context->ep, UNDEFINED, 1, sizeof(struct ips_tid_ctrl)); if (tidc->tid_ctrl == NULL) { return PSM_NO_MEMORY; } } /* * Only the master process can initialize. */ if (ctxt_info->subctxt == 0) { pthread_spin_init(&tidc->tid_ctrl->tid_ctrl_lock, PTHREAD_PROCESS_SHARED); /* check if exp tids are multiple of 8 (a group) */ if (context->ctrl->__hfi_tidexpcnt % 8) return psmi_handle_error(context->ep, PSM_INTERNAL_ERR, "Expected tids(%d) are not multi-groups(8)", context->ctrl->__hfi_tidexpcnt); tidc->tid_ctrl->tid_num_max = context->ctrl->__hfi_tidexpcnt >> 3; tidc->tid_ctrl->tid_num_avail = tidc->tid_ctrl->tid_num_max; }
psm_error_t __psm_am_get_parameters(psm_ep_t ep, struct psm_am_parameters *parameters, size_t sizeof_parameters_in, size_t *sizeof_parameters_out) { size_t s; if (parameters == NULL) { return psmi_handle_error(NULL, PSM_PARAM_ERR, "Invalid psm_am_get_parameters parameters"); } memset(parameters, 0, sizeof_parameters_in); s = min(sizeof(psmi_am_parameters), sizeof_parameters_in); memcpy(parameters, &psmi_am_parameters, s); *sizeof_parameters_out = s; return PSM_OK; }
psm2_error_t ips_tid_release(struct ips_tid *tidc, uint32_t *tid_array, uint32_t tidcnt) { struct ips_tid_ctrl *ctrl = tidc->tid_ctrl; psm2_error_t err = PSM2_OK; psmi_assert(tidcnt > 0); if (tidc->context->tid_ctrl) pthread_spin_lock(&ctrl->tid_ctrl_lock); if (hfi_free_tid(tidc->context->ctrl, (uint64_t) (uintptr_t) tid_array, tidcnt) < 0) { if (tidc->context->tid_ctrl) pthread_spin_unlock(&ctrl->tid_ctrl_lock); /* If failed to unpin pages, it's fatal error */ err = psmi_handle_error(tidc->context->ep, PSM2_EP_DEVICE_FAILURE, "Failed to tid free %d tids", tidcnt); goto fail; } ctrl->tid_num_avail += tidcnt; if (tidc->context->tid_ctrl) pthread_spin_unlock(&ctrl->tid_ctrl_lock); tidc->tid_num_inuse -= tidcnt; /* If an available callback is registered invoke it */ if (((tidc->tid_num_inuse + tidcnt) == ctrl->tid_num_max) && tidc->tid_avail_cb) tidc->tid_avail_cb(tidc, tidc->tid_avail_context); fail: return err; }
int __recvpath psmi_mq_handle_envelope_unexpected( psm_mq_t mq, uint16_t mode, psm_epaddr_t epaddr, uint64_t tag, psmi_egrid_t egrid, uint32_t send_msglen, const void *payload, uint32_t paylen) { psm_mq_req_t req; uint32_t msglen; /* * Keep a callback here in case we want to fit some other high-level * protocols over MQ (i.e. shmem). These protocols would bypass the * normal mesage handling and go to higher-level message handlers. */ if (mode >= MQ_MSG_USER_FIRST && mq->unexpected_callback) { mq->unexpected_callback(mq,mode,epaddr,tag,send_msglen,payload,paylen); return MQ_RET_UNEXP_OK; } req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV); psmi_assert(req != NULL); req->tag = tag; req->recv_msgoff = 0; req->recv_msglen = req->send_msglen = req->buf_len = msglen = send_msglen; _IPATH_VDBG( "from=%s match=NO (req=%p) mode=%x mqtag=%" PRIx64 " send_msglen=%d\n", psmi_epaddr_get_name(epaddr->epid), req, mode, tag, send_msglen); #if 0 if (mq->cur_sysbuf_bytes+msglen > mq->max_sysbuf_bytes) { _IPATH_VDBG("req=%p with len=%d exceeds limit of %llu sysbuf_bytes\n", req, msglen, (unsigned long long) mq->max_sysbuf_bytes); return MQ_RET_UNEXP_NO_RESOURCES; } #endif switch (mode) { case MQ_MSG_TINY: if (msglen > 0) { req->buf = psmi_mq_sysbuf_alloc(mq, msglen); mq_copy_tiny((uint32_t *)req->buf, (uint32_t *)payload, msglen); } else req->buf = NULL; req->state = MQ_STATE_COMPLETE; break; case MQ_MSG_SHORT: req->buf = psmi_mq_sysbuf_alloc(mq, msglen); psmi_mq_mtucpy(req->buf, payload, msglen); req->state = MQ_STATE_COMPLETE; break; case MQ_MSG_LONG: req->egrid = egrid; req->send_msgoff = 0; req->buf = psmi_mq_sysbuf_alloc(mq, msglen); req->state = MQ_STATE_UNEXP; req->type |= MQE_TYPE_EGRLONG; STAILQ_INSERT_TAIL(&epaddr->mctxt_master->egrlong, req, nextq); _IPATH_VDBG("unexp MSG_LONG %d of length %d bytes pay=%d\n", egrid.egr_msgno, msglen, paylen); if (paylen > 0) psmi_mq_handle_data(req, epaddr, egrid.egr_data, 0, payload, paylen); psmi_mq_handle_egrdata(mq, req, epaddr); break; default: psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, "Internal error, unknown packet 0x%x", mode); } mq_sq_append(&mq->unexpected_q, req); mq->stats.rx_sys_bytes += msglen; mq->stats.rx_sys_num++; return MQ_RET_UNEXP_OK; }
static psm_error_t psmi_ep_open_device(const psm_ep_t ep, const struct psm_ep_open_opts *opts, const psm_uuid_t unique_job_key, struct psmi_context *context, psm_epid_t *epid) { psm_error_t err = PSM_OK; /* Skip affinity. No affinity if: * 1. User explicitly sets no-affinity=YES in environment. * 2. User doesn't set affinity in environment and PSM is opened with * option affinity skip. */ if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { uint32_t rcvthread_flags; union psmi_envvar_val env_rcvthread; static int norcvthread; /* only for first rail */ ep->out_sl = opts->outsl; if ((err = psmi_context_open(ep, opts->unit, opts->port, unique_job_key, opts->timeout, context)) != PSM_OK) goto fail; _HFI_DBG("[%d]use unit %d port %d\n", getpid(), context->ctrl->__hfi_unit, 1); /* At this point, we have the unit id and port number, so * check if pkey is not 0x0/0x7fff/0xffff, and match one * of the pkey in table. */ if ((err = psmi_ep_verify_pkey(ep, (uint16_t) opts->network_pkey, &ep->network_pkey)) != PSM_OK) goto fail; /* See if we want to activate support for receive thread */ psmi_getenv("PSM_RCVTHREAD", "Recv thread flags (0 disables thread)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, (union psmi_envvar_val)(norcvthread++ ? 0 : PSMI_RCVTHREAD_FLAGS), &env_rcvthread); rcvthread_flags = env_rcvthread.e_uint; /* If enabled, use the pollurg capability to implement a receive * interrupt thread that can handle urg packets */ if (rcvthread_flags) { context->runtime_flags |= PSMI_RUNTIME_RCVTHREAD; #ifdef PSMI_PLOCK_IS_NOLOCK psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, "#define PSMI_PLOCK_IS_NOLOCK not functional yet " "with RCVTHREAD on"); #endif } context->rcvthread_flags = rcvthread_flags; *epid = context->epid; } else if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) { int rank; /* In shm-only mode, we need to derive a valid epid * based on our rank. We try to get it from the * environment if its available, or resort to using * our PID as the rank. */ union psmi_envvar_val env_rankid; if (psmi_getenv ("MPI_LOCALRANKID", "Shared context rankid", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, (union psmi_envvar_val)-1, &env_rankid)) { if (psmi_getenv ("PSC_MPI_NODE_RANK", "Shared context rankid", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, (union psmi_envvar_val)-1, &env_rankid)) { rank = getpid(); } else rank = env_rankid.e_int; } else rank = env_rankid.e_int; /* * We use a LID of 0 for non-HFI communication. * Since a jobkey is not available from IPS, pull the * first 16 bits from the UUID. */ *epid = PSMI_EPID_PACK(((uint16_t *) unique_job_key)[0], (rank >> 3), rank, 0, PSMI_HFI_TYPE_DEFAULT, 0x7, rank); } else {
psm2_error_t __psm2_ep_connect(psm2_ep_t ep, int num_of_epid, psm2_epid_t const *array_of_epid, int const *array_of_epid_mask, /* can be NULL */ psm2_error_t *array_of_errors, psm2_epaddr_t *array_of_epaddr, int64_t timeout) { psm2_error_t err = PSM2_OK; ptl_ctl_t *ptlctl; ptl_t *ptl; int i, j, dup_idx; int num_toconnect = 0; int *epid_mask = NULL; int *epid_mask_isdupof = NULL; char *device; uint64_t t_start = get_cycles(); uint64_t t_left; union psmi_envvar_val timeout_intval; PSM2_LOG_MSG("entering"); PSMI_ERR_UNLESS_INITIALIZED(ep); PSMI_PLOCK(); /* * Normally we would lock here, but instead each implemented ptl component * does its own locking. This is mostly because the ptl components are * ahead of the PSM interface in that they can disconnect their peers. */ if (ep == NULL || array_of_epaddr == NULL || array_of_epid == NULL || num_of_epid < 1) { err = psmi_handle_error(ep, PSM2_PARAM_ERR, "Invalid psm2_ep_connect parameters"); goto fail; } /* We need two of these masks to detect duplicates */ err = PSM2_NO_MEMORY; epid_mask = (int *)psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epid); if (epid_mask == NULL) goto fail; epid_mask_isdupof = (int *)psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epid); if (epid_mask_isdupof == NULL) goto fail; err = PSM2_OK; /* Eventually handle timeouts across all connects. */ for (j = 0; j < num_of_epid; j++) { if (array_of_epid_mask != NULL && !array_of_epid_mask[j]) epid_mask[j] = 0; else { epid_mask[j] = 1; array_of_errors[j] = PSM2_EPID_UNKNOWN; array_of_epaddr[j] = NULL; num_toconnect++; } epid_mask_isdupof[j] = -1; } psmi_getenv("PSM2_CONNECT_TIMEOUT", "End-point connection timeout over-ride. 0 for no time-out.", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)0, &timeout_intval); if (getenv("PSM2_CONNECT_TIMEOUT")) { timeout = timeout_intval.e_uint * SEC_ULL; } else if (timeout > 0) { /* The timeout parameter provides the minimum timeout. A heuristic * is used to scale up the timeout linearly with the number of * endpoints, and we allow one second per 100 endpoints. */ timeout = max(timeout, (num_toconnect * SEC_ULL) / 100); } if (timeout > 0 && timeout < PSMI_MIN_EP_CONNECT_TIMEOUT) timeout = PSMI_MIN_EP_CONNECT_TIMEOUT; _HFI_PRDBG("Connect to %d endpoints with time-out of %.2f secs\n", num_toconnect, (double)timeout / 1e9); /* Look for duplicates in input array */ for (i = 0; i < num_of_epid; i++) { for (j = i + 1; j < num_of_epid; j++) { if (array_of_epid[i] == array_of_epid[j] && epid_mask[i] && epid_mask[j]) { epid_mask[j] = 0; /* don't connect more than once */ epid_mask_isdupof[j] = i; } } } for (i = 0; i < PTL_MAX_INIT; i++) { if (ep->devid_enabled[i] == -1) continue; /* Set up the right connect ptrs */ switch (ep->devid_enabled[i]) { case PTL_DEVID_IPS: ptlctl = &ep->ptl_ips; ptl = ep->ptl_ips.ptl; device = "ips"; break; case PTL_DEVID_AMSH: ptlctl = &ep->ptl_amsh; ptl = ep->ptl_amsh.ptl; device = "amsh"; break; case PTL_DEVID_SELF: ptlctl = &ep->ptl_self; ptl = ep->ptl_self.ptl; device = "self"; break; default: device = "unknown"; ptlctl = &ep->ptl_ips; /*no-unused */ ptl = ep->ptl_ips.ptl; /*no-unused */ device = "ips"; /*no-unused */ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Unknown/unhandled PTL id %d\n", ep->devid_enabled[i]); break; } t_left = psmi_cycles_left(t_start, timeout); _HFI_VDBG("Trying to connect with device %s\n", device); if ((err = ptlctl->ep_connect(ptl, num_of_epid, array_of_epid, epid_mask, array_of_errors, array_of_epaddr, cycles_to_nanosecs(t_left)))) { _HFI_PRDBG("Connect failure in device %s err=%d\n", device, err); goto connect_fail; } /* Now process what's been connected */ for (j = 0; j < num_of_epid; j++) { dup_idx = epid_mask_isdupof[j]; if (!epid_mask[j] && dup_idx == -1) continue; if (dup_idx != -1) { /* dup */ array_of_epaddr[j] = array_of_epaddr[dup_idx]; array_of_errors[j] = array_of_errors[dup_idx]; epid_mask_isdupof[j] = -1; } if (array_of_errors[j] == PSM2_OK) { epid_mask[j] = 0; /* don't try on next ptl */ ep->connections++; } } } for (i = 0; i < num_of_epid; i++) { ptl_ctl_t *c = NULL; if (array_of_epid_mask != NULL && !array_of_epid_mask[i]) continue; /* If we see unreachable here, that means some PTLs were not enabled */ if (array_of_errors[i] == PSM2_EPID_UNREACHABLE) { err = PSM2_EPID_UNREACHABLE; break; } psmi_assert_always(array_of_epaddr[i] != NULL); c = array_of_epaddr[i]->ptlctl; psmi_assert_always(c != NULL); _HFI_VDBG("%-20s DEVICE %s (%p)\n", psmi_epaddr_get_name(array_of_epid[i]), c == &ep->ptl_ips ? "hfi" : (c == &ep->ptl_amsh ? "amsh" : "self"), (void *)array_of_epaddr[i]->ptlctl->ptl); } connect_fail: /* If the error is a timeout (at worse) and the client is OPA MPI, * just return timeout to let OPA MPI handle the hostnames that * timed out */ if (err != PSM2_OK) { char errbuf[PSM2_ERRSTRING_MAXLEN]; size_t len; int j = 0; if (err == PSM2_EPID_UNREACHABLE) { char *deverr = "of an incorrect setting"; char *eperr = " "; char *devname = NULL; if (!psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) { deverr = "there is no shared memory PSM device (shm)"; eperr = " shared memory "; } else if (!psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { deverr = "there is no OPA PSM device (hfi)"; eperr = " OPA "; } len = snprintf(errbuf, sizeof(errbuf) - 1, "Some%sendpoints could not be connected because %s " "in the currently enabled PSM_DEVICES (", eperr, deverr); for (i = 0; i < PTL_MAX_INIT && len < sizeof(errbuf) - 1; i++) { switch (ep->devid_enabled[i]) { case PTL_DEVID_IPS: devname = "hfi"; break; case PTL_DEVID_AMSH: devname = "shm"; break; case PTL_DEVID_SELF: default: devname = "self"; break; } len += snprintf(errbuf + len, sizeof(errbuf) - len - 1, "%s,", devname); } if (len < sizeof(errbuf) - 1 && devname != NULL) /* parsed something, remove trailing comma */ errbuf[len - 1] = ')'; } else len = snprintf(errbuf, sizeof(errbuf) - 1, "%s", err == PSM2_TIMEOUT ? "Dectected connection timeout" : psm2_error_get_string(err)); /* first pass, look for all nodes with the error */ for (i = 0; i < num_of_epid && len < sizeof(errbuf) - 1; i++) { if (array_of_epid_mask != NULL && !array_of_epid_mask[i]) continue; if (array_of_errors[i] == PSM2_OK) continue; if (array_of_errors[i] == PSM2_EPID_UNREACHABLE && err != PSM2_EPID_UNREACHABLE) continue; if (err == array_of_errors[i]) { len += snprintf(errbuf + len, sizeof(errbuf) - len - 1, "%c %s", j == 0 ? ':' : ',', psmi_epaddr_get_hostname (array_of_epid[i])); j++; } } errbuf[sizeof(errbuf) - 1] = '\0'; err = psmi_handle_error(ep, err, errbuf); } fail: PSMI_PUNLOCK(); if (epid_mask != NULL) psmi_free(epid_mask); if (epid_mask_isdupof != NULL) psmi_free(epid_mask_isdupof); PSM2_LOG_MSG("leaving"); return err; }
static psm_error_t psmi_ep_multirail(int *num_rails, uint32_t *unit, uint16_t *port) { uint32_t num_units; uint64_t gid_hi, gid_lo; int i, j, ret, count = 0; char *env; psm_error_t err = PSM_OK; uint64_t gidh[HFI_MAX_RAILS][3]; env = getenv("PSM_MULTIRAIL"); if (!env || atoi(env) == 0) { *num_rails = 0; return err; } /* * map is in format: unit:port,unit:port,... */ if ((env = getenv("PSM_MULTIRAIL_MAP"))) { if (sscanf(env, "%d:%d", &i, &j) == 2) { char *comma = strchr(env, ','); unit[count] = i; port[count] = j; count++; while (comma) { if (sscanf(comma, ",%d:%d", &i, &j) != 2) { break; } unit[count] = i; port[count] = j; count++; if (count == HFI_MAX_RAILS) break; comma = strchr(comma + 1, ','); } } *num_rails = count; /* * Check if any of the port is not usable. */ for (i = 0; i < count; i++) { ret = hfi_get_port_lid(unit[i], port[i]); if (ret == -1) { err = psmi_handle_error(NULL, PSM_EP_DEVICE_FAILURE, "Couldn't get lid for unit %d:%d", unit[i], port[i]); return err; } ret = hfi_get_port_gid(unit[i], port[i], &gid_hi, &gid_lo); if (ret == -1) { err = psmi_handle_error(NULL, PSM_EP_DEVICE_FAILURE, "Couldn't get gid for unit %d:%d", unit[i], port[i]); return err; } } return err; } if ((err = psm_ep_num_devunits(&num_units))) { return err; } if (num_units > HFI_MAX_RAILS) { _HFI_INFO ("Found %d units, max %d units are supported, use %d\n", num_units, HFI_MAX_RAILS, HFI_MAX_RAILS); num_units = HFI_MAX_RAILS; } /* * Get all the ports with a valid lid and gid, one per unit. */ for (i = 0; i < num_units; i++) { for (j = 1; j <= HFI_MAX_PORT; j++) { ret = hfi_get_port_lid(i, j); if (ret == -1) continue; ret = hfi_get_port_gid(i, j, &gid_hi, &gid_lo); if (ret == -1) continue; gidh[count][0] = gid_hi; gidh[count][1] = i; gidh[count][2] = j; count++; break; } } /* * Sort all the ports with gidh from small to big. * This is for multiple fabrics, and we use fabric with the * smallest gid to make the master connection. */ qsort(gidh, count, sizeof(uint64_t) * 3, cmpfunc); for (i = 0; i < count; i++) { unit[i] = (uint32_t) gidh[i][1]; port[i] = (uint16_t) (uint32_t) gidh[i][2]; } *num_rails = count; return err; }
static psm_error_t psmi_ep_devlids(uint16_t **lids, uint32_t *num_lids_o, uint64_t my_gid_hi, uint64_t my_gid_lo) { static uint16_t *hfi_lids; static uint32_t nlids; uint32_t num_units; int i; psm_error_t err = PSM_OK; PSMI_ERR_UNLESS_INITIALIZED(NULL); if (hfi_lids == NULL) { if ((err = psm_ep_num_devunits(&num_units))) goto fail; hfi_lids = (uint16_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, num_units * HFI_MAX_PORT, sizeof(uint16_t)); if (hfi_lids == NULL) { err = psmi_handle_error(NULL, PSM_NO_MEMORY, "Couldn't allocate memory for dev_lids structure"); goto fail; } for (i = 0; i < num_units; i++) { int j; for (j = 1; j <= HFI_MAX_PORT; j++) { int lid = hfi_get_port_lid(i, j); int ret; uint64_t gid_hi = 0, gid_lo = 0; if (lid == -1) continue; ret = hfi_get_port_gid(i, j, &gid_hi, &gid_lo); if (ret == -1) continue; else if (my_gid_hi != gid_hi) { _HFI_VDBG("LID %d, unit %d, port %d, " "mismatched GID %llx:%llx and " "%llx:%llx\n", lid, i, j, (unsigned long long)gid_hi, (unsigned long long)gid_lo, (unsigned long long)my_gid_hi, (unsigned long long) my_gid_lo); continue; } _HFI_VDBG("LID %d, unit %d, port %d, " "matching GID %llx:%llx and " "%llx:%llx\n", lid, i, j, (unsigned long long)gid_hi, (unsigned long long)gid_lo, (unsigned long long)my_gid_hi, (unsigned long long)my_gid_lo); hfi_lids[nlids++] = (uint16_t) lid; } } if (nlids == 0) { err = psmi_handle_error(NULL, PSM_EP_DEVICE_FAILURE, "Couldn't get lid&gid from any unit/port"); goto fail; } } *lids = hfi_lids; *num_lids_o = nlids; fail: return err; }
psm_error_t __psm_ep_close(psm_ep_t ep, int mode, int64_t timeout_in) { psm_error_t err = PSM_OK; uint64_t t_start = get_cycles(); union psmi_envvar_val timeout_intval; psm_ep_t tmp, mep; PSMI_ERR_UNLESS_INITIALIZED(ep); psmi_assert_always(ep->mctxt_master == ep); PSMI_PLOCK(); if (psmi_opened_endpoint == NULL) { err = psmi_handle_error(NULL, PSM_EP_WAS_CLOSED, "PSM Endpoint is closed or does not exist"); return err; } tmp = psmi_opened_endpoint; while (tmp && tmp != ep) { tmp = tmp->user_ep_next; } if (!tmp) { err = psmi_handle_error(NULL, PSM_EP_WAS_CLOSED, "PSM Endpoint is closed or does not exist"); return err; } psmi_getenv("PSM_CLOSE_TIMEOUT", "End-point close timeout over-ride.", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)0, &timeout_intval); if (getenv("PSM_CLOSE_TIMEOUT")) { timeout_in = timeout_intval.e_uint * SEC_ULL; } else if (timeout_in > 0) { /* The timeout parameter provides the minimum timeout. A heuristic * is used to scale up the timeout linearly with the number of * endpoints, and we allow one second per 100 endpoints. */ timeout_in = max(timeout_in, (ep->connections * SEC_ULL) / 100); } if (timeout_in > 0 && timeout_in < PSMI_MIN_EP_CLOSE_TIMEOUT) timeout_in = PSMI_MIN_EP_CLOSE_TIMEOUT; /* Infinite and excessive close time-out are limited here to a max. * The "rationale" is that there is no point waiting around forever for * graceful termination. Normal (or forced) process termination should clean * up the context state correctly even if termination is not graceful. */ if (timeout_in <= 0 || timeout_in < PSMI_MAX_EP_CLOSE_TIMEOUT) timeout_in = PSMI_MAX_EP_CLOSE_TIMEOUT; _HFI_PRDBG("Closing endpoint %p with force=%s and to=%.2f seconds and " "%d connections\n", ep, mode == PSM_EP_CLOSE_FORCE ? "YES" : "NO", (double)timeout_in / 1e9, (int)ep->connections); /* XXX We currently cheat in the sense that we leave each PTL the allowed * timeout. There's no good way to do this until we change the PTL * interface to allow asynchronous finalization */ mep = ep; tmp = ep->mctxt_prev; do { ep = tmp; tmp = ep->mctxt_prev; PSM_MCTXT_REMOVE(ep); if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) err = psmi_ptl_amsh.fini(ep->ptl_amsh.ptl, mode, timeout_in); if ((err == PSM_OK || err == PSM_TIMEOUT) && psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) err = psmi_ptl_ips.fini(ep->ptl_ips.ptl, mode, timeout_in); /* If there's timeouts in the disconnect requests, * still make sure that we still get to close the *endpoint and mark it closed */ if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) psmi_context_close(&ep->context); psmi_free(ep->epaddr); psmi_free(ep->context_mylabel); /* * Before freeing the master ep itself, * remove it from the global linklist. * We do it here to let atexit handler in ptl_am directory * to search the global linklist and free the shared memory file. */ if (ep == mep) { if (psmi_opened_endpoint == ep) { psmi_opened_endpoint = ep->user_ep_next; } else { tmp = psmi_opened_endpoint; while (tmp->user_ep_next != ep) { tmp = tmp->user_ep_next; } tmp->user_ep_next = ep->user_ep_next; } psmi_opened_endpoint_count--; } psmi_free(ep); } while ((err == PSM_OK || err == PSM_TIMEOUT) && tmp != ep); PSMI_PUNLOCK(); _HFI_PRDBG("Closed endpoint in %.3f secs\n", (double)cycles_to_nanosecs(get_cycles() - t_start) / SEC_ULL); return err; }
/* * Get mmu notifier invalidation info and update PSM's caching. */ psm2_error_t ips_tidcache_invalidation(struct ips_tid *tidc) { cl_qmap_t *p_map = &tidc->tid_cachemap; uint32_t i, j, idx, tidcnt; psm2_error_t err; /* * get a list of invalidated tids from driver, * driver will clear the event bit before return. */ tidcnt = 0; if (hfi_get_invalidation(tidc->context->ctrl, (uint64_t) (uintptr_t) tidc->tid_array, &tidcnt) < 0) { /* If failed to get invalidation info, it's fatal error */ err = psmi_handle_error(tidc->context->ep, PSM2_EP_DEVICE_FAILURE, "Failed to get invalidation info"); return err; } psmi_assert(tidcnt > 0 && tidcnt <= tidc->tid_ctrl->tid_num_max); j = 0; for (i = 0; i < tidcnt; i++) { /* * Driver only returns tidctrl=1 or tidctrl=2. */ idx = 2*IPS_TIDINFO_GET_TID(tidc->tid_array[i]) + IPS_TIDINFO_GET_TIDCTRL(tidc->tid_array[i]); psmi_assert(idx != 0); psmi_assert(idx <= tidc->tid_ctrl->tid_num_max); /* * sanity check. */ psmi_assert(p_map->root[idx].payload.tidinfo == tidc->tid_array[i]); psmi_assert(LENGTH(idx) == IPS_TIDINFO_GET_LENGTH(tidc->tid_array[i])); /* * if the tid is already invalidated, ignore it, * but do sanity check. */ if (INVALIDATE(idx) != 0) { psmi_assert(REFCNT(idx) == 0); continue; } /* * mark the tid invalidated. */ INVALIDATE(idx) = 1; /* * if the tid is idle, remove the tid from RB tree * and idle queue, put on free list. */ if (REFCNT(idx) == 0) { IDLE_REMOVE(idx); ips_cl_qmap_remove_item(p_map, &p_map->root[idx]); if (i != j) tidc->tid_array[j] = tidc->tid_array[i]; j++; } } if (j > 0) { /* * call driver to free the tids. */ if (hfi_free_tid(tidc->context->ctrl, (uint64_t) (uintptr_t) tidc->tid_array, j) < 0) { /* If failed to unpin pages, it's fatal error */ err = psmi_handle_error(tidc->context->ep, PSM2_EP_DEVICE_FAILURE, "Failed to tid free %d tids", j); return err; } } return PSM2_OK; }
/* * This handles the regular (i.e. non-rendezvous MPI envelopes) */ int __recvpath psmi_mq_handle_envelope(psm_mq_t mq, uint16_t mode, psm_epaddr_t epaddr, uint64_t tag, psmi_egrid_t egrid, uint32_t send_msglen, const void *payload, uint32_t paylen) { psm_mq_req_t req; uint32_t msglen; int rc; psmi_assert(epaddr != NULL); req = mq_req_match(&(mq->expected_q), tag, 1); if (req) { /* we have a match */ psmi_assert(MQE_TYPE_IS_RECV(req->type)); req->tag = tag; msglen = mq_set_msglen(req, req->buf_len, send_msglen); _IPATH_VDBG("from=%s match=YES (req=%p) mode=%x mqtag=%" PRIx64" msglen=%d paylen=%d\n", psmi_epaddr_get_name(epaddr->epid), req, mode, tag, msglen, paylen); switch(mode) { case MQ_MSG_TINY: PSM_VALGRIND_DEFINE_MQ_RECV(req->buf, req->buf_len, msglen); mq_copy_tiny((uint32_t *)req->buf, (uint32_t *)payload, msglen); req->state = MQ_STATE_COMPLETE; mq_qq_append(&mq->completed_q, req); break; case MQ_MSG_SHORT: /* message fits in 1 payload */ PSM_VALGRIND_DEFINE_MQ_RECV(req->buf, req->buf_len, msglen); psmi_mq_mtucpy(req->buf, payload, msglen); req->state = MQ_STATE_COMPLETE; mq_qq_append(&mq->completed_q, req); break; case MQ_MSG_LONG: req->egrid = egrid; req->state = MQ_STATE_MATCHED; req->type |= MQE_TYPE_EGRLONG; req->send_msgoff = req->recv_msgoff = 0; STAILQ_INSERT_TAIL(&epaddr->mctxt_master->egrlong, req, nextq); _IPATH_VDBG("exp MSG_LONG %d of length %d bytes pay=%d\n", egrid.egr_msgno, msglen, paylen); if (paylen > 0) psmi_mq_handle_data(req, epaddr, egrid.egr_data, 0, payload, paylen); psmi_mq_handle_egrdata(mq, req, epaddr); break; default: psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, "Internal error, unknown packet 0x%x", mode); } mq->stats.rx_user_bytes += msglen; mq->stats.rx_user_num++; rc = MQ_RET_MATCH_OK; if (mode == MQ_MSG_LONG) return rc; } else rc = psmi_mq_handle_envelope_unexpected(mq, mode, epaddr, tag, egrid, send_msglen, payload, paylen); return rc; }
int __recvpath psmi_mq_handle_envelope_outoforder(psm_mq_t mq, uint16_t mode, psm_epaddr_t epaddr, uint16_t msg_seqnum, uint64_t tag, psmi_egrid_t egrid, uint32_t send_msglen, const void *payload, uint32_t paylen) { psm_mq_req_t req; uint32_t msglen; req = psmi_mq_req_alloc(mq, MQE_TYPE_RECV); psmi_assert(req != NULL); req->tag = tag; req->recv_msgoff = 0; req->recv_msglen = req->send_msglen = req->buf_len = msglen = send_msglen; _IPATH_VDBG( "from=%s match=NO (req=%p) mode=%x mqtag=%" PRIx64 " send_msglen=%d\n", psmi_epaddr_get_name(epaddr->epid), req, mode, tag, send_msglen); switch (mode) { case MQ_MSG_TINY: if (msglen > 0) { req->buf = psmi_mq_sysbuf_alloc(mq, msglen); mq_copy_tiny((uint32_t *)req->buf, (uint32_t *)payload, msglen); } else req->buf = NULL; req->state = MQ_STATE_COMPLETE; break; case MQ_MSG_SHORT: req->buf = psmi_mq_sysbuf_alloc(mq, msglen); psmi_mq_mtucpy(req->buf, payload, msglen); req->state = MQ_STATE_COMPLETE; break; case MQ_MSG_LONG: req->egrid = egrid; req->epaddr = epaddr; req->send_msgoff = 0; req->buf = psmi_mq_sysbuf_alloc(mq, msglen); req->state = MQ_STATE_UNEXP; req->type |= MQE_TYPE_EGRLONG; STAILQ_INSERT_TAIL(&epaddr->mctxt_master->egrlong, req, nextq); _IPATH_VDBG("unexp MSG_LONG %d of length %d bytes pay=%d\n", egrid.egr_msgno, msglen, paylen); if (paylen > 0) psmi_mq_handle_data(req, epaddr, egrid.egr_data, 0, payload, paylen); psmi_mq_handle_egrdata(mq, req, epaddr); break; default: psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, "Internal error, unknown packet 0x%x", mode); } req->msg_seqnum = msg_seqnum; mq_sq_append(&epaddr->mctxt_master->outoforder_q, req); epaddr->mctxt_master->outoforder_c++; mq->stats.rx_sys_bytes += msglen; mq->stats.rx_sys_num++; return MQ_RET_UNEXP_OK; }
psm_error_t __psm_ep_open_internal(psm_uuid_t const unique_job_key, int *devid_enabled, struct psm_ep_open_opts const *opts_i, psm_mq_t mq, psm_ep_t *epo, psm_epid_t *epido) { psm_ep_t ep = NULL; uint32_t num_units; size_t len; psm_error_t err; psm_epaddr_t epaddr = NULL; char buf[128], *p, *e; union psmi_envvar_val envvar_val; size_t ptl_sizes; struct psm_ep_open_opts opts; ptl_t *amsh_ptl, *ips_ptl, *self_ptl; int i; /* First get the set of default options, we overwrite with the user's * desired values afterwards */ if ((err = psm_ep_open_opts_get_defaults(&opts))) goto fail; if (opts_i != NULL) { if (opts_i->timeout != -1) opts.timeout = opts_i->timeout; if (opts_i->unit != -1) opts.unit = opts_i->unit; if (opts_i->affinity != -1) opts.affinity = opts_i->affinity; if (opts_i->sendbufs_num != -1) opts.sendbufs_num = opts_i->sendbufs_num; if (opts_i->network_pkey != HFI_DEFAULT_P_KEY) opts.network_pkey = opts_i->network_pkey; if (opts_i->port != 0) opts.port = opts_i->port; if (opts_i->outsl != -1) opts.outsl = opts_i->outsl; if (opts_i->service_id) opts.service_id = (uint64_t) opts_i->service_id; if (opts_i->path_res_type != PSM_PATH_RES_NONE) opts.path_res_type = opts_i->path_res_type; if (opts_i->senddesc_num) opts.senddesc_num = opts_i->senddesc_num; if (opts_i->imm_size) opts.imm_size = opts_i->imm_size; } /* Get Service ID from environment */ if (!psmi_getenv("PSM_IB_SERVICE_ID", "HFI Service ID for path resolution", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_ULONG_ULONG, (union psmi_envvar_val)HFI_DEFAULT_SERVICE_ID, &envvar_val)) { opts.service_id = (uint64_t) envvar_val.e_ulonglong; } /* Get Path resolution type from environment Possible choices are: * * NONE : Default same as previous instances. Utilizes static data. * OPP : Use OFED Plus Plus library to do path record queries. * UMAD : Use raw libibumad interface to form and process path records. */ if (!psmi_getenv("PSM_PATH_REC", "Mechanism to query HFI path record (default is no path query)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, (union psmi_envvar_val)"none", &envvar_val)) { if (!strcasecmp(envvar_val.e_str, "none")) opts.path_res_type = PSM_PATH_RES_NONE; else if (!strcasecmp(envvar_val.e_str, "opp")) opts.path_res_type = PSM_PATH_RES_OPP; else if (!strcasecmp(envvar_val.e_str, "umad")) opts.path_res_type = PSM_PATH_RES_UMAD; else { _HFI_ERROR("Unknown path resolution type %s. " "Disabling use of path record query.\n", envvar_val.e_str); opts.path_res_type = PSM_PATH_RES_NONE; } } /* If a specific unit is set in the environment, use that one. */ if (!psmi_getenv("HFI_UNIT", "Device Unit number (-1 autodetects)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_LONG, (union psmi_envvar_val)HFI_UNIT_ID_ANY, &envvar_val)) { opts.unit = envvar_val.e_long; } /* Get user specified port number to use. */ if (!psmi_getenv("HFI_PORT", "IB Port number (0 autodetects)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_LONG, (union psmi_envvar_val)HFI_PORT_NUM_ANY, &envvar_val)) { opts.port = envvar_val.e_long; } /* Get service level from environment, path-query overrides it */ if (!psmi_getenv ("HFI_SL", "HFI outging ServiceLevel number (default 0)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_LONG, (union psmi_envvar_val)PSMI_SL_DEFAULT, &envvar_val)) { opts.outsl = envvar_val.e_long; } /* Get network key from environment. MVAPICH and other vendor MPIs do not * specify it on ep open and we may require it for vFabrics. * path-query will override it. */ if (!psmi_getenv("PSM_PKEY", "HFI PKey to use for endpoint", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_ULONG, (union psmi_envvar_val)HFI_DEFAULT_P_KEY, &envvar_val)) { opts.network_pkey = (uint64_t) envvar_val.e_ulong; } /* BACKWARDS COMPATIBILITY: Open MPI likes to choose its own PKEY of 0x7FFF. That's no longer a valid default, so override it if the client was compiled against PSM v1 */ if (PSMI_VERNO_GET_MAJOR(psmi_verno_client()) < 2 && opts.network_pkey == 0x7FFF) { opts.network_pkey = HFI_DEFAULT_P_KEY; } /* Get number of default send buffers from environment */ if (!psmi_getenv("PSM_NUM_SEND_BUFFERS", "Number of send buffers to allocate [1024]", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)1024, &envvar_val)) { opts.sendbufs_num = envvar_val.e_uint; } /* Get immediate data size - transfers less than immediate data size do * not consume a send buffer and require just a send descriptor. */ if (!psmi_getenv("PSM_SEND_IMMEDIATE_SIZE", "Immediate data send size not requiring a buffer [128]", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)128, &envvar_val)) { opts.imm_size = envvar_val.e_uint; } /* Get numner of send descriptors - by default this is 4 times the number * of send buffers - mainly used for short/inlined messages. */ if (!psmi_getenv("PSM_NUM_SEND_DESCRIPTORS", "Number of send descriptors to allocate [4096]", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)4096, &envvar_val)) { opts.senddesc_num = envvar_val.e_uint; } if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) { if ((err = psm_ep_num_devunits(&num_units)) != PSM_OK) goto fail; } else num_units = 0; /* do some error checking */ if (opts.timeout < -1) { err = psmi_handle_error(NULL, PSM_PARAM_ERR, "Invalid timeout value %lld", (long long)opts.timeout); goto fail; } else if (num_units && (opts.unit < -1 || opts.unit >= (int)num_units)) { err = psmi_handle_error(NULL, PSM_PARAM_ERR, "Invalid Device Unit ID %d (%d units found)", opts.unit, num_units); goto fail; } else if (opts.port < 0 || opts.port > HFI_MAX_PORT) { err = psmi_handle_error(NULL, PSM_PARAM_ERR, "Invalid Device port number %d", opts.port); goto fail; } else if (opts.affinity < 0 || opts.affinity > PSM_EP_OPEN_AFFINITY_FORCE) { err = psmi_handle_error(NULL, PSM_PARAM_ERR, "Invalid Affinity option: %d", opts.affinity); goto fail; } else if (opts.outsl < PSMI_SL_MIN || opts.outsl > PSMI_SL_MAX) { err = psmi_handle_error(NULL, PSM_PARAM_ERR, "Invalid SL number: %lld", (unsigned long long)opts.outsl); goto fail; } /* Set environment variable if PSM is not allowed to set affinity */ if (opts.affinity == PSM_EP_OPEN_AFFINITY_SKIP) setenv("HFI_NO_CPUAFFINITY", "1", 1); /* Allocate end point structure storage */ ptl_sizes = (psmi_device_is_enabled(devid_enabled, PTL_DEVID_SELF) ? psmi_ptl_self.sizeof_ptl() : 0) + (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS) ? psmi_ptl_ips.sizeof_ptl() : 0) + (psmi_device_is_enabled(devid_enabled, PTL_DEVID_AMSH) ? psmi_ptl_amsh.sizeof_ptl() : 0); if (ptl_sizes == 0) return PSM_EP_NO_DEVICE; ep = (psm_ep_t) psmi_memalign(PSMI_EP_NONE, UNDEFINED, 64, sizeof(struct psm_ep) + ptl_sizes); epaddr = (psm_epaddr_t) psmi_calloc(PSMI_EP_NONE, PER_PEER_ENDPOINT, 1, sizeof(struct psm_epaddr)); if (ep == NULL || epaddr == NULL) { err = psmi_handle_error(NULL, PSM_NO_MEMORY, "Couldn't allocate memory for %s structure", ep == NULL ? "psm_ep" : "psm_epaddr"); goto fail; } /* Copy PTL enabled status */ for (i = 0; i < PTL_MAX_INIT; i++) ep->devid_enabled[i] = devid_enabled[i]; /* Matched Queue initialization. We do this early because we have to * make sure ep->mq exists and is valid before calling ips_do_work. */ ep->mq = mq; /* Get ready for PTL initialization */ memcpy(&ep->uuid, (void *)unique_job_key, sizeof(psm_uuid_t)); ep->epaddr = epaddr; ep->memmode = mq->memmode; ep->hfi_num_sendbufs = opts.sendbufs_num; ep->service_id = opts.service_id; ep->path_res_type = opts.path_res_type; ep->hfi_num_descriptors = opts.senddesc_num; ep->hfi_imm_size = opts.imm_size; ep->errh = psmi_errhandler_global; /* by default use the global one */ ep->ptl_amsh.ep_poll = psmi_poll_noop; ep->ptl_ips.ep_poll = psmi_poll_noop; ep->connections = 0; /* See how many iterations we want to spin before yielding */ psmi_getenv("PSM_YIELD_SPIN_COUNT", "Spin poll iterations before yield", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)PSMI_BLOCKUNTIL_POLLS_BEFORE_YIELD, &envvar_val); ep->yield_spin_cnt = envvar_val.e_uint; ptl_sizes = 0; amsh_ptl = ips_ptl = self_ptl = NULL; if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) { amsh_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes); ptl_sizes += psmi_ptl_amsh.sizeof_ptl(); } if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { ips_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes); ptl_sizes += psmi_ptl_ips.sizeof_ptl(); } if (psmi_ep_device_is_enabled(ep, PTL_DEVID_SELF)) { self_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes); ptl_sizes += psmi_ptl_self.sizeof_ptl(); } if ((err = psmi_ep_open_device(ep, &opts, unique_job_key, &(ep->context), &ep->epid))) goto fail; psmi_assert_always(ep->epid != 0); ep->epaddr->epid = ep->epid; _HFI_VDBG("psmi_ep_open_device() passed\n"); /* Set our new label as soon as we know what it is */ strncpy(buf, psmi_gethostname(), sizeof(buf) - 1); buf[sizeof(buf) - 1] = '\0'; p = buf + strlen(buf); /* If our rank is set, use it. If not, use context.subcontext notation */ if (((e = getenv("MPI_RANKID")) != NULL && *e) || ((e = getenv("PSC_MPI_RANK")) != NULL && *e)) len = snprintf(p, sizeof(buf) - strlen(buf), ":%d.", atoi(e)); else len = snprintf(p, sizeof(buf) - strlen(buf), ":%d.%d.", (uint32_t) psm_epid_context(ep->epid), (uint32_t) psmi_epid_subcontext(ep->epid)); *(p + len) = '\0'; ep->context_mylabel = psmi_strdup(ep, buf); if (ep->context_mylabel == NULL) { err = PSM_NO_MEMORY; goto fail; } /* hfi_set_mylabel(ep->context_mylabel); */ if ((err = psmi_epid_set_hostname(psm_epid_nid(ep->epid), buf, 0))) goto fail; _HFI_VDBG("start ptl device init...\n"); if (psmi_ep_device_is_enabled(ep, PTL_DEVID_SELF)) { if ((err = psmi_ptl_self.init(ep, self_ptl, &ep->ptl_self))) goto fail; } if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { if ((err = psmi_ptl_ips.init(ep, ips_ptl, &ep->ptl_ips))) goto fail; } /* If we're shm-only, this device is enabled above */ if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) { if ((err = psmi_ptl_amsh.init(ep, amsh_ptl, &ep->ptl_amsh))) goto fail; } else { /* We may have pre-attached as part of getting our rank for enabling * shared contexts. */ } _HFI_VDBG("finish ptl device init...\n"); /* * Keep only IPS since only IPS support multi-rail, other devices * are only setup once. IPS device can come to this function again. */ for (i = 0; i < PTL_MAX_INIT; i++) { if (devid_enabled[i] != PTL_DEVID_IPS) { devid_enabled[i] = -1; } } *epido = ep->epid; *epo = ep; return PSM_OK; fail: if (ep != NULL) { if (ep->context.fd != -1) close(ep->context.fd); psmi_free(ep); } if (epaddr != NULL) psmi_free(epaddr); return err; }