psm2_error_t ips_proto_am_init(struct ips_proto *proto, int num_send_slots, uint32_t imm_size, struct ips_proto_am *proto_am) { psm2_error_t err = PSM2_OK; int send_buf_size = proto->ep->context.ctrl->__hfi_piosize; int num_rep_slots = calc_optimal_num_reply_slots(num_send_slots); int num_req_slots = num_send_slots - num_rep_slots; proto_am->proto = proto; /* In a node pair, the number of reply send buffers on at least one of * the nodes must be at least double the number (optimal: double + 1) of * send descriptors on the other node. While this constraint applies * only to the reply send buffers, allowing the caller to tune only the * number of request send buffers would be awkward, as they have no * knowledge of the subdivision of the memory into separate mempools for * requests and replies. It's an internal concern at this point. */ if ((err = ips_scbctrl_init(&proto->ep->context, num_req_slots, num_req_slots, imm_size, send_buf_size, NULL, NULL, &proto_am->scbc_request))) goto fail; if ((err = ips_scbctrl_init(&proto->ep->context, num_rep_slots, num_rep_slots, imm_size, send_buf_size, NULL, NULL, &proto_am->scbc_reply))) goto fail; if (ips_am_msg_pool == NULL) { union psmi_envvar_val max_msgs; ips_am_outoforder_q.head.next = NULL; ips_am_outoforder_q.tail = &ips_am_outoforder_q.head; psmi_getenv("PSM2_AM_MAX_OOO_MSGS", "Maximum number of OOO Active Messages to queue before dropping.", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)1024, &max_msgs); ips_am_msg_pool = psmi_mpool_create( sizeof(struct ips_am_message), 32, max_msgs.e_uint, 0, UNDEFINED, NULL, NULL); } fail: return err; }
psm2_error_t __psm2_ep_connect(psm2_ep_t ep, int num_of_epid, psm2_epid_t const *array_of_epid, int const *array_of_epid_mask, /* can be NULL */ psm2_error_t *array_of_errors, psm2_epaddr_t *array_of_epaddr, int64_t timeout) { psm2_error_t err = PSM2_OK; ptl_ctl_t *ptlctl; ptl_t *ptl; int i, j, dup_idx; int num_toconnect = 0; int *epid_mask = NULL; int *epid_mask_isdupof = NULL; char *device; uint64_t t_start = get_cycles(); uint64_t t_left; union psmi_envvar_val timeout_intval; PSM2_LOG_MSG("entering"); PSMI_ERR_UNLESS_INITIALIZED(ep); PSMI_PLOCK(); /* * Normally we would lock here, but instead each implemented ptl component * does its own locking. This is mostly because the ptl components are * ahead of the PSM interface in that they can disconnect their peers. */ if (ep == NULL || array_of_epaddr == NULL || array_of_epid == NULL || num_of_epid < 1) { err = psmi_handle_error(ep, PSM2_PARAM_ERR, "Invalid psm2_ep_connect parameters"); goto fail; } /* We need two of these masks to detect duplicates */ err = PSM2_NO_MEMORY; epid_mask = (int *)psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epid); if (epid_mask == NULL) goto fail; epid_mask_isdupof = (int *)psmi_malloc(ep, UNDEFINED, sizeof(int) * num_of_epid); if (epid_mask_isdupof == NULL) goto fail; err = PSM2_OK; /* Eventually handle timeouts across all connects. */ for (j = 0; j < num_of_epid; j++) { if (array_of_epid_mask != NULL && !array_of_epid_mask[j]) epid_mask[j] = 0; else { epid_mask[j] = 1; array_of_errors[j] = PSM2_EPID_UNKNOWN; array_of_epaddr[j] = NULL; num_toconnect++; } epid_mask_isdupof[j] = -1; } psmi_getenv("PSM2_CONNECT_TIMEOUT", "End-point connection timeout over-ride. 0 for no time-out.", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)0, &timeout_intval); if (getenv("PSM2_CONNECT_TIMEOUT")) { timeout = timeout_intval.e_uint * SEC_ULL; } else if (timeout > 0) { /* The timeout parameter provides the minimum timeout. A heuristic * is used to scale up the timeout linearly with the number of * endpoints, and we allow one second per 100 endpoints. */ timeout = max(timeout, (num_toconnect * SEC_ULL) / 100); } if (timeout > 0 && timeout < PSMI_MIN_EP_CONNECT_TIMEOUT) timeout = PSMI_MIN_EP_CONNECT_TIMEOUT; _HFI_PRDBG("Connect to %d endpoints with time-out of %.2f secs\n", num_toconnect, (double)timeout / 1e9); /* Look for duplicates in input array */ for (i = 0; i < num_of_epid; i++) { for (j = i + 1; j < num_of_epid; j++) { if (array_of_epid[i] == array_of_epid[j] && epid_mask[i] && epid_mask[j]) { epid_mask[j] = 0; /* don't connect more than once */ epid_mask_isdupof[j] = i; } } } for (i = 0; i < PTL_MAX_INIT; i++) { if (ep->devid_enabled[i] == -1) continue; /* Set up the right connect ptrs */ switch (ep->devid_enabled[i]) { case PTL_DEVID_IPS: ptlctl = &ep->ptl_ips; ptl = ep->ptl_ips.ptl; device = "ips"; break; case PTL_DEVID_AMSH: ptlctl = &ep->ptl_amsh; ptl = ep->ptl_amsh.ptl; device = "amsh"; break; case PTL_DEVID_SELF: ptlctl = &ep->ptl_self; ptl = ep->ptl_self.ptl; device = "self"; break; default: device = "unknown"; ptlctl = &ep->ptl_ips; /*no-unused */ ptl = ep->ptl_ips.ptl; /*no-unused */ device = "ips"; /*no-unused */ psmi_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Unknown/unhandled PTL id %d\n", ep->devid_enabled[i]); break; } t_left = psmi_cycles_left(t_start, timeout); _HFI_VDBG("Trying to connect with device %s\n", device); if ((err = ptlctl->ep_connect(ptl, num_of_epid, array_of_epid, epid_mask, array_of_errors, array_of_epaddr, cycles_to_nanosecs(t_left)))) { _HFI_PRDBG("Connect failure in device %s err=%d\n", device, err); goto connect_fail; } /* Now process what's been connected */ for (j = 0; j < num_of_epid; j++) { dup_idx = epid_mask_isdupof[j]; if (!epid_mask[j] && dup_idx == -1) continue; if (dup_idx != -1) { /* dup */ array_of_epaddr[j] = array_of_epaddr[dup_idx]; array_of_errors[j] = array_of_errors[dup_idx]; epid_mask_isdupof[j] = -1; } if (array_of_errors[j] == PSM2_OK) { epid_mask[j] = 0; /* don't try on next ptl */ ep->connections++; } } } for (i = 0; i < num_of_epid; i++) { ptl_ctl_t *c = NULL; if (array_of_epid_mask != NULL && !array_of_epid_mask[i]) continue; /* If we see unreachable here, that means some PTLs were not enabled */ if (array_of_errors[i] == PSM2_EPID_UNREACHABLE) { err = PSM2_EPID_UNREACHABLE; break; } psmi_assert_always(array_of_epaddr[i] != NULL); c = array_of_epaddr[i]->ptlctl; psmi_assert_always(c != NULL); _HFI_VDBG("%-20s DEVICE %s (%p)\n", psmi_epaddr_get_name(array_of_epid[i]), c == &ep->ptl_ips ? "hfi" : (c == &ep->ptl_amsh ? "amsh" : "self"), (void *)array_of_epaddr[i]->ptlctl->ptl); } connect_fail: /* If the error is a timeout (at worse) and the client is OPA MPI, * just return timeout to let OPA MPI handle the hostnames that * timed out */ if (err != PSM2_OK) { char errbuf[PSM2_ERRSTRING_MAXLEN]; size_t len; int j = 0; if (err == PSM2_EPID_UNREACHABLE) { char *deverr = "of an incorrect setting"; char *eperr = " "; char *devname = NULL; if (!psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) { deverr = "there is no shared memory PSM device (shm)"; eperr = " shared memory "; } else if (!psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { deverr = "there is no OPA PSM device (hfi)"; eperr = " OPA "; } len = snprintf(errbuf, sizeof(errbuf) - 1, "Some%sendpoints could not be connected because %s " "in the currently enabled PSM_DEVICES (", eperr, deverr); for (i = 0; i < PTL_MAX_INIT && len < sizeof(errbuf) - 1; i++) { switch (ep->devid_enabled[i]) { case PTL_DEVID_IPS: devname = "hfi"; break; case PTL_DEVID_AMSH: devname = "shm"; break; case PTL_DEVID_SELF: default: devname = "self"; break; } len += snprintf(errbuf + len, sizeof(errbuf) - len - 1, "%s,", devname); } if (len < sizeof(errbuf) - 1 && devname != NULL) /* parsed something, remove trailing comma */ errbuf[len - 1] = ')'; } else len = snprintf(errbuf, sizeof(errbuf) - 1, "%s", err == PSM2_TIMEOUT ? "Dectected connection timeout" : psm2_error_get_string(err)); /* first pass, look for all nodes with the error */ for (i = 0; i < num_of_epid && len < sizeof(errbuf) - 1; i++) { if (array_of_epid_mask != NULL && !array_of_epid_mask[i]) continue; if (array_of_errors[i] == PSM2_OK) continue; if (array_of_errors[i] == PSM2_EPID_UNREACHABLE && err != PSM2_EPID_UNREACHABLE) continue; if (err == array_of_errors[i]) { len += snprintf(errbuf + len, sizeof(errbuf) - len - 1, "%c %s", j == 0 ? ':' : ',', psmi_epaddr_get_hostname (array_of_epid[i])); j++; } } errbuf[sizeof(errbuf) - 1] = '\0'; err = psmi_handle_error(ep, err, errbuf); } fail: PSMI_PUNLOCK(); if (epid_mask != NULL) psmi_free(epid_mask); if (epid_mask_isdupof != NULL) psmi_free(epid_mask_isdupof); PSM2_LOG_MSG("leaving"); return err; }
psm_error_t __psm_ep_close(psm_ep_t ep, int mode, int64_t timeout_in) { psm_error_t err = PSM_OK; uint64_t t_start = get_cycles(); union psmi_envvar_val timeout_intval; psm_ep_t tmp, mep; PSMI_ERR_UNLESS_INITIALIZED(ep); psmi_assert_always(ep->mctxt_master == ep); PSMI_PLOCK(); if (psmi_opened_endpoint == NULL) { err = psmi_handle_error(NULL, PSM_EP_WAS_CLOSED, "PSM Endpoint is closed or does not exist"); return err; } tmp = psmi_opened_endpoint; while (tmp && tmp != ep) { tmp = tmp->user_ep_next; } if (!tmp) { err = psmi_handle_error(NULL, PSM_EP_WAS_CLOSED, "PSM Endpoint is closed or does not exist"); return err; } psmi_getenv("PSM_CLOSE_TIMEOUT", "End-point close timeout over-ride.", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)0, &timeout_intval); if (getenv("PSM_CLOSE_TIMEOUT")) { timeout_in = timeout_intval.e_uint * SEC_ULL; } else if (timeout_in > 0) { /* The timeout parameter provides the minimum timeout. A heuristic * is used to scale up the timeout linearly with the number of * endpoints, and we allow one second per 100 endpoints. */ timeout_in = max(timeout_in, (ep->connections * SEC_ULL) / 100); } if (timeout_in > 0 && timeout_in < PSMI_MIN_EP_CLOSE_TIMEOUT) timeout_in = PSMI_MIN_EP_CLOSE_TIMEOUT; /* Infinite and excessive close time-out are limited here to a max. * The "rationale" is that there is no point waiting around forever for * graceful termination. Normal (or forced) process termination should clean * up the context state correctly even if termination is not graceful. */ if (timeout_in <= 0 || timeout_in < PSMI_MAX_EP_CLOSE_TIMEOUT) timeout_in = PSMI_MAX_EP_CLOSE_TIMEOUT; _HFI_PRDBG("Closing endpoint %p with force=%s and to=%.2f seconds and " "%d connections\n", ep, mode == PSM_EP_CLOSE_FORCE ? "YES" : "NO", (double)timeout_in / 1e9, (int)ep->connections); /* XXX We currently cheat in the sense that we leave each PTL the allowed * timeout. There's no good way to do this until we change the PTL * interface to allow asynchronous finalization */ mep = ep; tmp = ep->mctxt_prev; do { ep = tmp; tmp = ep->mctxt_prev; PSM_MCTXT_REMOVE(ep); if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) err = psmi_ptl_amsh.fini(ep->ptl_amsh.ptl, mode, timeout_in); if ((err == PSM_OK || err == PSM_TIMEOUT) && psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) err = psmi_ptl_ips.fini(ep->ptl_ips.ptl, mode, timeout_in); /* If there's timeouts in the disconnect requests, * still make sure that we still get to close the *endpoint and mark it closed */ if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) psmi_context_close(&ep->context); psmi_free(ep->epaddr); psmi_free(ep->context_mylabel); /* * Before freeing the master ep itself, * remove it from the global linklist. * We do it here to let atexit handler in ptl_am directory * to search the global linklist and free the shared memory file. */ if (ep == mep) { if (psmi_opened_endpoint == ep) { psmi_opened_endpoint = ep->user_ep_next; } else { tmp = psmi_opened_endpoint; while (tmp->user_ep_next != ep) { tmp = tmp->user_ep_next; } tmp->user_ep_next = ep->user_ep_next; } psmi_opened_endpoint_count--; } psmi_free(ep); } while ((err == PSM_OK || err == PSM_TIMEOUT) && tmp != ep); PSMI_PUNLOCK(); _HFI_PRDBG("Closed endpoint in %.3f secs\n", (double)cycles_to_nanosecs(get_cycles() - t_start) / SEC_ULL); return err; }
psm_error_t __psm_ep_open(psm_uuid_t const unique_job_key, struct psm_ep_open_opts const *opts_i, psm_ep_t *epo, psm_epid_t *epido) { psm_error_t err; psm_mq_t mq; psm_epid_t epid; psm_ep_t ep, tmp; uint32_t units[HFI_MAX_RAILS]; uint16_t ports[HFI_MAX_RAILS]; int i, num_rails = 0; char *uname = "HFI_UNIT"; char *pname = "HFI_PORT"; char uvalue[4], pvalue[4]; int devid_enabled[PTL_MAX_INIT]; union psmi_envvar_val devs; PSMI_ERR_UNLESS_INITIALIZED(NULL); /* Currently only one endpoint is supported. */ if (psmi_opened_endpoint_count > 0) return PSM_TOO_MANY_ENDPOINTS; PSMI_PLOCK(); /* Matched Queue initialization. We do this early because we have to * make sure ep->mq exists and is valid before calling ips_do_work. */ err = psmi_mq_malloc(&mq); if (err != PSM_OK) goto fail; /* See which ptl devices we want to use for this ep to be opened */ psmi_getenv("PSM_DEVICES", "Ordered list of PSM-level devices", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, (union psmi_envvar_val)PSMI_DEVICES_DEFAULT, &devs); if ((err = psmi_parse_devices(devid_enabled, devs.e_str))) goto fail; if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) { err = psmi_ep_multirail(&num_rails, units, ports); if (err != PSM_OK) goto fail; /* If multi-rail is used, set the first ep unit/port */ if (num_rails > 0) { snprintf(uvalue, 4, "%1d", units[0]); snprintf(pvalue, 4, "%1d", ports[0]); setenv(uname, uvalue, 1); setenv(pname, pvalue, 1); } } err = __psm_ep_open_internal(unique_job_key, devid_enabled, opts_i, mq, &ep, &epid); if (err != PSM_OK) goto fail; if (psmi_opened_endpoint == NULL) { psmi_opened_endpoint = ep; } else { tmp = psmi_opened_endpoint; while (tmp->user_ep_next) tmp = tmp->user_ep_next; tmp->user_ep_next = ep; } psmi_opened_endpoint_count++; ep->mctxt_prev = ep->mctxt_next = ep; ep->mctxt_master = ep; mq->ep = ep; /* Active Message initialization */ err = psmi_am_init_internal(ep); if (err != PSM_OK) goto fail; *epo = ep; *epido = epid; if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) { for (i = 1; i < num_rails; i++) { snprintf(uvalue, 4, "%1d", units[i]); snprintf(pvalue, 4, "%1d", ports[i]); setenv(uname, uvalue, 1); setenv(pname, pvalue, 1); /* Create slave EP */ err = __psm_ep_open_internal(unique_job_key, devid_enabled, opts_i, mq, &tmp, &epid); if (err) goto fail; /* Point back to shared resources on the master EP */ tmp->am_htable = ep->am_htable; /* Link slave EP after master EP. */ PSM_MCTXT_APPEND(ep, tmp); } } /* Once we've initialized all devices, we can update the MQ with its * default values */ if (err == PSM_OK) err = psmi_mq_initialize_defaults(mq); _HFI_VDBG("psm_ep_open() OK....\n"); fail: PSMI_PUNLOCK(); return err; }
psm_error_t __psm_ep_open_internal(psm_uuid_t const unique_job_key, int *devid_enabled, struct psm_ep_open_opts const *opts_i, psm_mq_t mq, psm_ep_t *epo, psm_epid_t *epido) { psm_ep_t ep = NULL; uint32_t num_units; size_t len; psm_error_t err; psm_epaddr_t epaddr = NULL; char buf[128], *p, *e; union psmi_envvar_val envvar_val; size_t ptl_sizes; struct psm_ep_open_opts opts; ptl_t *amsh_ptl, *ips_ptl, *self_ptl; int i; /* First get the set of default options, we overwrite with the user's * desired values afterwards */ if ((err = psm_ep_open_opts_get_defaults(&opts))) goto fail; if (opts_i != NULL) { if (opts_i->timeout != -1) opts.timeout = opts_i->timeout; if (opts_i->unit != -1) opts.unit = opts_i->unit; if (opts_i->affinity != -1) opts.affinity = opts_i->affinity; if (opts_i->sendbufs_num != -1) opts.sendbufs_num = opts_i->sendbufs_num; if (opts_i->network_pkey != HFI_DEFAULT_P_KEY) opts.network_pkey = opts_i->network_pkey; if (opts_i->port != 0) opts.port = opts_i->port; if (opts_i->outsl != -1) opts.outsl = opts_i->outsl; if (opts_i->service_id) opts.service_id = (uint64_t) opts_i->service_id; if (opts_i->path_res_type != PSM_PATH_RES_NONE) opts.path_res_type = opts_i->path_res_type; if (opts_i->senddesc_num) opts.senddesc_num = opts_i->senddesc_num; if (opts_i->imm_size) opts.imm_size = opts_i->imm_size; } /* Get Service ID from environment */ if (!psmi_getenv("PSM_IB_SERVICE_ID", "HFI Service ID for path resolution", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_ULONG_ULONG, (union psmi_envvar_val)HFI_DEFAULT_SERVICE_ID, &envvar_val)) { opts.service_id = (uint64_t) envvar_val.e_ulonglong; } /* Get Path resolution type from environment Possible choices are: * * NONE : Default same as previous instances. Utilizes static data. * OPP : Use OFED Plus Plus library to do path record queries. * UMAD : Use raw libibumad interface to form and process path records. */ if (!psmi_getenv("PSM_PATH_REC", "Mechanism to query HFI path record (default is no path query)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, (union psmi_envvar_val)"none", &envvar_val)) { if (!strcasecmp(envvar_val.e_str, "none")) opts.path_res_type = PSM_PATH_RES_NONE; else if (!strcasecmp(envvar_val.e_str, "opp")) opts.path_res_type = PSM_PATH_RES_OPP; else if (!strcasecmp(envvar_val.e_str, "umad")) opts.path_res_type = PSM_PATH_RES_UMAD; else { _HFI_ERROR("Unknown path resolution type %s. " "Disabling use of path record query.\n", envvar_val.e_str); opts.path_res_type = PSM_PATH_RES_NONE; } } /* If a specific unit is set in the environment, use that one. */ if (!psmi_getenv("HFI_UNIT", "Device Unit number (-1 autodetects)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_LONG, (union psmi_envvar_val)HFI_UNIT_ID_ANY, &envvar_val)) { opts.unit = envvar_val.e_long; } /* Get user specified port number to use. */ if (!psmi_getenv("HFI_PORT", "IB Port number (0 autodetects)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_LONG, (union psmi_envvar_val)HFI_PORT_NUM_ANY, &envvar_val)) { opts.port = envvar_val.e_long; } /* Get service level from environment, path-query overrides it */ if (!psmi_getenv ("HFI_SL", "HFI outging ServiceLevel number (default 0)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_LONG, (union psmi_envvar_val)PSMI_SL_DEFAULT, &envvar_val)) { opts.outsl = envvar_val.e_long; } /* Get network key from environment. MVAPICH and other vendor MPIs do not * specify it on ep open and we may require it for vFabrics. * path-query will override it. */ if (!psmi_getenv("PSM_PKEY", "HFI PKey to use for endpoint", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_ULONG, (union psmi_envvar_val)HFI_DEFAULT_P_KEY, &envvar_val)) { opts.network_pkey = (uint64_t) envvar_val.e_ulong; } /* BACKWARDS COMPATIBILITY: Open MPI likes to choose its own PKEY of 0x7FFF. That's no longer a valid default, so override it if the client was compiled against PSM v1 */ if (PSMI_VERNO_GET_MAJOR(psmi_verno_client()) < 2 && opts.network_pkey == 0x7FFF) { opts.network_pkey = HFI_DEFAULT_P_KEY; } /* Get number of default send buffers from environment */ if (!psmi_getenv("PSM_NUM_SEND_BUFFERS", "Number of send buffers to allocate [1024]", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)1024, &envvar_val)) { opts.sendbufs_num = envvar_val.e_uint; } /* Get immediate data size - transfers less than immediate data size do * not consume a send buffer and require just a send descriptor. */ if (!psmi_getenv("PSM_SEND_IMMEDIATE_SIZE", "Immediate data send size not requiring a buffer [128]", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)128, &envvar_val)) { opts.imm_size = envvar_val.e_uint; } /* Get numner of send descriptors - by default this is 4 times the number * of send buffers - mainly used for short/inlined messages. */ if (!psmi_getenv("PSM_NUM_SEND_DESCRIPTORS", "Number of send descriptors to allocate [4096]", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)4096, &envvar_val)) { opts.senddesc_num = envvar_val.e_uint; } if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) { if ((err = psm_ep_num_devunits(&num_units)) != PSM_OK) goto fail; } else num_units = 0; /* do some error checking */ if (opts.timeout < -1) { err = psmi_handle_error(NULL, PSM_PARAM_ERR, "Invalid timeout value %lld", (long long)opts.timeout); goto fail; } else if (num_units && (opts.unit < -1 || opts.unit >= (int)num_units)) { err = psmi_handle_error(NULL, PSM_PARAM_ERR, "Invalid Device Unit ID %d (%d units found)", opts.unit, num_units); goto fail; } else if (opts.port < 0 || opts.port > HFI_MAX_PORT) { err = psmi_handle_error(NULL, PSM_PARAM_ERR, "Invalid Device port number %d", opts.port); goto fail; } else if (opts.affinity < 0 || opts.affinity > PSM_EP_OPEN_AFFINITY_FORCE) { err = psmi_handle_error(NULL, PSM_PARAM_ERR, "Invalid Affinity option: %d", opts.affinity); goto fail; } else if (opts.outsl < PSMI_SL_MIN || opts.outsl > PSMI_SL_MAX) { err = psmi_handle_error(NULL, PSM_PARAM_ERR, "Invalid SL number: %lld", (unsigned long long)opts.outsl); goto fail; } /* Set environment variable if PSM is not allowed to set affinity */ if (opts.affinity == PSM_EP_OPEN_AFFINITY_SKIP) setenv("HFI_NO_CPUAFFINITY", "1", 1); /* Allocate end point structure storage */ ptl_sizes = (psmi_device_is_enabled(devid_enabled, PTL_DEVID_SELF) ? psmi_ptl_self.sizeof_ptl() : 0) + (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS) ? psmi_ptl_ips.sizeof_ptl() : 0) + (psmi_device_is_enabled(devid_enabled, PTL_DEVID_AMSH) ? psmi_ptl_amsh.sizeof_ptl() : 0); if (ptl_sizes == 0) return PSM_EP_NO_DEVICE; ep = (psm_ep_t) psmi_memalign(PSMI_EP_NONE, UNDEFINED, 64, sizeof(struct psm_ep) + ptl_sizes); epaddr = (psm_epaddr_t) psmi_calloc(PSMI_EP_NONE, PER_PEER_ENDPOINT, 1, sizeof(struct psm_epaddr)); if (ep == NULL || epaddr == NULL) { err = psmi_handle_error(NULL, PSM_NO_MEMORY, "Couldn't allocate memory for %s structure", ep == NULL ? "psm_ep" : "psm_epaddr"); goto fail; } /* Copy PTL enabled status */ for (i = 0; i < PTL_MAX_INIT; i++) ep->devid_enabled[i] = devid_enabled[i]; /* Matched Queue initialization. We do this early because we have to * make sure ep->mq exists and is valid before calling ips_do_work. */ ep->mq = mq; /* Get ready for PTL initialization */ memcpy(&ep->uuid, (void *)unique_job_key, sizeof(psm_uuid_t)); ep->epaddr = epaddr; ep->memmode = mq->memmode; ep->hfi_num_sendbufs = opts.sendbufs_num; ep->service_id = opts.service_id; ep->path_res_type = opts.path_res_type; ep->hfi_num_descriptors = opts.senddesc_num; ep->hfi_imm_size = opts.imm_size; ep->errh = psmi_errhandler_global; /* by default use the global one */ ep->ptl_amsh.ep_poll = psmi_poll_noop; ep->ptl_ips.ep_poll = psmi_poll_noop; ep->connections = 0; /* See how many iterations we want to spin before yielding */ psmi_getenv("PSM_YIELD_SPIN_COUNT", "Spin poll iterations before yield", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)PSMI_BLOCKUNTIL_POLLS_BEFORE_YIELD, &envvar_val); ep->yield_spin_cnt = envvar_val.e_uint; ptl_sizes = 0; amsh_ptl = ips_ptl = self_ptl = NULL; if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) { amsh_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes); ptl_sizes += psmi_ptl_amsh.sizeof_ptl(); } if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { ips_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes); ptl_sizes += psmi_ptl_ips.sizeof_ptl(); } if (psmi_ep_device_is_enabled(ep, PTL_DEVID_SELF)) { self_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes); ptl_sizes += psmi_ptl_self.sizeof_ptl(); } if ((err = psmi_ep_open_device(ep, &opts, unique_job_key, &(ep->context), &ep->epid))) goto fail; psmi_assert_always(ep->epid != 0); ep->epaddr->epid = ep->epid; _HFI_VDBG("psmi_ep_open_device() passed\n"); /* Set our new label as soon as we know what it is */ strncpy(buf, psmi_gethostname(), sizeof(buf) - 1); buf[sizeof(buf) - 1] = '\0'; p = buf + strlen(buf); /* If our rank is set, use it. If not, use context.subcontext notation */ if (((e = getenv("MPI_RANKID")) != NULL && *e) || ((e = getenv("PSC_MPI_RANK")) != NULL && *e)) len = snprintf(p, sizeof(buf) - strlen(buf), ":%d.", atoi(e)); else len = snprintf(p, sizeof(buf) - strlen(buf), ":%d.%d.", (uint32_t) psm_epid_context(ep->epid), (uint32_t) psmi_epid_subcontext(ep->epid)); *(p + len) = '\0'; ep->context_mylabel = psmi_strdup(ep, buf); if (ep->context_mylabel == NULL) { err = PSM_NO_MEMORY; goto fail; } /* hfi_set_mylabel(ep->context_mylabel); */ if ((err = psmi_epid_set_hostname(psm_epid_nid(ep->epid), buf, 0))) goto fail; _HFI_VDBG("start ptl device init...\n"); if (psmi_ep_device_is_enabled(ep, PTL_DEVID_SELF)) { if ((err = psmi_ptl_self.init(ep, self_ptl, &ep->ptl_self))) goto fail; } if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { if ((err = psmi_ptl_ips.init(ep, ips_ptl, &ep->ptl_ips))) goto fail; } /* If we're shm-only, this device is enabled above */ if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) { if ((err = psmi_ptl_amsh.init(ep, amsh_ptl, &ep->ptl_amsh))) goto fail; } else { /* We may have pre-attached as part of getting our rank for enabling * shared contexts. */ } _HFI_VDBG("finish ptl device init...\n"); /* * Keep only IPS since only IPS support multi-rail, other devices * are only setup once. IPS device can come to this function again. */ for (i = 0; i < PTL_MAX_INIT; i++) { if (devid_enabled[i] != PTL_DEVID_IPS) { devid_enabled[i] = -1; } } *epido = ep->epid; *epo = ep; return PSM_OK; fail: if (ep != NULL) { if (ep->context.fd != -1) close(ep->context.fd); psmi_free(ep); } if (epaddr != NULL) psmi_free(epaddr); return err; }
static psm_error_t psmi_ep_open_device(const psm_ep_t ep, const struct psm_ep_open_opts *opts, const psm_uuid_t unique_job_key, struct psmi_context *context, psm_epid_t *epid) { psm_error_t err = PSM_OK; /* Skip affinity. No affinity if: * 1. User explicitly sets no-affinity=YES in environment. * 2. User doesn't set affinity in environment and PSM is opened with * option affinity skip. */ if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { uint32_t rcvthread_flags; union psmi_envvar_val env_rcvthread; static int norcvthread; /* only for first rail */ ep->out_sl = opts->outsl; if ((err = psmi_context_open(ep, opts->unit, opts->port, unique_job_key, opts->timeout, context)) != PSM_OK) goto fail; _HFI_DBG("[%d]use unit %d port %d\n", getpid(), context->ctrl->__hfi_unit, 1); /* At this point, we have the unit id and port number, so * check if pkey is not 0x0/0x7fff/0xffff, and match one * of the pkey in table. */ if ((err = psmi_ep_verify_pkey(ep, (uint16_t) opts->network_pkey, &ep->network_pkey)) != PSM_OK) goto fail; /* See if we want to activate support for receive thread */ psmi_getenv("PSM_RCVTHREAD", "Recv thread flags (0 disables thread)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS, (union psmi_envvar_val)(norcvthread++ ? 0 : PSMI_RCVTHREAD_FLAGS), &env_rcvthread); rcvthread_flags = env_rcvthread.e_uint; /* If enabled, use the pollurg capability to implement a receive * interrupt thread that can handle urg packets */ if (rcvthread_flags) { context->runtime_flags |= PSMI_RUNTIME_RCVTHREAD; #ifdef PSMI_PLOCK_IS_NOLOCK psmi_handle_error(PSMI_EP_NORETURN, PSM_INTERNAL_ERR, "#define PSMI_PLOCK_IS_NOLOCK not functional yet " "with RCVTHREAD on"); #endif } context->rcvthread_flags = rcvthread_flags; *epid = context->epid; } else if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) { int rank; /* In shm-only mode, we need to derive a valid epid * based on our rank. We try to get it from the * environment if its available, or resort to using * our PID as the rank. */ union psmi_envvar_val env_rankid; if (psmi_getenv ("MPI_LOCALRANKID", "Shared context rankid", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, (union psmi_envvar_val)-1, &env_rankid)) { if (psmi_getenv ("PSC_MPI_NODE_RANK", "Shared context rankid", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, (union psmi_envvar_val)-1, &env_rankid)) { rank = getpid(); } else rank = env_rankid.e_int; } else rank = env_rankid.e_int; /* * We use a LID of 0 for non-HFI communication. * Since a jobkey is not available from IPS, pull the * first 16 bits from the UUID. */ *epid = PSMI_EPID_PACK(((uint16_t *) unique_job_key)[0], (rank >> 3), rank, 0, PSMI_HFI_TYPE_DEFAULT, 0x7, rank); } else {