/* * Add ipsaddr with epid to the epstate table, return new index to caller in * 'connidx'. */ psm2_error_t ips_epstate_add(struct ips_epstate *eps, struct ips_epaddr *ipsaddr, ips_epstate_idx *connidx_o) { int i, j; ips_epstate_idx connidx; if (++eps->eps_tabsizeused > eps->eps_tabsize) { /* realloc */ struct ips_epstate_entry *newtab; eps->eps_tabsize += PTL_EPADDR_ALLOC_CHUNK; newtab = (struct ips_epstate_entry *) psmi_calloc(eps->context->ep, PER_PEER_ENDPOINT, eps->eps_tabsize, sizeof(struct ips_epstate_entry)); if (newtab == NULL) return PSM2_NO_MEMORY; else if (eps->eps_tab) { /* NOT first alloc */ for (i = 0; i < eps->eps_tabsize - PTL_EPADDR_ALLOC_CHUNK; i++) newtab[i] = eps->eps_tab[i]; /* deep copy */ psmi_free(eps->eps_tab); } eps->eps_tab = newtab; } /* Find the next free hole. We can afford to do this since connect is not * in the critical path */ for (i = 0, j = eps->eps_tab_nextidx; i < eps->eps_tabsize; i++, j++) { if (j == eps->eps_tabsize) j = 0; if (eps->eps_tab[j].ipsaddr == NULL) { eps->eps_tab_nextidx = j + 1; if (eps->eps_tab_nextidx == eps->eps_tabsize) eps->eps_tab_nextidx = 0; break; } } psmi_assert_always(i != eps->eps_tabsize); connidx = (j - eps->eps_base_idx) & (IPS_EPSTATE_CONNIDX_MAX-1); _HFI_VDBG("node %s gets connidx=%d (table idx %d)\n", psmi_epaddr_get_name(((psm2_epaddr_t) ipsaddr)->epid), connidx, j); eps->eps_tab[j].ipsaddr = ipsaddr; if (j >= IPS_EPSTATE_CONNIDX_MAX) { return psmi_handle_error(eps->context->ep, PSM2_TOO_MANY_ENDPOINTS, "Can't connect to more than %d non-local endpoints", IPS_EPSTATE_CONNIDX_MAX); } *connidx_o = connidx; return PSM2_OK; }
psm_error_t ips_tid_init(const psmi_context_t *context, struct ips_tid *tidc, ips_tid_avail_cb_fn_t cb, void *cb_context) { const struct hfi1_ctxt_info *ctxt_info = &context->ctrl->ctxt_info; struct psmi_stats_entry entries[] = { PSMI_STATS_DECL("tid update count", MPSPAWN_STATS_REDUCTION_ALL, NULL, &tidc->tid_num_total), }; tidc->context = context; /* these are in group unit, a group is 8 tids or 4 tidpairs */ tidc->tid_num_total = 0; tidc->tid_num_inuse = 0; tidc->tid_avail_cb = cb; tidc->tid_avail_context = cb_context; tidc->tid_ctrl = (struct ips_tid_ctrl *)context->tid_ctrl; if (!tidc->tid_ctrl) { tidc->tid_ctrl = (struct ips_tid_ctrl *) psmi_calloc(context->ep, UNDEFINED, 1, sizeof(struct ips_tid_ctrl)); if (tidc->tid_ctrl == NULL) { return PSM_NO_MEMORY; } } /* * Only the master process can initialize. */ if (ctxt_info->subctxt == 0) { pthread_spin_init(&tidc->tid_ctrl->tid_ctrl_lock, PTHREAD_PROCESS_SHARED); /* check if exp tids are multiple of 8 (a group) */ if (context->ctrl->__hfi_tidexpcnt % 8) return psmi_handle_error(context->ep, PSM_INTERNAL_ERR, "Expected tids(%d) are not multi-groups(8)", context->ctrl->__hfi_tidexpcnt); tidc->tid_ctrl->tid_num_max = context->ctrl->__hfi_tidexpcnt >> 3; tidc->tid_ctrl->tid_num_avail = tidc->tid_ctrl->tid_num_max; }
psm2_error_t ips_tid_init(const psmi_context_t *context, struct ips_protoexp *protoexp, ips_tid_avail_cb_fn_t cb, void *cb_context) { const struct hfi1_user_info_dep *user_info = &context->user_info; const struct hfi1_base_info *base_info = &context->ctrl->base_info; const struct hfi1_ctxt_info *ctxt_info = &context->ctrl->ctxt_info; struct ips_tid *tidc = &protoexp->tidc; struct psmi_stats_entry entries[] = { PSMI_STATS_DECL("tid update count", MPSPAWN_STATS_REDUCTION_ALL, NULL, &tidc->tid_num_total), }; tidc->context = context; tidc->protoexp = protoexp; tidc->tid_num_total = 0; tidc->tid_num_inuse = 0; tidc->tid_avail_cb = cb; tidc->tid_avail_context = cb_context; tidc->tid_array = NULL; tidc->invalidation_event = (uint64_t *) (ptrdiff_t) base_info->events_bufbase; /* * PSM uses tid registration caching only if driver has enabled it. */ if (!(tidc->context->runtime_flags & HFI1_CAP_TID_UNMAP)) { int i; cl_qmap_t *p_map; cl_map_item_t *root,*nil_item; tidc->tid_array = (uint32_t *) psmi_calloc(context->ep, UNDEFINED, context->ctrl->__hfi_tidexpcnt, sizeof(uint32_t)); if (tidc->tid_array == NULL) return PSM2_NO_MEMORY; /* * first is root node, last is terminator node. */ p_map = &tidc->tid_cachemap; root = (cl_map_item_t *) psmi_calloc(context->ep, UNDEFINED, context->ctrl->__hfi_tidexpcnt + 2, sizeof(cl_map_item_t)); if (root == NULL) return PSM2_NO_MEMORY; nil_item = &root [context->ctrl->__hfi_tidexpcnt + 1]; ips_tidcache_map_init(p_map,root,nil_item); NTID = 0; NIDLE = 0; IPREV(IHEAD) = INEXT(IHEAD) = IHEAD; for (i = 1; i <= context->ctrl->__hfi_tidexpcnt; i++) { INVALIDATE(i) = 1; } /* * if not shared context, all tids are used by the same * process. Otherwise, subcontext process can only cache * its own portion. Driver makes the same tid number * assignment to subcontext processes. */ tidc->tid_cachesize = context->ctrl->__hfi_tidexpcnt; if (user_info->subctxt_cnt > 0) { uint16_t remainder = tidc->tid_cachesize % user_info->subctxt_cnt; tidc->tid_cachesize /= user_info->subctxt_cnt; if (ctxt_info->subctxt < remainder) tidc->tid_cachesize++; } } /* * Setup shared control structure. */ tidc->tid_ctrl = (struct ips_tid_ctrl *)context->tid_ctrl; if (!tidc->tid_ctrl) { tidc->tid_ctrl = (struct ips_tid_ctrl *) psmi_calloc(context->ep, UNDEFINED, 1, sizeof(struct ips_tid_ctrl)); if (tidc->tid_ctrl == NULL) { return PSM2_NO_MEMORY; } } /* * Only the master process can initialize. */ if (ctxt_info->subctxt == 0) { pthread_spin_init(&tidc->tid_ctrl->tid_ctrl_lock, PTHREAD_PROCESS_SHARED); tidc->tid_ctrl->tid_num_max = context->ctrl->__hfi_tidexpcnt; tidc->tid_ctrl->tid_num_avail = tidc->tid_ctrl->tid_num_max; } return psmi_stats_register_type(PSMI_STATS_NO_HEADING, PSMI_STATSTYPE_TIDS, entries, PSMI_STATS_HOWMANY(entries), tidc); }
psm_error_t __psm_ep_open_internal(psm_uuid_t const unique_job_key, int *devid_enabled, struct psm_ep_open_opts const *opts_i, psm_mq_t mq, psm_ep_t *epo, psm_epid_t *epido) { psm_ep_t ep = NULL; uint32_t num_units; size_t len; psm_error_t err; psm_epaddr_t epaddr = NULL; char buf[128], *p, *e; union psmi_envvar_val envvar_val; size_t ptl_sizes; struct psm_ep_open_opts opts; ptl_t *amsh_ptl, *ips_ptl, *self_ptl; int i; /* First get the set of default options, we overwrite with the user's * desired values afterwards */ if ((err = psm_ep_open_opts_get_defaults(&opts))) goto fail; if (opts_i != NULL) { if (opts_i->timeout != -1) opts.timeout = opts_i->timeout; if (opts_i->unit != -1) opts.unit = opts_i->unit; if (opts_i->affinity != -1) opts.affinity = opts_i->affinity; if (opts_i->sendbufs_num != -1) opts.sendbufs_num = opts_i->sendbufs_num; if (opts_i->network_pkey != HFI_DEFAULT_P_KEY) opts.network_pkey = opts_i->network_pkey; if (opts_i->port != 0) opts.port = opts_i->port; if (opts_i->outsl != -1) opts.outsl = opts_i->outsl; if (opts_i->service_id) opts.service_id = (uint64_t) opts_i->service_id; if (opts_i->path_res_type != PSM_PATH_RES_NONE) opts.path_res_type = opts_i->path_res_type; if (opts_i->senddesc_num) opts.senddesc_num = opts_i->senddesc_num; if (opts_i->imm_size) opts.imm_size = opts_i->imm_size; } /* Get Service ID from environment */ if (!psmi_getenv("PSM_IB_SERVICE_ID", "HFI Service ID for path resolution", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_ULONG_ULONG, (union psmi_envvar_val)HFI_DEFAULT_SERVICE_ID, &envvar_val)) { opts.service_id = (uint64_t) envvar_val.e_ulonglong; } /* Get Path resolution type from environment Possible choices are: * * NONE : Default same as previous instances. Utilizes static data. * OPP : Use OFED Plus Plus library to do path record queries. * UMAD : Use raw libibumad interface to form and process path records. */ if (!psmi_getenv("PSM_PATH_REC", "Mechanism to query HFI path record (default is no path query)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR, (union psmi_envvar_val)"none", &envvar_val)) { if (!strcasecmp(envvar_val.e_str, "none")) opts.path_res_type = PSM_PATH_RES_NONE; else if (!strcasecmp(envvar_val.e_str, "opp")) opts.path_res_type = PSM_PATH_RES_OPP; else if (!strcasecmp(envvar_val.e_str, "umad")) opts.path_res_type = PSM_PATH_RES_UMAD; else { _HFI_ERROR("Unknown path resolution type %s. " "Disabling use of path record query.\n", envvar_val.e_str); opts.path_res_type = PSM_PATH_RES_NONE; } } /* If a specific unit is set in the environment, use that one. */ if (!psmi_getenv("HFI_UNIT", "Device Unit number (-1 autodetects)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_LONG, (union psmi_envvar_val)HFI_UNIT_ID_ANY, &envvar_val)) { opts.unit = envvar_val.e_long; } /* Get user specified port number to use. */ if (!psmi_getenv("HFI_PORT", "IB Port number (0 autodetects)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_LONG, (union psmi_envvar_val)HFI_PORT_NUM_ANY, &envvar_val)) { opts.port = envvar_val.e_long; } /* Get service level from environment, path-query overrides it */ if (!psmi_getenv ("HFI_SL", "HFI outging ServiceLevel number (default 0)", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_LONG, (union psmi_envvar_val)PSMI_SL_DEFAULT, &envvar_val)) { opts.outsl = envvar_val.e_long; } /* Get network key from environment. MVAPICH and other vendor MPIs do not * specify it on ep open and we may require it for vFabrics. * path-query will override it. */ if (!psmi_getenv("PSM_PKEY", "HFI PKey to use for endpoint", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_ULONG, (union psmi_envvar_val)HFI_DEFAULT_P_KEY, &envvar_val)) { opts.network_pkey = (uint64_t) envvar_val.e_ulong; } /* BACKWARDS COMPATIBILITY: Open MPI likes to choose its own PKEY of 0x7FFF. That's no longer a valid default, so override it if the client was compiled against PSM v1 */ if (PSMI_VERNO_GET_MAJOR(psmi_verno_client()) < 2 && opts.network_pkey == 0x7FFF) { opts.network_pkey = HFI_DEFAULT_P_KEY; } /* Get number of default send buffers from environment */ if (!psmi_getenv("PSM_NUM_SEND_BUFFERS", "Number of send buffers to allocate [1024]", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)1024, &envvar_val)) { opts.sendbufs_num = envvar_val.e_uint; } /* Get immediate data size - transfers less than immediate data size do * not consume a send buffer and require just a send descriptor. */ if (!psmi_getenv("PSM_SEND_IMMEDIATE_SIZE", "Immediate data send size not requiring a buffer [128]", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)128, &envvar_val)) { opts.imm_size = envvar_val.e_uint; } /* Get numner of send descriptors - by default this is 4 times the number * of send buffers - mainly used for short/inlined messages. */ if (!psmi_getenv("PSM_NUM_SEND_DESCRIPTORS", "Number of send descriptors to allocate [4096]", PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)4096, &envvar_val)) { opts.senddesc_num = envvar_val.e_uint; } if (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) { if ((err = psm_ep_num_devunits(&num_units)) != PSM_OK) goto fail; } else num_units = 0; /* do some error checking */ if (opts.timeout < -1) { err = psmi_handle_error(NULL, PSM_PARAM_ERR, "Invalid timeout value %lld", (long long)opts.timeout); goto fail; } else if (num_units && (opts.unit < -1 || opts.unit >= (int)num_units)) { err = psmi_handle_error(NULL, PSM_PARAM_ERR, "Invalid Device Unit ID %d (%d units found)", opts.unit, num_units); goto fail; } else if (opts.port < 0 || opts.port > HFI_MAX_PORT) { err = psmi_handle_error(NULL, PSM_PARAM_ERR, "Invalid Device port number %d", opts.port); goto fail; } else if (opts.affinity < 0 || opts.affinity > PSM_EP_OPEN_AFFINITY_FORCE) { err = psmi_handle_error(NULL, PSM_PARAM_ERR, "Invalid Affinity option: %d", opts.affinity); goto fail; } else if (opts.outsl < PSMI_SL_MIN || opts.outsl > PSMI_SL_MAX) { err = psmi_handle_error(NULL, PSM_PARAM_ERR, "Invalid SL number: %lld", (unsigned long long)opts.outsl); goto fail; } /* Set environment variable if PSM is not allowed to set affinity */ if (opts.affinity == PSM_EP_OPEN_AFFINITY_SKIP) setenv("HFI_NO_CPUAFFINITY", "1", 1); /* Allocate end point structure storage */ ptl_sizes = (psmi_device_is_enabled(devid_enabled, PTL_DEVID_SELF) ? psmi_ptl_self.sizeof_ptl() : 0) + (psmi_device_is_enabled(devid_enabled, PTL_DEVID_IPS) ? psmi_ptl_ips.sizeof_ptl() : 0) + (psmi_device_is_enabled(devid_enabled, PTL_DEVID_AMSH) ? psmi_ptl_amsh.sizeof_ptl() : 0); if (ptl_sizes == 0) return PSM_EP_NO_DEVICE; ep = (psm_ep_t) psmi_memalign(PSMI_EP_NONE, UNDEFINED, 64, sizeof(struct psm_ep) + ptl_sizes); epaddr = (psm_epaddr_t) psmi_calloc(PSMI_EP_NONE, PER_PEER_ENDPOINT, 1, sizeof(struct psm_epaddr)); if (ep == NULL || epaddr == NULL) { err = psmi_handle_error(NULL, PSM_NO_MEMORY, "Couldn't allocate memory for %s structure", ep == NULL ? "psm_ep" : "psm_epaddr"); goto fail; } /* Copy PTL enabled status */ for (i = 0; i < PTL_MAX_INIT; i++) ep->devid_enabled[i] = devid_enabled[i]; /* Matched Queue initialization. We do this early because we have to * make sure ep->mq exists and is valid before calling ips_do_work. */ ep->mq = mq; /* Get ready for PTL initialization */ memcpy(&ep->uuid, (void *)unique_job_key, sizeof(psm_uuid_t)); ep->epaddr = epaddr; ep->memmode = mq->memmode; ep->hfi_num_sendbufs = opts.sendbufs_num; ep->service_id = opts.service_id; ep->path_res_type = opts.path_res_type; ep->hfi_num_descriptors = opts.senddesc_num; ep->hfi_imm_size = opts.imm_size; ep->errh = psmi_errhandler_global; /* by default use the global one */ ep->ptl_amsh.ep_poll = psmi_poll_noop; ep->ptl_ips.ep_poll = psmi_poll_noop; ep->connections = 0; /* See how many iterations we want to spin before yielding */ psmi_getenv("PSM_YIELD_SPIN_COUNT", "Spin poll iterations before yield", PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT, (union psmi_envvar_val)PSMI_BLOCKUNTIL_POLLS_BEFORE_YIELD, &envvar_val); ep->yield_spin_cnt = envvar_val.e_uint; ptl_sizes = 0; amsh_ptl = ips_ptl = self_ptl = NULL; if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) { amsh_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes); ptl_sizes += psmi_ptl_amsh.sizeof_ptl(); } if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { ips_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes); ptl_sizes += psmi_ptl_ips.sizeof_ptl(); } if (psmi_ep_device_is_enabled(ep, PTL_DEVID_SELF)) { self_ptl = (ptl_t *) (ep->ptl_base_data + ptl_sizes); ptl_sizes += psmi_ptl_self.sizeof_ptl(); } if ((err = psmi_ep_open_device(ep, &opts, unique_job_key, &(ep->context), &ep->epid))) goto fail; psmi_assert_always(ep->epid != 0); ep->epaddr->epid = ep->epid; _HFI_VDBG("psmi_ep_open_device() passed\n"); /* Set our new label as soon as we know what it is */ strncpy(buf, psmi_gethostname(), sizeof(buf) - 1); buf[sizeof(buf) - 1] = '\0'; p = buf + strlen(buf); /* If our rank is set, use it. If not, use context.subcontext notation */ if (((e = getenv("MPI_RANKID")) != NULL && *e) || ((e = getenv("PSC_MPI_RANK")) != NULL && *e)) len = snprintf(p, sizeof(buf) - strlen(buf), ":%d.", atoi(e)); else len = snprintf(p, sizeof(buf) - strlen(buf), ":%d.%d.", (uint32_t) psm_epid_context(ep->epid), (uint32_t) psmi_epid_subcontext(ep->epid)); *(p + len) = '\0'; ep->context_mylabel = psmi_strdup(ep, buf); if (ep->context_mylabel == NULL) { err = PSM_NO_MEMORY; goto fail; } /* hfi_set_mylabel(ep->context_mylabel); */ if ((err = psmi_epid_set_hostname(psm_epid_nid(ep->epid), buf, 0))) goto fail; _HFI_VDBG("start ptl device init...\n"); if (psmi_ep_device_is_enabled(ep, PTL_DEVID_SELF)) { if ((err = psmi_ptl_self.init(ep, self_ptl, &ep->ptl_self))) goto fail; } if (psmi_ep_device_is_enabled(ep, PTL_DEVID_IPS)) { if ((err = psmi_ptl_ips.init(ep, ips_ptl, &ep->ptl_ips))) goto fail; } /* If we're shm-only, this device is enabled above */ if (psmi_ep_device_is_enabled(ep, PTL_DEVID_AMSH)) { if ((err = psmi_ptl_amsh.init(ep, amsh_ptl, &ep->ptl_amsh))) goto fail; } else { /* We may have pre-attached as part of getting our rank for enabling * shared contexts. */ } _HFI_VDBG("finish ptl device init...\n"); /* * Keep only IPS since only IPS support multi-rail, other devices * are only setup once. IPS device can come to this function again. */ for (i = 0; i < PTL_MAX_INIT; i++) { if (devid_enabled[i] != PTL_DEVID_IPS) { devid_enabled[i] = -1; } } *epido = ep->epid; *epo = ep; return PSM_OK; fail: if (ep != NULL) { if (ep->context.fd != -1) close(ep->context.fd); psmi_free(ep); } if (epaddr != NULL) psmi_free(epaddr); return err; }
static psm_error_t psmi_ep_devlids(uint16_t **lids, uint32_t *num_lids_o, uint64_t my_gid_hi, uint64_t my_gid_lo) { static uint16_t *hfi_lids; static uint32_t nlids; uint32_t num_units; int i; psm_error_t err = PSM_OK; PSMI_ERR_UNLESS_INITIALIZED(NULL); if (hfi_lids == NULL) { if ((err = psm_ep_num_devunits(&num_units))) goto fail; hfi_lids = (uint16_t *) psmi_calloc(PSMI_EP_NONE, UNDEFINED, num_units * HFI_MAX_PORT, sizeof(uint16_t)); if (hfi_lids == NULL) { err = psmi_handle_error(NULL, PSM_NO_MEMORY, "Couldn't allocate memory for dev_lids structure"); goto fail; } for (i = 0; i < num_units; i++) { int j; for (j = 1; j <= HFI_MAX_PORT; j++) { int lid = hfi_get_port_lid(i, j); int ret; uint64_t gid_hi = 0, gid_lo = 0; if (lid == -1) continue; ret = hfi_get_port_gid(i, j, &gid_hi, &gid_lo); if (ret == -1) continue; else if (my_gid_hi != gid_hi) { _HFI_VDBG("LID %d, unit %d, port %d, " "mismatched GID %llx:%llx and " "%llx:%llx\n", lid, i, j, (unsigned long long)gid_hi, (unsigned long long)gid_lo, (unsigned long long)my_gid_hi, (unsigned long long) my_gid_lo); continue; } _HFI_VDBG("LID %d, unit %d, port %d, " "matching GID %llx:%llx and " "%llx:%llx\n", lid, i, j, (unsigned long long)gid_hi, (unsigned long long)gid_lo, (unsigned long long)my_gid_hi, (unsigned long long)my_gid_lo); hfi_lids[nlids++] = (uint16_t) lid; } } if (nlids == 0) { err = psmi_handle_error(NULL, PSM_EP_DEVICE_FAILURE, "Couldn't get lid&gid from any unit/port"); goto fail; } } *lids = hfi_lids; *num_lids_o = nlids; fail: return err; }