static ucs_status_t uct_cm_ep_fill_path_rec(uct_cm_ep_t *ep, struct ibv_sa_path_rec *path) { uct_cm_iface_t *iface = ucs_derived_of(ep->super.super.iface, uct_cm_iface_t); path->dgid.global.subnet_prefix = ep->dest_addr.subnet_prefix; path->dgid.global.interface_id = ep->dest_addr.guid; path->sgid = iface->super.gid; path->dlid = htons(ep->dest_addr.lid); path->slid = htons(uct_ib_iface_port_attr(&iface->super)->lid); path->raw_traffic = 0; /* IB traffic */ path->flow_label = 0; path->hop_limit = 0; path->traffic_class = 0; path->reversible = htonl(1); /* IBCM currently only supports reversible paths */ path->numb_path = 0; path->pkey = ntohs(iface->super.pkey_value); path->sl = iface->super.sl; path->mtu_selector = 2; /* EQ */ path->mtu = uct_ib_iface_port_attr(&iface->super)->active_mtu; path->rate_selector = 2; /* EQ */ path->rate = IBV_RATE_MAX; path->packet_life_time_selector = 2; /* EQ */ path->packet_life_time = 0; path->preference = 0; /* Use first path */ return UCS_OK; }
void uct_rc_verbs_iface_common_query(uct_rc_verbs_iface_common_t *verbs_iface, uct_rc_iface_t *iface, uct_iface_attr_t *iface_attr) { /* PUT */ iface_attr->cap.put.max_short = verbs_iface->config.max_inline; iface_attr->cap.put.max_bcopy = iface->super.config.seg_size; iface_attr->cap.put.min_zcopy = 0; iface_attr->cap.put.max_zcopy = uct_ib_iface_port_attr(&iface->super)->max_msg_sz; iface_attr->cap.put.max_iov = uct_ib_iface_get_max_iov(&iface->super); /* GET */ iface_attr->cap.get.max_bcopy = iface->super.config.seg_size; iface_attr->cap.get.min_zcopy = iface->super.config.max_inl_resp + 1; iface_attr->cap.get.max_zcopy = uct_ib_iface_port_attr(&iface->super)->max_msg_sz; iface_attr->cap.get.max_iov = uct_ib_iface_get_max_iov(&iface->super); /* AM */ iface_attr->cap.am.max_short = verbs_iface->config.max_inline - sizeof(uct_rc_hdr_t); iface_attr->cap.am.max_bcopy = iface->super.config.seg_size - sizeof(uct_rc_hdr_t); iface_attr->cap.am.min_zcopy = 0; iface_attr->cap.am.max_zcopy = iface->super.config.seg_size - sizeof(uct_rc_hdr_t); /* The first IOV is reserved for the header */ iface_attr->cap.am.max_iov = uct_ib_iface_get_max_iov(&iface->super) - 1; /* TODO: may need to change for dc/rc */ iface_attr->cap.am.max_hdr = verbs_iface->config.short_desc_size - sizeof(uct_rc_hdr_t); iface_attr->cap.flags |= UCT_IFACE_FLAG_ERRHANDLE_PEER_FAILURE; /* Software overhead */ iface_attr->overhead = 75e-9; }
static ucs_status_t uct_rc_verbs_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr) { uct_rc_verbs_iface_t *iface = ucs_derived_of(tl_iface, uct_rc_verbs_iface_t); struct ibv_exp_device_attr *dev_attr = &uct_ib_iface_device(&iface->super.super)->dev_attr; uct_rc_iface_query(&iface->super, iface_attr); /* PUT */ iface_attr->cap.put.max_short = iface->config.max_inline; iface_attr->cap.put.max_bcopy = iface->super.super.config.seg_size; iface_attr->cap.put.max_zcopy = uct_ib_iface_port_attr(&iface->super.super)->max_msg_sz; /* GET */ iface_attr->cap.get.max_bcopy = iface->super.super.config.seg_size; iface_attr->cap.get.max_zcopy = uct_ib_iface_port_attr(&iface->super.super)->max_msg_sz; /* AM */ iface_attr->cap.am.max_short = iface->config.max_inline - sizeof(uct_rc_hdr_t); iface_attr->cap.am.max_bcopy = iface->super.super.config.seg_size - sizeof(uct_rc_hdr_t); iface_attr->cap.am.max_zcopy = iface->super.super.config.seg_size - sizeof(uct_rc_hdr_t); iface_attr->cap.am.max_hdr = iface->config.short_desc_size - sizeof(uct_rc_hdr_t); /* * Atomics. * Need to make sure device support at least one kind of atomics. */ if (IBV_EXP_HAVE_ATOMIC_HCA(dev_attr) || IBV_EXP_HAVE_ATOMIC_GLOB(dev_attr) || IBV_EXP_HAVE_ATOMIC_HCA_REPLY_BE(dev_attr)) { iface_attr->cap.flags |= UCT_IFACE_FLAG_ATOMIC_ADD64 | UCT_IFACE_FLAG_ATOMIC_FADD64 | UCT_IFACE_FLAG_ATOMIC_CSWAP64; if (uct_rc_verbs_is_ext_atomic_supported(dev_attr, sizeof(uint32_t))) { iface_attr->cap.flags |= UCT_IFACE_FLAG_ATOMIC_ADD32 | UCT_IFACE_FLAG_ATOMIC_FADD32 | UCT_IFACE_FLAG_ATOMIC_SWAP32 | UCT_IFACE_FLAG_ATOMIC_CSWAP32; } if (uct_rc_verbs_is_ext_atomic_supported(dev_attr, sizeof(uint64_t))) { iface_attr->cap.flags |= UCT_IFACE_FLAG_ATOMIC_SWAP64; } } return UCS_OK; }
ucs_status_t uct_ib_iface_get_device_address(uct_iface_h tl_iface, uct_device_addr_t *dev_addr) { uct_ib_iface_t *iface = ucs_derived_of(tl_iface, uct_ib_iface_t); uct_ib_address_pack(uct_ib_iface_device(iface), iface->addr_type, &iface->gid, uct_ib_iface_port_attr(iface)->lid, (void*)dev_addr); return UCS_OK; }
ucs_status_t uct_ib_iface_get_address(uct_iface_h tl_iface, struct sockaddr *addr) { uct_ib_iface_t *iface = ucs_derived_of(tl_iface, uct_ib_iface_t); uct_sockaddr_ib_t *ib_addr = (uct_sockaddr_ib_t*)addr; /* TODO LMC */ ib_addr->sib_family = UCT_AF_INFINIBAND; ib_addr->lid = uct_ib_iface_port_attr(iface)->lid; ib_addr->id = 0; ib_addr->guid = iface->gid.global.interface_id; ib_addr->subnet_prefix = iface->gid.global.subnet_prefix; ib_addr->qp_num = 0; return UCS_OK; }
static ucs_status_t uct_ib_iface_init_pkey(uct_ib_iface_t *iface, const uct_ib_iface_config_t *config) { uct_ib_device_t *dev = uct_ib_iface_device(iface); uint16_t pkey_tbl_len = uct_ib_iface_port_attr(iface)->pkey_tbl_len; uint16_t pkey_index, port_pkey, pkey; if (config->pkey_value > UCT_IB_PKEY_PARTITION_MASK) { ucs_error("Requested pkey 0x%x is invalid, should be in the range 0..0x%x", config->pkey_value, UCT_IB_PKEY_PARTITION_MASK); return UCS_ERR_INVALID_PARAM; } /* get the user's pkey value and find its index in the port's pkey table */ for (pkey_index = 0; pkey_index < pkey_tbl_len; ++pkey_index) { /* get the pkey values from the port's pkeys table */ if (ibv_query_pkey(dev->ibv_context, iface->config.port_num, pkey_index, &port_pkey)) { ucs_error("ibv_query_pkey("UCT_IB_IFACE_FMT", index=%d) failed: %m", UCT_IB_IFACE_ARG(iface), pkey_index); } pkey = ntohs(port_pkey); if (!(pkey & UCT_IB_PKEY_MEMBERSHIP_MASK)) { ucs_debug("skipping send-only pkey[%d]=0x%x", pkey_index, pkey); continue; } /* take only the lower 15 bits for the comparison */ if ((pkey & UCT_IB_PKEY_PARTITION_MASK) == config->pkey_value) { iface->pkey_index = pkey_index; iface->pkey_value = pkey; ucs_debug("using pkey[%d] 0x%x on "UCT_IB_IFACE_FMT, iface->pkey_index, iface->pkey_value, UCT_IB_IFACE_ARG(iface)); return UCS_OK; } } ucs_error("The requested pkey: 0x%x, cannot be used. " "It wasn't found or the configured pkey doesn't have full membership.", config->pkey_value); return UCS_ERR_INVALID_PARAM; }
ucs_status_t uct_ib_iface_query(uct_ib_iface_t *iface, size_t xport_hdr_len, uct_iface_attr_t *iface_attr) { uct_ib_device_t *dev = uct_ib_iface_device(iface); static const unsigned ib_port_widths[] = { [0] = 1, [1] = 4, [2] = 8, [3] = 12 }; uint8_t active_width, active_speed, active_mtu; double encoding, signal_rate, wire_speed; size_t mtu, width, extra_pkt_len; int i = 0; active_width = uct_ib_iface_port_attr(iface)->active_width; active_speed = uct_ib_iface_port_attr(iface)->active_speed; active_mtu = uct_ib_iface_port_attr(iface)->active_mtu; /* Get active width */ if (!ucs_is_pow2(active_width) || (active_width < 1) || (ucs_ilog2(active_width) > 3)) { ucs_error("Invalid active_width on %s:%d: %d", UCT_IB_IFACE_ARG(iface), active_width); return UCS_ERR_IO_ERROR; } memset(iface_attr, 0, sizeof(*iface_attr)); iface_attr->device_addr_len = iface->addr_size; switch (active_speed) { case 1: /* SDR */ iface_attr->latency = 5000e-9; signal_rate = 2.5e9; encoding = 8.0/10.0; break; case 2: /* DDR */ iface_attr->latency = 2500e-9; signal_rate = 5.0e9; encoding = 8.0/10.0; break; case 4: /* QDR */ iface_attr->latency = 1300e-9; signal_rate = 10.0e9; encoding = 8.0/10.0; break; case 8: /* FDR10 */ iface_attr->latency = 700e-9; signal_rate = 10.3125e9; encoding = 64.0/66.0; break; case 16: /* FDR */ iface_attr->latency = 700e-9; signal_rate = 14.0625e9; encoding = 64.0/66.0; break; case 32: /* EDR */ iface_attr->latency = 600e-9; signal_rate = 25.78125e9; encoding = 64.0/66.0; break; default: ucs_error("Invalid active_speed on %s:%d: %d", UCT_IB_IFACE_ARG(iface), active_speed); return UCS_ERR_IO_ERROR; } /* Wire speed calculation: Width * SignalRate * Encoding */ width = ib_port_widths[ucs_ilog2(active_width)]; wire_speed = (width * signal_rate * encoding) / 8.0; /* Calculate packet overhead */ mtu = ucs_min(uct_ib_mtu_value(active_mtu), iface->config.seg_size); extra_pkt_len = UCT_IB_BTH_LEN + xport_hdr_len + UCT_IB_ICRC_LEN + UCT_IB_VCRC_LEN + UCT_IB_DELIM_LEN; if (IBV_PORT_IS_LINK_LAYER_ETHERNET(uct_ib_iface_port_attr(iface))) { extra_pkt_len += UCT_IB_GRH_LEN + UCT_IB_ROCE_LEN; } else { /* TODO check if UCT_IB_DELIM_LEN is present in RoCE as well */ extra_pkt_len += UCT_IB_LRH_LEN; } iface_attr->bandwidth = (wire_speed * mtu) / (mtu + extra_pkt_len); /* Set priority of current device */ iface_attr->priority = 0; while (uct_ib_device_info_table[i].vendor_part_id != 0) { if (uct_ib_device_info_table[i].vendor_part_id == dev->dev_attr.vendor_part_id) { iface_attr->priority = uct_ib_device_info_table[i].priority; break; } i++; } return UCS_OK; }
/** * @param rx_headroom Headroom requested by the user. * @param rx_priv_len Length of transport private data to reserve (0 if unused) * @param rx_hdr_len Length of transport network header. * @param mss Maximal segment size (transport limit). */ UCS_CLASS_INIT_FUNC(uct_ib_iface_t, uct_ib_iface_ops_t *ops, uct_md_h md, uct_worker_h worker, const uct_iface_params_t *params, unsigned rx_priv_len, unsigned rx_hdr_len, unsigned tx_cq_len, size_t mss, const uct_ib_iface_config_t *config) { uct_ib_device_t *dev = &ucs_derived_of(md, uct_ib_md_t)->dev; ucs_status_t status; uint8_t port_num; UCS_CLASS_CALL_SUPER_INIT(uct_base_iface_t, &ops->super, md, worker, &config->super UCS_STATS_ARG(dev->stats)); status = uct_ib_device_find_port(dev, params->dev_name, &port_num); if (status != UCS_OK) { goto err; } self->ops = ops; self->config.rx_payload_offset = sizeof(uct_ib_iface_recv_desc_t) + ucs_max(sizeof(uct_am_recv_desc_t) + params->rx_headroom, rx_priv_len + rx_hdr_len); self->config.rx_hdr_offset = self->config.rx_payload_offset - rx_hdr_len; self->config.rx_headroom_offset= self->config.rx_payload_offset - params->rx_headroom; self->config.seg_size = ucs_min(mss, config->super.max_bcopy); self->config.tx_max_poll = config->tx.max_poll; self->config.rx_max_poll = config->rx.max_poll; self->config.rx_max_batch = ucs_min(config->rx.max_batch, config->rx.queue_len / 4); self->config.port_num = port_num; self->config.sl = config->sl; self->config.gid_index = config->gid_index; status = uct_ib_iface_init_pkey(self, config); if (status != UCS_OK) { goto err; } status = uct_ib_device_query_gid(dev, self->config.port_num, self->config.gid_index, &self->gid); if (status != UCS_OK) { goto err; } status = uct_ib_iface_init_lmc(self, config); if (status != UCS_OK) { goto err; } self->comp_channel = ibv_create_comp_channel(dev->ibv_context); if (self->comp_channel == NULL) { ucs_error("ibv_create_comp_channel() failed: %m"); status = UCS_ERR_IO_ERROR; goto err_free_path_bits; } status = ucs_sys_fcntl_modfl(self->comp_channel->fd, O_NONBLOCK, 0); if (status != UCS_OK) { goto err_destroy_comp_channel; } status = uct_ib_iface_create_cq(self, tx_cq_len, 0, &self->send_cq); if (status != UCS_OK) { goto err_destroy_comp_channel; } status = uct_ib_iface_create_cq(self, config->rx.queue_len, config->rx.inl, &self->recv_cq); if (status != UCS_OK) { goto err_destroy_send_cq; } /* Address scope and size */ if (config->addr_type == UCT_IB_IFACE_ADDRESS_TYPE_AUTO) { if (IBV_PORT_IS_LINK_LAYER_ETHERNET(uct_ib_iface_port_attr(self))) { self->addr_type = UCT_IB_ADDRESS_TYPE_ETH; } else { self->addr_type = uct_ib_address_scope(self->gid.global.subnet_prefix); } } else { ucs_assert(config->addr_type < UCT_IB_ADDRESS_TYPE_LAST); self->addr_type = config->addr_type; } self->addr_size = uct_ib_address_size(self->addr_type); ucs_debug("created uct_ib_iface_t headroom_ofs %d payload_ofs %d hdr_ofs %d data_sz %d", self->config.rx_headroom_offset, self->config.rx_payload_offset, self->config.rx_hdr_offset, self->config.seg_size); return UCS_OK; err_destroy_send_cq: ibv_destroy_cq(self->send_cq); err_destroy_comp_channel: ibv_destroy_comp_channel(self->comp_channel); err_free_path_bits: ucs_free(self->path_bits); err: return status; }
static ucs_status_t uct_ib_iface_init_lmc(uct_ib_iface_t *iface, const uct_ib_iface_config_t *config) { unsigned i, j, num_path_bits; unsigned first, last; uint8_t lmc; int step; if (config->lid_path_bits.count == 0) { ucs_error("List of path bits must not be empty"); return UCS_ERR_INVALID_PARAM; } /* count the number of lid_path_bits */ num_path_bits = 0; for (i = 0; i < config->lid_path_bits.count; i++) { num_path_bits += 1 + abs(config->lid_path_bits.ranges[i].first - config->lid_path_bits.ranges[i].last); } iface->path_bits = ucs_calloc(1, num_path_bits * sizeof(*iface->path_bits), "ib_path_bits"); if (iface->path_bits == NULL) { return UCS_ERR_NO_MEMORY; } lmc = uct_ib_iface_port_attr(iface)->lmc; /* go over the list of values (ranges) for the lid_path_bits and set them */ iface->path_bits_count = 0; for (i = 0; i < config->lid_path_bits.count; ++i) { first = config->lid_path_bits.ranges[i].first; last = config->lid_path_bits.ranges[i].last; /* range of values or one value */ if (first < last) { step = 1; } else { step = -1; } /* fill the value/s */ for (j = first; j != (last + step); j += step) { if (j >= UCS_BIT(lmc)) { ucs_debug("Not using value %d for path_bits - must be < 2^lmc (lmc=%d)", j, lmc); if (step == 1) { break; } else { continue; } } ucs_assert(iface->path_bits_count <= num_path_bits); iface->path_bits[iface->path_bits_count] = j; iface->path_bits_count++; } } return UCS_OK; }
ucs_status_t uct_ib_iface_query(uct_ib_iface_t *iface, size_t xport_hdr_len, uct_iface_attr_t *iface_attr) { uct_ib_device_t *dev = uct_ib_iface_device(iface); static const unsigned ib_port_widths[] = { [0] = 1, [1] = 4, [2] = 8, [3] = 12 }; uint8_t active_width, active_speed, active_mtu; double encoding, signal_rate, wire_speed; size_t mtu, width, extra_pkt_len; if (!uct_ib_device_is_port_ib(dev, iface->port_num)) { return UCS_ERR_UNSUPPORTED; } active_width = uct_ib_iface_port_attr(iface)->active_width; active_speed = uct_ib_iface_port_attr(iface)->active_speed; active_mtu = uct_ib_iface_port_attr(iface)->active_mtu; /* Get active width */ if (!ucs_is_pow2(active_width) || (active_width < 1) || (ucs_ilog2(active_width) > 3)) { ucs_error("Invalid active_width on %s:%d: %d", uct_ib_device_name(dev), iface->port_num, active_width); return UCS_ERR_IO_ERROR; } memset(iface_attr, 0, sizeof(*iface_attr)); iface_attr->device_addr_len = iface->addr_size; switch (active_speed) { case 1: /* SDR */ iface_attr->latency = 5000e-9; signal_rate = 2.5e9; encoding = 8.0/10.0; break; case 2: /* DDR */ iface_attr->latency = 2500e-9; signal_rate = 5.0e9; encoding = 8.0/10.0; break; case 4: /* QDR */ iface_attr->latency = 1300e-9; signal_rate = 10.0e9; encoding = 8.0/10.0; break; case 8: /* FDR10 */ iface_attr->latency = 700e-9; signal_rate = 10.3125e9; encoding = 64.0/66.0; break; case 16: /* FDR */ iface_attr->latency = 700e-9; signal_rate = 14.0625e9; encoding = 64.0/66.0; break; case 32: /* EDR */ iface_attr->latency = 600e-9; signal_rate = 25.78125e9; encoding = 64.0/66.0; break; default: ucs_error("Invalid active_speed on %s:%d: %d", uct_ib_device_name(dev), iface->port_num, active_speed); return UCS_ERR_IO_ERROR; } /* Wire speed calculation: Width * SignalRate * Encoding */ width = ib_port_widths[ucs_ilog2(active_width)]; wire_speed = (width * signal_rate * encoding) / 8.0; /* Calculate packet overhead */ mtu = ucs_min(uct_ib_mtu_value(active_mtu), iface->config.seg_size); extra_pkt_len = UCT_IB_LRH_LEN + UCT_IB_BTH_LEN + xport_hdr_len + UCT_IB_ICRC_LEN + UCT_IB_VCRC_LEN + UCT_IB_DELIM_LEN; iface_attr->bandwidth = (wire_speed * mtu) / (mtu + extra_pkt_len); return UCS_OK; }