static int fi_ibv_get_device_attrs(struct ibv_context *ctx, struct fi_info *info) { struct ibv_device_attr device_attr; struct ibv_port_attr port_attr; int ret = 0; ret = ibv_query_device(ctx, &device_attr); if (ret) { VERBS_INFO_ERRNO(FI_LOG_FABRIC, "ibv_query_device", errno); return -errno; } info->domain_attr->cq_cnt = device_attr.max_cq; info->domain_attr->ep_cnt = device_attr.max_qp; info->domain_attr->tx_ctx_cnt = MIN(info->domain_attr->tx_ctx_cnt, device_attr.max_qp); info->domain_attr->rx_ctx_cnt = MIN(info->domain_attr->rx_ctx_cnt, device_attr.max_qp); info->domain_attr->max_ep_tx_ctx = device_attr.max_qp; info->domain_attr->max_ep_rx_ctx = device_attr.max_qp; ret = fi_ibv_get_qp_cap(ctx, &device_attr, info); if (ret) return ret; ret = ibv_query_port(ctx, 1, &port_attr); if (ret) { VERBS_INFO_ERRNO(FI_LOG_FABRIC, "ibv_query_port", errno); return -errno; } info->ep_attr->max_msg_size = port_attr.max_msg_sz; info->ep_attr->max_order_raw_size = port_attr.max_msg_sz; info->ep_attr->max_order_waw_size = port_attr.max_msg_sz; return 0; }
static int fi_ibv_get_device_attrs(struct ibv_context *ctx, struct fi_info *info) { struct ibv_device_attr device_attr; struct ibv_port_attr port_attr; int ret = 0; ret = ibv_query_device(ctx, &device_attr); if (ret) { VERBS_INFO_ERRNO(FI_LOG_FABRIC, "ibv_query_device", errno); return -errno; } info->domain_attr->cq_cnt = device_attr.max_cq; info->domain_attr->ep_cnt = device_attr.max_qp; info->domain_attr->tx_ctx_cnt = MIN(info->domain_attr->tx_ctx_cnt, device_attr.max_qp); info->domain_attr->rx_ctx_cnt = MIN(info->domain_attr->rx_ctx_cnt, device_attr.max_qp); info->domain_attr->max_ep_tx_ctx = MIN(info->domain_attr->tx_ctx_cnt, device_attr.max_qp); info->domain_attr->max_ep_rx_ctx = MIN(info->domain_attr->rx_ctx_cnt, device_attr.max_qp); info->domain_attr->max_ep_srx_ctx = device_attr.max_qp; info->domain_attr->mr_cnt = device_attr.max_mr; if (info->ep_attr->type == FI_EP_RDM) info->domain_attr->cntr_cnt = device_attr.max_qp * 4; info->tx_attr->size = device_attr.max_qp_wr; info->tx_attr->iov_limit = device_attr.max_sge; info->tx_attr->rma_iov_limit = device_attr.max_sge; info->rx_attr->size = device_attr.max_srq_wr ? MIN(device_attr.max_qp_wr, device_attr.max_srq_wr) : device_attr.max_qp_wr; info->rx_attr->iov_limit = MIN(device_attr.max_sge, device_attr.max_srq_sge); ret = fi_ibv_get_qp_cap(ctx, info); if (ret) return ret; ret = ibv_query_port(ctx, 1, &port_attr); if (ret) { VERBS_INFO_ERRNO(FI_LOG_FABRIC, "ibv_query_port", errno); return -errno; } info->ep_attr->max_msg_size = port_attr.max_msg_sz; info->ep_attr->max_order_raw_size = port_attr.max_msg_sz; info->ep_attr->max_order_waw_size = port_attr.max_msg_sz; return 0; }
static ssize_t fi_ibv_rdm_process_event_rejected(struct fi_ibv_rdm_ep *ep, struct rdma_cm_event *event) { struct fi_ibv_rdm_conn *conn = event->id->context; ssize_t ret = FI_SUCCESS; const int *pdata = event->param.conn.private_data; if ((pdata && *pdata == 0xdeadbeef) || /* * TODO: this is a workaround of the case when private_data is not * arriving from rdma_reject call on iWarp devices */ (conn->cm_role == FI_VERBS_CM_PASSIVE && event->status == -ECONNREFUSED)) { errno = 0; rdma_destroy_qp(event->id); if (errno) { VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_destroy_qp failed\n", errno); ret = -errno; } if (rdma_destroy_id(event->id)) { VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_destroy_id failed\n", errno); if (ret == FI_SUCCESS) ret = -errno; } VERBS_INFO(FI_LOG_AV, "Rejected from conn %p, addr %s:%u, cm_role %d, status %d\n", conn, inet_ntoa(conn->addr.sin_addr), ntohs(conn->addr.sin_port), conn->cm_role, event->status); } else { VERBS_INFO(FI_LOG_AV, "Unexpected REJECT from conn %p, addr %s:%u, cm_role %d, " "msg len %d, msg %x, status %d, err %d\n", conn, inet_ntoa(conn->addr.sin_addr), ntohs(conn->addr.sin_port), conn->cm_role, event->param.conn.private_data_len, event->param.conn.private_data ? *(int *)event->param.conn.private_data : 0, event->status, errno); conn->state = FI_VERBS_CONN_REJECTED; } return ret; }
static inline int fi_ibv_get_qp_cap(struct ibv_context *ctx, struct fi_info *info) { struct ibv_pd *pd; struct ibv_cq *cq; struct ibv_qp *qp; struct ibv_qp_init_attr init_attr; int ret = 0; pd = ibv_alloc_pd(ctx); if (!pd) { VERBS_INFO_ERRNO(FI_LOG_FABRIC, "ibv_alloc_pd", errno); return -errno; } cq = ibv_create_cq(ctx, 1, NULL, NULL, 0); if (!cq) { VERBS_INFO_ERRNO(FI_LOG_FABRIC, "ibv_create_cq", errno); ret = -errno; goto err1; } memset(&init_attr, 0, sizeof init_attr); init_attr.send_cq = cq; init_attr.recv_cq = cq; init_attr.cap.max_send_wr = verbs_default_tx_size; init_attr.cap.max_recv_wr = verbs_default_rx_size; init_attr.cap.max_send_sge = verbs_default_tx_iov_limit; init_attr.cap.max_recv_sge = verbs_default_rx_iov_limit; init_attr.cap.max_inline_data = verbs_default_inline_size; init_attr.qp_type = IBV_QPT_RC; qp = ibv_create_qp(pd, &init_attr); if (!qp) { VERBS_INFO_ERRNO(FI_LOG_FABRIC, "ibv_create_qp", errno); ret = -errno; goto err2; } info->tx_attr->inject_size = init_attr.cap.max_inline_data; ibv_destroy_qp(qp); err2: ibv_destroy_cq(cq); err1: ibv_dealloc_pd(pd); return ret; }
static ssize_t fi_ibv_rdm_process_route_resolved(struct rdma_cm_event *event, struct fi_ibv_rdm_ep *ep) { struct fi_ibv_rdm_tagged_conn *conn = event->id->context; ssize_t ret = FI_SUCCESS; struct rdma_conn_param cm_params; fi_ibv_rdm_pack_cm_params(&cm_params, conn, ep); VERBS_INFO(FI_LOG_AV, "ROUTE RESOLVED, conn %p, addr %s:%u\n", conn, inet_ntoa(conn->addr.sin_addr), ntohs(conn->addr.sin_port)); if (rdma_connect(event->id, &cm_params)) { VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_connect failed\n", errno); ret = -errno; free((void *)cm_params.private_data); assert(0); } return ret; }
int fi_ibv_create_ep(const char *node, const char *service, uint64_t flags, const struct fi_info *hints, struct rdma_addrinfo **rai, struct rdma_cm_id **id) { struct rdma_addrinfo *_rai = NULL; int ret; ret = fi_ibv_get_rdma_rai(node, service, flags, hints, &_rai); if (ret) { return ret; } ret = rdma_create_ep(id, _rai, NULL, NULL); if (ret) { VERBS_INFO_ERRNO(FI_LOG_FABRIC, "rdma_create_ep", errno); ret = -errno; goto err1; } if (rai) { *rai = _rai; } else { rdma_freeaddrinfo(_rai); } return ret; err1: rdma_freeaddrinfo(_rai); return ret; }
int fi_ibv_accept_xrc(struct fi_ibv_xrc_ep *ep, int reciprocal, void *param, size_t paramlen) { struct sockaddr *addr; struct fi_ibv_connreq *connreq; struct rdma_conn_param conn_param = { 0 }; struct fi_ibv_xrc_cm_data *cm_data = param; int ret; addr = rdma_get_local_addr(ep->tgt_id); if (addr) ofi_straddr_dbg(&fi_ibv_prov, FI_LOG_CORE, "src_addr", addr); addr = rdma_get_peer_addr(ep->tgt_id); if (addr) ofi_straddr_dbg(&fi_ibv_prov, FI_LOG_CORE, "dest_addr", addr); connreq = container_of(ep->base_ep.info->handle, struct fi_ibv_connreq, handle); ret = fi_ibv_ep_create_tgt_qp(ep, connreq->xrc.conn_data); if (ret) return ret; fi_ibv_set_xrc_cm_data(cm_data, connreq->xrc.is_reciprocal, connreq->xrc.conn_tag, connreq->xrc.port, ep->srqn); conn_param.private_data = cm_data; conn_param.private_data_len = paramlen; conn_param.responder_resources = RDMA_MAX_RESP_RES; conn_param.initiator_depth = RDMA_MAX_INIT_DEPTH; conn_param.flow_control = 1; conn_param.rnr_retry_count = 7; if (ep->base_ep.srq_ep) conn_param.srq = 1; /* Shared INI/TGT QP connection use a temporarily reserved QP number * avoid the appearance of being a stale/duplicate IB CM message */ if (!ep->tgt_id->qp) conn_param.qp_num = ep->conn_setup->rsvd_tgt_qpn->qp_num; if (connreq->xrc.is_reciprocal) fi_ibv_eq_clear_xrc_conn_tag(ep); else ep->conn_setup->conn_tag = connreq->xrc.conn_tag; assert(ep->conn_state == FI_IBV_XRC_UNCONNECTED || ep->conn_state == FI_IBV_XRC_ORIG_CONNECTED); fi_ibv_next_xrc_conn_state(ep); ret = rdma_accept(ep->tgt_id, &conn_param); if (ret) { ret = -errno; VERBS_INFO_ERRNO(FI_LOG_EP_CTRL, "XRC TGT, ibv_open_qp", errno); fi_ibv_prev_xrc_conn_state(ep); } free(connreq); return ret; }
int fi_ibv_get_rdma_rai(const char *node, const char *service, uint64_t flags, const struct fi_info *hints, struct rdma_addrinfo **rai) { struct rdma_addrinfo rai_hints, *_rai; struct rdma_addrinfo **rai_current; int ret = fi_ibv_fi_to_rai(hints, flags, &rai_hints); if (ret) goto out; if (!node && !rai_hints.ai_dst_addr) { if ((!rai_hints.ai_src_addr && !service) || (!rai_hints.ai_src_addr && FI_IBV_EP_TYPE_IS_RDM(hints))) { node = local_node; } rai_hints.ai_flags |= RAI_PASSIVE; } ret = rdma_getaddrinfo((char *) node, (char *) service, &rai_hints, &_rai); if (ret) { VERBS_INFO_ERRNO(FI_LOG_FABRIC, "rdma_getaddrinfo", errno); if (errno) { ret = -errno; } goto out; } /* * If caller requested rai, remove ib_rai entries added by IBACM to * prevent wrong ib_connect_hdr from being sent in connect request. */ if (rai && hints && (hints->addr_format != FI_SOCKADDR_IB)) { for (rai_current = &_rai; *rai_current;) { struct rdma_addrinfo *rai_next; if ((*rai_current)->ai_family == AF_IB) { rai_next = (*rai_current)->ai_next; (*rai_current)->ai_next = NULL; rdma_freeaddrinfo(*rai_current); *rai_current = rai_next; continue; } rai_current = &(*rai_current)->ai_next; } } if (rai) *rai = _rai; out: if (rai_hints.ai_src_addr) free(rai_hints.ai_src_addr); if (rai_hints.ai_dst_addr) free(rai_hints.ai_dst_addr); return ret; }
static ssize_t fi_ibv_rdm_process_addr_resolved(struct rdma_cm_id *id, struct fi_ibv_rdm_ep *ep) { ssize_t ret = FI_SUCCESS; struct ibv_qp_init_attr qp_attr; struct fi_ibv_rdm_tagged_conn *conn = id->context; VERBS_INFO(FI_LOG_AV, "ADDR_RESOLVED conn %p, addr %s:%u\n", conn, inet_ntoa(conn->addr.sin_addr), ntohs(conn->addr.sin_port)); assert(id->verbs == ep->domain->verbs); do { fi_ibv_rdm_tagged_init_qp_attributes(&qp_attr, ep); if (rdma_create_qp(id, ep->domain->pd, &qp_attr)) { VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_create_qp failed\n", errno); return -errno; } if (conn->cm_role == FI_VERBS_CM_PASSIVE) { break; } conn->qp[0] = id->qp; assert(conn->id[0] == id); if (conn->cm_role == FI_VERBS_CM_SELF) { break; } ret = fi_ibv_rdm_prepare_conn_memory(ep, conn); if (ret != FI_SUCCESS) { goto err; } ret = fi_ibv_rdm_repost_receives(conn, ep, ep->rq_wr_depth); if (ret < 0) { VERBS_INFO(FI_LOG_AV, "repost receives failed\n"); goto err; } else { ret = FI_SUCCESS; } } while (0); if (rdma_resolve_route(id, FI_IBV_RDM_CM_RESOLVEADDR_TIMEOUT)) { VERBS_INFO(FI_LOG_AV, "rdma_resolve_route failed\n"); ret = -FI_EHOSTUNREACH; goto err; } return ret; err: rdma_destroy_qp(id); return ret; }
static ssize_t fi_ibv_rdm_process_event_rejected(struct fi_ibv_rdm_ep *ep, struct rdma_cm_event *event) { struct fi_ibv_rdm_tagged_conn *conn = event->id->context; ssize_t ret = FI_SUCCESS; if (NULL != event->param.conn.private_data && *((int *)event->param.conn.private_data) == 0xdeadbeef ) { assert(conn->cm_role == FI_VERBS_CM_PASSIVE); errno = 0; rdma_destroy_qp(event->id); if (errno) { VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_destroy_qp failed\n", errno); ret = -errno; } if (rdma_destroy_id(event->id)) { VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_destroy_id failed\n", errno); if (ret == FI_SUCCESS) ret = -errno; } VERBS_INFO(FI_LOG_AV, "Rejected from conn %p, addr %s:%u, cm_role %d, status %d\n", conn, inet_ntoa(conn->addr.sin_addr), ntohs(conn->addr.sin_port), conn->cm_role, event->status); } else { VERBS_INFO(FI_LOG_AV, "Unexpected REJECT from conn %p, addr %s:%u, cm_role %d, msg len %d, msg %x, status %d\n", conn, inet_ntoa(conn->addr.sin_addr), ntohs(conn->addr.sin_port), conn->cm_role, event->param.conn.private_data_len, event->param.conn.private_data ? *(int *)event->param.conn.private_data : 0, event->status); conn->state = FI_VERBS_CONN_REJECTED; } return ret; }
static int fi_ibv_rdm_cm_init(struct fi_ibv_rdm_cm* cm, const struct rdma_addrinfo* rai) { struct sockaddr_in* src_addr = (struct sockaddr_in*)rai->ai_src_addr; cm->ec = rdma_create_event_channel(); if (!cm->ec) { VERBS_INFO(FI_LOG_EP_CTRL, "Failed to create listener event channel: %s\n", strerror(errno)); return -FI_EOTHER; } if (fi_fd_nonblock(cm->ec->fd) != 0) { VERBS_INFO_ERRNO(FI_LOG_EP_CTRL, "fcntl", errno); return -FI_EOTHER; } if (rdma_create_id(cm->ec, &cm->listener, NULL, RDMA_PS_TCP)) { VERBS_INFO(FI_LOG_EP_CTRL, "Failed to create cm listener: %s\n", strerror(errno)); return -FI_EOTHER; } if (fi_ibv_rdm_find_ipoib_addr(src_addr, cm)) { VERBS_INFO(FI_LOG_EP_CTRL, "Failed to find correct IPoIB address\n"); return -FI_ENODEV; } cm->my_addr.sin_port = src_addr->sin_port; char my_ipoib_addr_str[INET6_ADDRSTRLEN]; inet_ntop(cm->my_addr.sin_family, &cm->my_addr.sin_addr.s_addr, my_ipoib_addr_str, INET_ADDRSTRLEN); VERBS_INFO(FI_LOG_EP_CTRL, "My IPoIB: %s\n", my_ipoib_addr_str); if (rdma_bind_addr(cm->listener, (struct sockaddr *)&cm->my_addr)) { VERBS_INFO(FI_LOG_EP_CTRL, "Failed to bind cm listener to my IPoIB addr %s: %s\n", my_ipoib_addr_str, strerror(errno)); return -FI_EOTHER; } if (!cm->my_addr.sin_port) { cm->my_addr.sin_port = rdma_get_src_port(cm->listener); } assert(cm->my_addr.sin_family == AF_INET); VERBS_INFO(FI_LOG_EP_CTRL, "My ep_addr: %s:%u\n", inet_ntoa(cm->my_addr.sin_addr), ntohs(cm->my_addr.sin_port)); return FI_SUCCESS; }
int fi_ibv_init_info(void) { struct ibv_context **ctx_list; struct fi_info *fi = NULL, *tail = NULL; int ret = 0, i, num_devices; if (verbs_info) return 0; pthread_mutex_lock(&verbs_info_lock); if (verbs_info) goto unlock; if (!fi_ibv_have_device()) { VERBS_INFO(FI_LOG_FABRIC, "No RDMA devices found\n"); ret = -FI_ENODATA; goto unlock; } ctx_list = rdma_get_devices(&num_devices); if (!num_devices) { VERBS_INFO_ERRNO(FI_LOG_FABRIC, "rdma_get_devices", errno); ret = -errno; goto unlock; } for (i = 0; i < num_devices; i++) { ret = fi_ibv_alloc_info(ctx_list[i], &fi, &verbs_msg_domain); if (!ret) { if (!verbs_info) verbs_info = fi; else tail->next = fi; tail = fi; ret = fi_ibv_alloc_info(ctx_list[i], &fi, &verbs_rdm_domain); if (!ret) { tail->next = fi; tail = fi; } } } ret = verbs_info ? 0 : ret; rdma_free_devices(ctx_list); unlock: pthread_mutex_unlock(&verbs_info_lock); return ret; }
static ssize_t fi_ibv_rdm_dereg_and_free(struct ibv_mr **mr, char **buff) { ssize_t ret = FI_SUCCESS; if (ibv_dereg_mr(*mr)) { VERBS_INFO_ERRNO(FI_LOG_AV, "ibv_dereg_mr failed\n", errno); ret = -errno; } *mr = NULL; free(*buff); *buff = NULL; return ret; }
int fi_ibv_create_ep(const char *node, const char *service, uint64_t flags, const struct fi_info *hints, struct rdma_addrinfo **rai, struct rdma_cm_id **id) { struct rdma_addrinfo *_rai; struct sockaddr *local_addr; int ret; ret = fi_ibv_get_rdma_rai(node, service, flags, hints, &_rai); if (ret) { return ret; } ret = rdma_create_ep(id, _rai, NULL, NULL); if (ret) { VERBS_INFO_ERRNO(FI_LOG_FABRIC, "rdma_create_ep", errno); ret = -errno; goto err1; } if (rai && !_rai->ai_src_addr) { local_addr = rdma_get_local_addr(*id); _rai->ai_src_len = fi_ibv_sockaddr_len(local_addr); if (!(_rai->ai_src_addr = malloc(_rai->ai_src_len))) { ret = -FI_ENOMEM; goto err2; } memcpy(_rai->ai_src_addr, local_addr, _rai->ai_src_len); } if (rai) { *rai = _rai; } else { rdma_freeaddrinfo(_rai); } return ret; err2: rdma_destroy_ep(*id); err1: rdma_freeaddrinfo(_rai); return ret; }
static int fi_ibv_alloc_info(struct ibv_context *ctx, struct fi_info **info, const struct verbs_ep_domain *ep_dom) { struct fi_info *fi; union ibv_gid gid; size_t name_len; int ret; int param; if (!(fi = fi_allocinfo())) return -FI_ENOMEM; fi->caps = ep_dom->caps; fi->handle = NULL; if (ep_dom->type == FI_EP_RDM) { fi->mode = VERBS_RDM_MODE; *(fi->tx_attr) = verbs_rdm_tx_attr; } else { fi->mode = VERBS_MODE; *(fi->tx_attr) = verbs_tx_attr; } *(fi->rx_attr) = (ep_dom->type == FI_EP_RDM) ? verbs_rdm_rx_attr : verbs_rx_attr; *(fi->ep_attr) = verbs_ep_attr; *(fi->domain_attr) = verbs_domain_attr; *(fi->fabric_attr) = verbs_fabric_attr; fi->ep_attr->type = ep_dom->type; fi->tx_attr->caps = ep_dom->caps; fi->rx_attr->caps = ep_dom->caps; ret = fi_ibv_get_device_attrs(ctx, fi); if (ret) goto err; if (ep_dom->type == FI_EP_RDM) { fi->tx_attr->inject_size = FI_IBV_RDM_DFLT_BUFFERED_SSIZE; fi->tx_attr->iov_limit = 1; fi->tx_attr->rma_iov_limit = 1; if (!fi_param_get_int(&fi_ibv_prov, "rdm_buffer_size", ¶m)) { if (param > sizeof (struct fi_ibv_rdm_tagged_rndv_header)) { fi->tx_attr->inject_size = param; } else { FI_INFO(&fi_ibv_prov, FI_LOG_CORE, "rdm_buffer_size too small, should be greater then %d\n", sizeof (struct fi_ibv_rdm_tagged_rndv_header)); ret = -FI_EINVAL; goto err; } } } switch (ctx->device->transport_type) { case IBV_TRANSPORT_IB: if(ibv_query_gid(ctx, 1, 0, &gid)) { VERBS_INFO_ERRNO(FI_LOG_FABRIC, "ibv_query_gid", errno); ret = -errno; goto err; } name_len = strlen(VERBS_IB_PREFIX) + INET6_ADDRSTRLEN; if (!(fi->fabric_attr->name = calloc(1, name_len + 1))) { ret = -FI_ENOMEM; goto err; } snprintf(fi->fabric_attr->name, name_len, VERBS_IB_PREFIX "%lx", gid.global.subnet_prefix); fi->ep_attr->protocol = (ep_dom == &verbs_msg_domain) ? FI_PROTO_RDMA_CM_IB_RC : FI_PROTO_IB_RDM; break; case IBV_TRANSPORT_IWARP: fi->fabric_attr->name = strdup(VERBS_IWARP_FABRIC); if (!fi->fabric_attr->name) { ret = -FI_ENOMEM; goto err; } if (ep_dom == &verbs_msg_domain) { fi->ep_attr->protocol = FI_PROTO_IWARP; fi->tx_attr->op_flags = VERBS_TX_OP_FLAGS_IWARP; } else { fi->ep_attr->protocol = FI_PROTO_IWARP_RDM; fi->tx_attr->op_flags = VERBS_TX_OP_FLAGS_IWARP_RDM; } break; default: FI_INFO(&fi_ibv_prov, FI_LOG_CORE, "Unknown transport type\n"); ret = -FI_ENODATA; goto err; } name_len = strlen(ctx->device->name) + strlen(ep_dom->suffix); fi->domain_attr->name = malloc(name_len + 1); if (!fi->domain_attr->name) { ret = -FI_ENOMEM; goto err; } snprintf(fi->domain_attr->name, name_len + 1, "%s%s", ctx->device->name, ep_dom->suffix); fi->domain_attr->name[name_len] = '\0'; *info = fi; return 0; err: fi_freeinfo(fi); return ret; }
int fi_ibv_init_info(void) { struct ibv_context **ctx_list; struct fi_info *fi = NULL, *tail = NULL; int ret = 0, i, num_devices, fork_unsafe = 0; if (verbs_info) return 0; pthread_mutex_lock(&verbs_info_lock); if (verbs_info) goto unlock; if (!fi_ibv_have_device()) { VERBS_INFO(FI_LOG_FABRIC, "No RDMA devices found\n"); ret = -FI_ENODATA; goto unlock; } fi_param_get_bool(NULL, "fork_unsafe", &fork_unsafe); if (!fork_unsafe) { FI_INFO(&fi_ibv_prov, FI_LOG_CORE, "Enabling IB fork support\n"); ret = ibv_fork_init(); if (ret) { FI_WARN(&fi_ibv_prov, FI_LOG_CORE, "Enabling IB fork support failed: %s (%d)\n", strerror(ret), ret); goto unlock; } } else { FI_INFO(&fi_ibv_prov, FI_LOG_CORE, "Not enabling IB fork support\n"); } ctx_list = rdma_get_devices(&num_devices); if (!num_devices) { VERBS_INFO_ERRNO(FI_LOG_FABRIC, "rdma_get_devices", errno); ret = -errno; goto unlock; } for (i = 0; i < num_devices; i++) { ret = fi_ibv_alloc_info(ctx_list[i], &fi, &verbs_msg_domain); if (!ret) { if (!verbs_info) verbs_info = fi; else tail->next = fi; tail = fi; ret = fi_ibv_alloc_info(ctx_list[i], &fi, &verbs_rdm_domain); if (!ret) { tail->next = fi; tail = fi; } } } ret = verbs_info ? 0 : ret; rdma_free_devices(ctx_list); unlock: pthread_mutex_unlock(&verbs_info_lock); return ret; }
static ssize_t fi_ibv_rdm_process_connect_request(struct rdma_cm_event *event, struct fi_ibv_rdm_ep *ep) { struct ibv_qp_init_attr qp_attr; struct rdma_conn_param cm_params; struct fi_ibv_rdm_tagged_conn *conn = NULL; struct rdma_cm_id *id = event->id; ssize_t ret = FI_SUCCESS; char *p = (char *) event->param.conn.private_data; if (ep->is_closing) { int rej_message = 0xdeadbeef; if (rdma_reject(id, &rej_message, sizeof(int))) { VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_reject\n", errno); ret = -errno; if (rdma_destroy_id(id)) { VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_destroy_id\n", errno); ret = (ret == FI_SUCCESS) ? -errno : ret; } } assert(ret == FI_SUCCESS); return ret; } HASH_FIND(hh, fi_ibv_rdm_tagged_conn_hash, p, FI_IBV_RDM_DFLT_ADDRLEN, conn); if (!conn) { conn = memalign(FI_IBV_RDM_MEM_ALIGNMENT, sizeof(*conn)); if (!conn) return -FI_ENOMEM; memset(conn, 0, sizeof(struct fi_ibv_rdm_tagged_conn)); conn->state = FI_VERBS_CONN_ALLOCATED; dlist_init(&conn->postponed_requests_head); fi_ibv_rdm_unpack_cm_params(&event->param.conn, conn, ep); fi_ibv_rdm_conn_init_cm_role(conn, ep); FI_INFO(&fi_ibv_prov, FI_LOG_AV, "CONN REQUEST, NOT found in hash, new conn %p %d, addr %s:%u, HASH ADD\n", conn, conn->cm_role, inet_ntoa(conn->addr.sin_addr), ntohs(conn->addr.sin_port)); HASH_ADD(hh, fi_ibv_rdm_tagged_conn_hash, addr, FI_IBV_RDM_DFLT_ADDRLEN, conn); } else { if (conn->cm_role != FI_VERBS_CM_ACTIVE) { /* * Do it before rdma_create_qp since that call would * modify event->param.conn.private_data buffer */ fi_ibv_rdm_unpack_cm_params(&event->param.conn, conn, ep); } FI_INFO(&fi_ibv_prov, FI_LOG_AV, "CONN REQUEST, FOUND in hash, conn %p %d, addr %s:%u\n", conn, conn->cm_role, inet_ntoa(conn->addr.sin_addr), ntohs(conn->addr.sin_port)); } if (conn->cm_role == FI_VERBS_CM_ACTIVE) { int rej_message = 0xdeadbeef; if (rdma_reject(id, &rej_message, sizeof(rej_message))) { VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_reject\n", errno); ret = -errno; if (rdma_destroy_id(id)) { VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_destroy_id\n", errno); ret = (ret == FI_SUCCESS) ? -errno : ret; } } if (conn->state == FI_VERBS_CONN_ALLOCATED) { ret = fi_ibv_rdm_start_connection(ep, conn); if (ret != FI_SUCCESS) goto err; } } else { assert(conn->state == FI_VERBS_CONN_ALLOCATED || conn->state == FI_VERBS_CONN_STARTED); const size_t idx = (conn->cm_role == FI_VERBS_CM_PASSIVE) ? 0 : 1; conn->state = FI_VERBS_CONN_STARTED; assert (conn->id[idx] == NULL); conn->id[idx] = id; ret = fi_ibv_rdm_prepare_conn_memory(ep, conn); if (ret != FI_SUCCESS) goto err; fi_ibv_rdm_tagged_init_qp_attributes(&qp_attr, ep); if (rdma_create_qp(id, ep->domain->pd, &qp_attr)) { ret = -errno; goto err; } conn->qp[idx] = id->qp; ret = fi_ibv_rdm_repost_receives(conn, ep, ep->rq_wr_depth); if (ret < 0) { VERBS_INFO(FI_LOG_AV, "repost receives failed\n"); goto err; } else { ret = FI_SUCCESS; } id->context = conn; fi_ibv_rdm_pack_cm_params(&cm_params, conn, ep); if (rdma_accept(id, &cm_params)) { VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_accept\n", errno); ret = -errno; goto err; } if (cm_params.private_data) { free((void *) cm_params.private_data); } } return ret; err: /* ret err code is already set here, just cleanup resources */ fi_ibv_rdm_conn_cleanup(conn); return ret; }
static inline int fi_ibv_get_qp_cap(struct ibv_context *ctx, struct ibv_device_attr *device_attr, struct fi_info *info) { struct ibv_pd *pd; struct ibv_cq *cq; struct ibv_qp *qp; struct ibv_qp_init_attr init_attr; int ret = 0; pd = ibv_alloc_pd(ctx); if (!pd) { VERBS_INFO_ERRNO(FI_LOG_FABRIC, "ibv_alloc_pd", errno); return -errno; } cq = ibv_create_cq(ctx, 1, NULL, NULL, 0); if (!cq) { VERBS_INFO_ERRNO(FI_LOG_FABRIC, "ibv_create_cq", errno); ret = -errno; goto err1; } /* TODO: serialize access to string buffers */ fi_read_file(FI_CONF_DIR, "def_tx_ctx_size", def_tx_ctx_size, sizeof def_tx_ctx_size); fi_read_file(FI_CONF_DIR, "def_rx_ctx_size", def_rx_ctx_size, sizeof def_rx_ctx_size); fi_read_file(FI_CONF_DIR, "def_tx_iov_limit", def_tx_iov_limit, sizeof def_tx_iov_limit); fi_read_file(FI_CONF_DIR, "def_rx_iov_limit", def_rx_iov_limit, sizeof def_rx_iov_limit); fi_read_file(FI_CONF_DIR, "def_inject_size", def_inject_size, sizeof def_inject_size); memset(&init_attr, 0, sizeof init_attr); init_attr.send_cq = cq; init_attr.recv_cq = cq; init_attr.cap.max_send_wr = MIN(atoi(def_tx_ctx_size), device_attr->max_qp_wr); init_attr.cap.max_recv_wr = MIN(atoi(def_rx_ctx_size), device_attr->max_qp_wr); init_attr.cap.max_send_sge = MIN(atoi(def_tx_iov_limit), device_attr->max_sge); init_attr.cap.max_recv_sge = MIN(atoi(def_rx_iov_limit), device_attr->max_sge); init_attr.cap.max_inline_data = atoi(def_inject_size); init_attr.qp_type = IBV_QPT_RC; qp = ibv_create_qp(pd, &init_attr); if (!qp) { VERBS_INFO_ERRNO(FI_LOG_FABRIC, "ibv_create_qp", errno); ret = -errno; goto err2; } info->tx_attr->inject_size = init_attr.cap.max_inline_data; info->tx_attr->iov_limit = init_attr.cap.max_send_sge; info->tx_attr->size = init_attr.cap.max_send_wr; info->rx_attr->iov_limit = init_attr.cap.max_recv_sge; /* * On some HW ibv_create_qp can increase max_recv_wr value more than * it really supports. So, alignment with device capability is needed. */ info->rx_attr->size = MIN(init_attr.cap.max_recv_wr, device_attr->max_qp_wr); ibv_destroy_qp(qp); err2: ibv_destroy_cq(cq); err1: ibv_dealloc_pd(pd); return ret; }
ssize_t fi_ibv_rdm_conn_cleanup(struct fi_ibv_rdm_tagged_conn *conn) { ssize_t ret = FI_SUCCESS; ssize_t err = FI_SUCCESS; VERBS_DBG(FI_LOG_AV, "conn %p, exp = %lld unexp = %lld\n", conn, conn->exp_counter, conn->unexp_counter); errno = 0; if (conn->id[0]) { rdma_destroy_qp(conn->id[0]); if (errno) { VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_destroy_qp\n", errno); ret = -errno; } if (rdma_destroy_id(conn->id[0])) { VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_destroy_id\n", errno); if (ret == FI_SUCCESS) ret = -errno; } } if (conn->id[1]) { assert(conn->cm_role == FI_VERBS_CM_SELF); rdma_destroy_qp(conn->id[1]); if (errno) { VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_destroy_qp\n", errno); if (ret == FI_SUCCESS) ret = -errno; } if (rdma_destroy_id(conn->id[1])) { VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_destroy_id\n", errno); if (ret == FI_SUCCESS) ret = -errno; } } if (conn->s_mr) { err = fi_ibv_rdm_dereg_and_free(&conn->s_mr, &conn->sbuf_mem_reg); if ((err != FI_SUCCESS) && (ret == FI_SUCCESS)) { ret = err; } } if (conn->r_mr) { err = fi_ibv_rdm_dereg_and_free(&conn->r_mr, &conn->rbuf_mem_reg); if ((err != FI_SUCCESS) && (ret == FI_SUCCESS)) { ret = err; } } if (conn->ack_mr) { if (ibv_dereg_mr(conn->ack_mr)) { VERBS_INFO_ERRNO(FI_LOG_AV, "ibv_dereg_mr failed\n", errno); if (ret == FI_SUCCESS) ret = -errno; } } if (conn->rma_mr) { err = fi_ibv_rdm_dereg_and_free(&conn->rma_mr, &conn->rmabuf_mem_reg); if ((err != FI_SUCCESS) && (ret == FI_SUCCESS)) { ret = err; } } free(conn); return ret; }
static int fi_ibv_mr_reg(struct fid *fid, const void *buf, size_t len, uint64_t access, uint64_t offset, uint64_t requested_key, uint64_t flags, struct fid_mr **mr, void *context) { struct fi_ibv_mem_desc *md; int fi_ibv_access = 0; struct fid_domain *domain; if (flags) return -FI_EBADFLAGS; if (fid->fclass != FI_CLASS_DOMAIN) { return -FI_EINVAL; } domain = container_of(fid, struct fid_domain, fid); md = calloc(1, sizeof *md); if (!md) return -FI_ENOMEM; md->domain = container_of(domain, struct fi_ibv_domain, domain_fid); md->mr_fid.fid.fclass = FI_CLASS_MR; md->mr_fid.fid.context = context; md->mr_fid.fid.ops = &fi_ibv_mr_ops; /* Enable local write access by default for FI_EP_RDM which hides local * registration requirements. This allows to avoid buffering or double * registration */ if (!(md->domain->info->caps & FI_LOCAL_MR)) fi_ibv_access |= IBV_ACCESS_LOCAL_WRITE; /* Local read access to an MR is enabled by default in verbs */ if (access & FI_RECV) fi_ibv_access |= IBV_ACCESS_LOCAL_WRITE; /* iWARP spec requires Remote Write access for an MR that is used * as a data sink for a Remote Read */ if (access & FI_READ) { fi_ibv_access |= IBV_ACCESS_LOCAL_WRITE; if (md->domain->verbs->device->transport_type == IBV_TRANSPORT_IWARP) fi_ibv_access |= IBV_ACCESS_REMOTE_WRITE; } if (access & FI_WRITE) fi_ibv_access |= IBV_ACCESS_LOCAL_WRITE; if (access & FI_REMOTE_READ) fi_ibv_access |= IBV_ACCESS_REMOTE_READ; /* Verbs requires Local Write access too for Remote Write access */ if (access & FI_REMOTE_WRITE) fi_ibv_access |= IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_ATOMIC; md->mr = ibv_reg_mr(md->domain->pd, (void *) buf, len, fi_ibv_access); if (!md->mr) goto err; md->mr_fid.mem_desc = (void *) (uintptr_t) md->mr->lkey; md->mr_fid.key = md->mr->rkey; *mr = &md->mr_fid; if(md->domain->eq && (md->domain->eq_flags & FI_REG_MR)) { struct fi_eq_entry entry = { .fid = &md->mr_fid.fid, .context = context }; fi_ibv_eq_write_event(md->domain->eq, FI_MR_COMPLETE, &entry, sizeof(entry)); } return 0; err: free(md); return -errno; } static int fi_ibv_mr_regv(struct fid *fid, const struct iovec * iov, size_t count, uint64_t access, uint64_t offset, uint64_t requested_key, uint64_t flags, struct fid_mr **mr, void *context) { if (count > VERBS_MR_IOV_LIMIT) { VERBS_WARN(FI_LOG_FABRIC, "iov count > %d not supported\n", VERBS_MR_IOV_LIMIT); return -FI_EINVAL; } return fi_ibv_mr_reg(fid, iov->iov_base, iov->iov_len, access, offset, requested_key, flags, mr, context); } static int fi_ibv_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr, uint64_t flags, struct fid_mr **mr) { return fi_ibv_mr_regv(fid, attr->mr_iov, attr->iov_count, attr->access, 0, attr->requested_key, flags, mr, attr->context); } static int fi_ibv_domain_bind(struct fid *fid, struct fid *bfid, uint64_t flags) { struct fi_ibv_domain *domain; struct fi_ibv_eq *eq; domain = container_of(fid, struct fi_ibv_domain, domain_fid.fid); switch (bfid->fclass) { case FI_CLASS_EQ: eq = container_of(bfid, struct fi_ibv_eq, eq_fid); domain->eq = eq; domain->eq_flags = flags; break; default: return -EINVAL; } return 0; } static int fi_ibv_domain_close(fid_t fid) { struct fi_ibv_domain *domain; int ret; domain = container_of(fid, struct fi_ibv_domain, domain_fid.fid); if (domain->rdm) { rdma_destroy_ep(domain->rdm_cm->listener); free(domain->rdm_cm); } if (domain->pd) { ret = ibv_dealloc_pd(domain->pd); if (ret) return -ret; domain->pd = NULL; } fi_freeinfo(domain->info); free(domain); return 0; } static int fi_ibv_open_device_by_name(struct fi_ibv_domain *domain, const char *name) { struct ibv_context **dev_list; int i, ret = -FI_ENODEV; if (!name) return -FI_EINVAL; dev_list = rdma_get_devices(NULL); if (!dev_list) return -errno; for (i = 0; dev_list[i] && ret; i++) { if (domain->rdm) { ret = strncmp(name, ibv_get_device_name(dev_list[i]->device), strlen(name) - strlen(verbs_rdm_domain.suffix)); } else { ret = strcmp(name, ibv_get_device_name(dev_list[i]->device)); } if (!ret) domain->verbs = dev_list[i]; } rdma_free_devices(dev_list); return ret; } static struct fi_ops fi_ibv_fid_ops = { .size = sizeof(struct fi_ops), .close = fi_ibv_domain_close, .bind = fi_ibv_domain_bind, .control = fi_no_control, .ops_open = fi_no_ops_open, }; static struct fi_ops_mr fi_ibv_domain_mr_ops = { .size = sizeof(struct fi_ops_mr), .reg = fi_ibv_mr_reg, .regv = fi_ibv_mr_regv, .regattr = fi_ibv_mr_regattr, }; static struct fi_ops_domain fi_ibv_domain_ops = { .size = sizeof(struct fi_ops_domain), .av_open = fi_no_av_open, .cq_open = fi_ibv_cq_open, .endpoint = fi_ibv_open_ep, .scalable_ep = fi_no_scalable_ep, .cntr_open = fi_no_cntr_open, .poll_open = fi_no_poll_open, .stx_ctx = fi_no_stx_context, .srx_ctx = fi_ibv_srq_context, }; static struct fi_ops_domain fi_ibv_rdm_domain_ops = { .size = sizeof(struct fi_ops_domain), .av_open = fi_ibv_rdm_av_open, .cq_open = fi_ibv_rdm_cq_open, .endpoint = fi_ibv_rdm_open_ep, .scalable_ep = fi_no_scalable_ep, .cntr_open = fi_rbv_rdm_cntr_open, .poll_open = fi_no_poll_open, .stx_ctx = fi_no_stx_context, .srx_ctx = fi_no_srx_context, }; static int fi_ibv_domain(struct fid_fabric *fabric, struct fi_info *info, struct fid_domain **domain, void *context) { struct fi_ibv_domain *_domain; struct fi_ibv_fabric *fab; struct fi_info *fi; int ret; fi = fi_ibv_get_verbs_info(info->domain_attr->name); if (!fi) return -FI_EINVAL; fab = container_of(fabric, struct fi_ibv_fabric, util_fabric.fabric_fid); ret = ofi_check_domain_attr(&fi_ibv_prov, fabric->api_version, fi->domain_attr, info->domain_attr); if (ret) return ret; _domain = calloc(1, sizeof *_domain); if (!_domain) return -FI_ENOMEM; _domain->info = fi_dupinfo(info); if (!_domain->info) goto err1; _domain->rdm = FI_IBV_EP_TYPE_IS_RDM(info); if (_domain->rdm) { _domain->rdm_cm = calloc(1, sizeof(*_domain->rdm_cm)); if (!_domain->rdm_cm) { ret = -FI_ENOMEM; goto err2; } } ret = fi_ibv_open_device_by_name(_domain, info->domain_attr->name); if (ret) goto err2; _domain->pd = ibv_alloc_pd(_domain->verbs); if (!_domain->pd) { ret = -errno; goto err2; } _domain->domain_fid.fid.fclass = FI_CLASS_DOMAIN; _domain->domain_fid.fid.context = context; _domain->domain_fid.fid.ops = &fi_ibv_fid_ops; _domain->domain_fid.mr = &fi_ibv_domain_mr_ops; if (_domain->rdm) { _domain->domain_fid.ops = &fi_ibv_rdm_domain_ops; _domain->rdm_cm->ec = rdma_create_event_channel(); if (!_domain->rdm_cm->ec) { VERBS_INFO(FI_LOG_EP_CTRL, "Failed to create listener event channel: %s\n", strerror(errno)); ret = -FI_EOTHER; goto err2; } if (fi_fd_nonblock(_domain->rdm_cm->ec->fd) != 0) { VERBS_INFO_ERRNO(FI_LOG_EP_CTRL, "fcntl", errno); ret = -FI_EOTHER; goto err3; } if (rdma_create_id(_domain->rdm_cm->ec, &_domain->rdm_cm->listener, NULL, RDMA_PS_TCP)) { VERBS_INFO(FI_LOG_EP_CTRL, "Failed to create cm listener: %s\n", strerror(errno)); ret = -FI_EOTHER; goto err3; } _domain->rdm_cm->is_bound = 0; } else { _domain->domain_fid.ops = &fi_ibv_domain_ops; } _domain->fab = fab; *domain = &_domain->domain_fid; return 0; err3: if (_domain->rdm) rdma_destroy_event_channel(_domain->rdm_cm->ec); err2: if (_domain->rdm) free(_domain->rdm_cm); fi_freeinfo(_domain->info); err1: free(_domain); return ret; } static int fi_ibv_trywait(struct fid_fabric *fabric, struct fid **fids, int count) { struct fi_ibv_cq *cq; int ret, i; for (i = 0; i < count; i++) { switch (fids[i]->fclass) { case FI_CLASS_CQ: cq = container_of(fids[i], struct fi_ibv_cq, cq_fid.fid); ret = cq->trywait(fids[i]); if (ret) return ret; break; case FI_CLASS_EQ: /* We are always ready to wait on an EQ since * rdmacm EQ is based on an fd */ continue; case FI_CLASS_CNTR: case FI_CLASS_WAIT: return -FI_ENOSYS; default: return -FI_EINVAL; } } return FI_SUCCESS; } static int fi_ibv_fabric_close(fid_t fid) { struct fi_ibv_fabric *fab; int ret; fab = container_of(fid, struct fi_ibv_fabric, util_fabric.fabric_fid.fid); ret = ofi_fabric_close(&fab->util_fabric); if (ret) return ret; free(fab); return 0; } static struct fi_ops fi_ibv_fi_ops = { .size = sizeof(struct fi_ops), .close = fi_ibv_fabric_close, .bind = fi_no_bind, .control = fi_no_control, .ops_open = fi_no_ops_open, }; static struct fi_ops_fabric fi_ibv_ops_fabric = { .size = sizeof(struct fi_ops_fabric), .domain = fi_ibv_domain, .passive_ep = fi_ibv_passive_ep, .eq_open = fi_ibv_eq_open, .wait_open = fi_no_wait_open, .trywait = fi_ibv_trywait }; int fi_ibv_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric, void *context) { struct fi_ibv_fabric *fab; struct fi_info *info; int ret; ret = fi_ibv_init_info(); if (ret) return ret; fab = calloc(1, sizeof(*fab)); if (!fab) return -FI_ENOMEM; for (info = verbs_info; info; info = info->next) { ret = ofi_fabric_init(&fi_ibv_prov, info->fabric_attr, attr, &fab->util_fabric, context); if (ret != -FI_ENODATA) break; } if (ret) { free(fab); return ret; } *fabric = &fab->util_fabric.fabric_fid; (*fabric)->fid.ops = &fi_ibv_fi_ops; (*fabric)->ops = &fi_ibv_ops_fabric; return 0; }
ssize_t fi_ibv_rdm_cm_progress(struct fi_ibv_rdm_ep *ep) { struct rdma_cm_event *event = NULL; void *data = NULL; ssize_t ret = FI_SUCCESS; if (rdma_get_cm_event(ep->cm.ec, &event)) { if(errno == EAGAIN) { errno = 0; usleep(FI_IBV_RDM_CM_THREAD_TIMEOUT); return FI_SUCCESS; } else { VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_get_cm_event failed\n", errno); ret = -errno; } } while (ret == FI_SUCCESS && event) { pthread_mutex_lock(&ep->cm_lock); struct rdma_cm_event event_copy; memcpy(&event_copy, event, sizeof(*event)); if (event->param.conn.private_data_len) { data = malloc(event->param.conn.private_data_len); if (!data) { pthread_mutex_unlock(&ep->cm_lock); ret = -FI_ENOMEM; break; } memcpy(data, event->param.conn.private_data, event->param.conn.private_data_len); event_copy.param.conn.private_data = data; event_copy.param.conn.private_data_len = event->param.conn.private_data_len; } if (rdma_ack_cm_event(event)) { VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_get_cm_event failed\n", errno); ret = -errno; } if (ret == FI_SUCCESS){ ret = fi_ibv_rdm_process_event(&event_copy, ep); } free(data); data = NULL; event = NULL; pthread_mutex_unlock(&ep->cm_lock); if (ret != FI_SUCCESS) { break; } if(rdma_get_cm_event(ep->cm.ec, &event)) { if(errno == EAGAIN) { errno = 0; usleep(FI_IBV_RDM_CM_THREAD_TIMEOUT); break; } else { VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_get_cm_event failed\n", errno); ret = -errno; } } } return ret; }