static ssize_t fi_ibv_rdm_process_addr_resolved(struct rdma_cm_id *id, struct fi_ibv_rdm_ep *ep) { ssize_t ret = FI_SUCCESS; struct ibv_qp_init_attr qp_attr; struct fi_ibv_rdm_tagged_conn *conn = id->context; VERBS_INFO(FI_LOG_AV, "ADDR_RESOLVED conn %p, addr %s:%u\n", conn, inet_ntoa(conn->addr.sin_addr), ntohs(conn->addr.sin_port)); assert(id->verbs == ep->domain->verbs); do { fi_ibv_rdm_tagged_init_qp_attributes(&qp_attr, ep); if (rdma_create_qp(id, ep->domain->pd, &qp_attr)) { VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_create_qp failed\n", errno); return -errno; } if (conn->cm_role == FI_VERBS_CM_PASSIVE) { break; } conn->qp[0] = id->qp; assert(conn->id[0] == id); if (conn->cm_role == FI_VERBS_CM_SELF) { break; } ret = fi_ibv_rdm_prepare_conn_memory(ep, conn); if (ret != FI_SUCCESS) { goto err; } ret = fi_ibv_rdm_repost_receives(conn, ep, ep->rq_wr_depth); if (ret < 0) { VERBS_INFO(FI_LOG_AV, "repost receives failed\n"); goto err; } else { ret = FI_SUCCESS; } } while (0); if (rdma_resolve_route(id, FI_IBV_RDM_CM_RESOLVEADDR_TIMEOUT)) { VERBS_INFO(FI_LOG_AV, "rdma_resolve_route failed\n"); ret = -FI_EHOSTUNREACH; goto err; } return ret; err: rdma_destroy_qp(id); return ret; }
static struct fi_info * fi_ibv_eq_cm_getinfo(struct fi_ibv_fabric *fab, struct rdma_cm_event *event, struct fi_info *pep_info) { struct fi_info *info, *fi; struct fi_ibv_connreq *connreq; const char *devname = ibv_get_device_name(event->id->verbs->device); if (strcmp(devname, fab->info->domain_attr->name)) { fi = fi_ibv_get_verbs_info(fab->all_infos, devname); if (!fi) return NULL; } else { fi = fab->info; } info = fi_dupinfo(fi); if (!info) return NULL; info->fabric_attr->fabric = &fab->util_fabric.fabric_fid; if (!(info->fabric_attr->prov_name = strdup(VERBS_PROV_NAME))) goto err; ofi_alter_info(info, pep_info, fab->util_fabric.fabric_fid.api_version); info->src_addrlen = fi_ibv_sockaddr_len(rdma_get_local_addr(event->id)); if (!(info->src_addr = malloc(info->src_addrlen))) goto err; memcpy(info->src_addr, rdma_get_local_addr(event->id), info->src_addrlen); info->dest_addrlen = fi_ibv_sockaddr_len(rdma_get_peer_addr(event->id)); if (!(info->dest_addr = malloc(info->dest_addrlen))) goto err; memcpy(info->dest_addr, rdma_get_peer_addr(event->id), info->dest_addrlen); VERBS_INFO(FI_LOG_CORE, "src_addr: %s:%d\n", inet_ntoa(((struct sockaddr_in *)info->src_addr)->sin_addr), ntohs(((struct sockaddr_in *)info->src_addr)->sin_port)); VERBS_INFO(FI_LOG_CORE, "dst_addr: %s:%d\n", inet_ntoa(((struct sockaddr_in *)info->dest_addr)->sin_addr), ntohs(((struct sockaddr_in *)info->dest_addr)->sin_port)); connreq = calloc(1, sizeof *connreq); if (!connreq) goto err; connreq->handle.fclass = FI_CLASS_CONNREQ; connreq->id = event->id; info->handle = &connreq->handle; return info; err: fi_freeinfo(info); return NULL; }
static int fi_ibv_check_hints(uint32_t version, const struct fi_info *hints, const struct fi_info *info) { int ret; uint64_t prov_mode; if (hints->caps & ~(info->caps)) { VERBS_INFO(FI_LOG_CORE, "Unsupported capabilities\n"); FI_INFO_CHECK(&fi_ibv_prov, info, hints, caps, FI_TYPE_CAPS); return -FI_ENODATA; } prov_mode = ofi_mr_get_prov_mode(version, hints, info); if ((hints->mode & prov_mode) != prov_mode) { VERBS_INFO(FI_LOG_CORE, "needed mode not set\n"); FI_INFO_MODE(&fi_ibv_prov, prov_mode, hints->mode); return -FI_ENODATA; } if (hints->fabric_attr) { ret = ofi_check_fabric_attr(&fi_ibv_prov, info->fabric_attr, hints->fabric_attr); if (ret) return ret; } if (hints->domain_attr) { ret = ofi_check_domain_attr(&fi_ibv_prov, version, info->domain_attr, hints->domain_attr); if (ret) return ret; } if (hints->ep_attr) { ret = fi_ibv_check_ep_attr(hints->ep_attr, info); if (ret) return ret; } if (hints->rx_attr) { ret = fi_ibv_check_rx_attr(hints->rx_attr, hints, info); if (ret) return ret; } if (hints->tx_attr) { ret = fi_ibv_check_tx_attr(hints->tx_attr, hints, info); if (ret) return ret; } return 0; }
int fi_ibv_open_ep(struct fid_domain *domain, struct fi_info *info, struct fid_ep **ep, void *context) { struct fi_ibv_domain *dom; struct fi_ibv_msg_ep *_ep; struct fi_ibv_connreq *connreq; struct fi_ibv_pep *pep; struct fi_info *fi; int ret; dom = container_of(domain, struct fi_ibv_domain, domain_fid); if (strcmp(dom->verbs->device->name, info->domain_attr->name)) { VERBS_INFO(FI_LOG_DOMAIN, "Invalid info->domain_attr->name\n"); return -FI_EINVAL; } fi = fi_ibv_get_verbs_info(info->domain_attr->name); if (!fi) { VERBS_INFO(FI_LOG_DOMAIN, "Unable to find matching verbs_info\n"); return -FI_EINVAL; } if (info->ep_attr) { ret = fi_ibv_check_ep_attr(info->ep_attr, fi); if (ret) return ret; } if (info->tx_attr) { ret = fi_ibv_check_tx_attr(info->tx_attr, info, fi); if (ret) return ret; } if (info->rx_attr) { ret = fi_ibv_check_rx_attr(info->rx_attr, info, fi); if (ret) return ret; } _ep = fi_ibv_alloc_msg_ep(info); if (!_ep) return -FI_ENOMEM; if (!info->handle) { ret = fi_ibv_create_ep(NULL, NULL, 0, info, NULL, &_ep->id); if (ret) goto err; } else if (info->handle->fclass == FI_CLASS_CONNREQ) { connreq = container_of(info->handle, struct fi_ibv_connreq, handle); _ep->id = connreq->id; } else if (info->handle->fclass == FI_CLASS_PEP) {
static ssize_t fi_ibv_rdm_process_event_rejected(struct fi_ibv_rdm_ep *ep, struct rdma_cm_event *event) { struct fi_ibv_rdm_conn *conn = event->id->context; ssize_t ret = FI_SUCCESS; const int *pdata = event->param.conn.private_data; if ((pdata && *pdata == 0xdeadbeef) || /* * TODO: this is a workaround of the case when private_data is not * arriving from rdma_reject call on iWarp devices */ (conn->cm_role == FI_VERBS_CM_PASSIVE && event->status == -ECONNREFUSED)) { errno = 0; rdma_destroy_qp(event->id); if (errno) { VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_destroy_qp failed\n", errno); ret = -errno; } if (rdma_destroy_id(event->id)) { VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_destroy_id failed\n", errno); if (ret == FI_SUCCESS) ret = -errno; } VERBS_INFO(FI_LOG_AV, "Rejected from conn %p, addr %s:%u, cm_role %d, status %d\n", conn, inet_ntoa(conn->addr.sin_addr), ntohs(conn->addr.sin_port), conn->cm_role, event->status); } else { VERBS_INFO(FI_LOG_AV, "Unexpected REJECT from conn %p, addr %s:%u, cm_role %d, " "msg len %d, msg %x, status %d, err %d\n", conn, inet_ntoa(conn->addr.sin_addr), ntohs(conn->addr.sin_port), conn->cm_role, event->param.conn.private_data_len, event->param.conn.private_data ? *(int *)event->param.conn.private_data : 0, event->status, errno); conn->state = FI_VERBS_CONN_REJECTED; } return ret; }
static ssize_t fi_ibv_rdm_process_route_resolved(struct rdma_cm_event *event, struct fi_ibv_rdm_ep *ep) { struct fi_ibv_rdm_tagged_conn *conn = event->id->context; ssize_t ret = FI_SUCCESS; struct rdma_conn_param cm_params; fi_ibv_rdm_pack_cm_params(&cm_params, conn, ep); VERBS_INFO(FI_LOG_AV, "ROUTE RESOLVED, conn %p, addr %s:%u\n", conn, inet_ntoa(conn->addr.sin_addr), ntohs(conn->addr.sin_port)); if (rdma_connect(event->id, &cm_params)) { VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_connect failed\n", errno); ret = -errno; free((void *)cm_params.private_data); assert(0); } return ret; }
static ssize_t fi_ibv_rdm_process_event_established(struct rdma_cm_event *event, struct fi_ibv_rdm_ep *ep) { struct fi_ibv_rdm_tagged_conn *conn = (struct fi_ibv_rdm_tagged_conn *)event->id->context; if (conn->state != FI_VERBS_CONN_STARTED && conn->cm_role != FI_VERBS_CM_SELF) { VERBS_INFO(FI_LOG_AV, "state = %d, conn %p", conn->state, conn); assert(0 && "Wrong state"); return -FI_ECONNABORTED; } if (conn->cm_role == FI_VERBS_CM_ACTIVE || conn->cm_role == FI_VERBS_CM_SELF) { fi_ibv_rdm_unpack_cm_params(&event->param.conn, conn, ep); } FI_INFO(&fi_ibv_prov, FI_LOG_AV, "CONN ESTABLISHED, conn %p, addr %s:%u\n", conn, inet_ntoa(conn->addr.sin_addr), ntohs(conn->addr.sin_port)); /* Do not count self twice */ if (conn->state != FI_VERBS_CONN_ESTABLISHED) { ep->num_active_conns++; conn->state = FI_VERBS_CONN_ESTABLISHED; } return FI_SUCCESS; }
int fi_ibv_fi_to_rai(const struct fi_info *fi, uint64_t flags, struct rdma_addrinfo *rai) { memset(rai, 0, sizeof *rai); if (flags & FI_SOURCE) rai->ai_flags = RAI_PASSIVE; if (flags & FI_NUMERICHOST) rai->ai_flags |= RAI_NUMERICHOST; rai->ai_qp_type = IBV_QPT_RC; rai->ai_port_space = RDMA_PS_TCP; if (!fi) return 0; switch(fi->addr_format) { case FI_SOCKADDR_IN: rai->ai_family = AF_INET; rai->ai_flags |= RAI_FAMILY; break; case FI_SOCKADDR_IN6: rai->ai_family = AF_INET6; rai->ai_flags |= RAI_FAMILY; break; case FI_SOCKADDR_IB: rai->ai_family = AF_IB; rai->ai_flags |= RAI_FAMILY; break; case FI_SOCKADDR: if (fi->src_addrlen) { rai->ai_family = ((struct sockaddr *)fi->src_addr)->sa_family; rai->ai_flags |= RAI_FAMILY; } else if (fi->dest_addrlen) { rai->ai_family = ((struct sockaddr *)fi->dest_addr)->sa_family; rai->ai_flags |= RAI_FAMILY; } break; case FI_FORMAT_UNSPEC: break; default: VERBS_INFO(FI_LOG_FABRIC, "Unknown fi->addr_format\n"); } if (fi->src_addrlen) { if (!(rai->ai_src_addr = malloc(fi->src_addrlen))) return -FI_ENOMEM; memcpy(rai->ai_src_addr, fi->src_addr, fi->src_addrlen); rai->ai_src_len = fi->src_addrlen; } if (fi->dest_addrlen) { if (!(rai->ai_dst_addr = malloc(fi->dest_addrlen))) return -FI_ENOMEM; memcpy(rai->ai_dst_addr, fi->dest_addr, fi->dest_addrlen); rai->ai_dst_len = fi->dest_addrlen; } return 0; }
int fi_ibv_rdm_cm_bind_ep(struct fi_ibv_rdm_cm *cm, struct fi_ibv_rdm_ep *ep) { char my_ipoib_addr_str[INET6_ADDRSTRLEN]; assert(cm->ec && cm->listener); if (ep->info->src_addr) { memcpy(&ep->my_addr, ep->info->src_addr, sizeof(ep->my_addr)); inet_ntop(ep->my_addr.sin_family, &ep->my_addr.sin_addr.s_addr, my_ipoib_addr_str, INET_ADDRSTRLEN); } else { strcpy(my_ipoib_addr_str, "undefined"); } VERBS_INFO(FI_LOG_EP_CTRL, "My IPoIB: %s\n", my_ipoib_addr_str); if (!cm->is_bound) { errno = 0; if (rdma_bind_addr(cm->listener, (struct sockaddr *)&ep->my_addr)) { VERBS_INFO(FI_LOG_EP_CTRL, "Failed to bind cm listener to my IPoIB addr %s: %s\n", my_ipoib_addr_str, strerror(errno)); return -FI_EOTHER; } if (rdma_listen(cm->listener, 1024)) { VERBS_INFO(FI_LOG_EP_CTRL, "rdma_listen failed: %s\n", strerror(errno)); return -FI_EOTHER; } cm->is_bound = 1; } if (!ep->my_addr.sin_port) { ep->my_addr.sin_port = rdma_get_src_port(cm->listener); } assert(ep->my_addr.sin_family == AF_INET); VERBS_INFO(FI_LOG_EP_CTRL, "My ep_addr: %s:%u\n", inet_ntoa(ep->my_addr.sin_addr), ntohs(ep->my_addr.sin_port)); return FI_SUCCESS; }
static ssize_t fi_ibv_rdm_process_event_rejected(struct fi_ibv_rdm_ep *ep, struct rdma_cm_event *event) { struct fi_ibv_rdm_tagged_conn *conn = event->id->context; ssize_t ret = FI_SUCCESS; if (NULL != event->param.conn.private_data && *((int *)event->param.conn.private_data) == 0xdeadbeef ) { assert(conn->cm_role == FI_VERBS_CM_PASSIVE); errno = 0; rdma_destroy_qp(event->id); if (errno) { VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_destroy_qp failed\n", errno); ret = -errno; } if (rdma_destroy_id(event->id)) { VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_destroy_id failed\n", errno); if (ret == FI_SUCCESS) ret = -errno; } VERBS_INFO(FI_LOG_AV, "Rejected from conn %p, addr %s:%u, cm_role %d, status %d\n", conn, inet_ntoa(conn->addr.sin_addr), ntohs(conn->addr.sin_port), conn->cm_role, event->status); } else { VERBS_INFO(FI_LOG_AV, "Unexpected REJECT from conn %p, addr %s:%u, cm_role %d, msg len %d, msg %x, status %d\n", conn, inet_ntoa(conn->addr.sin_addr), ntohs(conn->addr.sin_port), conn->cm_role, event->param.conn.private_data_len, event->param.conn.private_data ? *(int *)event->param.conn.private_data : 0, event->status); conn->state = FI_VERBS_CONN_REJECTED; } return ret; }
int fi_ibv_check_tx_attr(const struct fi_tx_attr *attr, const struct fi_info *hints, const struct fi_info *info) { if (attr->caps & ~(info->tx_attr->caps)) { VERBS_INFO(FI_LOG_CORE, "Given tx_attr->caps not supported\n"); FI_INFO_CHECK(&fi_ibv_prov, (info->tx_attr), attr, caps, FI_TYPE_CAPS); return -FI_ENODATA; } if (((attr->mode ? attr->mode : hints->mode) & info->tx_attr->mode) != info->tx_attr->mode) { size_t user_mode = (attr->mode ? attr->mode : hints->mode); VERBS_INFO(FI_LOG_CORE, "Given tx_attr->mode not supported\n"); FI_INFO_MODE(&fi_ibv_prov, info->tx_attr->mode, user_mode); return -FI_ENODATA; } if (attr->op_flags & ~(info->tx_attr->op_flags)) { VERBS_INFO(FI_LOG_CORE, "Given tx_attr->op_flags not supported\n"); return -FI_ENODATA; } if (attr->msg_order & ~(info->tx_attr->msg_order)) { VERBS_INFO(FI_LOG_CORE, "Given tx_attr->msg_order not supported\n"); return -FI_ENODATA; } if (attr->size > info->tx_attr->size) { VERBS_INFO(FI_LOG_CORE, "Given tx_attr->size is greater than supported\n"); FI_INFO_CHECK_VAL(&fi_ibv_prov, (info->tx_attr), attr, size); return -FI_ENODATA; } if (attr->iov_limit > info->tx_attr->iov_limit) { VERBS_INFO(FI_LOG_CORE, "Given tx_attr->iov_limit greater than supported\n"); FI_INFO_CHECK_VAL(&fi_ibv_prov, (info->tx_attr), attr, iov_limit); return -FI_ENODATA; } if (attr->rma_iov_limit > info->tx_attr->rma_iov_limit) { VERBS_INFO(FI_LOG_CORE, "Given tx_attr->rma_iov_limit greater than supported\n"); FI_INFO_CHECK_VAL(&fi_ibv_prov, (info->tx_attr), attr, rma_iov_limit); return -FI_ENODATA; } return 0; }
void fi_ibv_log_ep_conn(struct fi_ibv_xrc_ep *ep, char *desc) { struct sockaddr *addr; char buf[OFI_ADDRSTRLEN]; size_t len = sizeof(buf); if (!fi_log_enabled(&fi_ibv_prov, FI_LOG_INFO, FI_LOG_FABRIC)) return; VERBS_INFO(FI_LOG_FABRIC, "EP %p, %s\n", ep, desc); VERBS_INFO(FI_LOG_FABRIC, "EP %p, CM ID %p, TGT CM ID %p, SRQN %d Peer SRQN %d\n", ep, ep->base_ep.id, ep->tgt_id, ep->srqn, ep->peer_srqn); assert(ep->base_ep.id); addr = rdma_get_local_addr(ep->base_ep.id); if (addr) { ofi_straddr(buf, &len, ep->base_ep.info->addr_format, addr); VERBS_INFO(FI_LOG_FABRIC, "EP %p src_addr: %s\n", ep, buf); } addr = rdma_get_peer_addr(ep->base_ep.id); if (addr) { len = sizeof(buf); ofi_straddr(buf, &len, ep->base_ep.info->addr_format, addr); VERBS_INFO(FI_LOG_FABRIC, "EP %p dst_addr: %s\n", ep, buf); } if (ep->base_ep.ibv_qp) { VERBS_INFO(FI_LOG_FABRIC, "EP %p, INI QP Num %d\n", ep, ep->base_ep.ibv_qp->qp_num); VERBS_INFO(FI_LOG_FABRIC, "EP %p, Remote TGT QP Num %d\n", ep, ep->ini_conn->tgt_qpn); } if (ep->tgt_ibv_qp) VERBS_INFO(FI_LOG_FABRIC, "EP %p, TGT QP Num %d\n", ep, ep->tgt_ibv_qp->qp_num); if (ep->conn_setup && ep->conn_setup->rsvd_ini_qpn) VERBS_INFO(FI_LOG_FABRIC, "EP %p, Reserved INI QPN %d\n", ep, ep->conn_setup->rsvd_ini_qpn->qp_num); if (ep->conn_setup && ep->conn_setup->rsvd_tgt_qpn) VERBS_INFO(FI_LOG_FABRIC, "EP %p, Reserved TGT QPN %d\n", ep, ep->conn_setup->rsvd_tgt_qpn->qp_num); }
int fi_ibv_init_info(void) { struct ibv_context **ctx_list; struct fi_info *fi = NULL, *tail = NULL; int ret = 0, i, num_devices; if (verbs_info) return 0; pthread_mutex_lock(&verbs_info_lock); if (verbs_info) goto unlock; if (!fi_ibv_have_device()) { VERBS_INFO(FI_LOG_FABRIC, "No RDMA devices found\n"); ret = -FI_ENODATA; goto unlock; } ctx_list = rdma_get_devices(&num_devices); if (!num_devices) { VERBS_INFO_ERRNO(FI_LOG_FABRIC, "rdma_get_devices", errno); ret = -errno; goto unlock; } for (i = 0; i < num_devices; i++) { ret = fi_ibv_alloc_info(ctx_list[i], &fi, &verbs_msg_domain); if (!ret) { if (!verbs_info) verbs_info = fi; else tail->next = fi; tail = fi; ret = fi_ibv_alloc_info(ctx_list[i], &fi, &verbs_rdm_domain); if (!ret) { tail->next = fi; tail = fi; } } } ret = verbs_info ? 0 : ret; rdma_free_devices(ctx_list); unlock: pthread_mutex_unlock(&verbs_info_lock); return ret; }
static int fi_ibv_rdm_cm_init(struct fi_ibv_rdm_cm* cm, const struct rdma_addrinfo* rai) { struct sockaddr_in* src_addr = (struct sockaddr_in*)rai->ai_src_addr; cm->ec = rdma_create_event_channel(); if (!cm->ec) { VERBS_INFO(FI_LOG_EP_CTRL, "Failed to create listener event channel: %s\n", strerror(errno)); return -FI_EOTHER; } if (fi_fd_nonblock(cm->ec->fd) != 0) { VERBS_INFO_ERRNO(FI_LOG_EP_CTRL, "fcntl", errno); return -FI_EOTHER; } if (rdma_create_id(cm->ec, &cm->listener, NULL, RDMA_PS_TCP)) { VERBS_INFO(FI_LOG_EP_CTRL, "Failed to create cm listener: %s\n", strerror(errno)); return -FI_EOTHER; } if (fi_ibv_rdm_find_ipoib_addr(src_addr, cm)) { VERBS_INFO(FI_LOG_EP_CTRL, "Failed to find correct IPoIB address\n"); return -FI_ENODEV; } cm->my_addr.sin_port = src_addr->sin_port; char my_ipoib_addr_str[INET6_ADDRSTRLEN]; inet_ntop(cm->my_addr.sin_family, &cm->my_addr.sin_addr.s_addr, my_ipoib_addr_str, INET_ADDRSTRLEN); VERBS_INFO(FI_LOG_EP_CTRL, "My IPoIB: %s\n", my_ipoib_addr_str); if (rdma_bind_addr(cm->listener, (struct sockaddr *)&cm->my_addr)) { VERBS_INFO(FI_LOG_EP_CTRL, "Failed to bind cm listener to my IPoIB addr %s: %s\n", my_ipoib_addr_str, strerror(errno)); return -FI_EOTHER; } if (!cm->my_addr.sin_port) { cm->my_addr.sin_port = rdma_get_src_port(cm->listener); } assert(cm->my_addr.sin_family == AF_INET); VERBS_INFO(FI_LOG_EP_CTRL, "My ep_addr: %s:%u\n", inet_ntoa(cm->my_addr.sin_addr), ntohs(cm->my_addr.sin_port)); return FI_SUCCESS; }
static ssize_t fi_ibv_rdm_process_event(struct rdma_cm_event *event, struct fi_ibv_rdm_ep *ep) { ssize_t ret = FI_SUCCESS; switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: ret = fi_ibv_rdm_process_addr_resolved(event->id, ep); break; case RDMA_CM_EVENT_ROUTE_RESOLVED: ret = fi_ibv_rdm_process_route_resolved(event, ep); break; case RDMA_CM_EVENT_ESTABLISHED: ret = fi_ibv_rdm_process_event_established(event, ep); break; case RDMA_CM_EVENT_DISCONNECTED: ret = fi_ibv_rdm_process_event_disconnected(ep, event); break; case RDMA_CM_EVENT_CONNECT_REQUEST: ret = fi_ibv_rdm_process_connect_request(event, ep); break; case RDMA_CM_EVENT_REJECTED: ret = fi_ibv_rdm_process_event_rejected(ep, event); break; case RDMA_CM_EVENT_TIMEWAIT_EXIT: ret = FI_SUCCESS; break; /* All cases below fall to default case to print error message*/ case RDMA_CM_EVENT_ADDR_ERROR: ret = -FI_EADDRNOTAVAIL; case RDMA_CM_EVENT_ROUTE_ERROR: ret = (ret == FI_SUCCESS) ? -FI_EHOSTUNREACH : ret; case RDMA_CM_EVENT_CONNECT_ERROR: ret = (ret == FI_SUCCESS) ? -FI_ECONNREFUSED : ret; case RDMA_CM_EVENT_UNREACHABLE: ret = (ret == FI_SUCCESS) ? -FI_EADDRNOTAVAIL : ret; default: VERBS_INFO(FI_LOG_AV, "got unexpected rdmacm event, %s\n", rdma_event_str(event->event)); ret = (ret == FI_SUCCESS) ? -FI_ECONNABORTED : ret; break; } return ret; }
static inline ssize_t fi_ibv_rdm_batch_repost_receives(struct fi_ibv_rdm_tagged_conn *conn, struct fi_ibv_rdm_ep *ep, int num_to_post) { const size_t idx = (conn->cm_role == FI_VERBS_CM_SELF) ? 1 : 0; struct ibv_recv_wr *bad_wr = NULL; struct ibv_recv_wr wr[num_to_post]; struct ibv_sge sge[num_to_post]; int last = num_to_post - 1; int i; /* IBV_WR_SEND opcode specific */ assert((num_to_post % ep->n_buffs) == 0); assert(ep->topcode == IBV_WR_SEND || ep->topcode == IBV_WR_RDMA_WRITE_WITH_IMM); if (ep->topcode == IBV_WR_SEND) { for (i = 0; i < num_to_post; i++) { sge[i].addr = (uint64_t)(void *) fi_ibv_rdm_get_rbuf(conn, ep, i % ep->n_buffs); sge[i].length = FI_IBV_RDM_DFLT_BUFFER_SIZE; sge[i].lkey = conn->r_mr->lkey; } } for (i = 0; i < num_to_post; i++) { wr[i].wr_id = (uintptr_t) conn; wr[i].next = &wr[i + 1]; wr[i].sg_list = &sge[i]; wr[i].num_sge = 1; } wr[last].next = NULL; if (ibv_post_recv(conn->qp[idx], wr, &bad_wr) == 0) { conn->recv_preposted += num_to_post; return num_to_post; } VERBS_INFO(FI_LOG_EP_DATA, "Failed to post recv\n"); return -FI_ENOMEM; }
static ssize_t fi_ibv_rdm_process_event_disconnected(struct fi_ibv_rdm_ep *ep, struct rdma_cm_event *event) { struct fi_ibv_rdm_tagged_conn *conn = event->id->context; ep->num_active_conns--; if (conn->state == FI_VERBS_CONN_ESTABLISHED) { conn->state = FI_VERBS_CONN_REMOTE_DISCONNECT; } else { assert(conn->state == FI_VERBS_CONN_LOCAL_DISCONNECT); conn->state = FI_VERBS_CONN_CLOSED; } VERBS_INFO(FI_LOG_AV, "Disconnected from conn %p, addr %s:%u\n", conn, inet_ntoa(conn->addr.sin_addr), ntohs(conn->addr.sin_port)); if (conn->state == FI_VERBS_CONN_CLOSED) { return fi_ibv_rdm_conn_cleanup(conn); } return FI_SUCCESS; }
/* Builds a list of interfaces that correspond to active verbs devices */ static int fi_ibv_getifaddrs(struct dlist_entry *verbs_devs) { struct ifaddrs *ifaddr, *ifa; char name[INET6_ADDRSTRLEN]; struct rdma_addrinfo *rai; struct rdma_cm_id *id; const char *ret_ptr; int ret, num_verbs_ifs = 0; char *iface = NULL; size_t iface_len = 0; int exact_match = 0; ret = getifaddrs(&ifaddr); if (ret) { VERBS_WARN(FI_LOG_FABRIC, "Unable to get interface addresses\n"); return ret; } /* select best iface name based on user's input */ if (fi_param_get_str(&fi_ibv_prov, "iface", &iface) == FI_SUCCESS) { iface_len = strlen(iface); if (iface_len > IFNAMSIZ) { VERBS_INFO(FI_LOG_EP_CTRL, "Too long iface name: %s, max: %d\n", iface, IFNAMSIZ); return -FI_EINVAL; } for (ifa = ifaddr; ifa && !exact_match; ifa = ifa->ifa_next) exact_match = !strcmp(ifa->ifa_name, iface); } for (ifa = ifaddr; ifa; ifa = ifa->ifa_next) { if (!ifa->ifa_addr || !(ifa->ifa_flags & IFF_UP) || !strcmp(ifa->ifa_name, "lo")) continue; if(iface) { if(exact_match) { if(strcmp(ifa->ifa_name, iface)) continue; } else { if(strncmp(ifa->ifa_name, iface, iface_len)) continue; } } switch (ifa->ifa_addr->sa_family) { case AF_INET: ret_ptr = inet_ntop(AF_INET, &ofi_sin_addr(ifa->ifa_addr), name, INET6_ADDRSTRLEN); break; case AF_INET6: ret_ptr = inet_ntop(AF_INET6, &ofi_sin6_addr(ifa->ifa_addr), name, INET6_ADDRSTRLEN); break; default: continue; } if (!ret_ptr) { VERBS_WARN(FI_LOG_FABRIC, "inet_ntop failed: %s(%d)\n", strerror(errno), errno); goto err1; } ret = fi_ibv_create_ep(name, NULL, FI_NUMERICHOST | FI_SOURCE, NULL, &rai, &id); if (ret) continue; ret = fi_ibv_add_rai(verbs_devs, id, rai); if (ret) goto err2; VERBS_DBG(FI_LOG_FABRIC, "Found active interface for verbs device: " "%s with address: %s\n", ibv_get_device_name(id->verbs->device), name); rdma_destroy_ep(id); num_verbs_ifs++; } freeifaddrs(ifaddr); return num_verbs_ifs ? 0 : -FI_ENODATA; err2: rdma_destroy_ep(id); err1: fi_ibv_verbs_devs_free(verbs_devs); freeifaddrs(ifaddr); return ret; }
int fi_ibv_eq_open(struct fid_fabric *fabric, struct fi_eq_attr *attr, struct fid_eq **eq, void *context) { struct fi_ibv_eq *_eq; struct epoll_event event; int ret; _eq = calloc(1, sizeof *_eq); if (!_eq) return -ENOMEM; _eq->fab = container_of(fabric, struct fi_ibv_fabric, util_fabric.fabric_fid); fastlock_init(&_eq->lock); ret = dlistfd_head_init(&_eq->list_head); if (ret) { VERBS_INFO(FI_LOG_EQ, "Unable to initialize dlistfd\n"); goto err1; } _eq->epfd = epoll_create1(0); if (_eq->epfd < 0) { ret = -errno; goto err2; } memset(&event, 0, sizeof(event)); event.events = EPOLLIN; if (epoll_ctl(_eq->epfd, EPOLL_CTL_ADD, _eq->list_head.signal.fd[FI_READ_FD], &event)) { ret = -errno; goto err3; } switch (attr->wait_obj) { case FI_WAIT_NONE: case FI_WAIT_UNSPEC: case FI_WAIT_FD: _eq->channel = rdma_create_event_channel(); if (!_eq->channel) { ret = -errno; goto err3; } ret = fi_fd_nonblock(_eq->channel->fd); if (ret) goto err4; if (epoll_ctl(_eq->epfd, EPOLL_CTL_ADD, _eq->channel->fd, &event)) { ret = -errno; goto err4; } break; default: ret = -FI_ENOSYS; goto err1; } _eq->flags = attr->flags; _eq->eq_fid.fid.fclass = FI_CLASS_EQ; _eq->eq_fid.fid.context = context; _eq->eq_fid.fid.ops = &fi_ibv_eq_fi_ops; _eq->eq_fid.ops = &fi_ibv_eq_ops; *eq = &_eq->eq_fid; return 0; err4: if (_eq->channel) rdma_destroy_event_channel(_eq->channel); err3: close(_eq->epfd); err2: dlistfd_head_free(&_eq->list_head); err1: fastlock_destroy(&_eq->lock); free(_eq); return ret; }
static int fi_ibv_mr_reg(struct fid *fid, const void *buf, size_t len, uint64_t access, uint64_t offset, uint64_t requested_key, uint64_t flags, struct fid_mr **mr, void *context) { struct fi_ibv_mem_desc *md; int fi_ibv_access = 0; struct fid_domain *domain; if (flags) return -FI_EBADFLAGS; if (fid->fclass != FI_CLASS_DOMAIN) { return -FI_EINVAL; } domain = container_of(fid, struct fid_domain, fid); md = calloc(1, sizeof *md); if (!md) return -FI_ENOMEM; md->domain = container_of(domain, struct fi_ibv_domain, domain_fid); md->mr_fid.fid.fclass = FI_CLASS_MR; md->mr_fid.fid.context = context; md->mr_fid.fid.ops = &fi_ibv_mr_ops; /* Enable local write access by default for FI_EP_RDM which hides local * registration requirements. This allows to avoid buffering or double * registration */ if (!(md->domain->info->caps & FI_LOCAL_MR)) fi_ibv_access |= IBV_ACCESS_LOCAL_WRITE; /* Local read access to an MR is enabled by default in verbs */ if (access & FI_RECV) fi_ibv_access |= IBV_ACCESS_LOCAL_WRITE; /* iWARP spec requires Remote Write access for an MR that is used * as a data sink for a Remote Read */ if (access & FI_READ) { fi_ibv_access |= IBV_ACCESS_LOCAL_WRITE; if (md->domain->verbs->device->transport_type == IBV_TRANSPORT_IWARP) fi_ibv_access |= IBV_ACCESS_REMOTE_WRITE; } if (access & FI_WRITE) fi_ibv_access |= IBV_ACCESS_LOCAL_WRITE; if (access & FI_REMOTE_READ) fi_ibv_access |= IBV_ACCESS_REMOTE_READ; /* Verbs requires Local Write access too for Remote Write access */ if (access & FI_REMOTE_WRITE) fi_ibv_access |= IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_ATOMIC; md->mr = ibv_reg_mr(md->domain->pd, (void *) buf, len, fi_ibv_access); if (!md->mr) goto err; md->mr_fid.mem_desc = (void *) (uintptr_t) md->mr->lkey; md->mr_fid.key = md->mr->rkey; *mr = &md->mr_fid; if(md->domain->eq && (md->domain->eq_flags & FI_REG_MR)) { struct fi_eq_entry entry = { .fid = &md->mr_fid.fid, .context = context }; fi_ibv_eq_write_event(md->domain->eq, FI_MR_COMPLETE, &entry, sizeof(entry)); } return 0; err: free(md); return -errno; } static int fi_ibv_mr_regv(struct fid *fid, const struct iovec * iov, size_t count, uint64_t access, uint64_t offset, uint64_t requested_key, uint64_t flags, struct fid_mr **mr, void *context) { if (count > VERBS_MR_IOV_LIMIT) { VERBS_WARN(FI_LOG_FABRIC, "iov count > %d not supported\n", VERBS_MR_IOV_LIMIT); return -FI_EINVAL; } return fi_ibv_mr_reg(fid, iov->iov_base, iov->iov_len, access, offset, requested_key, flags, mr, context); } static int fi_ibv_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr, uint64_t flags, struct fid_mr **mr) { return fi_ibv_mr_regv(fid, attr->mr_iov, attr->iov_count, attr->access, 0, attr->requested_key, flags, mr, attr->context); } static int fi_ibv_domain_bind(struct fid *fid, struct fid *bfid, uint64_t flags) { struct fi_ibv_domain *domain; struct fi_ibv_eq *eq; domain = container_of(fid, struct fi_ibv_domain, domain_fid.fid); switch (bfid->fclass) { case FI_CLASS_EQ: eq = container_of(bfid, struct fi_ibv_eq, eq_fid); domain->eq = eq; domain->eq_flags = flags; break; default: return -EINVAL; } return 0; } static int fi_ibv_domain_close(fid_t fid) { struct fi_ibv_domain *domain; int ret; domain = container_of(fid, struct fi_ibv_domain, domain_fid.fid); if (domain->rdm) { rdma_destroy_ep(domain->rdm_cm->listener); free(domain->rdm_cm); } if (domain->pd) { ret = ibv_dealloc_pd(domain->pd); if (ret) return -ret; domain->pd = NULL; } fi_freeinfo(domain->info); free(domain); return 0; } static int fi_ibv_open_device_by_name(struct fi_ibv_domain *domain, const char *name) { struct ibv_context **dev_list; int i, ret = -FI_ENODEV; if (!name) return -FI_EINVAL; dev_list = rdma_get_devices(NULL); if (!dev_list) return -errno; for (i = 0; dev_list[i] && ret; i++) { if (domain->rdm) { ret = strncmp(name, ibv_get_device_name(dev_list[i]->device), strlen(name) - strlen(verbs_rdm_domain.suffix)); } else { ret = strcmp(name, ibv_get_device_name(dev_list[i]->device)); } if (!ret) domain->verbs = dev_list[i]; } rdma_free_devices(dev_list); return ret; } static struct fi_ops fi_ibv_fid_ops = { .size = sizeof(struct fi_ops), .close = fi_ibv_domain_close, .bind = fi_ibv_domain_bind, .control = fi_no_control, .ops_open = fi_no_ops_open, }; static struct fi_ops_mr fi_ibv_domain_mr_ops = { .size = sizeof(struct fi_ops_mr), .reg = fi_ibv_mr_reg, .regv = fi_ibv_mr_regv, .regattr = fi_ibv_mr_regattr, }; static struct fi_ops_domain fi_ibv_domain_ops = { .size = sizeof(struct fi_ops_domain), .av_open = fi_no_av_open, .cq_open = fi_ibv_cq_open, .endpoint = fi_ibv_open_ep, .scalable_ep = fi_no_scalable_ep, .cntr_open = fi_no_cntr_open, .poll_open = fi_no_poll_open, .stx_ctx = fi_no_stx_context, .srx_ctx = fi_ibv_srq_context, }; static struct fi_ops_domain fi_ibv_rdm_domain_ops = { .size = sizeof(struct fi_ops_domain), .av_open = fi_ibv_rdm_av_open, .cq_open = fi_ibv_rdm_cq_open, .endpoint = fi_ibv_rdm_open_ep, .scalable_ep = fi_no_scalable_ep, .cntr_open = fi_rbv_rdm_cntr_open, .poll_open = fi_no_poll_open, .stx_ctx = fi_no_stx_context, .srx_ctx = fi_no_srx_context, }; static int fi_ibv_domain(struct fid_fabric *fabric, struct fi_info *info, struct fid_domain **domain, void *context) { struct fi_ibv_domain *_domain; struct fi_ibv_fabric *fab; struct fi_info *fi; int ret; fi = fi_ibv_get_verbs_info(info->domain_attr->name); if (!fi) return -FI_EINVAL; fab = container_of(fabric, struct fi_ibv_fabric, util_fabric.fabric_fid); ret = ofi_check_domain_attr(&fi_ibv_prov, fabric->api_version, fi->domain_attr, info->domain_attr); if (ret) return ret; _domain = calloc(1, sizeof *_domain); if (!_domain) return -FI_ENOMEM; _domain->info = fi_dupinfo(info); if (!_domain->info) goto err1; _domain->rdm = FI_IBV_EP_TYPE_IS_RDM(info); if (_domain->rdm) { _domain->rdm_cm = calloc(1, sizeof(*_domain->rdm_cm)); if (!_domain->rdm_cm) { ret = -FI_ENOMEM; goto err2; } } ret = fi_ibv_open_device_by_name(_domain, info->domain_attr->name); if (ret) goto err2; _domain->pd = ibv_alloc_pd(_domain->verbs); if (!_domain->pd) { ret = -errno; goto err2; } _domain->domain_fid.fid.fclass = FI_CLASS_DOMAIN; _domain->domain_fid.fid.context = context; _domain->domain_fid.fid.ops = &fi_ibv_fid_ops; _domain->domain_fid.mr = &fi_ibv_domain_mr_ops; if (_domain->rdm) { _domain->domain_fid.ops = &fi_ibv_rdm_domain_ops; _domain->rdm_cm->ec = rdma_create_event_channel(); if (!_domain->rdm_cm->ec) { VERBS_INFO(FI_LOG_EP_CTRL, "Failed to create listener event channel: %s\n", strerror(errno)); ret = -FI_EOTHER; goto err2; } if (fi_fd_nonblock(_domain->rdm_cm->ec->fd) != 0) { VERBS_INFO_ERRNO(FI_LOG_EP_CTRL, "fcntl", errno); ret = -FI_EOTHER; goto err3; } if (rdma_create_id(_domain->rdm_cm->ec, &_domain->rdm_cm->listener, NULL, RDMA_PS_TCP)) { VERBS_INFO(FI_LOG_EP_CTRL, "Failed to create cm listener: %s\n", strerror(errno)); ret = -FI_EOTHER; goto err3; } _domain->rdm_cm->is_bound = 0; } else { _domain->domain_fid.ops = &fi_ibv_domain_ops; } _domain->fab = fab; *domain = &_domain->domain_fid; return 0; err3: if (_domain->rdm) rdma_destroy_event_channel(_domain->rdm_cm->ec); err2: if (_domain->rdm) free(_domain->rdm_cm); fi_freeinfo(_domain->info); err1: free(_domain); return ret; } static int fi_ibv_trywait(struct fid_fabric *fabric, struct fid **fids, int count) { struct fi_ibv_cq *cq; int ret, i; for (i = 0; i < count; i++) { switch (fids[i]->fclass) { case FI_CLASS_CQ: cq = container_of(fids[i], struct fi_ibv_cq, cq_fid.fid); ret = cq->trywait(fids[i]); if (ret) return ret; break; case FI_CLASS_EQ: /* We are always ready to wait on an EQ since * rdmacm EQ is based on an fd */ continue; case FI_CLASS_CNTR: case FI_CLASS_WAIT: return -FI_ENOSYS; default: return -FI_EINVAL; } } return FI_SUCCESS; } static int fi_ibv_fabric_close(fid_t fid) { struct fi_ibv_fabric *fab; int ret; fab = container_of(fid, struct fi_ibv_fabric, util_fabric.fabric_fid.fid); ret = ofi_fabric_close(&fab->util_fabric); if (ret) return ret; free(fab); return 0; } static struct fi_ops fi_ibv_fi_ops = { .size = sizeof(struct fi_ops), .close = fi_ibv_fabric_close, .bind = fi_no_bind, .control = fi_no_control, .ops_open = fi_no_ops_open, }; static struct fi_ops_fabric fi_ibv_ops_fabric = { .size = sizeof(struct fi_ops_fabric), .domain = fi_ibv_domain, .passive_ep = fi_ibv_passive_ep, .eq_open = fi_ibv_eq_open, .wait_open = fi_no_wait_open, .trywait = fi_ibv_trywait }; int fi_ibv_fabric(struct fi_fabric_attr *attr, struct fid_fabric **fabric, void *context) { struct fi_ibv_fabric *fab; struct fi_info *info; int ret; ret = fi_ibv_init_info(); if (ret) return ret; fab = calloc(1, sizeof(*fab)); if (!fab) return -FI_ENOMEM; for (info = verbs_info; info; info = info->next) { ret = ofi_fabric_init(&fi_ibv_prov, info->fabric_attr, attr, &fab->util_fabric, context); if (ret != -FI_ENODATA) break; } if (ret) { free(fab); return ret; } *fabric = &fab->util_fabric.fabric_fid; (*fabric)->fid.ops = &fi_ibv_fi_ops; (*fabric)->ops = &fi_ibv_ops_fabric; return 0; }
int fi_ibv_check_ep_attr(const struct fi_ep_attr *attr, const struct fi_info *info) { if ((attr->type != FI_EP_UNSPEC) && (attr->type != info->ep_attr->type)) { VERBS_INFO(FI_LOG_CORE, "Unsupported endpoint type\n"); return -FI_ENODATA; } switch (attr->protocol) { case FI_PROTO_UNSPEC: case FI_PROTO_RDMA_CM_IB_RC: case FI_PROTO_IWARP: case FI_PROTO_IB_UD: case FI_PROTO_IB_RDM: case FI_PROTO_IWARP_RDM: break; default: VERBS_INFO(FI_LOG_CORE, "Unsupported protocol\n"); return -FI_ENODATA; } if (attr->protocol_version > 1) { VERBS_INFO(FI_LOG_CORE, "Unsupported protocol version\n"); return -FI_ENODATA; } if (attr->max_msg_size > info->ep_attr->max_msg_size) { VERBS_INFO(FI_LOG_CORE, "Max message size too large\n"); FI_INFO_CHECK_VAL(&fi_ibv_prov, info->ep_attr, attr, max_msg_size); return -FI_ENODATA; } if (attr->max_order_raw_size > info->ep_attr->max_order_raw_size) { VERBS_INFO( FI_LOG_CORE, "max_order_raw_size exceeds supported size\n"); FI_INFO_CHECK_VAL(&fi_ibv_prov, info->ep_attr, attr, max_order_raw_size); return -FI_ENODATA; } if (attr->max_order_war_size) { VERBS_INFO(FI_LOG_CORE, "max_order_war_size exceeds supported size\n"); FI_INFO_CHECK_VAL(&fi_ibv_prov, info->ep_attr, attr, max_order_war_size); return -FI_ENODATA; } if (attr->max_order_waw_size > info->ep_attr->max_order_waw_size) { VERBS_INFO(FI_LOG_CORE, "max_order_waw_size exceeds supported size\n"); FI_INFO_CHECK_VAL(&fi_ibv_prov, info->ep_attr, attr, max_order_waw_size); return -FI_ENODATA; } if (attr->tx_ctx_cnt > info->domain_attr->max_ep_tx_ctx) { VERBS_INFO(FI_LOG_CORE, "tx_ctx_cnt exceeds supported size\n"); VERBS_INFO(FI_LOG_CORE, "Supported: %zd\nRequested: %zd\n", info->domain_attr->max_ep_tx_ctx, attr->tx_ctx_cnt); return -FI_ENODATA; } if ((attr->rx_ctx_cnt > info->domain_attr->max_ep_rx_ctx) && (attr->rx_ctx_cnt != FI_SHARED_CONTEXT)) { VERBS_INFO(FI_LOG_CORE, "rx_ctx_cnt exceeds supported size\n"); VERBS_INFO(FI_LOG_CORE, "Supported: %zd\nRequested: %zd\n", info->domain_attr->max_ep_rx_ctx, attr->rx_ctx_cnt); return -FI_ENODATA; } if (attr->auth_key_size && (attr->auth_key_size != info->ep_attr->auth_key_size)) { VERBS_INFO(FI_LOG_CORE, "Unsupported authentification size."); FI_INFO_CHECK_VAL(&fi_ibv_prov, info->ep_attr, attr, auth_key_size); return -FI_ENODATA; } return 0; }
int fi_ibv_query_atomic(struct fid_domain *domain_fid, enum fi_datatype datatype, enum fi_op op, struct fi_atomic_attr *attr, uint64_t flags) { struct fi_ibv_domain *domain = container_of(domain_fid, struct fi_ibv_domain, domain_fid); char *log_str_fetch = "fi_fetch_atomic with FI_SUM op"; char *log_str_comp = "fi_compare_atomic"; char *log_str; if (flags & FI_TAGGED) return -FI_ENOSYS; if ((flags & FI_FETCH_ATOMIC) && (flags & FI_COMPARE_ATOMIC)) return -FI_EBADFLAGS; if (!flags) { switch (op) { case FI_ATOMIC_WRITE: break; default: return -FI_ENOSYS; } } else { if (flags & FI_FETCH_ATOMIC) { switch (op) { case FI_ATOMIC_READ: goto check_datatype; case FI_SUM: log_str = log_str_fetch; break; default: return -FI_ENOSYS; } } else if (flags & FI_COMPARE_ATOMIC) { if (op != FI_CSWAP) return -FI_ENOSYS; log_str = log_str_comp; } else { return -FI_EBADFLAGS; } if (domain->info->tx_attr->op_flags & FI_INJECT) { VERBS_INFO(FI_LOG_EP_DATA, "FI_INJECT not supported for %s\n", log_str); return -FI_EINVAL; } } check_datatype: switch (datatype) { case FI_INT64: case FI_UINT64: #if __BITS_PER_LONG == 64 case FI_DOUBLE: case FI_FLOAT: #endif break; default: return -FI_EINVAL; } attr->size = fi_datatype_size(datatype); if (attr->size == 0) return -FI_EINVAL; attr->count = 1; return 0; }
int fi_ibv_check_rx_attr(const struct fi_rx_attr *attr, const struct fi_info *hints, const struct fi_info *info) { uint64_t compare_mode, check_mode; int rm_enabled; if (attr->caps & ~(info->rx_attr->caps)) { VERBS_INFO(FI_LOG_CORE, "Given rx_attr->caps not supported\n"); return -FI_ENODATA; } compare_mode = attr->mode ? attr->mode : hints->mode; check_mode = (hints->caps & FI_RMA) ? info->rx_attr->mode : (info->rx_attr->mode & ~FI_RX_CQ_DATA); if ((compare_mode & check_mode) != check_mode) { VERBS_INFO(FI_LOG_CORE, "Given rx_attr->mode not supported\n"); FI_INFO_MODE(&fi_ibv_prov, check_mode, compare_mode); return -FI_ENODATA; } if (attr->op_flags & ~(info->rx_attr->op_flags)) { VERBS_INFO(FI_LOG_CORE, "Given rx_attr->op_flags not supported\n"); return -FI_ENODATA; } if (attr->msg_order & ~(info->rx_attr->msg_order)) { VERBS_INFO(FI_LOG_CORE, "Given rx_attr->msg_order not supported\n"); return -FI_ENODATA; } if (attr->size > info->rx_attr->size) { VERBS_INFO(FI_LOG_CORE, "Given rx_attr->size is greater than supported\n"); FI_INFO_CHECK_VAL(&fi_ibv_prov, info->rx_attr, attr, size); return -FI_ENODATA; } rm_enabled =(info->domain_attr && info->domain_attr->resource_mgmt == FI_RM_ENABLED); if (!rm_enabled && (attr->total_buffered_recv > info->rx_attr->total_buffered_recv)) { VERBS_INFO(FI_LOG_CORE, "Given rx_attr->total_buffered_recv " "exceeds supported size\n"); FI_INFO_CHECK_VAL(&fi_ibv_prov, info->rx_attr, attr, total_buffered_recv); return -FI_ENODATA; } if (attr->iov_limit > info->rx_attr->iov_limit) { VERBS_INFO(FI_LOG_CORE, "Given rx_attr->iov_limit greater than supported\n"); FI_INFO_CHECK_VAL(&fi_ibv_prov, info->rx_attr, attr, iov_limit); return -FI_ENODATA; } return 0; }
static int fi_ibv_rdm_find_sysaddrs(struct fi_ibv_rdm_sysaddr *iface_addr, struct fi_ibv_rdm_sysaddr *lo_addr) { struct ifaddrs *ifaddr, *ifa; char iface[IFNAMSIZ]; char *iface_tmp = "ib"; size_t iface_len = 2; int ret; if (!iface_addr || !lo_addr) { return -FI_EINVAL; } iface_addr->is_found = 0; lo_addr->is_found = 0; if (fi_param_get_str(&fi_ibv_prov, "iface", &iface_tmp) == FI_SUCCESS) { iface_len = strlen(iface_tmp); if (iface_len > IFNAMSIZ) { VERBS_INFO(FI_LOG_EP_CTRL, "Too long iface name: %s, max: %d\n", iface_tmp, IFNAMSIZ); return -FI_EINVAL; } } strncpy(iface, iface_tmp, iface_len); ret = getifaddrs(&ifaddr); if (ret) { FI_WARN(&fi_ibv_prov, FI_LOG_FABRIC, "Unable to get interface addresses\n"); return ret; } for (ifa = ifaddr; ifa; ifa = ifa->ifa_next) { if (!iface_addr->is_found && (ifa->ifa_addr->sa_family == AF_INET) && !strncmp(ifa->ifa_name, iface, iface_len)) { memcpy(&iface_addr->addr, ifa->ifa_addr, sizeof(iface_addr->addr)); iface_addr->is_found = 1; FI_INFO(&fi_ibv_prov, FI_LOG_FABRIC, "iface addr %s:%u\n", inet_ntoa(iface_addr->addr.sin_addr), ntohs(iface_addr->addr.sin_port)); } if (!lo_addr->is_found && (ifa->ifa_addr->sa_family == AF_INET) && !strncmp(ifa->ifa_name, "lo", strlen(ifa->ifa_name))) { memcpy(&lo_addr->addr, ifa->ifa_addr, sizeof(lo_addr->addr)); lo_addr->is_found = 1; FI_INFO(&fi_ibv_prov, FI_LOG_FABRIC, "lo addr %s:%u\n", inet_ntoa(lo_addr->addr.sin_addr), ntohs(lo_addr->addr.sin_port)); } if (iface_addr->is_found && lo_addr->is_found) { break; } } freeifaddrs(ifaddr); return 0; }
static ssize_t fi_ibv_rdm_process_connect_request(struct rdma_cm_event *event, struct fi_ibv_rdm_ep *ep) { struct ibv_qp_init_attr qp_attr; struct rdma_conn_param cm_params; struct fi_ibv_rdm_tagged_conn *conn = NULL; struct rdma_cm_id *id = event->id; ssize_t ret = FI_SUCCESS; char *p = (char *) event->param.conn.private_data; if (ep->is_closing) { int rej_message = 0xdeadbeef; if (rdma_reject(id, &rej_message, sizeof(int))) { VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_reject\n", errno); ret = -errno; if (rdma_destroy_id(id)) { VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_destroy_id\n", errno); ret = (ret == FI_SUCCESS) ? -errno : ret; } } assert(ret == FI_SUCCESS); return ret; } HASH_FIND(hh, fi_ibv_rdm_tagged_conn_hash, p, FI_IBV_RDM_DFLT_ADDRLEN, conn); if (!conn) { conn = memalign(FI_IBV_RDM_MEM_ALIGNMENT, sizeof(*conn)); if (!conn) return -FI_ENOMEM; memset(conn, 0, sizeof(struct fi_ibv_rdm_tagged_conn)); conn->state = FI_VERBS_CONN_ALLOCATED; dlist_init(&conn->postponed_requests_head); fi_ibv_rdm_unpack_cm_params(&event->param.conn, conn, ep); fi_ibv_rdm_conn_init_cm_role(conn, ep); FI_INFO(&fi_ibv_prov, FI_LOG_AV, "CONN REQUEST, NOT found in hash, new conn %p %d, addr %s:%u, HASH ADD\n", conn, conn->cm_role, inet_ntoa(conn->addr.sin_addr), ntohs(conn->addr.sin_port)); HASH_ADD(hh, fi_ibv_rdm_tagged_conn_hash, addr, FI_IBV_RDM_DFLT_ADDRLEN, conn); } else { if (conn->cm_role != FI_VERBS_CM_ACTIVE) { /* * Do it before rdma_create_qp since that call would * modify event->param.conn.private_data buffer */ fi_ibv_rdm_unpack_cm_params(&event->param.conn, conn, ep); } FI_INFO(&fi_ibv_prov, FI_LOG_AV, "CONN REQUEST, FOUND in hash, conn %p %d, addr %s:%u\n", conn, conn->cm_role, inet_ntoa(conn->addr.sin_addr), ntohs(conn->addr.sin_port)); } if (conn->cm_role == FI_VERBS_CM_ACTIVE) { int rej_message = 0xdeadbeef; if (rdma_reject(id, &rej_message, sizeof(rej_message))) { VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_reject\n", errno); ret = -errno; if (rdma_destroy_id(id)) { VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_destroy_id\n", errno); ret = (ret == FI_SUCCESS) ? -errno : ret; } } if (conn->state == FI_VERBS_CONN_ALLOCATED) { ret = fi_ibv_rdm_start_connection(ep, conn); if (ret != FI_SUCCESS) goto err; } } else { assert(conn->state == FI_VERBS_CONN_ALLOCATED || conn->state == FI_VERBS_CONN_STARTED); const size_t idx = (conn->cm_role == FI_VERBS_CM_PASSIVE) ? 0 : 1; conn->state = FI_VERBS_CONN_STARTED; assert (conn->id[idx] == NULL); conn->id[idx] = id; ret = fi_ibv_rdm_prepare_conn_memory(ep, conn); if (ret != FI_SUCCESS) goto err; fi_ibv_rdm_tagged_init_qp_attributes(&qp_attr, ep); if (rdma_create_qp(id, ep->domain->pd, &qp_attr)) { ret = -errno; goto err; } conn->qp[idx] = id->qp; ret = fi_ibv_rdm_repost_receives(conn, ep, ep->rq_wr_depth); if (ret < 0) { VERBS_INFO(FI_LOG_AV, "repost receives failed\n"); goto err; } else { ret = FI_SUCCESS; } id->context = conn; fi_ibv_rdm_pack_cm_params(&cm_params, conn, ep); if (rdma_accept(id, &cm_params)) { VERBS_INFO_ERRNO(FI_LOG_AV, "rdma_accept\n", errno); ret = -errno; goto err; } if (cm_params.private_data) { free((void *) cm_params.private_data); } } return ret; err: /* ret err code is already set here, just cleanup resources */ fi_ibv_rdm_conn_cleanup(conn); return ret; }
int fi_ibv_init_info(void) { struct ibv_context **ctx_list; struct fi_info *fi = NULL, *tail = NULL; int ret = 0, i, num_devices, fork_unsafe = 0; if (verbs_info) return 0; pthread_mutex_lock(&verbs_info_lock); if (verbs_info) goto unlock; if (!fi_ibv_have_device()) { VERBS_INFO(FI_LOG_FABRIC, "No RDMA devices found\n"); ret = -FI_ENODATA; goto unlock; } fi_param_get_bool(NULL, "fork_unsafe", &fork_unsafe); if (!fork_unsafe) { FI_INFO(&fi_ibv_prov, FI_LOG_CORE, "Enabling IB fork support\n"); ret = ibv_fork_init(); if (ret) { FI_WARN(&fi_ibv_prov, FI_LOG_CORE, "Enabling IB fork support failed: %s (%d)\n", strerror(ret), ret); goto unlock; } } else { FI_INFO(&fi_ibv_prov, FI_LOG_CORE, "Not enabling IB fork support\n"); } ctx_list = rdma_get_devices(&num_devices); if (!num_devices) { VERBS_INFO_ERRNO(FI_LOG_FABRIC, "rdma_get_devices", errno); ret = -errno; goto unlock; } for (i = 0; i < num_devices; i++) { ret = fi_ibv_alloc_info(ctx_list[i], &fi, &verbs_msg_domain); if (!ret) { if (!verbs_info) verbs_info = fi; else tail->next = fi; tail = fi; ret = fi_ibv_alloc_info(ctx_list[i], &fi, &verbs_rdm_domain); if (!ret) { tail->next = fi; tail = fi; } } } ret = verbs_info ? 0 : ret; rdma_free_devices(ctx_list); unlock: pthread_mutex_unlock(&verbs_info_lock); return ret; }
static int fi_ibv_alloc_info(struct ibv_context *ctx, struct fi_info **info, const struct verbs_ep_domain *ep_dom) { struct fi_info *fi; union ibv_gid gid; size_t name_len; int ret; int param; if (!(fi = fi_allocinfo())) return -FI_ENOMEM; fi->caps = ep_dom->caps; fi->handle = NULL; if (ep_dom->type == FI_EP_RDM) { fi->mode = VERBS_RDM_MODE; *(fi->tx_attr) = verbs_rdm_tx_attr; } else { *(fi->tx_attr) = verbs_tx_attr; } *(fi->rx_attr) = (ep_dom->type == FI_EP_RDM) ? verbs_rdm_rx_attr : verbs_rx_attr; *(fi->ep_attr) = verbs_ep_attr; *(fi->domain_attr) = verbs_domain_attr; if (ep_dom->type == FI_EP_RDM) fi->domain_attr->mr_mode &= ~FI_MR_LOCAL; *(fi->fabric_attr) = verbs_fabric_attr; fi->ep_attr->type = ep_dom->type; fi->tx_attr->caps = ep_dom->caps; fi->rx_attr->caps = ep_dom->caps; ret = fi_ibv_get_device_attrs(ctx, fi); if (ret) goto err; if (ep_dom->type == FI_EP_RDM) { fi->tx_attr->inject_size = FI_IBV_RDM_DFLT_BUFFERED_SSIZE; fi->tx_attr->iov_limit = 1; fi->tx_attr->rma_iov_limit = 1; if (!fi_param_get_int(&fi_ibv_prov, "rdm_buffer_size", ¶m)) { if (param > sizeof (struct fi_ibv_rdm_rndv_header)) { fi->tx_attr->inject_size = param; } else { VERBS_INFO(FI_LOG_CORE, "rdm_buffer_size too small, " "should be greater then %d\n", sizeof (struct fi_ibv_rdm_rndv_header)); ret = -FI_EINVAL; goto err; } } } switch (ctx->device->transport_type) { case IBV_TRANSPORT_IB: if(ibv_query_gid(ctx, 1, 0, &gid)) { VERBS_INFO_ERRNO(FI_LOG_FABRIC, "ibv_query_gid", errno); ret = -errno; goto err; } name_len = strlen(VERBS_IB_PREFIX) + INET6_ADDRSTRLEN; if (!(fi->fabric_attr->name = calloc(1, name_len + 1))) { ret = -FI_ENOMEM; goto err; } snprintf(fi->fabric_attr->name, name_len, VERBS_IB_PREFIX "%lx", gid.global.subnet_prefix); fi->ep_attr->protocol = (ep_dom == &verbs_msg_domain) ? FI_PROTO_RDMA_CM_IB_RC : FI_PROTO_IB_RDM; break; case IBV_TRANSPORT_IWARP: fi->fabric_attr->name = strdup(VERBS_IWARP_FABRIC); if (!fi->fabric_attr->name) { ret = -FI_ENOMEM; goto err; } if (ep_dom == &verbs_msg_domain) { fi->ep_attr->protocol = FI_PROTO_IWARP; fi->tx_attr->op_flags = VERBS_TX_OP_FLAGS_IWARP; } else { fi->ep_attr->protocol = FI_PROTO_IWARP_RDM; fi->tx_attr->op_flags = VERBS_TX_OP_FLAGS_IWARP_RDM; } break; default: VERBS_INFO(FI_LOG_CORE, "Unknown transport type\n"); ret = -FI_ENODATA; goto err; } name_len = strlen(ctx->device->name) + strlen(ep_dom->suffix); fi->domain_attr->name = malloc(name_len + 1); if (!fi->domain_attr->name) { ret = -FI_ENOMEM; goto err; } snprintf(fi->domain_attr->name, name_len + 1, "%s%s", ctx->device->name, ep_dom->suffix); fi->domain_attr->name[name_len] = '\0'; *info = fi; return 0; err: fi_freeinfo(fi); return ret; }