static void mlx5_free_context(struct verbs_device *device, struct ibv_context *ibctx) { struct mlx5_context *context = to_mctx(ibctx); int page_size = to_mdev(ibctx->device)->page_size; int i; struct mlx5_wc_uar *wc_uar; if (context->hca_core_clock) munmap(context->hca_core_clock - context->core_clock.offset, to_mdev(&device->device)->page_size); if (context->cc.buf) munmap(context->cc.buf, 4096 * context->num_ports); free(context->bfs); for (i = 0; i < MLX5_MAX_UAR_PAGES; ++i) { if (context->uar[i].regs) munmap(context->uar[i].regs, page_size); } if (context->max_ctx_res_domain) { mlx5_spin_lock(&context->send_db_lock); while (!list_empty(&context->wc_uar_list)) { wc_uar = list_entry(context->wc_uar_list.next, struct mlx5_wc_uar, list); list_del(&wc_uar->list); free(wc_uar); } mlx5_spin_unlock(&context->send_db_lock); }
int mlx4_destroy_xrc_srq(struct ibv_srq *srq) { struct mlx4_context *mctx = to_mctx(srq->context); struct mlx4_srq *msrq = to_msrq(srq); struct mlx4_cq *mcq; int ret; mcq = to_mcq(msrq->verbs_srq.cq); mlx4_cq_clean(mcq, 0, msrq); pthread_spin_lock(&mcq->lock); mlx4_clear_xsrq(&mctx->xsrq_table, msrq->verbs_srq.srq_num); pthread_spin_unlock(&mcq->lock); ret = ibv_cmd_destroy_srq(srq); if (ret) { pthread_spin_lock(&mcq->lock); mlx4_store_xsrq(&mctx->xsrq_table, msrq->verbs_srq.srq_num, msrq); pthread_spin_unlock(&mcq->lock); return ret; } mlx4_free_db(mctx, MLX4_DB_TYPE_RQ, msrq->db); mlx4_free_buf(&msrq->buf); free(msrq->wrid); free(msrq); return 0; }
int mlx4_arm_cq(struct ibv_cq *ibvcq, int solicited) { struct mlx4_cq *cq = to_mcq(ibvcq); uint32_t doorbell[2]; uint32_t sn; uint32_t ci; uint32_t cmd; sn = cq->arm_sn & 3; ci = cq->cons_index & 0xffffff; cmd = solicited ? MLX4_CQ_DB_REQ_NOT_SOL : MLX4_CQ_DB_REQ_NOT; *cq->arm_db = htonl(sn << 28 | cmd | ci); /* * Make sure that the doorbell record in host memory is * written before ringing the doorbell via PCI MMIO. */ wmb(); doorbell[0] = htonl(sn << 28 | cmd | cq->cqn); doorbell[1] = htonl(ci); mlx4_write64(doorbell, to_mctx(ibvcq->context), MLX4_CQ_DOORBELL); return 0; }
int mlx5_alloc_srq_buf(struct ibv_context *context, struct mlx5_srq *srq) { struct mlx5_wqe_srq_next_seg *next; int size; int buf_size; int i; struct mlx5_context *ctx; ctx = to_mctx(context); if (srq->max_gs < 0) { errno = EINVAL; return -1; } srq->wrid = malloc(srq->max * sizeof *srq->wrid); if (!srq->wrid) return -1; size = sizeof(struct mlx5_wqe_srq_next_seg) + srq->max_gs * sizeof(struct mlx5_wqe_data_seg); size = max(32, size); size = mlx5_round_up_power_of_two(size); if (size > ctx->max_recv_wr) { errno = EINVAL; return -1; } srq->max_gs = (size - sizeof(struct mlx5_wqe_srq_next_seg)) / sizeof(struct mlx5_wqe_data_seg); srq->wqe_shift = mlx5_ilog2(size); buf_size = srq->max * size; if (mlx5_alloc_buf(&srq->buf, buf_size, to_mdev(context->device)->page_size)) { free(srq->wrid); return -1; } memset(srq->buf.buf, 0, buf_size); /* * Now initialize the SRQ buffer so that all of the WQEs are * linked into the list of free WQEs. */ for (i = 0; i < srq->max; ++i) { next = get_wqe(srq, i); next->next_wqe_index = htobe16((i + 1) & (srq->max - 1)); } srq->head = 0; srq->tail = srq->max - 1; return 0; }
static void mlx4_free_context(struct ibv_context *ibctx) { struct mlx4_context *context = to_mctx(ibctx); munmap(context->uar, to_mdev(ibctx->device)->page_size); if (context->bf_page) munmap(context->bf_page, to_mdev(ibctx->device)->page_size); free(context); }
static void mthca_free_context(struct ibv_context *ibctx) { struct mthca_context *context = to_mctx(ibctx); mthca_free_pd(context->pd); munmap(context->uar, to_mdev(ibctx->device)->page_size); mthca_free_db_tab(context->db_tab); free(context); }
static void mlx4_uninit_context(struct verbs_device *v_device, struct ibv_context *ibv_ctx) { struct mlx4_context *context = to_mctx(ibv_ctx); munmap(context->uar, to_mdev(&v_device->device)->page_size); if (context->bfs.page) munmap(context->bfs.page, to_mdev(&v_device->device)->page_size); if (context->hca_core_clock) munmap((context->hca_core_clock - context->core_clk.offset), context->core_clk.offset + sizeof(context->core_clk.mask)); }
static void mlx5_free_context(struct verbs_device *device, struct ibv_context *ibctx) { struct mlx5_context *context = to_mctx(ibctx); int page_size = to_mdev(ibctx->device)->page_size; int i; free(context->bfs); for (i = 0; i < MLX5_MAX_UAR_PAGES; ++i) { if (context->uar[i]) munmap(context->uar[i], page_size); } close_debug_file(context); }
int mlx4_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap, enum ibv_qp_type type, struct mlx4_qp *qp) { qp->rq.max_gs = cap->max_recv_sge; qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t)); if (!qp->sq.wrid) return -1; if (qp->rq.wqe_cnt) { qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof (uint64_t)); if (!qp->rq.wrid) { free(qp->sq.wrid); return -1; } } for (qp->rq.wqe_shift = 4; 1 << qp->rq.wqe_shift < qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg); qp->rq.wqe_shift++) ; /* nothing */ qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + (qp->sq.wqe_cnt << qp->sq.wqe_shift); if (qp->rq.wqe_shift > qp->sq.wqe_shift) { qp->rq.offset = 0; qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; } else { qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift; qp->sq.offset = 0; } if (vib_alloc_buf(to_mctx(pd->context), &qp->buf, align(qp->buf_size, to_mdev(pd->context->device)->page_size), to_mdev(pd->context->device)->page_size)) { free(qp->sq.wrid); free(qp->rq.wrid); return -1; } memset(qp->buf.buf, 0, qp->buf_size); return 0; }
void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap, enum ibv_qp_type type) { int wqe_size; struct mlx4_context *ctx = to_mctx(qp->ibv_qp.context); wqe_size = min((1 << qp->sq.wqe_shift), MLX4_MAX_WQE_SIZE) - sizeof (struct mlx4_wqe_ctrl_seg); switch (type) { case IBV_QPT_UD: wqe_size -= sizeof (struct mlx4_wqe_datagram_seg); break; case IBV_QPT_UC: case IBV_QPT_RC: case IBV_QPT_XRC: wqe_size -= sizeof (struct mlx4_wqe_raddr_seg); break; default: break; } qp->sq.max_gs = wqe_size / sizeof (struct mlx4_wqe_data_seg); cap->max_send_sge = min(ctx->max_sge, qp->sq.max_gs); qp->sq.max_post = min(ctx->max_qp_wr, qp->sq.wqe_cnt - qp->sq_spare_wqes); cap->max_send_wr = qp->sq.max_post; /* * Inline data segments can't cross a 64 byte boundary. So * subtract off one segment header for each 64-byte chunk, * taking into account the fact that wqe_size will be 32 mod * 64 for non-UD QPs. */ qp->max_inline_data = wqe_size - sizeof (struct mlx4_wqe_inline_seg) * (align(wqe_size, MLX4_INLINE_ALIGN) / MLX4_INLINE_ALIGN); cap->max_inline_data = qp->max_inline_data; }
static int mlx5_map_internal_clock(struct mlx5_device *mdev, struct ibv_context *ibv_ctx) { struct mlx5_context *context = to_mctx(ibv_ctx); void *hca_clock_page; off_t offset = 0; set_command(MLX5_EXP_MMAP_GET_CORE_CLOCK_CMD, &offset); hca_clock_page = mmap(NULL, mdev->page_size, PROT_READ, MAP_SHARED, ibv_ctx->cmd_fd, offset * mdev->page_size); if (hca_clock_page == MAP_FAILED) { fprintf(stderr, PFX "Warning: Timestamp available,\n" "but failed to mmap() hca core clock page.\n"); return -1; } context->hca_core_clock = hca_clock_page + context->core_clock.offset; return 0; }
struct ibv_srq *mlx4_create_xrc_srq(struct ibv_context *context, struct ibv_srq_init_attr_ex *attr_ex) { struct mlx4_create_xsrq cmd; struct mlx4_create_xsrq_resp resp; struct mlx4_srq *srq; int ret; /* Sanity check SRQ size before proceeding */ if (attr_ex->attr.max_wr > 1 << 16 || attr_ex->attr.max_sge > 64) return NULL; srq = calloc(1, sizeof *srq); if (!srq) return NULL; if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE)) goto err; srq->max = align_queue_size(attr_ex->attr.max_wr + 1); srq->max_gs = attr_ex->attr.max_sge; srq->counter = 0; srq->ext_srq = 1; if (mlx4_alloc_srq_buf(attr_ex->pd, &attr_ex->attr, srq)) goto err; srq->db = mlx4_alloc_db(to_mctx(context), MLX4_DB_TYPE_RQ); if (!srq->db) goto err_free; *srq->db = 0; cmd.buf_addr = (uintptr_t) srq->buf.buf; cmd.db_addr = (uintptr_t) srq->db; ret = ibv_cmd_create_srq_ex(context, &srq->verbs_srq, sizeof(srq->verbs_srq), attr_ex, &cmd.ibv_cmd, sizeof cmd, &resp.ibv_resp, sizeof resp); if (ret) goto err_db; ret = mlx4_store_xsrq(&to_mctx(context)->xsrq_table, srq->verbs_srq.srq_num, srq); if (ret) goto err_destroy; return &srq->verbs_srq.srq; err_destroy: ibv_cmd_destroy_srq(&srq->verbs_srq.srq); err_db: mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_RQ, srq->db); err_free: free(srq->wrid); mlx4_free_buf(&srq->buf); err: free(srq); return NULL; }
static int mlx4_poll_one(struct mlx4_cq *cq, struct mlx4_qp **cur_qp, struct ibv_wc *wc) { struct mlx4_wq *wq; struct mlx4_cqe *cqe; struct mlx4_srq *srq = NULL; uint32_t qpn; uint32_t srqn; uint32_t g_mlpath_rqpn; uint16_t wqe_index; int is_error; int is_send; cqe = next_cqe_sw(cq); if (!cqe) return CQ_EMPTY; ++cq->cons_index; VALGRIND_MAKE_MEM_DEFINED(cqe, sizeof *cqe); /* * Make sure we read CQ entry contents after we've checked the * ownership bit. */ rmb(); qpn = ntohl(cqe->my_qpn); is_send = cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK; is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_CQE_OPCODE_ERROR; if (qpn & MLX4_XRC_QPN_BIT && !is_send) { srqn = ntohl(cqe->g_mlpath_rqpn) & 0xffffff; /* * We do not have to take the XRC SRQ table lock here, * because CQs will be locked while XRC SRQs are removed * from the table. */ srq = mlx4_find_xrc_srq(to_mctx(cq->ibv_cq.context), srqn); if (!srq) return CQ_POLL_ERR; } else if (!*cur_qp || (qpn & 0xffffff) != (*cur_qp)->ibv_qp.qp_num) { /* * We do not have to take the QP table lock here, * because CQs will be locked while QPs are removed * from the table. */ *cur_qp = mlx4_find_qp(to_mctx(cq->ibv_cq.context), qpn & 0xffffff); if (!*cur_qp) return CQ_POLL_ERR; } wc->qp_num = qpn & 0xffffff; if (is_send) { wq = &(*cur_qp)->sq; wqe_index = ntohs(cqe->wqe_index); wq->tail += (uint16_t) (wqe_index - (uint16_t) wq->tail); wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; ++wq->tail; } else if (srq) { wqe_index = htons(cqe->wqe_index); wc->wr_id = srq->wrid[wqe_index]; mlx4_free_srq_wqe(srq, wqe_index); } else if ((*cur_qp)->ibv_qp.srq) { srq = to_msrq((*cur_qp)->ibv_qp.srq); wqe_index = htons(cqe->wqe_index); wc->wr_id = srq->wrid[wqe_index]; mlx4_free_srq_wqe(srq, wqe_index); } else { wq = &(*cur_qp)->rq; wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; ++wq->tail; } if (is_error) { mlx4_handle_error_cqe((struct mlx4_err_cqe *) cqe, wc); return CQ_OK; } wc->status = IBV_WC_SUCCESS; if (is_send) { wc->wc_flags = 0; switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) { case MLX4_OPCODE_RDMA_WRITE_IMM: wc->wc_flags |= IBV_WC_WITH_IMM; case MLX4_OPCODE_RDMA_WRITE: wc->opcode = IBV_WC_RDMA_WRITE; break; case MLX4_OPCODE_SEND_IMM: wc->wc_flags |= IBV_WC_WITH_IMM; case MLX4_OPCODE_SEND: wc->opcode = IBV_WC_SEND; break; case MLX4_OPCODE_RDMA_READ: wc->opcode = IBV_WC_RDMA_READ; wc->byte_len = ntohl(cqe->byte_cnt); break; case MLX4_OPCODE_ATOMIC_CS: wc->opcode = IBV_WC_COMP_SWAP; wc->byte_len = 8; break; case MLX4_OPCODE_ATOMIC_FA: wc->opcode = IBV_WC_FETCH_ADD; wc->byte_len = 8; break; case MLX4_OPCODE_BIND_MW: wc->opcode = IBV_WC_BIND_MW; break; default: /* assume it's a send completion */ wc->opcode = IBV_WC_SEND; break; } } else { wc->byte_len = ntohl(cqe->byte_cnt); switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) { case MLX4_RECV_OPCODE_RDMA_WRITE_IMM: wc->opcode = IBV_WC_RECV_RDMA_WITH_IMM; wc->wc_flags = IBV_WC_WITH_IMM; wc->imm_data = cqe->immed_rss_invalid; break; case MLX4_RECV_OPCODE_SEND: wc->opcode = IBV_WC_RECV; wc->wc_flags = 0; break; case MLX4_RECV_OPCODE_SEND_IMM: wc->opcode = IBV_WC_RECV; wc->wc_flags = IBV_WC_WITH_IMM; wc->imm_data = cqe->immed_rss_invalid; break; } wc->slid = ntohs(cqe->rlid); wc->sl = cqe->sl >> 4; g_mlpath_rqpn = ntohl(cqe->g_mlpath_rqpn); wc->src_qp = g_mlpath_rqpn & 0xffffff; wc->dlid_path_bits = (g_mlpath_rqpn >> 24) & 0x7f; wc->wc_flags |= g_mlpath_rqpn & 0x80000000 ? IBV_WC_GRH : 0; wc->pkey_index = ntohl(cqe->immed_rss_invalid) & 0x7f; } return CQ_OK; }
int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) { struct mlx4_context *ctx; struct mlx4_qp *qp = to_mqp(ibqp); void *wqe; struct mlx4_wqe_ctrl_seg *ctrl; int ind; int nreq; int inl = 0; int ret = 0; int size; int i; pthread_spin_lock(&qp->sq.lock); /* XXX check that state is OK to post send */ ind = qp->sq.head; for (nreq = 0; wr; ++nreq, wr = wr->next) { if (wq_overflow(&qp->sq, nreq, to_mcq(qp->ibv_qp.send_cq))) { ret = -1; *bad_wr = wr; goto out; } if (wr->num_sge > qp->sq.max_gs) { ret = -1; *bad_wr = wr; goto out; } if (wr->opcode >= sizeof mlx4_ib_opcode / sizeof mlx4_ib_opcode[0]) { ret = -1; *bad_wr = wr; goto out; } ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id; ctrl->xrcrb_flags = (wr->send_flags & IBV_SEND_SIGNALED ? htonl(MLX4_WQE_CTRL_CQ_UPDATE) : 0) | (wr->send_flags & IBV_SEND_SOLICITED ? htonl(MLX4_WQE_CTRL_SOLICIT) : 0) | qp->sq_signal_bits; if (wr->opcode == IBV_WR_SEND_WITH_IMM || wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM) ctrl->imm = wr->imm_data; else ctrl->imm = 0; wqe += sizeof *ctrl; size = sizeof *ctrl / 16; switch (ibqp->qp_type) { case IBV_QPT_XRC: ctrl->xrcrb_flags |= htonl(wr->xrc_remote_srq_num << 8); /* fall thru */ case IBV_QPT_RC: case IBV_QPT_UC: switch (wr->opcode) { case IBV_WR_ATOMIC_CMP_AND_SWP: case IBV_WR_ATOMIC_FETCH_AND_ADD: set_raddr_seg(wqe, wr->wr.atomic.remote_addr, wr->wr.atomic.rkey); wqe += sizeof (struct mlx4_wqe_raddr_seg); set_atomic_seg(wqe, wr); wqe += sizeof (struct mlx4_wqe_atomic_seg); size += (sizeof (struct mlx4_wqe_raddr_seg) + sizeof (struct mlx4_wqe_atomic_seg)) / 16; break; case IBV_WR_RDMA_READ: inl = 1; /* fall through */ case IBV_WR_RDMA_WRITE: case IBV_WR_RDMA_WRITE_WITH_IMM: set_raddr_seg(wqe, wr->wr.rdma.remote_addr, wr->wr.rdma.rkey); wqe += sizeof (struct mlx4_wqe_raddr_seg); size += sizeof (struct mlx4_wqe_raddr_seg) / 16; break; default: /* No extra segments required for sends */ break; } break; case IBV_QPT_UD: set_datagram_seg(wqe, wr); wqe += sizeof (struct mlx4_wqe_datagram_seg); size += sizeof (struct mlx4_wqe_datagram_seg) / 16; if (to_mah(wr->wr.ud.ah)->tagged) { ctrl->ins_vlan = 1 << 6; ctrl->vlan_tag = htons(to_mah(wr->wr.ud.ah)->vlan); } break; default: break; } if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) { struct mlx4_wqe_inline_seg *seg; void *addr; int len, seg_len; int num_seg; int off, to_copy; inl = 0; seg = wqe; wqe += sizeof *seg; off = ((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1); num_seg = 0; seg_len = 0; for (i = 0; i < wr->num_sge; ++i) { addr = (void *) (uintptr_t) wr->sg_list[i].addr; len = wr->sg_list[i].length; inl += len; if (inl > qp->max_inline_data) { inl = 0; ret = -1; *bad_wr = wr; goto out; } while (len >= MLX4_INLINE_ALIGN - off) { to_copy = MLX4_INLINE_ALIGN - off; memcpy(wqe, addr, to_copy); len -= to_copy; wqe += to_copy; addr += to_copy; seg_len += to_copy; wmb(); /* see comment below */ seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len); seg_len = 0; seg = wqe; wqe += sizeof *seg; off = sizeof *seg; ++num_seg; } memcpy(wqe, addr, len); wqe += len; seg_len += len; off += len; } if (seg_len) { ++num_seg; /* * Need a barrier here to make sure * all the data is visible before the * byte_count field is set. Otherwise * the HCA prefetcher could grab the * 64-byte chunk with this inline * segment and get a valid (!= * 0xffffffff) byte count but stale * data, and end up sending the wrong * data. */ wmb(); seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len); } size += (inl + num_seg * sizeof * seg + 15) / 16; } else { struct mlx4_wqe_data_seg *seg = wqe; for (i = wr->num_sge - 1; i >= 0 ; --i) set_data_seg(seg + i, wr->sg_list + i); size += wr->num_sge * (sizeof *seg / 16); } ctrl->fence_size = (wr->send_flags & IBV_SEND_FENCE ? MLX4_WQE_CTRL_FENCE : 0) | size; /* * Make sure descriptor is fully written before * setting ownership bit (because HW can start * executing as soon as we do). */ wmb(); ctrl->owner_opcode = htonl(mlx4_ib_opcode[wr->opcode]) | (ind & qp->sq.wqe_cnt ? htonl(1 << 31) : 0); /* * We can improve latency by not stamping the last * send queue WQE until after ringing the doorbell, so * only stamp here if there are still more WQEs to post. */ if (wr->next) stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) & (qp->sq.wqe_cnt - 1)); ++ind; } out: ctx = to_mctx(ibqp->context); if (nreq == 1 && inl && size > 1 && size < ctx->bf_buf_size / 16) { ctrl->owner_opcode |= htonl((qp->sq.head & 0xffff) << 8); *(uint32_t *) (&ctrl->vlan_tag) |= qp->doorbell_qpn; /* * Make sure that descriptor is written to memory * before writing to BlueFlame page. */ wmb(); ++qp->sq.head; pthread_spin_lock(&ctx->bf_lock); mlx4_bf_copy(ctx->bf_page + ctx->bf_offset, (unsigned long *) ctrl, align(size * 16, 64)); wc_wmb(); ctx->bf_offset ^= ctx->bf_buf_size; pthread_spin_unlock(&ctx->bf_lock); } else if (nreq) { qp->sq.head += nreq; /* * Make sure that descriptors are written before * doorbell record. */ wmb(); *(uint32_t *) (ctx->uar + MLX4_SEND_DOORBELL) = qp->doorbell_qpn; } if (nreq) stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) & (qp->sq.wqe_cnt - 1)); pthread_spin_unlock(&qp->sq.lock); return ret; }
static int mlx4_init_context(struct verbs_device *v_device, struct ibv_context *ibv_ctx, int cmd_fd) { struct mlx4_context *context; struct mlx4_alloc_ucontext_req req; struct mlx4_alloc_ucontext_resp resp; struct mlx4_alloc_ucontext_resp_v3 resp_v3; int i; struct ibv_exp_device_attr dev_attrs; struct ibv_device_attr dev_legacy_attrs; struct mlx4_device *dev = to_mdev(&v_device->device); unsigned int qp_tab_size; unsigned int bf_reg_size; unsigned int cqe_size; int hca_clock_offset; void *hca_clock_page = NULL; /* verbs_context should be used for new verbs. * memory footprint of mlx4_context and verbs_context share * struct ibv_context. */ struct verbs_context *verbs_ctx = verbs_get_ctx(ibv_ctx); struct verbs_context_exp *verbs_exp_ctx = verbs_get_exp_ctx(ibv_ctx); memset(&req, 0, sizeof(req)); context = to_mctx(ibv_ctx); ibv_ctx->cmd_fd = cmd_fd; ibv_ctx->device = &v_device->device; if (pthread_mutex_init(&context->env_mtx, NULL)) return EIO; if (dev->driver_abi_ver > 3) { #ifdef MLX4_WQE_FORMAT req.lib_caps = MLX4_USER_DEV_CAP_WQE_FORMAT; #endif if (ibv_cmd_get_context(ibv_ctx, &req.cmd, sizeof(req), &resp.ibv_resp, sizeof(resp))) return errno; VALGRIND_MAKE_MEM_DEFINED(&resp, sizeof(resp)); qp_tab_size = resp.qp_tab_size; bf_reg_size = resp.bf_reg_size; context->bf_regs_per_page = resp.bf_regs_per_page; cqe_size = resp.cqe_size; } else { if (ibv_cmd_get_context(ibv_ctx, &req.cmd, sizeof(req.cmd), &resp_v3.ibv_resp, sizeof(resp_v3))) return errno; VALGRIND_MAKE_MEM_DEFINED(&resp_v3, sizeof(resp_v3)); qp_tab_size = resp_v3.qp_tab_size; bf_reg_size = resp_v3.bf_reg_size; context->bf_regs_per_page = resp_v3.bf_regs_per_page; cqe_size = 32; } context->num_qps = qp_tab_size; context->qp_table_shift = ffs(context->num_qps) - 1 - MLX4_QP_TABLE_BITS; context->qp_table_mask = (1 << context->qp_table_shift) - 1; context->cqe_size = cqe_size; for (i = 0; i < MLX4_PORTS_NUM; ++i) context->port_query_cache[i].valid = 0; pthread_mutex_init(&context->qp_table_mutex, NULL); for (i = 0; i < MLX4_QP_TABLE_SIZE; ++i) context->qp_table[i].refcnt = 0; for (i = 0; i < MLX4_NUM_DB_TYPE; ++i) context->db_list[i] = NULL; mlx4_init_xsrq_table(&context->xsrq_table, qp_tab_size); pthread_mutex_init(&context->db_list_mutex, NULL); context->uar = mmap(NULL, dev->page_size, PROT_WRITE, MAP_SHARED, cmd_fd, 0); if (context->uar == MAP_FAILED) return errno; if (bf_reg_size) { context->bfs.page = mmap(NULL, dev->page_size, PROT_WRITE, MAP_SHARED, cmd_fd, dev->page_size); if (context->bfs.page == MAP_FAILED) { fprintf(stderr, PFX "Warning: BlueFlame available, " "but failed to mmap() BlueFlame page.\n"); context->bfs.page = NULL; context->bfs.buf_size = 0; context->bfs.num_dedic_bfs = 0; } else { context->bfs.num_dedic_bfs = min(context->bf_regs_per_page - 1, MLX4_MAX_BFS_IN_PAGE - 1); context->bfs.buf_size = bf_reg_size / 2; mlx4_spinlock_init(&context->bfs.dedic_bf_lock, !mlx4_single_threaded); context->bfs.cmn_bf.address = context->bfs.page; mlx4_lock_init(&context->bfs.cmn_bf.lock, !mlx4_single_threaded, mlx4_get_locktype()); context->bfs.dedic_bf_free = context->bfs.num_dedic_bfs; for (i = 0; i < context->bfs.num_dedic_bfs; i++) { context->bfs.dedic_bf[i].address = context->bfs.page + (i + 1) * MLX4_BFS_STRIDE; context->bfs.dedic_bf_used[i] = 0; } } } else { context->bfs.page = NULL; context->bfs.buf_size = 0; context->bfs.num_dedic_bfs = 0; } mlx4_spinlock_init(&context->uar_lock, !mlx4_single_threaded); mlx4_spinlock_init(&context->send_db_lock, !mlx4_single_threaded); INIT_LIST_HEAD(&context->send_db_list); mlx4_spinlock_init(&context->hugetlb_lock, !mlx4_single_threaded); INIT_LIST_HEAD(&context->hugetlb_list); pthread_mutex_init(&context->task_mutex, NULL); memset(&dev_attrs, 0, sizeof(dev_attrs)); dev_attrs.comp_mask = IBV_EXP_DEVICE_ATTR_WITH_TIMESTAMP_MASK | IBV_EXP_DEVICE_ATTR_WITH_HCA_CORE_CLOCK | IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS | IBV_EXP_DEVICE_ATTR_MAX_CTX_RES_DOMAIN; if (mlx4_exp_query_device(ibv_ctx, &dev_attrs)) { if (mlx4_query_device(ibv_ctx, &dev_legacy_attrs)) goto query_free; memcpy(&dev_attrs, &dev_legacy_attrs, sizeof(dev_legacy_attrs)); } context->max_qp_wr = dev_attrs.max_qp_wr; context->max_sge = dev_attrs.max_sge; context->max_cqe = dev_attrs.max_cqe; context->exp_device_cap_flags = dev_attrs.exp_device_cap_flags; if (dev_attrs.comp_mask & IBV_EXP_DEVICE_ATTR_MAX_CTX_RES_DOMAIN) context->max_ctx_res_domain = dev_attrs.max_ctx_res_domain; VALGRIND_MAKE_MEM_DEFINED(&context->hca_core_clock, sizeof(context->hca_core_clock)); if (dev_attrs.comp_mask & IBV_EXP_DEVICE_ATTR_WITH_HCA_CORE_CLOCK) { if (dev_attrs.hca_core_clock) context->core_clk.mult = ((1ull * 1000) << 29) / dev_attrs.hca_core_clock; else context->core_clk.mult = 0; context->core_clk.shift = 29; context->core_clk.mask = dev_attrs.timestamp_mask; if (ioctl(cmd_fd, MLX4_IOCHWCLOCKOFFSET, &hca_clock_offset) >= 0) { VALGRIND_MAKE_MEM_DEFINED(&hca_clock_offset, sizeof(hca_clock_offset)); context->core_clk.offset = hca_clock_offset; hca_clock_page = mmap(NULL, hca_clock_offset + sizeof(context->core_clk.mask), PROT_READ, MAP_SHARED, cmd_fd, dev->page_size * (MLX4_IB_MMAP_GET_HW_CLOCK)); if (hca_clock_page == MAP_FAILED) { fprintf(stderr, PFX "Warning: Timestamp available,\n" "but failed to mmap() hca core " "clock page.\n"); } else { context->hca_core_clock = hca_clock_page + context->core_clk.offset; } } } ibv_ctx->ops = mlx4_ctx_ops; verbs_ctx->has_comp_mask |= VERBS_CONTEXT_XRCD | VERBS_CONTEXT_SRQ | VERBS_CONTEXT_QP; verbs_set_ctx_op(verbs_ctx, close_xrcd, mlx4_close_xrcd); verbs_set_ctx_op(verbs_ctx, open_xrcd, mlx4_open_xrcd); verbs_set_ctx_op(verbs_ctx, create_srq_ex, mlx4_create_srq_ex); verbs_set_ctx_op(verbs_ctx, get_srq_num, verbs_get_srq_num); verbs_set_ctx_op(verbs_ctx, create_qp_ex, mlx4_create_qp_ex); verbs_set_ctx_op(verbs_ctx, open_qp, mlx4_open_qp); verbs_set_ctx_op(verbs_ctx, create_flow, ibv_cmd_create_flow); verbs_set_ctx_op(verbs_ctx, destroy_flow, ibv_cmd_destroy_flow); /* * Set experimental verbs */ verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_ibv_reg_shared_mr, mlx4_reg_shared_mr); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_ibv_create_flow, ibv_exp_cmd_create_flow); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_ibv_destroy_flow, ibv_exp_cmd_destroy_flow); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_ibv_create_ah, mlx4_exp_create_ah); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_query_device, mlx4_exp_query_device); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_create_qp, mlx4_exp_create_qp); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_modify_qp, mlx4_exp_modify_qp); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_query_port, mlx4_exp_query_port); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_modify_cq, mlx4_modify_cq); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_post_task, mlx4_post_task); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_set_legacy_xrc, mlx4_set_legacy_xrc); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_get_legacy_xrc, mlx4_get_legacy_xrc); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_ibv_poll_cq, mlx4_exp_poll_cq); verbs_set_exp_ctx_op(verbs_exp_ctx, exp_create_cq, mlx4_create_cq_ex); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_query_values, mlx4_query_values); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_reg_mr, mlx4_exp_reg_mr); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_post_send, mlx4_exp_post_send); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_bind_mw, mlx4_exp_bind_mw); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_rereg_mr, mlx4_exp_rereg_mr); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_dereg_mr, mlx4_exp_dereg_mr); verbs_set_exp_ctx_op(verbs_exp_ctx, exp_create_res_domain, mlx4_exp_create_res_domain); verbs_set_exp_ctx_op(verbs_exp_ctx, exp_destroy_res_domain, mlx4_exp_destroy_res_domain); verbs_set_exp_ctx_op(verbs_exp_ctx, exp_query_intf, mlx4_exp_query_intf); verbs_set_exp_ctx_op(verbs_exp_ctx, exp_release_intf, mlx4_exp_release_intf); return 0; query_free: munmap(context->uar, dev->page_size); if (context->bfs.page) munmap(context->bfs.page, dev->page_size); if (hca_clock_page) munmap(hca_clock_page, hca_clock_offset + sizeof(context->core_clk.mask)); return errno; }
int mthca_tavor_post_srq_recv(struct ibv_srq *ibsrq, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr) { struct mthca_srq *srq = to_msrq(ibsrq); uint32_t doorbell[2]; int err = 0; int first_ind; int ind; int next_ind; int nreq; int i; void *wqe; void *prev_wqe; pthread_spin_lock(&srq->lock); first_ind = srq->first_free; for (nreq = 0; wr; wr = wr->next) { ind = srq->first_free; wqe = get_wqe(srq, ind); next_ind = *wqe_to_link(wqe); if (next_ind < 0) { err = -1; *bad_wr = wr; break; } prev_wqe = srq->last; srq->last = wqe; ((struct mthca_next_seg *) wqe)->ee_nds = 0; /* flags field will always remain 0 */ wqe += sizeof (struct mthca_next_seg); if (wr->num_sge > srq->max_gs) { err = -1; *bad_wr = wr; srq->last = prev_wqe; break; } for (i = 0; i < wr->num_sge; ++i) { ((struct mthca_data_seg *) wqe)->byte_count = htonl(wr->sg_list[i].length); ((struct mthca_data_seg *) wqe)->lkey = htonl(wr->sg_list[i].lkey); ((struct mthca_data_seg *) wqe)->addr = htonll(wr->sg_list[i].addr); wqe += sizeof (struct mthca_data_seg); } if (i < srq->max_gs) { ((struct mthca_data_seg *) wqe)->byte_count = 0; ((struct mthca_data_seg *) wqe)->lkey = htonl(MTHCA_INVAL_LKEY); ((struct mthca_data_seg *) wqe)->addr = 0; } ((struct mthca_next_seg *) prev_wqe)->ee_nds = htonl(MTHCA_NEXT_DBD); srq->wrid[ind] = wr->wr_id; srq->first_free = next_ind; if (++nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB) { nreq = 0; doorbell[0] = htonl(first_ind << srq->wqe_shift); doorbell[1] = htonl(srq->srqn << 8); /* * Make sure that descriptors are written * before doorbell is rung. */ wmb(); mthca_write64(doorbell, to_mctx(ibsrq->context), MTHCA_RECV_DOORBELL); first_ind = srq->first_free; } } if (nreq) { doorbell[0] = htonl(first_ind << srq->wqe_shift); doorbell[1] = htonl((srq->srqn << 8) | nreq); /* * Make sure that descriptors are written before * doorbell is rung. */ wmb(); mthca_write64(doorbell, to_mctx(ibsrq->context), MTHCA_RECV_DOORBELL); } pthread_spin_unlock(&srq->lock); return err; }
static int mlx5_alloc_context(struct verbs_device *vdev, struct ibv_context *ctx, int cmd_fd) { struct mlx5_context *context; struct mlx5_alloc_ucontext req; struct mlx5_alloc_ucontext_resp resp; struct ibv_device *ibdev = &vdev->device; struct verbs_context *verbs_ctx = verbs_get_ctx(ctx); int i; int page_size = to_mdev(ibdev)->page_size; int tot_uuars; int low_lat_uuars; int gross_uuars; int j; off_t offset; mlx5_single_threaded = single_threaded_app(); context = to_mctx(ctx); context->ibv_ctx.cmd_fd = cmd_fd; memset(&resp, 0, sizeof(resp)); open_debug_file(context); set_debug_mask(); set_freeze_on_error(); if (gethostname(context->hostname, sizeof(context->hostname))) strcpy(context->hostname, "host_unknown"); tot_uuars = get_total_uuars(); if (tot_uuars <= 0) { if (tot_uuars == 0) errno = EINVAL; else errno = -tot_uuars; goto err_free; } gross_uuars = tot_uuars / MLX5_NUM_UUARS_PER_PAGE * 4; context->bfs = calloc(gross_uuars, sizeof *context->bfs); if (!context->bfs) { errno = ENOMEM; goto err_free; } low_lat_uuars = get_num_low_lat_uuars(); if (low_lat_uuars < 0) { errno = ENOMEM; goto err_free_bf; } if (low_lat_uuars > tot_uuars - 1) { errno = ENOMEM; goto err_free_bf; } memset(&req, 0, sizeof(req)); req.total_num_uuars = tot_uuars; req.num_low_latency_uuars = low_lat_uuars; if (ibv_cmd_get_context(&context->ibv_ctx, &req.ibv_req, sizeof req, &resp.ibv_resp, sizeof resp)) goto err_free_bf; context->max_num_qps = resp.qp_tab_size; context->bf_reg_size = resp.bf_reg_size; context->tot_uuars = resp.tot_uuars; context->low_lat_uuars = low_lat_uuars; context->cache_line_size = resp.cache_line_size; context->max_sq_desc_sz = resp.max_sq_desc_sz; context->max_rq_desc_sz = resp.max_rq_desc_sz; context->max_send_wqebb = resp.max_send_wqebb; context->num_ports = resp.num_ports; context->max_recv_wr = resp.max_recv_wr; context->max_srq_recv_wr = resp.max_srq_recv_wr; context->max_desc_sz_sq_dc = resp.max_desc_sz_sq_dc; context->atomic_sizes_dc = resp.atomic_sizes_dc; pthread_mutex_init(&context->rsc_table_mutex, NULL); pthread_mutex_init(&context->srq_table_mutex, NULL); for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i) context->rsc_table[i].refcnt = 0; context->db_list = NULL; pthread_mutex_init(&context->db_list_mutex, NULL); for (i = 0; i < resp.tot_uuars / MLX5_NUM_UUARS_PER_PAGE; ++i) { offset = 0; set_command(MLX5_MMAP_GET_REGULAR_PAGES_CMD, &offset); set_index(i, &offset); context->uar[i] = mmap(NULL, to_mdev(ibdev)->page_size, PROT_WRITE, MAP_SHARED, cmd_fd, page_size * offset); if (context->uar[i] == MAP_FAILED) { context->uar[i] = NULL; goto err_free_bf; } } for (j = 0; j < gross_uuars; ++j) { context->bfs[j].reg = context->uar[j / 4] + MLX5_BF_OFFSET + (j % 4) * context->bf_reg_size; context->bfs[j].need_lock = need_uuar_lock(context, j); mlx5_spinlock_init(&context->bfs[j].lock); context->bfs[j].offset = 0; if (j) context->bfs[j].buf_size = context->bf_reg_size / 2; context->bfs[j].uuarn = j; } mlx5_spinlock_init(&context->lock32); context->prefer_bf = get_always_bf(); context->shut_up_bf = get_shut_up_bf(); mlx5_read_env(ibdev, context); mlx5_spinlock_init(&context->hugetlb_lock); INIT_LIST_HEAD(&context->hugetlb_list); pthread_mutex_init(&context->task_mutex, NULL); ctx->ops = mlx5_ctx_ops; set_extended(verbs_ctx); set_experimental(ctx); return 0; err_free_bf: free(context->bfs); err_free: for (i = 0; i < MLX5_MAX_UAR_PAGES; ++i) { if (context->uar[i]) munmap(context->uar[i], page_size); } close_debug_file(context); return errno; }
static void set_experimental(struct ibv_context *ctx) { struct verbs_context_exp *verbs_exp_ctx = verbs_get_exp_ctx(ctx); struct mlx5_context *mctx = to_mctx(ctx); verbs_set_exp_ctx_op(verbs_exp_ctx, create_dct, mlx5_create_dct); verbs_set_exp_ctx_op(verbs_exp_ctx, destroy_dct, mlx5_destroy_dct); verbs_set_exp_ctx_op(verbs_exp_ctx, query_dct, mlx5_query_dct); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_arm_dct, mlx5_arm_dct); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_query_device, mlx5_query_device_ex); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_create_qp, mlx5_exp_create_qp); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_modify_qp, mlx5_modify_qp_ex); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_get_legacy_xrc, mlx5_get_legacy_xrc); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_set_legacy_xrc, mlx5_set_legacy_xrc); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_modify_cq, mlx5_modify_cq); verbs_set_exp_ctx_op(verbs_exp_ctx, exp_create_cq, mlx5_create_cq_ex); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_ibv_poll_cq, mlx5_poll_cq_ex); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_post_task, mlx5_post_task); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_reg_mr, mlx5_exp_reg_mr); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_post_send, mlx5_exp_post_send); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_alloc_mkey_list_memory, mlx5_alloc_mkey_mem); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_dealloc_mkey_list_memory, mlx5_free_mkey_mem); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_query_mkey, mlx5_query_mkey); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_create_mr, mlx5_create_mr); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_prefetch_mr, mlx5_prefetch_mr); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_dereg_mr, mlx5_exp_dereg_mr); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_poll_dc_info, mlx5_poll_dc_info); verbs_set_exp_ctx_op(verbs_exp_ctx, exp_create_wq, mlx5_exp_create_wq); verbs_set_exp_ctx_op(verbs_exp_ctx, exp_modify_wq, mlx5_exp_modify_wq); verbs_set_exp_ctx_op(verbs_exp_ctx, exp_destroy_wq, mlx5_exp_destroy_wq); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_ibv_create_flow, ibv_exp_cmd_create_flow); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_ibv_destroy_flow, ibv_exp_cmd_destroy_flow); verbs_set_exp_ctx_op(verbs_exp_ctx, exp_create_rwq_ind_table, mlx5_exp_create_rwq_ind_table); verbs_set_exp_ctx_op(verbs_exp_ctx, exp_destroy_rwq_ind_table, mlx5_exp_destroy_rwq_ind_table); verbs_set_exp_ctx_op(verbs_exp_ctx, exp_create_res_domain, mlx5_exp_create_res_domain); verbs_set_exp_ctx_op(verbs_exp_ctx, exp_destroy_res_domain, mlx5_exp_destroy_res_domain); verbs_set_exp_ctx_op(verbs_exp_ctx, exp_query_intf, mlx5_exp_query_intf); verbs_set_exp_ctx_op(verbs_exp_ctx, exp_release_intf, mlx5_exp_release_intf); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_query_port, mlx5_exp_query_port); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_ibv_create_ah, mlx5_exp_create_ah); verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_query_values, mlx5_exp_query_values); verbs_set_exp_ctx_op(verbs_exp_ctx, alloc_ec_calc, mlx5_alloc_ec_calc); verbs_set_exp_ctx_op(verbs_exp_ctx, dealloc_ec_calc, mlx5_dealloc_ec_calc); verbs_set_exp_ctx_op(verbs_exp_ctx, ec_encode_async, mlx5_ec_encode_async); verbs_set_exp_ctx_op(verbs_exp_ctx, ec_encode_sync, mlx5_ec_encode_sync); verbs_set_exp_ctx_op(verbs_exp_ctx, ec_decode_async, mlx5_ec_decode_async); verbs_set_exp_ctx_op(verbs_exp_ctx, ec_decode_sync, mlx5_ec_decode_sync); verbs_set_exp_ctx_op(verbs_exp_ctx, ec_poll, mlx5_ec_poll); verbs_set_exp_ctx_op(verbs_exp_ctx, ec_encode_send, mlx5_ec_encode_send); verbs_set_exp_ctx_op(verbs_exp_ctx, ec_update_async, mlx5_ec_update_async); verbs_set_exp_ctx_op(verbs_exp_ctx, ec_update_sync, mlx5_ec_update_sync); if (mctx->cqe_version == 1) verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_ibv_poll_cq, mlx5_poll_cq_ex_1); else verbs_set_exp_ctx_op(verbs_exp_ctx, drv_exp_ibv_poll_cq, mlx5_poll_cq_ex); verbs_set_exp_ctx_op(verbs_exp_ctx, exp_peer_commit_qp, mlx5_exp_peer_commit_qp); verbs_set_exp_ctx_op(verbs_exp_ctx, exp_rollback_send, mlx5_exp_rollback_send); verbs_set_exp_ctx_op(verbs_exp_ctx, exp_peer_peek_cq, mlx5_exp_peer_peek_cq); verbs_set_exp_ctx_op(verbs_exp_ctx, exp_peer_abort_peek_cq, mlx5_exp_peer_abort_peek_cq); }
static int mlx5_alloc_context(struct verbs_device *vdev, struct ibv_context *ctx, int cmd_fd) { struct mlx5_context *context; struct mlx5_alloc_ucontext req; struct mlx5_exp_alloc_ucontext_resp resp; struct ibv_device *ibdev = &vdev->device; struct verbs_context *verbs_ctx = verbs_get_ctx(ctx); struct ibv_exp_device_attr attr; int i; int page_size = to_mdev(ibdev)->page_size; int tot_uuars; int low_lat_uuars; int gross_uuars; int j; int uar_mapped; off_t offset; int err; context = to_mctx(ctx); if (pthread_mutex_init(&context->env_mtx, NULL)) return -1; context->ibv_ctx.cmd_fd = cmd_fd; memset(&resp, 0, sizeof(resp)); if (gethostname(context->hostname, sizeof(context->hostname))) strcpy(context->hostname, "host_unknown"); tot_uuars = get_total_uuars(); gross_uuars = tot_uuars / MLX5_NUM_UUARS_PER_PAGE * 4; context->bfs = calloc(gross_uuars, sizeof *context->bfs); if (!context->bfs) { errno = ENOMEM; goto err_free; } low_lat_uuars = get_num_low_lat_uuars(); if (low_lat_uuars > tot_uuars - 1) { errno = ENOMEM; goto err_free_bf; } memset(&req, 0, sizeof(req)); req.total_num_uuars = tot_uuars; req.num_low_latency_uuars = low_lat_uuars; if (ibv_cmd_get_context(&context->ibv_ctx, &req.ibv_req, sizeof req, &resp.ibv_resp, sizeof resp)) goto err_free_bf; context->max_num_qps = resp.qp_tab_size; context->bf_reg_size = resp.bf_reg_size; context->tot_uuars = resp.tot_uuars; context->low_lat_uuars = low_lat_uuars; context->cache_line_size = resp.cache_line_size; context->max_sq_desc_sz = resp.max_sq_desc_sz; context->max_rq_desc_sz = resp.max_rq_desc_sz; context->max_send_wqebb = resp.max_send_wqebb; context->num_ports = resp.num_ports; context->max_recv_wr = resp.max_recv_wr; context->max_srq_recv_wr = resp.max_srq_recv_wr; context->max_desc_sz_sq_dc = resp.max_desc_sz_sq_dc; context->atomic_sizes_dc = resp.atomic_sizes_dc; context->compact_av = resp.flags & MLX5_CAP_COMPACT_AV; if (resp.exp_data.comp_mask & MLX5_EXP_ALLOC_CTX_RESP_MASK_CQE_COMP_MAX_NUM) context->cqe_comp_max_num = resp.exp_data.cqe_comp_max_num; if (resp.exp_data.comp_mask & MLX5_EXP_ALLOC_CTX_RESP_MASK_CQE_VERSION) context->cqe_version = resp.exp_data.cqe_version; if (resp.exp_data.comp_mask & MLX5_EXP_ALLOC_CTX_RESP_MASK_RROCE_UDP_SPORT_MIN) context->rroce_udp_sport_min = resp.exp_data.rroce_udp_sport_min; if (resp.exp_data.comp_mask & MLX5_EXP_ALLOC_CTX_RESP_MASK_RROCE_UDP_SPORT_MAX) context->rroce_udp_sport_max = resp.exp_data.rroce_udp_sport_max; ctx->ops = mlx5_ctx_ops; if (context->cqe_version) { if (context->cqe_version == 1) { ctx->ops.poll_cq = mlx5_poll_cq_1; } else { printf("Unsupported cqe_vesion = %d, stay on cqe version 0\n", context->cqe_version); context->cqe_version = 0; } } attr.comp_mask = IBV_EXP_DEVICE_ATTR_RESERVED - 1; err = mlx5_query_device_ex(ctx, &attr); if (!err) { if (attr.comp_mask & IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS) context->exp_device_cap_flags = attr.exp_device_cap_flags; if (attr.comp_mask & IBV_EXP_DEVICE_ATTR_MAX_CTX_RES_DOMAIN) { context->max_ctx_res_domain = attr.max_ctx_res_domain; mlx5_spinlock_init(&context->send_db_lock, !mlx5_single_threaded); INIT_LIST_HEAD(&context->send_wc_db_list); INIT_LIST_HEAD(&context->wc_uar_list); } if (resp.exp_data.comp_mask & MLX5_EXP_ALLOC_CTX_RESP_MASK_HCA_CORE_CLOCK_OFFSET) { context->core_clock.offset = resp.exp_data.hca_core_clock_offset & (to_mdev(ibdev)->page_size - 1); mlx5_map_internal_clock(to_mdev(ibdev), ctx); if (attr.hca_core_clock) context->core_clock.mult = ((1ull * 1000) << 21) / attr.hca_core_clock; else context->core_clock.mult = 0; /* ConnectX-4 supports 64bit timestamp. We choose these numbers * in order to make sure that after arithmetic operations, * we don't overflow a 64bit variable. */ context->core_clock.shift = 21; context->core_clock.mask = (1ULL << 49) - 1; } } pthread_mutex_init(&context->rsc_table_mutex, NULL); pthread_mutex_init(&context->srq_table_mutex, NULL); for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i) context->rsc_table[i].refcnt = 0; for (i = 0; i < MLX5_QP_TABLE_SIZE; ++i) context->uidx_table[i].refcnt = 0; context->db_list = NULL; pthread_mutex_init(&context->db_list_mutex, NULL); context->prefer_bf = get_always_bf(&context->ibv_ctx); context->shut_up_bf = get_shut_up_bf(&context->ibv_ctx); context->enable_cqe_comp = get_cqe_comp(&context->ibv_ctx); mlx5_use_mutex = get_use_mutex(&context->ibv_ctx); offset = 0; set_command(MLX5_MMAP_MAP_DC_INFO_PAGE, &offset); context->cc.buf = mmap(NULL, 4096 * context->num_ports, PROT_READ, MAP_PRIVATE, cmd_fd, page_size * offset); if (context->cc.buf == MAP_FAILED) context->cc.buf = NULL; mlx5_single_threaded = single_threaded_app(&context->ibv_ctx); for (i = 0; i < resp.tot_uuars / MLX5_NUM_UUARS_PER_PAGE; ++i) { uar_mapped = 0; /* Don't map UAR to WC if BF is not used */ if (!context->shut_up_bf) { context->uar[i].regs = mlx5_uar_mmap(i, MLX5_MMAP_GET_WC_PAGES_CMD, page_size, cmd_fd); if (context->uar[i].regs != MAP_FAILED) { context->uar[i].map_type = MLX5_UAR_MAP_WC; uar_mapped = 1; } } if (!uar_mapped) { context->uar[i].regs = mlx5_uar_mmap(i, MLX5_MMAP_GET_NC_PAGES_CMD, page_size, cmd_fd); if (context->uar[i].regs != MAP_FAILED) { context->uar[i].map_type = MLX5_UAR_MAP_NC; uar_mapped = 1; } } if (!uar_mapped) { /* for backward compatibility with old kernel driver */ context->uar[i].regs = mlx5_uar_mmap(i, MLX5_MMAP_GET_REGULAR_PAGES_CMD, page_size, cmd_fd); if (context->uar[i].regs != MAP_FAILED) { context->uar[i].map_type = MLX5_UAR_MAP_WC; uar_mapped = 1; } } if (!uar_mapped) { context->uar[i].regs = NULL; goto err_free_cc; } } for (j = 0; j < gross_uuars; ++j) { context->bfs[j].reg = context->uar[j / 4].regs + MLX5_BF_OFFSET + (j % 4) * context->bf_reg_size; context->bfs[j].need_lock = need_uuar_lock(context, j) && context->uar[j / 4].map_type == MLX5_UAR_MAP_WC; mlx5_lock_init(&context->bfs[j].lock, !mlx5_single_threaded, mlx5_get_locktype()); context->bfs[j].offset = 0; if (context->uar[j / 4].map_type == MLX5_UAR_MAP_WC) { context->bfs[j].buf_size = context->bf_reg_size / 2; context->bfs[j].db_method = (context->bfs[j].need_lock && !mlx5_single_threaded) ? MLX5_DB_METHOD_BF : (mlx5_single_threaded && wc_auto_evict_size() == 64 ? MLX5_DB_METHOD_DEDIC_BF_1_THREAD : MLX5_DB_METHOD_DEDIC_BF); } else { context->bfs[j].db_method = MLX5_DB_METHOD_DB; } context->bfs[j].uuarn = j; } mlx5_lock_init(&context->lock32, !mlx5_single_threaded, mlx5_get_locktype()); mlx5_spinlock_init(&context->hugetlb_lock, !mlx5_single_threaded); INIT_LIST_HEAD(&context->hugetlb_list); pthread_mutex_init(&context->task_mutex, NULL); set_extended(verbs_ctx); set_experimental(ctx); for (i = 0; i < MLX5_MAX_PORTS_NUM; ++i) context->port_query_cache[i].valid = 0; return 0; err_free_cc: if (context->cc.buf) munmap(context->cc.buf, 4096 * context->num_ports); if (context->hca_core_clock) munmap(context->hca_core_clock - context->core_clock.offset, to_mdev(ibdev)->page_size); err_free_bf: free(context->bfs); err_free: for (i = 0; i < MLX5_MAX_UAR_PAGES; ++i) { if (context->uar[i].regs) munmap(context->uar[i].regs, page_size); } close_debug_file(context); return errno; }