int mthca_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, struct ib_send_wr **bad_wr) { struct mthca_dev *dev = to_mdev(ibqp->device); struct mthca_qp *qp = to_mqp(ibqp); void *wqe; void *prev_wqe; unsigned long flags; int err = 0; int nreq; int i; int size; int size0 = 0; u32 f0 = 0; int ind; u8 op0 = 0; static const u8 opcode[] = { [IB_WR_SEND] = MTHCA_OPCODE_SEND, [IB_WR_SEND_WITH_IMM] = MTHCA_OPCODE_SEND_IMM, [IB_WR_RDMA_WRITE] = MTHCA_OPCODE_RDMA_WRITE, [IB_WR_RDMA_WRITE_WITH_IMM] = MTHCA_OPCODE_RDMA_WRITE_IMM, [IB_WR_RDMA_READ] = MTHCA_OPCODE_RDMA_READ, [IB_WR_ATOMIC_CMP_AND_SWP] = MTHCA_OPCODE_ATOMIC_CS, [IB_WR_ATOMIC_FETCH_AND_ADD] = MTHCA_OPCODE_ATOMIC_FA, };
/* * Experimental functions */ struct ib_qp *mlx4_ib_exp_create_qp(struct ib_pd *pd, struct ib_exp_qp_init_attr *init_attr, struct ib_udata *udata) { int rwqe_size; struct ib_qp *qp; struct mlx4_ib_qp *mqp; int use_inlr; struct mlx4_ib_dev *dev; if ((init_attr->create_flags & IB_QP_CREATE_ATOMIC_BE_REPLY) && mlx4_is_little_endian()) return ERR_PTR(-EINVAL); if (init_attr->max_inl_recv && !udata) return ERR_PTR(-EINVAL); use_inlr = mlx4_ib_qp_has_rq((struct ib_qp_init_attr *)init_attr) && init_attr->max_inl_recv && pd; if (use_inlr) { rwqe_size = roundup_pow_of_two(max(1U, init_attr->cap.max_recv_sge)) * sizeof(struct mlx4_wqe_data_seg); if (rwqe_size < init_attr->max_inl_recv) { dev = to_mdev(pd->device); init_attr->max_inl_recv = min(init_attr->max_inl_recv, (u32)(dev->dev->caps.max_rq_sg * sizeof(struct mlx4_wqe_data_seg))); init_attr->cap.max_recv_sge = roundup_pow_of_two(init_attr->max_inl_recv) / sizeof(struct mlx4_wqe_data_seg); } } else { init_attr->max_inl_recv = 0; } qp = mlx4_ib_create_qp(pd, (struct ib_qp_init_attr *)init_attr, udata); if (IS_ERR(qp)) return qp; if (use_inlr) { mqp = to_mqp(qp); mqp->max_inlr_data = 1 << mqp->rq.wqe_shift; init_attr->max_inl_recv = mqp->max_inlr_data; } return qp; }
int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask) { struct mthca_dev *dev = to_mdev(ibqp->device); struct mthca_qp *qp = to_mqp(ibqp); enum ib_qp_state cur_state, new_state; void *mailbox = NULL; struct mthca_qp_param *qp_param; struct mthca_qp_context *qp_context; u32 req_param, opt_param; u8 status; int err; if (attr_mask & IB_QP_CUR_STATE) { if (attr->cur_qp_state != IB_QPS_RTR && attr->cur_qp_state != IB_QPS_RTS && attr->cur_qp_state != IB_QPS_SQD && attr->cur_qp_state != IB_QPS_SQE) return -EINVAL; else cur_state = attr->cur_qp_state; } else { spin_lock_irq(&qp->lock); cur_state = qp->state; spin_unlock_irq(&qp->lock); } if (attr_mask & IB_QP_STATE) { if (attr->qp_state < 0 || attr->qp_state > IB_QPS_ERR) return -EINVAL; new_state = attr->qp_state; } else new_state = cur_state; if (state_table[cur_state][new_state].trans == MTHCA_TRANS_INVALID) { mthca_dbg(dev, "Illegal QP transition " "%d->%d\n", cur_state, new_state); return -EINVAL; } req_param = state_table[cur_state][new_state].req_param[qp->transport]; opt_param = state_table[cur_state][new_state].opt_param[qp->transport]; if ((req_param & attr_mask) != req_param) { mthca_dbg(dev, "QP transition " "%d->%d missing req attr 0x%08x\n", cur_state, new_state, req_param & ~attr_mask); return -EINVAL; } if (attr_mask & ~(req_param | opt_param | IB_QP_STATE)) { mthca_dbg(dev, "QP transition (transport %d) " "%d->%d has extra attr 0x%08x\n", qp->transport, cur_state, new_state, attr_mask & ~(req_param | opt_param | IB_QP_STATE)); return -EINVAL; } mailbox = kmalloc(sizeof (*qp_param) + MTHCA_CMD_MAILBOX_EXTRA, GFP_KERNEL); if (!mailbox) return -ENOMEM; qp_param = MAILBOX_ALIGN(mailbox); qp_context = &qp_param->context; memset(qp_param, 0, sizeof *qp_param); qp_context->flags = cpu_to_be32((to_mthca_state(new_state) << 28) | (to_mthca_st(qp->transport) << 16)); qp_context->flags |= cpu_to_be32(MTHCA_QP_BIT_DE); if (!(attr_mask & IB_QP_PATH_MIG_STATE)) qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_MIGRATED << 11); else { qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PM_STATE); switch (attr->path_mig_state) { case IB_MIG_MIGRATED: qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_MIGRATED << 11); break; case IB_MIG_REARM: qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_REARM << 11); break; case IB_MIG_ARMED: qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_ARMED << 11); break; } } /* leave sched_queue as 0 */ if (qp->transport == MLX || qp->transport == UD) qp_context->mtu_msgmax = cpu_to_be32((IB_MTU_2048 << 29) | (11 << 24)); else if (attr_mask & IB_QP_PATH_MTU) { qp_context->mtu_msgmax = cpu_to_be32((attr->path_mtu << 29) | (31 << 24)); } qp_context->usr_page = cpu_to_be32(MTHCA_KAR_PAGE); qp_context->local_qpn = cpu_to_be32(qp->qpn); if (attr_mask & IB_QP_DEST_QPN) { qp_context->remote_qpn = cpu_to_be32(attr->dest_qp_num); } if (qp->transport == MLX) qp_context->pri_path.port_pkey |= cpu_to_be32(to_msqp(qp)->port << 24); else { if (attr_mask & IB_QP_PORT) { qp_context->pri_path.port_pkey |= cpu_to_be32(attr->port_num << 24); qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PORT_NUM); } } if (attr_mask & IB_QP_PKEY_INDEX) { qp_context->pri_path.port_pkey |= cpu_to_be32(attr->pkey_index); qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PKEY_INDEX); } if (attr_mask & IB_QP_RNR_RETRY) { qp_context->pri_path.rnr_retry = attr->rnr_retry << 5; qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RNR_RETRY); } if (attr_mask & IB_QP_AV) { qp_context->pri_path.g_mylmc = attr->ah_attr.src_path_bits & 0x7f; qp_context->pri_path.rlid = cpu_to_be16(attr->ah_attr.dlid); qp_context->pri_path.static_rate = (!!attr->ah_attr.static_rate) << 3; if (attr->ah_attr.ah_flags & IB_AH_GRH) { qp_context->pri_path.g_mylmc |= 1 << 7; qp_context->pri_path.mgid_index = attr->ah_attr.grh.sgid_index; qp_context->pri_path.hop_limit = attr->ah_attr.grh.hop_limit; qp_context->pri_path.sl_tclass_flowlabel = cpu_to_be32((attr->ah_attr.sl << 28) | (attr->ah_attr.grh.traffic_class << 20) | (attr->ah_attr.grh.flow_label)); memcpy(qp_context->pri_path.rgid, attr->ah_attr.grh.dgid.raw, 16); } else { qp_context->pri_path.sl_tclass_flowlabel = cpu_to_be32(attr->ah_attr.sl << 28); } qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PRIMARY_ADDR_PATH); } if (attr_mask & IB_QP_TIMEOUT) { qp_context->pri_path.ackto = attr->timeout; qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_ACK_TIMEOUT); } /* XXX alt_path */ /* leave rdd as 0 */ qp_context->pd = cpu_to_be32(to_mpd(ibqp->pd)->pd_num); /* leave wqe_base as 0 (we always create an MR based at 0 for WQs) */ qp_context->wqe_lkey = cpu_to_be32(qp->mr.ibmr.lkey); qp_context->params1 = cpu_to_be32((MTHCA_ACK_REQ_FREQ << 28) | (MTHCA_FLIGHT_LIMIT << 24) | MTHCA_QP_BIT_SRE | MTHCA_QP_BIT_SWE | MTHCA_QP_BIT_SAE); if (qp->sq.policy == IB_SIGNAL_ALL_WR) qp_context->params1 |= cpu_to_be32(MTHCA_QP_BIT_SSC); if (attr_mask & IB_QP_RETRY_CNT) { qp_context->params1 |= cpu_to_be32(attr->retry_cnt << 16); qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RETRY_COUNT); } if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { qp_context->params1 |= cpu_to_be32(min(attr->max_dest_rd_atomic ? ffs(attr->max_dest_rd_atomic) - 1 : 0, 7) << 21); qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_SRA_MAX); } if (attr_mask & IB_QP_SQ_PSN) qp_context->next_send_psn = cpu_to_be32(attr->sq_psn); qp_context->cqn_snd = cpu_to_be32(to_mcq(ibqp->send_cq)->cqn); if (attr_mask & IB_QP_ACCESS_FLAGS) { /* * Only enable RDMA/atomics if we have responder * resources set to a non-zero value. */ if (qp->resp_depth) { qp_context->params2 |= cpu_to_be32(attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE ? MTHCA_QP_BIT_RWE : 0); qp_context->params2 |= cpu_to_be32(attr->qp_access_flags & IB_ACCESS_REMOTE_READ ? MTHCA_QP_BIT_RRE : 0); qp_context->params2 |= cpu_to_be32(attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC ? MTHCA_QP_BIT_RAE : 0); } qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RWE | MTHCA_QP_OPTPAR_RRE | MTHCA_QP_OPTPAR_RAE); qp->atomic_rd_en = attr->qp_access_flags; } if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) { u8 rra_max; if (qp->resp_depth && !attr->max_rd_atomic) { /* * Lowering our responder resources to zero. * Turn off RDMA/atomics as responder. * (RWE/RRE/RAE in params2 already zero) */ qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RWE | MTHCA_QP_OPTPAR_RRE | MTHCA_QP_OPTPAR_RAE); } if (!qp->resp_depth && attr->max_rd_atomic) { /* * Increasing our responder resources from * zero. Turn on RDMA/atomics as appropriate. */ qp_context->params2 |= cpu_to_be32(qp->atomic_rd_en & IB_ACCESS_REMOTE_WRITE ? MTHCA_QP_BIT_RWE : 0); qp_context->params2 |= cpu_to_be32(qp->atomic_rd_en & IB_ACCESS_REMOTE_READ ? MTHCA_QP_BIT_RRE : 0); qp_context->params2 |= cpu_to_be32(qp->atomic_rd_en & IB_ACCESS_REMOTE_ATOMIC ? MTHCA_QP_BIT_RAE : 0); qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RWE | MTHCA_QP_OPTPAR_RRE | MTHCA_QP_OPTPAR_RAE); } for (rra_max = 0; 1 << rra_max < attr->max_rd_atomic && rra_max < dev->qp_table.rdb_shift; ++rra_max) ; /* nothing */ qp_context->params2 |= cpu_to_be32(rra_max << 21); qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RRA_MAX); qp->resp_depth = attr->max_rd_atomic; } if (qp->rq.policy == IB_SIGNAL_ALL_WR) qp_context->params2 |= cpu_to_be32(MTHCA_QP_BIT_RSC); if (attr_mask & IB_QP_MIN_RNR_TIMER) { qp_context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24); qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RNR_TIMEOUT); } if (attr_mask & IB_QP_RQ_PSN) qp_context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn); qp_context->ra_buff_indx = dev->qp_table.rdb_base + ((qp->qpn & (dev->limits.num_qps - 1)) * MTHCA_RDB_ENTRY_SIZE << dev->qp_table.rdb_shift); qp_context->cqn_rcv = cpu_to_be32(to_mcq(ibqp->recv_cq)->cqn); if (attr_mask & IB_QP_QKEY) { qp_context->qkey = cpu_to_be32(attr->qkey); qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_Q_KEY); } err = mthca_MODIFY_QP(dev, state_table[cur_state][new_state].trans, qp->qpn, 0, qp_param, 0, &status); if (status) { mthca_warn(dev, "modify QP %d returned status %02x.\n", state_table[cur_state][new_state].trans, status); err = -EINVAL; } if (!err) qp->state = new_state; kfree(mailbox); if (is_sqp(dev, qp)) store_attrs(to_msqp(qp), attr, attr_mask); /* * If we are moving QP0 to RTR, bring the IB link up; if we * are moving QP0 to RESET or ERROR, bring the link back down. */ if (is_qp0(dev, qp)) { if (cur_state != IB_QPS_RTR && new_state == IB_QPS_RTR) init_port(dev, to_msqp(qp)->port); if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR && (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR)) mthca_CLOSE_IB(dev, to_msqp(qp)->port, &status); } return err; }
int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr) { struct mlx4_qp *qp = to_mqp(ibqp); struct mlx4_wqe_data_seg *scat; int ret = 0; int nreq; int ind; int i; pthread_spin_lock(&qp->rq.lock); /* XXX check that state is OK to post receive */ ind = qp->rq.head & (qp->rq.wqe_cnt - 1); for (nreq = 0; wr; ++nreq, wr = wr->next) { if (wq_overflow(&qp->rq, nreq, to_mcq(qp->ibv_qp.recv_cq))) { ret = -1; *bad_wr = wr; goto out; } if (wr->num_sge > qp->rq.max_gs) { ret = -1; *bad_wr = wr; goto out; } scat = get_recv_wqe(qp, ind); for (i = 0; i < wr->num_sge; ++i) __set_data_seg(scat + i, wr->sg_list + i); if (i < qp->rq.max_gs) { scat[i].byte_count = 0; scat[i].lkey = htonl(MLX4_INVALID_LKEY); scat[i].addr = 0; } qp->rq.wrid[ind] = wr->wr_id; ind = (ind + 1) & (qp->rq.wqe_cnt - 1); } out: if (nreq) { qp->rq.head += nreq; /* * Make sure that descriptors are written before * doorbell record. */ wmb(); *qp->db = htonl(qp->rq.head & 0xffff); } pthread_spin_unlock(&qp->rq.lock); return ret; }
int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) { struct mlx4_context *ctx; struct mlx4_qp *qp = to_mqp(ibqp); void *wqe; struct mlx4_wqe_ctrl_seg *ctrl; int ind; int nreq; int inl = 0; int ret = 0; int size; int i; pthread_spin_lock(&qp->sq.lock); /* XXX check that state is OK to post send */ ind = qp->sq.head; for (nreq = 0; wr; ++nreq, wr = wr->next) { if (wq_overflow(&qp->sq, nreq, to_mcq(qp->ibv_qp.send_cq))) { ret = -1; *bad_wr = wr; goto out; } if (wr->num_sge > qp->sq.max_gs) { ret = -1; *bad_wr = wr; goto out; } if (wr->opcode >= sizeof mlx4_ib_opcode / sizeof mlx4_ib_opcode[0]) { ret = -1; *bad_wr = wr; goto out; } ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id; ctrl->xrcrb_flags = (wr->send_flags & IBV_SEND_SIGNALED ? htonl(MLX4_WQE_CTRL_CQ_UPDATE) : 0) | (wr->send_flags & IBV_SEND_SOLICITED ? htonl(MLX4_WQE_CTRL_SOLICIT) : 0) | qp->sq_signal_bits; if (wr->opcode == IBV_WR_SEND_WITH_IMM || wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM) ctrl->imm = wr->imm_data; else ctrl->imm = 0; wqe += sizeof *ctrl; size = sizeof *ctrl / 16; switch (ibqp->qp_type) { case IBV_QPT_XRC: ctrl->xrcrb_flags |= htonl(wr->xrc_remote_srq_num << 8); /* fall thru */ case IBV_QPT_RC: case IBV_QPT_UC: switch (wr->opcode) { case IBV_WR_ATOMIC_CMP_AND_SWP: case IBV_WR_ATOMIC_FETCH_AND_ADD: set_raddr_seg(wqe, wr->wr.atomic.remote_addr, wr->wr.atomic.rkey); wqe += sizeof (struct mlx4_wqe_raddr_seg); set_atomic_seg(wqe, wr); wqe += sizeof (struct mlx4_wqe_atomic_seg); size += (sizeof (struct mlx4_wqe_raddr_seg) + sizeof (struct mlx4_wqe_atomic_seg)) / 16; break; case IBV_WR_RDMA_READ: inl = 1; /* fall through */ case IBV_WR_RDMA_WRITE: case IBV_WR_RDMA_WRITE_WITH_IMM: set_raddr_seg(wqe, wr->wr.rdma.remote_addr, wr->wr.rdma.rkey); wqe += sizeof (struct mlx4_wqe_raddr_seg); size += sizeof (struct mlx4_wqe_raddr_seg) / 16; break; default: /* No extra segments required for sends */ break; } break; case IBV_QPT_UD: set_datagram_seg(wqe, wr); wqe += sizeof (struct mlx4_wqe_datagram_seg); size += sizeof (struct mlx4_wqe_datagram_seg) / 16; if (to_mah(wr->wr.ud.ah)->tagged) { ctrl->ins_vlan = 1 << 6; ctrl->vlan_tag = htons(to_mah(wr->wr.ud.ah)->vlan); } break; default: break; } if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) { struct mlx4_wqe_inline_seg *seg; void *addr; int len, seg_len; int num_seg; int off, to_copy; inl = 0; seg = wqe; wqe += sizeof *seg; off = ((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1); num_seg = 0; seg_len = 0; for (i = 0; i < wr->num_sge; ++i) { addr = (void *) (uintptr_t) wr->sg_list[i].addr; len = wr->sg_list[i].length; inl += len; if (inl > qp->max_inline_data) { inl = 0; ret = -1; *bad_wr = wr; goto out; } while (len >= MLX4_INLINE_ALIGN - off) { to_copy = MLX4_INLINE_ALIGN - off; memcpy(wqe, addr, to_copy); len -= to_copy; wqe += to_copy; addr += to_copy; seg_len += to_copy; wmb(); /* see comment below */ seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len); seg_len = 0; seg = wqe; wqe += sizeof *seg; off = sizeof *seg; ++num_seg; } memcpy(wqe, addr, len); wqe += len; seg_len += len; off += len; } if (seg_len) { ++num_seg; /* * Need a barrier here to make sure * all the data is visible before the * byte_count field is set. Otherwise * the HCA prefetcher could grab the * 64-byte chunk with this inline * segment and get a valid (!= * 0xffffffff) byte count but stale * data, and end up sending the wrong * data. */ wmb(); seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len); } size += (inl + num_seg * sizeof * seg + 15) / 16; } else { struct mlx4_wqe_data_seg *seg = wqe; for (i = wr->num_sge - 1; i >= 0 ; --i) set_data_seg(seg + i, wr->sg_list + i); size += wr->num_sge * (sizeof *seg / 16); } ctrl->fence_size = (wr->send_flags & IBV_SEND_FENCE ? MLX4_WQE_CTRL_FENCE : 0) | size; /* * Make sure descriptor is fully written before * setting ownership bit (because HW can start * executing as soon as we do). */ wmb(); ctrl->owner_opcode = htonl(mlx4_ib_opcode[wr->opcode]) | (ind & qp->sq.wqe_cnt ? htonl(1 << 31) : 0); /* * We can improve latency by not stamping the last * send queue WQE until after ringing the doorbell, so * only stamp here if there are still more WQEs to post. */ if (wr->next) stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) & (qp->sq.wqe_cnt - 1)); ++ind; } out: ctx = to_mctx(ibqp->context); if (nreq == 1 && inl && size > 1 && size < ctx->bf_buf_size / 16) { ctrl->owner_opcode |= htonl((qp->sq.head & 0xffff) << 8); *(uint32_t *) (&ctrl->vlan_tag) |= qp->doorbell_qpn; /* * Make sure that descriptor is written to memory * before writing to BlueFlame page. */ wmb(); ++qp->sq.head; pthread_spin_lock(&ctx->bf_lock); mlx4_bf_copy(ctx->bf_page + ctx->bf_offset, (unsigned long *) ctrl, align(size * 16, 64)); wc_wmb(); ctx->bf_offset ^= ctx->bf_buf_size; pthread_spin_unlock(&ctx->bf_lock); } else if (nreq) { qp->sq.head += nreq; /* * Make sure that descriptors are written before * doorbell record. */ wmb(); *(uint32_t *) (ctx->uar + MLX4_SEND_DOORBELL) = qp->doorbell_qpn; } if (nreq) stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) & (qp->sq.wqe_cnt - 1)); pthread_spin_unlock(&qp->sq.lock); return ret; }
static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { return mlx4_multicast_detach(to_mdev(ibqp->device)->dev, &to_mqp(ibqp)->mqp, gid->raw); }