/* ARGSUSED */
static void
tavor_agent_mad_resp_handling(tavor_state_t *state, ibmf_msg_t *msgp,
    uint_t port)
{
	ib_mad_hdr_t	*rmadhdrp = msgp->im_msgbufs_recv.im_bufs_mad_hdr;
	ib_mad_hdr_t	*smadhdrp = msgp->im_msgbufs_send.im_bufs_mad_hdr;
	uint_t		hop_count, hop_point;
	uchar_t		*resp, *ret_path;

	resp = (uchar_t *)msgp->im_msgbufs_send.im_bufs_cl_data;

	/*
	 * Handle directed route MADs as a special case.  Tavor firmware
	 * does not update the "direction" bit, "hop pointer", "Return
	 * Path" or, in fact, any of the "directed route" parameters.  So
	 * the responsibility falls on Tavor driver software to inspect the
	 * MADs and update those fields as appropriate (see section 14.2.2
	 * of the IBA specification, rev 1.1)
	 */
	if (TAVOR_MAD_IS_DR(rmadhdrp)) {

	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*((sm_dr_mad_hdr_t *)rmadhdrp)))
	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*((sm_dr_mad_hdr_t *)smadhdrp)))

		/*
		 * Set the "Direction" bit to one.  This indicates that this
		 * is now directed route response
		 */
		TAVOR_DRMAD_SET_DIRECTION(rmadhdrp);

		/* Extract the "hop pointer" and "hop count" from the MAD */
		hop_count = TAVOR_DRMAD_GET_HOPCOUNT(rmadhdrp);
		hop_point = TAVOR_DRMAD_GET_HOPPOINTER(rmadhdrp);

		/* Append the port we came in on to the "Return Path" */
		if ((hop_count != 0) && ((hop_point == hop_count) ||
		    (hop_point == hop_count + 1))) {
			ret_path = &resp[TAVOR_DRMAD_RETURN_PATH_OFFSET];
			ret_path[hop_point] = port;
		}

		/* Then increment the "hop pointer" in the MAD */
		hop_point++;
		TAVOR_DRMAD_SET_HOPPOINTER(smadhdrp, hop_point);
	}
}
Esempio n. 2
0
Dnode *
_elf_dnode()
{
	register Dnode	*d;

	if ((d = (Dnode *)malloc(sizeof (Dnode))) == 0) {
		_elf_seterr(EMEM_DNODE, errno);
		return (0);
	}
	NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*d))
	*d = _elf_dnode_init;
	d->db_myflags = DBF_ALLOC;
	NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*d))
	return (d);
}
Esempio n. 3
0
Elf *
_elf_regular(int fd, unsigned flags)		/* initialize regular file */
{
	Elf		*elf;

	if ((elf = (Elf *)calloc(1, sizeof (Elf))) == 0) {
		_elf_seterr(EMEM_ELF, errno);
		return (0);
	}

	NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*elf))
	elf->ed_fd = fd;
	elf->ed_myflags |= flags;
	if (_elf_inmap(elf) != OK_YES) {
		free(elf);
		return (0);
	}
	NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*elf))
	return (elf);
}
Esempio n. 4
0
/*
 * tavor_srq_numcalc()
 *    Context: Can be called from interrupt or base context.
 */
static void
tavor_srq_numcalc(tavor_state_t *state, uint32_t indx, uint32_t *key)
{
	uint32_t	tmp, log_num_srq;

	/*
	 * Generate a simple key from counter.  Note:  We increment this
	 * static variable _intentionally_ without any kind of mutex around
	 * it.  First, single-threading all operations through a single lock
	 * would be a bad idea (from a performance point-of-view).  Second,
	 * the upper "unconstrained" bits don't really have to be unique
	 * because the lower bits are guaranteed to be (although we do make a
	 * best effort to ensure that they are).  Third, the window for the
	 * race (where both threads read and update the counter at the same
	 * time) is incredibly small.
	 */
	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(tavor_debug_srqnum_cnt))
	log_num_srq = state->ts_cfg_profile->cp_log_num_srq;
	tmp = (tavor_debug_srqnum_cnt++) << log_num_srq;
	*key = (tmp | indx) & TAVOR_CQ_MAXNUMBER_MSK;
}
Esempio n. 5
0
/*
 * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'.
 * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes.
 */
static int
ibcm_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep, sa_family_t family_loc)
{
	int err;
	struct lifnum lifn;

	bzero(&lifn, sizeof (struct lifnum));
	lifn.lifn_family = family_loc;
	lifn.lifn_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES;

	err = ibcm_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn);
	if (err != 0)
		return (err);

	IBTF_DPRINTF_L3(cmlog, "ibcm_do_lifconf: Family %d, lifn_count %d",
	    family_loc, lifn.lifn_count);
	/*
	 * Pad the interface count to account for additional interfaces that
	 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
	 */
	lifn.lifn_count += 4;

	bzero(lifcp, sizeof (struct lifconf));
	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*lifcp))
	lifcp->lifc_family = family_loc;
	lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq);
	lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP);
	lifcp->lifc_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES;

	err = ibcm_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp);
	if (err != 0) {
		kmem_free(lifcp->lifc_buf, *bufsizep);
		return (err);
	}
	return (0);
}
Esempio n. 6
0
/*
 * ibmf_i_issue_pkt():
 *	Post an IB packet on the specified QP's send queue
 */
int
ibmf_i_issue_pkt(ibmf_client_t *clientp, ibmf_msg_impl_t *msgimplp,
    ibmf_qp_handle_t ibmf_qp_handle, ibmf_send_wqe_t *send_wqep)
{
	int			ret;
	ibt_status_t		status;
	ibt_wr_ds_t		sgl[1];
	ibt_qp_hdl_t		ibt_qp_handle;

	_NOTE(ASSUMING_PROTECTED(*send_wqep))
	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*send_wqep))

	IBMF_TRACE_4(IBMF_TNF_DEBUG, DPRINT_L4,
	    ibmf_i_issue_pkt_start, IBMF_TNF_TRACE, "",
	    "ibmf_i_issue_pkt() enter, clientp = %p, msg = %p, "
	    "qp_hdl = %p,  swqep = %p\n", tnf_opaque, clientp, clientp,
	    tnf_opaque, msg, msgimplp, tnf_opaque, ibmf_qp_handle,
	    ibmf_qp_handle, tnf_opaque, send_wqep, send_wqep);

	ASSERT(MUTEX_HELD(&msgimplp->im_mutex));
	ASSERT(MUTEX_NOT_HELD(&clientp->ic_mutex));

	/*
	 * if the qp handle provided in ibmf_send_pkt()
	 * is not the default qp handle for this client,
	 * then the wqe must be sent on this qp,
	 * else use the default qp handle set up during ibmf_register()
	 */
	if (ibmf_qp_handle == IBMF_QP_HANDLE_DEFAULT) {
		ibt_qp_handle = clientp->ic_qp->iq_qp_handle;
	} else {
		ibt_qp_handle =
		    ((ibmf_alt_qp_t *)ibmf_qp_handle)->isq_qp_handle;
	}

	/* initialize the send WQE */
	ibmf_i_init_send_wqe(clientp, msgimplp, sgl, send_wqep,
	    msgimplp->im_ud_dest, ibt_qp_handle, ibmf_qp_handle);

	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*send_wqep))

	/*
	 * Issue the wqe to the transport.
	 * NOTE: ibt_post_send() will not block, so, it is ok
	 * to hold the msgimpl mutex across this call.
	 */
	status = ibt_post_send(send_wqep->send_qp_handle, &send_wqep->send_wr,
	    1, NULL);
	if (status != IBT_SUCCESS) {
		mutex_enter(&clientp->ic_kstat_mutex);
		IBMF_ADD32_KSTATS(clientp, send_pkt_failed, 1);
		mutex_exit(&clientp->ic_kstat_mutex);
		IBMF_TRACE_2(IBMF_TNF_NODEBUG, DPRINT_L1,
		    ibmf_i_issue_pkt_err, IBMF_TNF_TRACE, "",
		    "ibmf_i_issue_pkt(): %s, status = %d\n",
		    tnf_string, msg, "post send failure",
		    tnf_uint, ibt_status, status);
		IBMF_TRACE_0(IBMF_TNF_DEBUG, DPRINT_L4, ibmf_i_issue_pkt_end,
		    IBMF_TNF_TRACE, "", "ibmf_i_issue_pkt(() exit\n");
		return (IBMF_TRANSPORT_FAILURE);
	}

	ret = IBMF_SUCCESS;

	/* bump the number of active sends */
	if (ibmf_qp_handle == IBMF_QP_HANDLE_DEFAULT) {
		mutex_enter(&clientp->ic_mutex);
		clientp->ic_sends_active++;
		mutex_exit(&clientp->ic_mutex);
		mutex_enter(&clientp->ic_kstat_mutex);
		IBMF_ADD32_KSTATS(clientp, sends_active, 1);
		mutex_exit(&clientp->ic_kstat_mutex);
	} else {
		ibmf_alt_qp_t *qpp = (ibmf_alt_qp_t *)ibmf_qp_handle;
		mutex_enter(&qpp->isq_mutex);
		qpp->isq_sends_active++;
		mutex_exit(&qpp->isq_mutex);
		mutex_enter(&clientp->ic_kstat_mutex);
		IBMF_ADD32_KSTATS(clientp, sends_active, 1);
		mutex_exit(&clientp->ic_kstat_mutex);
	}

	IBMF_TRACE_0(IBMF_TNF_DEBUG, DPRINT_L4, ibmf_i_issue_pkt_end,
	    IBMF_TNF_TRACE, "", "ibmf_i_issue_pkt() exit\n");
	return (ret);
}
Esempio n. 7
0
/* ARGSUSED */
void
ibmf_i_handle_send_completion(ibmf_ci_t *cip, ibt_wc_t *wcp)
{
	ibmf_client_t		*clientp, *cclientp;
	ibmf_send_wqe_t		*send_wqep;
	ibmf_qp_handle_t	ibmf_qp_handle;
	ibmf_alt_qp_t		*qpp;
	int			ret;

	IBMF_TRACE_2(IBMF_TNF_DEBUG, DPRINT_L4,
	    ibmf_i_handle_send_completion_start, IBMF_TNF_TRACE, "",
	    "ibmf_i_handle_send_completion() enter, cip = %p, wcp = %p\n",
	    tnf_opaque, cip, cip, tnf_opaque, wcp, wcp);

	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*send_wqep))

	ASSERT(wcp->wc_id != NULL);

	ASSERT(IBMF_IS_SEND_WR_ID(wcp->wc_id));

	/* get the IBMF send WQE context */
	IBMF_SEND_WR_ID_TO_ADDR(wcp->wc_id, send_wqep);

	ASSERT(send_wqep != NULL);

	/* get the client context */
	cclientp =  clientp = send_wqep->send_client;

	/* Check if this is a completion for a BUSY MAD sent by IBMF */
	if (clientp == NULL) {
		ibmf_msg_impl_t		*msgimplp;

		IBMF_TRACE_0(IBMF_TNF_DEBUG, DPRINT_L3,
		    ibmf_i_handle_send_completion, IBMF_TNF_TRACE, "",
		    "ibmf_i_handle_send_completion(): NULL client\n");

		msgimplp = send_wqep->send_msg;

		/*
		 * Deregister registered memory and free it, and
		 * free up the send WQE context
		 */
		(void) ibt_deregister_mr(cip->ci_ci_handle,
		    send_wqep->send_mem_hdl);
		kmem_free(send_wqep->send_mem, IBMF_MEM_PER_WQE);
		kmem_free(send_wqep, sizeof (ibmf_send_wqe_t));

		/* Free up the message context */
		ibmf_i_put_ud_dest(cip, msgimplp->im_ibmf_ud_dest);
		ibmf_i_clean_ud_dest_list(cip, B_FALSE);
		kmem_free(msgimplp, sizeof (ibmf_msg_impl_t));
		IBMF_TRACE_0(IBMF_TNF_DEBUG, DPRINT_L4,
		    ibmf_i_handle_send_completion_end, IBMF_TNF_TRACE, "",
		    "ibmf_i_handle_send_completion() exit\n");
		return;
	}

	/* get the QP handle */
	ibmf_qp_handle = send_wqep->send_ibmf_qp_handle;
	qpp = (ibmf_alt_qp_t *)ibmf_qp_handle;

	ASSERT(clientp != NULL);

	/* decrement the number of active sends */
	if (ibmf_qp_handle == IBMF_QP_HANDLE_DEFAULT) {
		mutex_enter(&clientp->ic_mutex);
		clientp->ic_sends_active--;
		mutex_exit(&clientp->ic_mutex);
	} else {
		mutex_enter(&qpp->isq_mutex);
		qpp->isq_sends_active--;
		mutex_exit(&qpp->isq_mutex);
	}

	mutex_enter(&clientp->ic_kstat_mutex);
	IBMF_SUB32_KSTATS(clientp, sends_active, 1);
	mutex_exit(&clientp->ic_kstat_mutex);

	send_wqep->send_status = ibmf_i_ibt_wc_to_ibmf_status(wcp->wc_status);

	/*
	 * issue the callback using taskq. If no taskq or if the
	 * dispatch fails, we do the send processing in the callback context
	 * which is the interrupt context
	 */
	if (cclientp->ic_send_taskq == NULL) {
		/* Do the processing in callback context */
		mutex_enter(&clientp->ic_kstat_mutex);
		IBMF_ADD32_KSTATS(clientp, send_cb_active, 1);
		mutex_exit(&clientp->ic_kstat_mutex);
		ibmf_i_do_send_cb((void *)send_wqep);
		mutex_enter(&clientp->ic_kstat_mutex);
		IBMF_SUB32_KSTATS(clientp, send_cb_active, 1);
		mutex_exit(&clientp->ic_kstat_mutex);
		IBMF_TRACE_1(IBMF_TNF_DEBUG, DPRINT_L4,
		    ibmf_i_handle_send_err, IBMF_TNF_ERROR, "",
		    "ibmf_i_handle_send_completion(): %s\n",
		    tnf_string, msg, "ci_send_taskq == NULL");
		IBMF_TRACE_0(IBMF_TNF_DEBUG, DPRINT_L4,
		    ibmf_i_handle_send_completion_end, IBMF_TNF_TRACE, "",
		    "ibmf_i_handle_send_completion() exit\n");
		return;
	}

	mutex_enter(&clientp->ic_kstat_mutex);
	IBMF_ADD32_KSTATS(clientp, send_cb_active, 1);
	mutex_exit(&clientp->ic_kstat_mutex);

	/* Use taskq for processing if the IBMF_REG_FLAG_NO_OFFLOAD isn't set */
	if ((clientp->ic_reg_flags & IBMF_REG_FLAG_NO_OFFLOAD) == 0) {
		ret = taskq_dispatch(cclientp->ic_send_taskq, ibmf_i_do_send_cb,
		    send_wqep, TQ_NOSLEEP);
		if (ret == 0) {
			IBMF_TRACE_1(IBMF_TNF_DEBUG, DPRINT_L4,
			    ibmf_i_handle_send_err, IBMF_TNF_ERROR, "",
			    "ibmf_i_handle_send_completion(): %s\n",
			    tnf_string, msg, "send: dispatch failed");
			ibmf_i_do_send_cb((void *)send_wqep);
		}
	} else {
		ibmf_i_do_send_cb((void *)send_wqep);
	}

	mutex_enter(&clientp->ic_kstat_mutex);
	IBMF_SUB32_KSTATS(clientp, send_cb_active, 1);
	mutex_exit(&clientp->ic_kstat_mutex);

	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*send_wqep))

	IBMF_TRACE_0(IBMF_TNF_DEBUG, DPRINT_L4,
	    ibmf_i_handle_send_completion_end, IBMF_TNF_TRACE, "",
	    "ibmf_i_handle_send_completion() exit\n");
}
Esempio n. 8
0
/*
 * ibmf_i_send_pkt()
 *	Send an IB packet after allocating send resources
 */
int
ibmf_i_send_pkt(ibmf_client_t *clientp, ibmf_qp_handle_t ibmf_qp_handle,
    ibmf_msg_impl_t *msgimplp, int block)
{
	ibmf_send_wqe_t	*send_wqep;
	int		status;

	IBMF_TRACE_4(IBMF_TNF_DEBUG, DPRINT_L4, ibmf_i_send_pkt_start,
	    IBMF_TNF_TRACE, "",
	    "ibmf_i_send_pkt(): clientp = 0x%p, qp_hdl = 0x%p, "
	    "msgp = 0x%p, block = %d\n", tnf_opaque, clientp, clientp,
	    tnf_opaque, qp_hdl, ibmf_qp_handle, tnf_opaque, msg, msgimplp,
	    tnf_uint, block, block);

	ASSERT(MUTEX_HELD(&msgimplp->im_mutex));

	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*send_wqep))

	/*
	 * Reset send_done to indicate we have not received the completion
	 * for this send yet.
	 */
	msgimplp->im_trans_state_flags &= ~IBMF_TRANS_STATE_FLAG_SEND_DONE;

	/*
	 * Allocate resources needed to send a UD packet including the
	 * send WQE context
	 */
	status = ibmf_i_alloc_send_resources(clientp->ic_myci,
	    msgimplp, block, &send_wqep);
	if (status != IBMF_SUCCESS) {
		IBMF_TRACE_2(IBMF_TNF_NODEBUG, DPRINT_L1, ibmf_i_send_pkt_err,
		    IBMF_TNF_ERROR, "", "ibmf_i_send_pkt(): %s, status = %d\n",
		    tnf_string, msg, "unable to allocate send resources",
		    tnf_uint, status, status);
		IBMF_TRACE_0(IBMF_TNF_DEBUG, DPRINT_L4,	ibmf_i_send_pkt_end,
		    IBMF_TNF_TRACE, "", "ibmf_i_send_pkt() exit\n");
		return (status);
	}

	/* Set the segment number in the send WQE context */
	if (msgimplp->im_flags & IBMF_MSG_FLAGS_SEND_RMPP)
		send_wqep->send_rmpp_segment = msgimplp->im_rmpp_ctx.rmpp_ns;

	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*send_wqep))

	/*
	 * Increment the count of pending send completions.
	 * Only when this count is zero should the client be notified
	 * of completion of the transaction.
	 */
	msgimplp->im_pending_send_compls += 1;

	/* Send the packet */
	status = ibmf_i_issue_pkt(clientp, msgimplp, ibmf_qp_handle, send_wqep);
	if (status != IBMF_SUCCESS) {
		ibmf_i_free_send_resources(clientp->ic_myci, msgimplp,
		    send_wqep);
		IBMF_TRACE_2(IBMF_TNF_NODEBUG, DPRINT_L1, ibmf_i_send_pkt_err,
		    IBMF_TNF_ERROR, "", "ibmf_i_send_pkt(): %s, status = %d\n",
		    tnf_string, msg, "unable to issue packet",
		    tnf_uint, status, status);
		IBMF_TRACE_0(IBMF_TNF_DEBUG, DPRINT_L4,	ibmf_i_send_pkt_end,
		    IBMF_TNF_TRACE, "", "ibmf_i_send_pkt() exit\n");
		return (status);
	}

	IBMF_TRACE_1(IBMF_TNF_DEBUG, DPRINT_L4,	ibmf_i_send_pkt_end,
	    IBMF_TNF_TRACE, "", "ibmf_i_send_pkt() exit, status = %d\n",
	    tnf_uint, status, status);

	return (IBMF_SUCCESS);
}
Esempio n. 9
0
/*
 * ibmf_i_find_msg():
 *	Walk the client message list for the message corresponding to
 *	the parameters specified
 *	The msg_list parameter should be either IBMF_REG_MSG_LIST
 *	or IBMF_TERM_MSG_LIST for the termination message list.
 */
ibmf_msg_impl_t *
ibmf_i_find_msg(ibmf_client_t *clientp, uint64_t tid, uint8_t mgt_class,
                uint8_t r_method, ib_lid_t lid, ib_gid_t *gid, boolean_t gid_pr,
                ibmf_rmpp_hdr_t *rmpp_hdr, boolean_t msg_list)
{
    ibmf_msg_impl_t *msgimplp;
    ib_gid_t	*ctx_gidp;
    int		msg_found;

    IBMF_TRACE_5(IBMF_TNF_DEBUG, DPRINT_L4,
                 ibmf_i_find_msg_start, IBMF_TNF_TRACE, "",
                 "ibmf_i_find_msg(): clientp = 0x%p, tid = 0x%p, mgmt_class = 0x%x, "
                 "lid = 0x%x, gidp = 0x%p\n", tnf_opaque, clientp, clientp,
                 tnf_opaque, tid, tid, tnf_opaque, mgt_class, mgt_class,
                 tnf_opaque, lid, lid, tnf_opaque, gid, gid);

    msg_found = B_FALSE;

    mutex_enter(&clientp->ic_msg_mutex);

    if (msg_list == IBMF_REG_MSG_LIST)
        msgimplp = clientp->ic_msg_list;
    else
        msgimplp = clientp->ic_term_msg_list;

    /*
     * Look for a transaction (message) context that matches the
     * transaction ID, gid or lid, and management class of the
     * incoming packet.
     *
     * If the client decides to do a non-rmpp or rmpp send only,
     * despite expecting a response, then the response should check
     * if the message context for the send still exists.
     * If it does, it should be skipped.
     */
    while (msgimplp != NULL) {

        if (gid_pr == B_TRUE) {

            ctx_gidp = &msgimplp->im_global_addr.ig_sender_gid;

            /* first match gid */
            if ((ctx_gidp->gid_prefix != gid->gid_prefix) ||
                    (ctx_gidp->gid_guid != gid->gid_guid)) {

                msgimplp = msgimplp->im_msg_next;
                continue;
            }
        } else  {

            IBMF_TRACE_5(IBMF_TNF_DEBUG, DPRINT_L3,
                         ibmf_i_find_msg, IBMF_TNF_TRACE, "",
                         "ibmf_i_find_msg(): %s, msgp = 0x%p, tid = 0x%p, "
                         "remote_lid = 0x%x, mgmt_class = 0x%x\n",
                         tnf_string, msg, "Comparing to msg",
                         tnf_opaque, msg, msgimplp,
                         tnf_opaque, tid, msgimplp->im_tid,
                         tnf_opaque, remote_lid,
                         msgimplp->im_local_addr.ia_remote_lid,
                         tnf_opaque, class, msgimplp->im_mgt_class);

            /* first match lid */
            if (msgimplp->im_local_addr.ia_remote_lid != lid) {
                msgimplp = msgimplp->im_msg_next;
                continue;
            }
        }

        /* next match tid and class */
        if ((msgimplp->im_tid != tid) ||
                (msgimplp->im_mgt_class != mgt_class)) {

            msgimplp = msgimplp->im_msg_next;
            continue;
        }

        /*
         * For unsolicited transactions, the message is found
         * if the method matches, but,
         * If the response is an ACK, and the transaction is
         * in RMPP receiver mode, then skip this message.
         */
        if (msgimplp->im_unsolicited == B_TRUE) {
            ibmf_rmpp_ctx_t *rmpp_ctx;
            ibmf_msg_bufs_t *msgbufp;

            mutex_enter(&msgimplp->im_mutex);
            rmpp_ctx = &msgimplp->im_rmpp_ctx;

            if ((msgimplp->im_flags & IBMF_MSG_FLAGS_RECV_RMPP) &&
                    ((rmpp_ctx->rmpp_state ==
                      IBMF_RMPP_STATE_RECEVR_ACTIVE) ||
                     (rmpp_ctx->rmpp_state ==
                      IBMF_RMPP_STATE_RECEVR_TERMINATE))) {
                /* Continue if ACK packet */
                if (rmpp_hdr->rmpp_type == IBMF_RMPP_TYPE_ACK) {
                    mutex_exit(&msgimplp->im_mutex);
                    msgimplp = msgimplp->im_msg_next;
                    continue;
                }
            }

            if (msgimplp->im_trans_state_flags ==
                    IBMF_TRANS_STATE_FLAG_RECV_ACTIVE) {
                msgbufp = &msgimplp->im_msgbufs_recv;
                if (msgbufp->im_bufs_mad_hdr->R_Method ==
                        r_method) {
                    mutex_exit(&msgimplp->im_mutex);
                    msg_found = B_TRUE;
                    break;
                }
            }

            mutex_exit(&msgimplp->im_mutex);
        }

        /*
         * if this was an unsequenced, non-RMPP transaction there should
         * be no incoming packets
         */
        if ((!(msgimplp->im_transp_op_flags &
                IBMF_MSG_TRANS_FLAG_RMPP)) &&
                (!(msgimplp->im_transp_op_flags &
                   IBMF_MSG_TRANS_FLAG_SEQ))) {

            msgimplp = msgimplp->im_msg_next;
            continue;
        }


        /*
         * if this is a sequenced transaction,
         * (the send and response may or may not be RMPP)
         * and the method of the incoming MAD is the same as the
         * method in the send message context with the response bit
         * set then this message matches.
         */
        if (msgimplp->im_transp_op_flags & IBMF_MSG_TRANS_FLAG_SEQ) {
            ibmf_msg_bufs_t *msgbufp;

            mutex_enter(&msgimplp->im_mutex);

            msgbufp = &msgimplp->im_msgbufs_send;

            if ((msgbufp->im_bufs_mad_hdr->R_Method |
                    IBMF_RMPP_METHOD_RESP_BIT) == r_method) {
                mutex_exit(&msgimplp->im_mutex);
                msg_found = B_TRUE;
                break;
            }

            mutex_exit(&msgimplp->im_mutex);
        }

        /*
         * if this is an RMPP SEND transaction there should only
         * be ACK, STOP, and ABORTS RMPP packets.
         * The response data packets would have been detected in
         * the check above.
         */
        if (msgimplp->im_transp_op_flags & IBMF_MSG_TRANS_FLAG_RMPP) {
            ibmf_rmpp_ctx_t *rmpp_ctx = &msgimplp->im_rmpp_ctx;
            ibmf_msg_bufs_t *msgbufp;

            _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*rmpp_ctx))

            if ((rmpp_hdr != NULL) &&
                    (rmpp_hdr->rmpp_flags & IBMF_RMPP_FLAGS_ACTIVE)) {

                /*
                 * If non-sequenced, then there should be
                 * no DATA packets incoming for this transaction
                 */
                if (!(msgimplp->im_transp_op_flags &
                        IBMF_MSG_TRANS_FLAG_SEQ)) {
                    /* Continue if DATA packet */
                    if (rmpp_hdr->rmpp_type ==
                            IBMF_RMPP_TYPE_DATA) {
                        msgimplp =
                            msgimplp->im_msg_next;
                        continue;
                    }
                }


                /* Skip if R_Method does not match */
                if ((rmpp_ctx->rmpp_state ==
                        IBMF_RMPP_STATE_SENDER_ACTIVE) ||
                        (rmpp_ctx->rmpp_state ==
                         IBMF_RMPP_STATE_SENDER_SWITCH)) {
                    /* Continue if DATA packet */
                    if (rmpp_hdr->rmpp_type ==
                            IBMF_RMPP_TYPE_DATA) {
                        msgimplp =
                            msgimplp->im_msg_next;
                        continue;
                    }

                    /*
                     * Continue if method does not match
                     * Ignore response bit during match.
                     */
                    msgbufp = &msgimplp->im_msgbufs_send;
                    if ((msgbufp->im_bufs_mad_hdr->
                            R_Method & MAD_METHOD_MASK) !=
                            (r_method & MAD_METHOD_MASK)) {
                        msgimplp = msgimplp->
                                   im_msg_next;
                        continue;
                    }
                }

                /* Skip if R_Method does not match */
                if ((rmpp_ctx->rmpp_state ==
                        IBMF_RMPP_STATE_RECEVR_ACTIVE) ||
                        (rmpp_ctx->rmpp_state ==
                         IBMF_RMPP_STATE_RECEVR_TERMINATE)) {
                    /* Continue if ACK packet */
                    if (rmpp_hdr->rmpp_type ==
                            IBMF_RMPP_TYPE_ACK) {
                        msgimplp =
                            msgimplp->im_msg_next;
                        continue;
                    }

                    /* Continue if method does not match */
                    msgbufp = &msgimplp->im_msgbufs_recv;
                    if (msgbufp->im_bufs_mad_hdr->
                            R_Method != r_method) {
                        msgimplp = msgimplp->
                                   im_msg_next;
                        continue;
                    }
                }
            }
        }

        /*
         * For a sequenced non-RMPP transaction, if the
         * TID/LID/MgtClass are the same, and if the method
         * of the incoming MAD and the message context are the
         * same, then the MAD is likely to be a new request from
         * the remote entity, so skip this message.
         */
        if ((msgimplp->im_transp_op_flags & IBMF_MSG_TRANS_FLAG_SEQ) &&
                !(msgimplp->im_transp_op_flags &
                  IBMF_MSG_TRANS_FLAG_RMPP)) {
            ibmf_msg_bufs_t *msgbufp;

            mutex_enter(&msgimplp->im_mutex);

            msgbufp = &msgimplp->im_msgbufs_send;

            mutex_exit(&msgimplp->im_mutex);

            /* Continue if method is the same */
            if (msgbufp->im_bufs_mad_hdr->
                    R_Method == r_method) {
                msgimplp = msgimplp-> im_msg_next;
                continue;
            }
        }

        /* everything matches, found the correct message */
        msg_found = B_TRUE;
        break;
    }
Esempio n. 10
0
/*
 * tavor_srq_modify()
 *    Context: Can be called only from user or kernel context.
 */
int
tavor_srq_modify(tavor_state_t *state, tavor_srqhdl_t srq, uint_t size,
    uint_t *real_size, uint_t sleepflag)
{
	tavor_qalloc_info_t	new_srqinfo, old_srqinfo;
	tavor_rsrc_t		*mtt, *mpt, *old_mtt;
	tavor_bind_info_t	bind;
	tavor_bind_info_t	old_bind;
	tavor_rsrc_pool_info_t	*rsrc_pool;
	tavor_mrhdl_t		mr;
	tavor_hw_mpt_t		mpt_entry;
	tavor_wrid_entry_t	*wre_new, *wre_old;
	uint64_t		mtt_ddrbaseaddr, mtt_addr;
	uint64_t		srq_desc_off;
	uint32_t		*buf, srq_old_bufsz;
	uint32_t		wqesz;
	uint_t			max_srq_size;
	uint_t			dma_xfer_mode, mtt_pgsize_bits;
	uint_t			srq_sync, log_srq_size, maxprot;
	uint_t			wq_location;
	int			status;
	char			*errormsg;

	TAVOR_TNF_ENTER(tavor_srq_modify);

	/*
	 * Check the "inddr" flag.  This flag tells the driver whether or not
	 * the SRQ's work queues should be come from normal system memory or
	 * whether they should be allocated from DDR memory.
	 */
	wq_location = state->ts_cfg_profile->cp_srq_wq_inddr;

	/*
	 * If size requested is larger than device capability, return
	 * Insufficient Resources
	 */
	max_srq_size = (1 << state->ts_cfg_profile->cp_log_max_srq_sz);
	if (size > max_srq_size) {
		TNF_PROBE_0(tavor_srq_modify_size_larger_than_maxsize,
		    TAVOR_TNF_ERROR, "");
		TAVOR_TNF_EXIT(tavor_srq_modify);
		return (IBT_HCA_WR_EXCEEDED);
	}

	/*
	 * Calculate the appropriate size for the SRQ.
	 * Note:  All Tavor SRQs must be a power-of-2 in size.  Also
	 * they may not be any smaller than TAVOR_SRQ_MIN_SIZE.  This step
	 * is to round the requested size up to the next highest power-of-2
	 */
	size = max(size, TAVOR_SRQ_MIN_SIZE);
	log_srq_size = highbit(size);
	if ((size & (size - 1)) == 0) {
		log_srq_size = log_srq_size - 1;
	}

	/*
	 * Next we verify that the rounded-up size is valid (i.e. consistent
	 * with the device limits and/or software-configured limits).
	 */
	if (log_srq_size > state->ts_cfg_profile->cp_log_max_srq_sz) {
		/* Set "status" and "errormsg" and goto failure */
		TAVOR_TNF_FAIL(IBT_HCA_WR_EXCEEDED, "max SRQ size");
		goto srqmodify_fail;
	}

	/*
	 * Allocate the memory for newly resized Shared Receive Queue.
	 *
	 * Note: If SRQ is not user-mappable, then it may come from either
	 * kernel system memory or from HCA-attached local DDR memory.
	 *
	 * Note2: We align this queue on a pagesize boundary.  This is required
	 * to make sure that all the resulting IB addresses will start at 0,
	 * for a zero-based queue.  By making sure we are aligned on at least a
	 * page, any offset we use into our queue will be the same as it was
	 * when we allocated it at tavor_srq_alloc() time.
	 */
	wqesz = (1 << srq->srq_wq_log_wqesz);
	new_srqinfo.qa_size = (1 << log_srq_size) * wqesz;
	new_srqinfo.qa_alloc_align = PAGESIZE;
	new_srqinfo.qa_bind_align  = PAGESIZE;
	if (srq->srq_is_umap) {
		new_srqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
	} else {
		new_srqinfo.qa_location = wq_location;
	}
	status = tavor_queue_alloc(state, &new_srqinfo, sleepflag);
	if (status != DDI_SUCCESS) {
		/* Set "status" and "errormsg" and goto failure */
		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed srq");
		goto srqmodify_fail;
	}
	buf = (uint32_t *)new_srqinfo.qa_buf_aligned;
	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))

	/*
	 * Allocate the memory for the new WRE list.  This will be used later
	 * when we resize the wridlist based on the new SRQ size.
	 */
	wre_new = (tavor_wrid_entry_t *)kmem_zalloc((1 << log_srq_size) *
	    sizeof (tavor_wrid_entry_t), sleepflag);
	if (wre_new == NULL) {
		/* Set "status" and "errormsg" and goto failure */
		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE,
		    "failed wre_new alloc");
		goto srqmodify_fail;
	}

	/*
	 * Fill in the "bind" struct.  This struct provides the majority
	 * of the information that will be used to distinguish between an
	 * "addr" binding (as is the case here) and a "buf" binding (see
	 * below).  The "bind" struct is later passed to tavor_mr_mem_bind()
	 * which does most of the "heavy lifting" for the Tavor memory
	 * registration routines.
	 */
	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(bind))
	bzero(&bind, sizeof (tavor_bind_info_t));
	bind.bi_type  = TAVOR_BINDHDL_VADDR;
	bind.bi_addr  = (uint64_t)(uintptr_t)buf;
	bind.bi_len   = new_srqinfo.qa_size;
	bind.bi_as    = NULL;
	bind.bi_flags = sleepflag == TAVOR_SLEEP ? IBT_MR_SLEEP :
	    IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
	if (srq->srq_is_umap) {
		bind.bi_bypass = state->ts_cfg_profile->cp_iommu_bypass;
	} else {
		if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
			bind.bi_bypass =
			    state->ts_cfg_profile->cp_iommu_bypass;
			dma_xfer_mode =
			    state->ts_cfg_profile->cp_streaming_consistent;
			if (dma_xfer_mode == DDI_DMA_STREAMING) {
				bind.bi_flags |= IBT_MR_NONCOHERENT;
			}
		} else {
			bind.bi_bypass = TAVOR_BINDMEM_BYPASS;
		}
	}
	status = tavor_mr_mtt_bind(state, &bind, new_srqinfo.qa_dmahdl, &mtt,
	    &mtt_pgsize_bits);
	if (status != DDI_SUCCESS) {
		/* Set "status" and "errormsg" and goto failure */
		TAVOR_TNF_FAIL(status, "failed mtt bind");
		kmem_free(wre_new, srq->srq_wq_bufsz *
		    sizeof (tavor_wrid_entry_t));
		tavor_queue_free(state, &new_srqinfo);
		goto srqmodify_fail;
	}

	/*
	 * Calculate the offset between the kernel virtual address space
	 * and the IB virtual address space.  This will be used when
	 * posting work requests to properly initialize each WQE.
	 *
	 * Note: bind addr is zero-based (from alloc) so we calculate the
	 * correct new offset here.
	 */
	bind.bi_addr = bind.bi_addr & ((1 << mtt_pgsize_bits) - 1);
	srq_desc_off = (uint64_t)(uintptr_t)new_srqinfo.qa_buf_aligned -
	    (uint64_t)bind.bi_addr;

	/*
	 * Get the base address for the MTT table.  This will be necessary
	 * below when we are modifying the MPT entry.
	 */
	rsrc_pool = &state->ts_rsrc_hdl[TAVOR_MTT];
	mtt_ddrbaseaddr = (uint64_t)(uintptr_t)rsrc_pool->rsrc_ddr_offset;

	/*
	 * Fill in the MPT entry.  This is the final step before passing
	 * ownership of the MPT entry to the Tavor hardware.  We use all of
	 * the information collected/calculated above to fill in the
	 * requisite portions of the MPT.
	 */
	bzero(&mpt_entry, sizeof (tavor_hw_mpt_t));
	mpt_entry.reg_win_len	= bind.bi_len;
	mtt_addr = mtt_ddrbaseaddr + (mtt->tr_indx << TAVOR_MTT_SIZE_SHIFT);
	mpt_entry.mttseg_addr_h = mtt_addr >> 32;
	mpt_entry.mttseg_addr_l = mtt_addr >> 6;

	/*
	 * Now we grab the SRQ lock.  Since we will be updating the actual
	 * SRQ location and the producer/consumer indexes, we should hold
	 * the lock.
	 *
	 * We do a TAVOR_NOSLEEP here (and below), though, because we are
	 * holding the "srq_lock" and if we got raised to interrupt level
	 * by priority inversion, we would not want to block in this routine
	 * waiting for success.
	 */
	mutex_enter(&srq->srq_lock);

	/*
	 * Copy old entries to new buffer
	 */
	srq_old_bufsz = srq->srq_wq_bufsz;
	bcopy(srq->srq_wq_buf, buf, srq_old_bufsz * wqesz);

	/* Determine if later ddi_dma_sync will be necessary */
	srq_sync = TAVOR_SRQ_IS_SYNC_REQ(state, srq->srq_wqinfo);

	/* Sync entire "new" SRQ for use by hardware (if necessary) */
	if (srq_sync) {
		(void) ddi_dma_sync(bind.bi_dmahdl, 0,
		    new_srqinfo.qa_size, DDI_DMA_SYNC_FORDEV);
	}

	/*
	 * Setup MPT information for use in the MODIFY_MPT command
	 */
	mr = srq->srq_mrhdl;
	mutex_enter(&mr->mr_lock);
	mpt = srq->srq_mrhdl->mr_mptrsrcp;

	/*
	 * MODIFY_MPT
	 *
	 * If this fails for any reason, then it is an indication that
	 * something (either in HW or SW) has gone seriously wrong.  So we
	 * print a warning message and return.
	 */
	status = tavor_modify_mpt_cmd_post(state, &mpt_entry, mpt->tr_indx,
	    TAVOR_CMD_MODIFY_MPT_RESIZESRQ, sleepflag);
	if (status != TAVOR_CMD_SUCCESS) {
		cmn_err(CE_CONT, "Tavor: MODIFY_MPT command failed: %08x\n",
		    status);
		TNF_PROBE_1(tavor_mr_common_reg_sw2hw_mpt_cmd_fail,
		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
		TAVOR_TNF_FAIL(status, "MODIFY_MPT command failed");
		(void) tavor_mr_mtt_unbind(state, &srq->srq_mrhdl->mr_bindinfo,
		    srq->srq_mrhdl->mr_mttrsrcp);
		kmem_free(wre_new, srq->srq_wq_bufsz *
		    sizeof (tavor_wrid_entry_t));
		tavor_queue_free(state, &new_srqinfo);
		mutex_exit(&mr->mr_lock);
		mutex_exit(&srq->srq_lock);
		return (ibc_get_ci_failure(0));
	}

	/*
	 * Update the Tavor Shared Receive Queue handle with all the new
	 * information.  At the same time, save away all the necessary
	 * information for freeing up the old resources
	 */
	old_srqinfo	   = srq->srq_wqinfo;
	old_mtt		   = srq->srq_mrhdl->mr_mttrsrcp;
	bcopy(&srq->srq_mrhdl->mr_bindinfo, &old_bind,
	    sizeof (tavor_bind_info_t));

	/* Now set the new info */
	srq->srq_wqinfo	   = new_srqinfo;
	srq->srq_wq_buf	   = buf;
	srq->srq_wq_bufsz  = (1 << log_srq_size);
	bcopy(&bind, &srq->srq_mrhdl->mr_bindinfo, sizeof (tavor_bind_info_t));
	srq->srq_mrhdl->mr_mttrsrcp = mtt;
	srq->srq_desc_off  = srq_desc_off;
	srq->srq_real_sizes.srq_wr_sz = (1 << log_srq_size);

	/* Update MR mtt pagesize */
	mr->mr_logmttpgsz = mtt_pgsize_bits;
	mutex_exit(&mr->mr_lock);

#ifdef __lock_lint
	mutex_enter(&srq->srq_wrid_wql->wql_lock);
#else
	if (srq->srq_wrid_wql != NULL) {
		mutex_enter(&srq->srq_wrid_wql->wql_lock);
	}
#endif

	/*
	 * Initialize new wridlist, if needed.
	 *
	 * If a wridlist already is setup on an SRQ (the QP associated with an
	 * SRQ has moved "from_reset") then we must update this wridlist based
	 * on the new SRQ size.  We allocate the new size of Work Request ID
	 * Entries, copy over the old entries to the new list, and
	 * re-initialize the srq wridlist in non-umap case
	 */
	wre_old = NULL;
	if (srq->srq_wridlist != NULL) {
		wre_old = srq->srq_wridlist->wl_wre;

		bcopy(wre_old, wre_new, srq_old_bufsz *
		    sizeof (tavor_wrid_entry_t));

		/* Setup new sizes in wre */
		srq->srq_wridlist->wl_wre = wre_new;
		srq->srq_wridlist->wl_size = srq->srq_wq_bufsz;

		if (!srq->srq_is_umap) {
			tavor_wrid_list_srq_init(srq->srq_wridlist, srq,
			    srq_old_bufsz);
		}
	}

#ifdef __lock_lint
	mutex_exit(&srq->srq_wrid_wql->wql_lock);
#else
	if (srq->srq_wrid_wql != NULL) {
		mutex_exit(&srq->srq_wrid_wql->wql_lock);
	}
#endif

	/*
	 * If "old" SRQ was a user-mappable SRQ that is currently mmap()'d out
	 * to a user process, then we need to call devmap_devmem_remap() to
	 * invalidate the mapping to the SRQ memory.  We also need to
	 * invalidate the SRQ tracking information for the user mapping.
	 *
	 * Note: On failure, the remap really shouldn't ever happen.  So, if it
	 * does, it is an indication that something has gone seriously wrong.
	 * So we print a warning message and return error (knowing, of course,
	 * that the "old" SRQ memory will be leaked)
	 */
	if ((srq->srq_is_umap) && (srq->srq_umap_dhp != NULL)) {
		maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
		status = devmap_devmem_remap(srq->srq_umap_dhp,
		    state->ts_dip, 0, 0, srq->srq_wqinfo.qa_size, maxprot,
		    DEVMAP_MAPPING_INVALID, NULL);
		if (status != DDI_SUCCESS) {
			mutex_exit(&srq->srq_lock);
			TAVOR_WARNING(state, "failed in SRQ memory "
			    "devmap_devmem_remap()");
			/* We can, however, free the memory for old wre */
			if (wre_old != NULL) {
				kmem_free(wre_old, srq_old_bufsz *
				    sizeof (tavor_wrid_entry_t));
			}
			TAVOR_TNF_EXIT(tavor_srq_modify);
			return (ibc_get_ci_failure(0));
		}
		srq->srq_umap_dhp = (devmap_cookie_t)NULL;
	}

	/*
	 * Drop the SRQ lock now.  The only thing left to do is to free up
	 * the old resources.
	 */
	mutex_exit(&srq->srq_lock);

	/*
	 * Unbind the MTT entries.
	 */
	status = tavor_mr_mtt_unbind(state, &old_bind, old_mtt);
	if (status != DDI_SUCCESS) {
		TAVOR_WARNING(state, "failed to unbind old SRQ memory");
		/* Set "status" and "errormsg" and goto failure */
		TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
		    "failed to unbind (old)");
		goto srqmodify_fail;
	}

	/* Free the memory for old wre */
	if (wre_old != NULL) {
		kmem_free(wre_old, srq_old_bufsz *
		    sizeof (tavor_wrid_entry_t));
	}

	/* Free the memory for the old SRQ */
	tavor_queue_free(state, &old_srqinfo);

	/*
	 * Fill in the return arguments (if necessary).  This includes the
	 * real new completion queue size.
	 */
	if (real_size != NULL) {
		*real_size = (1 << log_srq_size);
	}

	TAVOR_TNF_EXIT(tavor_srq_modify);
	return (DDI_SUCCESS);

srqmodify_fail:
	TNF_PROBE_1(tavor_srq_modify_fail, TAVOR_TNF_ERROR, "",
	    tnf_string, msg, errormsg);
	TAVOR_TNF_EXIT(tavor_srq_modify);
	return (status);
}
Esempio n. 11
0
/*
 * tavor_srq_alloc()
 *    Context: Can be called only from user or kernel context.
 */
int
tavor_srq_alloc(tavor_state_t *state, tavor_srq_info_t *srqinfo,
    uint_t sleepflag, tavor_srq_options_t *op)
{
	ibt_srq_hdl_t		ibt_srqhdl;
	tavor_pdhdl_t		pd;
	ibt_srq_sizes_t		*sizes;
	ibt_srq_sizes_t		*real_sizes;
	tavor_srqhdl_t		*srqhdl;
	ibt_srq_flags_t		flags;
	tavor_rsrc_t		*srqc, *rsrc;
	tavor_hw_srqc_t		srqc_entry;
	uint32_t		*buf;
	tavor_srqhdl_t		srq;
	tavor_umap_db_entry_t	*umapdb;
	ibt_mr_attr_t		mr_attr;
	tavor_mr_options_t	mr_op;
	tavor_mrhdl_t		mr;
	uint64_t		addr;
	uint64_t		value, srq_desc_off;
	uint32_t		lkey;
	uint32_t		log_srq_size;
	uint32_t		uarpg;
	uint_t			wq_location, dma_xfer_mode, srq_is_umap;
	int			flag, status;
	char			*errormsg;
	uint_t			max_sgl;
	uint_t			wqesz;

	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*sizes))

	TAVOR_TNF_ENTER(tavor_srq_alloc);

	/*
	 * Check the "options" flag.  Currently this flag tells the driver
	 * whether or not the SRQ's work queues should be come from normal
	 * system memory or whether they should be allocated from DDR memory.
	 */
	if (op == NULL) {
		wq_location = TAVOR_QUEUE_LOCATION_NORMAL;
	} else {
		wq_location = op->srqo_wq_loc;
	}

	/*
	 * Extract the necessary info from the tavor_srq_info_t structure
	 */
	real_sizes = srqinfo->srqi_real_sizes;
	sizes	   = srqinfo->srqi_sizes;
	pd	   = srqinfo->srqi_pd;
	ibt_srqhdl = srqinfo->srqi_ibt_srqhdl;
	flags	   = srqinfo->srqi_flags;
	srqhdl	   = srqinfo->srqi_srqhdl;

	/*
	 * Determine whether SRQ is being allocated for userland access or
	 * whether it is being allocated for kernel access.  If the SRQ is
	 * being allocated for userland access, then lookup the UAR doorbell
	 * page number for the current process.  Note:  If this is not found
	 * (e.g. if the process has not previously open()'d the Tavor driver),
	 * then an error is returned.
	 */
	srq_is_umap = (flags & IBT_SRQ_USER_MAP) ? 1 : 0;
	if (srq_is_umap) {
		status = tavor_umap_db_find(state->ts_instance, ddi_get_pid(),
		    MLNX_UMAP_UARPG_RSRC, &value, 0, NULL);
		if (status != DDI_SUCCESS) {
			/* Set "status" and "errormsg" and goto failure */
			TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "failed UAR page");
			goto srqalloc_fail3;
		}
		uarpg = ((tavor_rsrc_t *)(uintptr_t)value)->tr_indx;
	}

	/* Increase PD refcnt */
	tavor_pd_refcnt_inc(pd);

	/* Allocate an SRQ context entry */
	status = tavor_rsrc_alloc(state, TAVOR_SRQC, 1, sleepflag, &srqc);
	if (status != DDI_SUCCESS) {
		/* Set "status" and "errormsg" and goto failure */
		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed SRQ context");
		goto srqalloc_fail1;
	}

	/* Allocate the SRQ Handle entry */
	status = tavor_rsrc_alloc(state, TAVOR_SRQHDL, 1, sleepflag, &rsrc);
	if (status != DDI_SUCCESS) {
		/* Set "status" and "errormsg" and goto failure */
		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed SRQ handle");
		goto srqalloc_fail2;
	}

	srq = (tavor_srqhdl_t)rsrc->tr_addr;
	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq))

	/* Calculate the SRQ number */
	tavor_srq_numcalc(state, srqc->tr_indx, &srq->srq_srqnum);

	/*
	 * If this will be a user-mappable SRQ, then allocate an entry for
	 * the "userland resources database".  This will later be added to
	 * the database (after all further SRQ operations are successful).
	 * If we fail here, we must undo the reference counts and the
	 * previous resource allocation.
	 */
	if (srq_is_umap) {
		umapdb = tavor_umap_db_alloc(state->ts_instance,
		    srq->srq_srqnum, MLNX_UMAP_SRQMEM_RSRC,
		    (uint64_t)(uintptr_t)rsrc);
		if (umapdb == NULL) {
			/* Set "status" and "errormsg" and goto failure */
			TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
			goto srqalloc_fail3;
		}
	}

	/*
	 * Calculate the appropriate size for the SRQ.
	 * Note:  All Tavor SRQs must be a power-of-2 in size.  Also
	 * they may not be any smaller than TAVOR_SRQ_MIN_SIZE.  This step
	 * is to round the requested size up to the next highest power-of-2
	 */
	sizes->srq_wr_sz = max(sizes->srq_wr_sz, TAVOR_SRQ_MIN_SIZE);
	log_srq_size = highbit(sizes->srq_wr_sz);
	if ((sizes->srq_wr_sz & (sizes->srq_wr_sz - 1)) == 0) {
		log_srq_size = log_srq_size - 1;
	}

	/*
	 * Next we verify that the rounded-up size is valid (i.e. consistent
	 * with the device limits and/or software-configured limits).  If not,
	 * then obviously we have a lot of cleanup to do before returning.
	 */
	if (log_srq_size > state->ts_cfg_profile->cp_log_max_srq_sz) {
		/* Set "status" and "errormsg" and goto failure */
		TAVOR_TNF_FAIL(IBT_HCA_WR_EXCEEDED, "max SRQ size");
		goto srqalloc_fail4;
	}

	/*
	 * Next we verify that the requested number of SGL is valid (i.e.
	 * consistent with the device limits and/or software-configured
	 * limits).  If not, then obviously the same cleanup needs to be done.
	 */
	max_sgl = state->ts_cfg_profile->cp_srq_max_sgl;
	if (sizes->srq_sgl_sz > max_sgl) {
		/* Set "status" and "errormsg" and goto failure */
		TAVOR_TNF_FAIL(IBT_HCA_SGL_EXCEEDED, "max SRQ SGL");
		goto srqalloc_fail4;
	}

	/*
	 * Determine the SRQ's WQE sizes.  This depends on the requested
	 * number of SGLs.  Note: This also has the side-effect of
	 * calculating the real number of SGLs (for the calculated WQE size)
	 */
	tavor_srq_sgl_to_logwqesz(state, sizes->srq_sgl_sz,
	    TAVOR_QP_WQ_TYPE_RECVQ, &srq->srq_wq_log_wqesz,
	    &srq->srq_wq_sgl);

	/*
	 * Allocate the memory for SRQ work queues.  Note:  The location from
	 * which we will allocate these work queues has been passed in through
	 * the tavor_qp_options_t structure.  Since Tavor work queues are not
	 * allowed to cross a 32-bit (4GB) boundary, the alignment of the work
	 * queue memory is very important.  We used to allocate work queues
	 * (the combined receive and send queues) so that they would be aligned
	 * on their combined size.  That alignment guaranteed that they would
	 * never cross the 4GB boundary (Tavor work queues are on the order of
	 * MBs at maximum).  Now we are able to relax this alignment constraint
	 * by ensuring that the IB address assigned to the queue memory (as a
	 * result of the tavor_mr_register() call) is offset from zero.
	 * Previously, we had wanted to use the ddi_dma_mem_alloc() routine to
	 * guarantee the alignment, but when attempting to use IOMMU bypass
	 * mode we found that we were not allowed to specify any alignment that
	 * was more restrictive than the system page size.  So we avoided this
	 * constraint by passing two alignment values, one for the memory
	 * allocation itself and the other for the DMA handle (for later bind).
	 * This used to cause more memory than necessary to be allocated (in
	 * order to guarantee the more restrictive alignment contraint).  But
	 * be guaranteeing the zero-based IB virtual address for the queue, we
	 * are able to conserve this memory.
	 *
	 * Note: If SRQ is not user-mappable, then it may come from either
	 * kernel system memory or from HCA-attached local DDR memory.
	 *
	 * Note2: We align this queue on a pagesize boundary.  This is required
	 * to make sure that all the resulting IB addresses will start at 0, for
	 * a zero-based queue.  By making sure we are aligned on at least a
	 * page, any offset we use into our queue will be the same as when we
	 * perform tavor_srq_modify() operations later.
	 */
	wqesz = (1 << srq->srq_wq_log_wqesz);
	srq->srq_wqinfo.qa_size = (1 << log_srq_size) * wqesz;
	srq->srq_wqinfo.qa_alloc_align = PAGESIZE;
	srq->srq_wqinfo.qa_bind_align = PAGESIZE;
	if (srq_is_umap) {
		srq->srq_wqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
	} else {
		srq->srq_wqinfo.qa_location = wq_location;
	}
	status = tavor_queue_alloc(state, &srq->srq_wqinfo, sleepflag);
	if (status != DDI_SUCCESS) {
		/* Set "status" and "errormsg" and goto failure */
		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed srq");
		goto srqalloc_fail4;
	}
	buf = (uint32_t *)srq->srq_wqinfo.qa_buf_aligned;
	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf))

	/*
	 * Register the memory for the SRQ work queues.  The memory for the SRQ
	 * must be registered in the Tavor TPT tables.  This gives us the LKey
	 * to specify in the SRQ context later.  Note: If the work queue is to
	 * be allocated from DDR memory, then only a "bypass" mapping is
	 * appropriate.  And if the SRQ memory is user-mappable, then we force
	 * DDI_DMA_CONSISTENT mapping.  Also, in order to meet the alignment
	 * restriction, we pass the "mro_bind_override_addr" flag in the call
	 * to tavor_mr_register().  This guarantees that the resulting IB vaddr
	 * will be zero-based (modulo the offset into the first page).  If we
	 * fail here, we still have the bunch of resource and reference count
	 * cleanup to do.
	 */
	flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP :
	    IBT_MR_NOSLEEP;
	mr_attr.mr_vaddr = (uint64_t)(uintptr_t)buf;
	mr_attr.mr_len   = srq->srq_wqinfo.qa_size;
	mr_attr.mr_as    = NULL;
	mr_attr.mr_flags = flag | IBT_MR_ENABLE_LOCAL_WRITE;
	if (srq_is_umap) {
		mr_op.mro_bind_type   = state->ts_cfg_profile->cp_iommu_bypass;
	} else {
		if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
			mr_op.mro_bind_type =
			    state->ts_cfg_profile->cp_iommu_bypass;
			dma_xfer_mode =
			    state->ts_cfg_profile->cp_streaming_consistent;
			if (dma_xfer_mode == DDI_DMA_STREAMING) {
				mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
			}
		} else {
			mr_op.mro_bind_type = TAVOR_BINDMEM_BYPASS;
		}
	}
	mr_op.mro_bind_dmahdl = srq->srq_wqinfo.qa_dmahdl;
	mr_op.mro_bind_override_addr = 1;
	status = tavor_mr_register(state, pd, &mr_attr, &mr, &mr_op);
	if (status != DDI_SUCCESS) {
		/* Set "status" and "errormsg" and goto failure */
		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
		goto srqalloc_fail5;
	}
	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr))
	addr = mr->mr_bindinfo.bi_addr;
	lkey = mr->mr_lkey;

	/*
	 * Calculate the offset between the kernel virtual address space
	 * and the IB virtual address space.  This will be used when
	 * posting work requests to properly initialize each WQE.
	 */
	srq_desc_off = (uint64_t)(uintptr_t)srq->srq_wqinfo.qa_buf_aligned -
	    (uint64_t)mr->mr_bindinfo.bi_addr;

	/*
	 * Create WQL and Wridlist for use by this SRQ
	 */
	srq->srq_wrid_wql = tavor_wrid_wql_create(state);
	if (srq->srq_wrid_wql == NULL) {
		/* Set "status" and "errormsg" and goto failure */
		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed wql create");
		goto srqalloc_fail6;
	}
	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(srq->srq_wrid_wql)))

	srq->srq_wridlist = tavor_wrid_get_list(1 << log_srq_size);
	if (srq->srq_wridlist == NULL) {
		/* Set "status" and "errormsg" and goto failure */
		TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed wridlist create");
		goto srqalloc_fail7;
	}
	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(srq->srq_wridlist)))

	srq->srq_wridlist->wl_srq_en = 1;
	srq->srq_wridlist->wl_free_list_indx = -1;

	/*
	 * Fill in all the return arguments (if necessary).  This includes
	 * real queue size and real SGLs.
	 */
	if (real_sizes != NULL) {
		real_sizes->srq_wr_sz = (1 << log_srq_size);
		real_sizes->srq_sgl_sz = srq->srq_wq_sgl;
	}

	/*
	 * Fill in the SRQC entry.  This is the final step before passing
	 * ownership of the SRQC entry to the Tavor hardware.  We use all of
	 * the information collected/calculated above to fill in the
	 * requisite portions of the SRQC.  Note: If this SRQ is going to be
	 * used for userland access, then we need to set the UAR page number
	 * appropriately (otherwise it's a "don't care")
	 */
	bzero(&srqc_entry, sizeof (tavor_hw_srqc_t));
	srqc_entry.wqe_addr_h	   = (addr >> 32);
	srqc_entry.next_wqe_addr_l = 0;
	srqc_entry.ds		   = (wqesz >> 4);
	srqc_entry.state	   = TAVOR_SRQ_STATE_HW_OWNER;
	srqc_entry.pd		   = pd->pd_pdnum;
	srqc_entry.lkey		   = lkey;
	srqc_entry.wqe_cnt	   = 0;
	if (srq_is_umap) {
		srqc_entry.uar	   = uarpg;
	} else {
		srqc_entry.uar	   = 0;
	}

	/*
	 * Write the SRQC entry to hardware.  Lastly, we pass ownership of
	 * the entry to the hardware (using the Tavor SW2HW_SRQ firmware
	 * command).  Note: In general, this operation shouldn't fail.  But
	 * if it does, we have to undo everything we've done above before
	 * returning error.
	 */
	status = tavor_cmn_ownership_cmd_post(state, SW2HW_SRQ, &srqc_entry,
	    sizeof (tavor_hw_srqc_t), srq->srq_srqnum,
	    sleepflag);
	if (status != TAVOR_CMD_SUCCESS) {
		cmn_err(CE_CONT, "Tavor: SW2HW_SRQ command failed: %08x\n",
		    status);
		TNF_PROBE_1(tavor_srq_alloc_sw2hw_srq_cmd_fail,
		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
		/* Set "status" and "errormsg" and goto failure */
		TAVOR_TNF_FAIL(IBT_FAILURE, "tavor SW2HW_SRQ command");
		goto srqalloc_fail8;
	}

	/*
	 * Fill in the rest of the Tavor SRQ handle.  We can update
	 * the following fields for use in further operations on the SRQ.
	 */
	srq->srq_srqcrsrcp = srqc;
	srq->srq_rsrcp	   = rsrc;
	srq->srq_mrhdl	   = mr;
	srq->srq_refcnt	   = 0;
	srq->srq_is_umap   = srq_is_umap;
	srq->srq_uarpg	   = (srq->srq_is_umap) ? uarpg : 0;
	srq->srq_umap_dhp  = (devmap_cookie_t)NULL;
	srq->srq_pdhdl	   = pd;
	srq->srq_wq_lastwqeindx = -1;
	srq->srq_wq_bufsz  = (1 << log_srq_size);
	srq->srq_wq_buf	   = buf;
	srq->srq_desc_off  = srq_desc_off;
	srq->srq_hdlrarg   = (void *)ibt_srqhdl;
	srq->srq_state	   = 0;
	srq->srq_real_sizes.srq_wr_sz = (1 << log_srq_size);
	srq->srq_real_sizes.srq_sgl_sz = srq->srq_wq_sgl;

	/* Determine if later ddi_dma_sync will be necessary */
	srq->srq_sync = TAVOR_SRQ_IS_SYNC_REQ(state, srq->srq_wqinfo);

	/*
	 * Put SRQ handle in Tavor SRQNum-to-SRQhdl list.  Then fill in the
	 * "srqhdl" and return success
	 */
	ASSERT(state->ts_srqhdl[srqc->tr_indx] == NULL);
	state->ts_srqhdl[srqc->tr_indx] = srq;

	/*
	 * If this is a user-mappable SRQ, then we need to insert the
	 * previously allocated entry into the "userland resources database".
	 * This will allow for later lookup during devmap() (i.e. mmap())
	 * calls.
	 */
	if (srq->srq_is_umap) {
		tavor_umap_db_add(umapdb);
	} else {
		mutex_enter(&srq->srq_wrid_wql->wql_lock);
		tavor_wrid_list_srq_init(srq->srq_wridlist, srq, 0);
		mutex_exit(&srq->srq_wrid_wql->wql_lock);
	}

	*srqhdl = srq;

	TAVOR_TNF_EXIT(tavor_srq_alloc);
	return (status);

/*
 * The following is cleanup for all possible failure cases in this routine
 */
srqalloc_fail8:
	kmem_free(srq->srq_wridlist->wl_wre, srq->srq_wridlist->wl_size *
	    sizeof (tavor_wrid_entry_t));
	kmem_free(srq->srq_wridlist, sizeof (tavor_wrid_list_hdr_t));
srqalloc_fail7:
	tavor_wql_refcnt_dec(srq->srq_wrid_wql);
srqalloc_fail6:
	if (tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
	    TAVOR_SLEEPFLAG_FOR_CONTEXT()) != DDI_SUCCESS) {
		TAVOR_WARNING(state, "failed to deregister SRQ memory");
	}
srqalloc_fail5:
	tavor_queue_free(state, &srq->srq_wqinfo);
srqalloc_fail4:
	if (srq_is_umap) {
		tavor_umap_db_free(umapdb);
	}
srqalloc_fail3:
	tavor_rsrc_free(state, &rsrc);
srqalloc_fail2:
	tavor_rsrc_free(state, &srqc);
srqalloc_fail1:
	tavor_pd_refcnt_dec(pd);
srqalloc_fail:
	TNF_PROBE_1(tavor_srq_alloc_fail, TAVOR_TNF_ERROR, "",
	    tnf_string, msg, errormsg);
	TAVOR_TNF_EXIT(tavor_srq_alloc);
	return (status);
}
Esempio n. 12
0
/* ARGSUSED */
int
tavor_srq_free(tavor_state_t *state, tavor_srqhdl_t *srqhdl, uint_t sleepflag)
{
	tavor_rsrc_t		*srqc, *rsrc;
	tavor_umap_db_entry_t	*umapdb;
	uint64_t		value;
	tavor_srqhdl_t		srq;
	tavor_mrhdl_t		mr;
	tavor_pdhdl_t		pd;
	tavor_hw_srqc_t		srqc_entry;
	uint32_t		srqnum;
	uint32_t		size;
	uint_t			maxprot;
	int			status;

	TAVOR_TNF_ENTER(tavor_srq_free);

	/*
	 * Pull all the necessary information from the Tavor Shared Receive
	 * Queue handle.  This is necessary here because the resource for the
	 * SRQ handle is going to be freed up as part of this operation.
	 */
	srq	= *srqhdl;
	mutex_enter(&srq->srq_lock);
	srqc	= srq->srq_srqcrsrcp;
	rsrc	= srq->srq_rsrcp;
	pd	= srq->srq_pdhdl;
	mr	= srq->srq_mrhdl;
	srqnum	= srq->srq_srqnum;

	/*
	 * If there are work queues still associated with the SRQ, then return
	 * an error.  Otherwise, we will be holding the SRQ lock.
	 */
	if (srq->srq_refcnt != 0) {
		mutex_exit(&srq->srq_lock);
		TNF_PROBE_1(tavor_srq_free_refcnt_fail, TAVOR_TNF_ERROR, "",
		    tnf_int, refcnt, srq->srq_refcnt);
		TAVOR_TNF_EXIT(tavor_srq_free);
		return (IBT_SRQ_IN_USE);
	}

	/*
	 * If this was a user-mappable SRQ, then we need to remove its entry
	 * from the "userland resources database".  If it is also currently
	 * mmap()'d out to a user process, then we need to call
	 * devmap_devmem_remap() to remap the SRQ memory to an invalid mapping.
	 * We also need to invalidate the SRQ tracking information for the
	 * user mapping.
	 */
	if (srq->srq_is_umap) {
		status = tavor_umap_db_find(state->ts_instance, srq->srq_srqnum,
		    MLNX_UMAP_SRQMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
		    &umapdb);
		if (status != DDI_SUCCESS) {
			mutex_exit(&srq->srq_lock);
			TAVOR_WARNING(state, "failed to find in database");
			TAVOR_TNF_EXIT(tavor_srq_free);
			return (ibc_get_ci_failure(0));
		}
		tavor_umap_db_free(umapdb);
		if (srq->srq_umap_dhp != NULL) {
			maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
			status = devmap_devmem_remap(srq->srq_umap_dhp,
			    state->ts_dip, 0, 0, srq->srq_wqinfo.qa_size,
			    maxprot, DEVMAP_MAPPING_INVALID, NULL);
			if (status != DDI_SUCCESS) {
				mutex_exit(&srq->srq_lock);
				TAVOR_WARNING(state, "failed in SRQ memory "
				    "devmap_devmem_remap()");
				TAVOR_TNF_EXIT(tavor_srq_free);
				return (ibc_get_ci_failure(0));
			}
			srq->srq_umap_dhp = (devmap_cookie_t)NULL;
		}
	}

	/*
	 * Put NULL into the Tavor SRQNum-to-SRQHdl list.  This will allow any
	 * in-progress events to detect that the SRQ corresponding to this
	 * number has been freed.
	 */
	state->ts_srqhdl[srqc->tr_indx] = NULL;

	mutex_exit(&srq->srq_lock);
	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq));
	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq->srq_wridlist));

	/*
	 * Reclaim SRQC entry from hardware (using the Tavor HW2SW_SRQ
	 * firmware command).  If the ownership transfer fails for any reason,
	 * then it is an indication that something (either in HW or SW) has
	 * gone seriously wrong.
	 */
	status = tavor_cmn_ownership_cmd_post(state, HW2SW_SRQ, &srqc_entry,
	    sizeof (tavor_hw_srqc_t), srqnum, sleepflag);
	if (status != TAVOR_CMD_SUCCESS) {
		TAVOR_WARNING(state, "failed to reclaim SRQC ownership");
		cmn_err(CE_CONT, "Tavor: HW2SW_SRQ command failed: %08x\n",
		    status);
		TNF_PROBE_1(tavor_srq_free_hw2sw_srq_cmd_fail,
		    TAVOR_TNF_ERROR, "", tnf_uint, status, status);
		TAVOR_TNF_EXIT(tavor_srq_free);
		return (IBT_FAILURE);
	}

	/*
	 * Deregister the memory for the Shared Receive Queue.  If this fails
	 * for any reason, then it is an indication that something (either
	 * in HW or SW) has gone seriously wrong.  So we print a warning
	 * message and return.
	 */
	status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
	    sleepflag);
	if (status != DDI_SUCCESS) {
		TAVOR_WARNING(state, "failed to deregister SRQ memory");
		TNF_PROBE_0(tavor_srq_free_dereg_mr_fail, TAVOR_TNF_ERROR, "");
		TAVOR_TNF_EXIT(tavor_srq_free);
		return (IBT_FAILURE);
	}

	/* Calculate the size and free the wridlist container */
	if (srq->srq_wridlist != NULL) {
		size = (srq->srq_wridlist->wl_size *
		    sizeof (tavor_wrid_entry_t));
		kmem_free(srq->srq_wridlist->wl_wre, size);
		kmem_free(srq->srq_wridlist, sizeof (tavor_wrid_list_hdr_t));

		/*
		 * Release reference to WQL; If this is the last reference,
		 * this call also has the side effect of freeing up the
		 * 'srq_wrid_wql' memory.
		 */
		tavor_wql_refcnt_dec(srq->srq_wrid_wql);
	}

	/* Free the memory for the SRQ */
	tavor_queue_free(state, &srq->srq_wqinfo);

	/* Free the Tavor SRQ Handle */
	tavor_rsrc_free(state, &rsrc);

	/* Free the SRQC entry resource */
	tavor_rsrc_free(state, &srqc);

	/* Decrement the reference count on the protection domain (PD) */
	tavor_pd_refcnt_dec(pd);

	/* Set the srqhdl pointer to NULL and return success */
	*srqhdl = NULL;

	TAVOR_TNF_EXIT(tavor_srq_free);
	return (DDI_SUCCESS);
}
Esempio n. 13
0
/*
 * ibmf_i_notify_client():
 * 	If the transaction is done, call the appropriate callback
 */
void
ibmf_i_notify_client(ibmf_msg_impl_t *msgimplp)
{
	ibmf_client_t	*clientp;
	ibmf_msg_cb_t	async_cb;
	void		*async_cb_arg;

	IBMF_TRACE_1(IBMF_TNF_DEBUG, DPRINT_L4, ibmf_i_notify_client_start,
	    IBMF_TNF_TRACE, "", "ibmf_i_notify_client(): msgp = 0x%p\n",
	    tnf_opaque, msgimplp, msgimplp);

	clientp = msgimplp->im_client;

	/*
	 * message is removed so no more threads will find message;
	 * wait for any current clients to finish
	 */
	mutex_enter(&msgimplp->im_mutex);

	ASSERT(msgimplp->im_trans_state_flags & IBMF_TRANS_STATE_FLAG_DONE);

	/*
	 * If the message reference count is not zero, then some duplicate
	 * MAD has arrived for this message. The thread processing the MAD
	 * found the message on the client's list before this thread was able
	 * to remove the message from the list. Since, we should not notify
	 * the client of the transaction completion until all the threads
	 * working on this message have completed (we don't want the client
	 * to free the message while a thread is working on it), we let one
	 * of the other threads notify the client of the completion once
	 * the message reference count is zero.
	 */
	if (msgimplp->im_ref_count != 0) {
		mutex_exit(&msgimplp->im_mutex);
		IBMF_TRACE_1(IBMF_TNF_DEBUG, DPRINT_L3,
		    ibmf_i_notify_client_err, IBMF_TNF_TRACE,
		    "", "ibmf_i_notify_client(): %s\n",
		    tnf_string, msg, "message reference count != 0");
		IBMF_TRACE_0(IBMF_TNF_DEBUG, DPRINT_L4,
		    ibmf_i_notify_client_end, IBMF_TNF_TRACE, "",
		    "ibmf_i_notify_client() exit\n");
		return;
	}

	mutex_exit(&msgimplp->im_mutex);

	/*
	 * Free up the UD dest resource so it is not tied down by
	 * the message in case the message is not freed immediately.
	 * Clean up the UD dest list as well so that excess UD dest
	 * resources are returned to the CI.
	 */
	if (msgimplp->im_ibmf_ud_dest != NULL) {
		ibmf_i_free_ud_dest(clientp, msgimplp);
		ibmf_i_clean_ud_dest_list(clientp->ic_myci, B_FALSE);
	}

	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*msgimplp))

	if (msgimplp->im_unsolicited == B_TRUE) {

		/*
		 * Do nothing if error status
		 */
		if (msgimplp->im_msg_status != IBMF_SUCCESS) {

			if (msgimplp->im_qp_hdl == IBMF_QP_HANDLE_DEFAULT) {
				mutex_enter(&clientp->ic_mutex);
				IBMF_RECV_CB_CLEANUP(clientp);
				mutex_exit(&clientp->ic_mutex);
			} else {
				ibmf_alt_qp_t *qpp =
				    (ibmf_alt_qp_t *)msgimplp->im_qp_hdl;
				mutex_enter(&qpp->isq_mutex);
				IBMF_ALT_RECV_CB_CLEANUP(qpp);
				mutex_exit(&qpp->isq_mutex);
			}

			IBMF_TRACE_2(IBMF_TNF_NODEBUG, DPRINT_L1,
			    ibmf_i_notify_client_err, IBMF_TNF_ERROR, "",
			    "ibmf_i_notify_client(): %s, status = %d\n",
			    tnf_string, msg, "message status not success",
			    tnf_opaque, status, msgimplp->im_msg_status);

			ibmf_i_free_msg(msgimplp);

			IBMF_TRACE_0(IBMF_TNF_DEBUG, DPRINT_L4,
			    ibmf_i_notify_client_end, IBMF_TNF_TRACE, "",
			    "ibmf_i_notify_client() exit\n");

			return;
		}

		/*
		 * Check to see if
		 * a callback has been resgistered with the client
		 * for this unsolicited message.
		 * If one has been registered, up the recvs active
		 * count to get the teardown routine to wait until
		 * this callback is complete.
		 */
		if (msgimplp->im_qp_hdl == IBMF_QP_HANDLE_DEFAULT) {

			mutex_enter(&clientp->ic_mutex);

			if ((clientp->ic_recv_cb == NULL) ||
			    (clientp->ic_flags & IBMF_CLIENT_TEAR_DOWN_CB)) {
				IBMF_RECV_CB_CLEANUP(clientp);
				mutex_exit(&clientp->ic_mutex);
				ibmf_i_free_msg(msgimplp);
				IBMF_TRACE_1(IBMF_TNF_NODEBUG, DPRINT_L1,
				    ibmf_i_notify_client_err, IBMF_TNF_ERROR,
				    "", "ibmf_i_notify_client(): %s\n",
				    tnf_string, msg,
				    "ibmf_tear_down_recv_cb already occurred");
				IBMF_TRACE_0(IBMF_TNF_DEBUG, DPRINT_L4,
				    ibmf_i_notify_client_end,
				    IBMF_TNF_TRACE, "",
				    "ibmf_i_notify_client() exit\n");
				return;
			}

			clientp->ic_msgs_alloced++;
			mutex_enter(&clientp->ic_kstat_mutex);
			IBMF_ADD32_KSTATS(clientp, msgs_alloced, 1);
			mutex_exit(&clientp->ic_kstat_mutex);

			async_cb = clientp->ic_recv_cb;
			async_cb_arg = clientp->ic_recv_cb_arg;

			mutex_exit(&clientp->ic_mutex);

			async_cb((ibmf_handle_t)clientp, (ibmf_msg_t *)msgimplp,
			    async_cb_arg);

			mutex_enter(&clientp->ic_mutex);
			IBMF_RECV_CB_CLEANUP(clientp);
			mutex_exit(&clientp->ic_mutex);

		} else {
			ibmf_alt_qp_t *qpp =
			    (ibmf_alt_qp_t *)msgimplp->im_qp_hdl;

			mutex_enter(&qpp->isq_mutex);

			if ((qpp->isq_recv_cb == NULL) ||
			    (qpp->isq_flags & IBMF_CLIENT_TEAR_DOWN_CB)) {
				IBMF_ALT_RECV_CB_CLEANUP(qpp);
				mutex_exit(&qpp->isq_mutex);
				ibmf_i_free_msg(msgimplp);
				IBMF_TRACE_1(IBMF_TNF_NODEBUG, DPRINT_L1,
				    ibmf_i_notify_client_err, IBMF_TNF_ERROR,
				    "", "ibmf_i_notify_client(): %s\n",
				    tnf_string, msg,
				    "ibmf_tear_down_recv_cb already occurred");
				IBMF_TRACE_0(IBMF_TNF_DEBUG, DPRINT_L4,
				    ibmf_i_notify_client_end,
				    IBMF_TNF_TRACE, "",
				    "ibmf_i_notify_client() exit\n");
				return;
			}

			async_cb = qpp->isq_recv_cb;
			async_cb_arg = qpp->isq_recv_cb_arg;

			mutex_exit(&qpp->isq_mutex);

			mutex_enter(&clientp->ic_mutex);

			clientp->ic_msgs_alloced++;

			mutex_exit(&clientp->ic_mutex);

			mutex_enter(&clientp->ic_kstat_mutex);
			IBMF_ADD32_KSTATS(clientp, msgs_alloced, 1);
			mutex_exit(&clientp->ic_kstat_mutex);

			async_cb((ibmf_handle_t)clientp, (ibmf_msg_t *)msgimplp,
			    async_cb_arg);

			mutex_enter(&qpp->isq_mutex);
			IBMF_ALT_RECV_CB_CLEANUP(qpp);
			mutex_exit(&qpp->isq_mutex);
		}
	} else {

		/* Solicited transaction processing */

		if (msgimplp->im_trans_cb == NULL) {

			/* Processing for a blocking transaction */

			mutex_enter(&msgimplp->im_mutex);

			if (msgimplp->im_trans_state_flags &
			    IBMF_TRANS_STATE_FLAG_WAIT) {

				IBMF_TRACE_2(IBMF_TNF_DEBUG, DPRINT_L3,
				    ibmf_i_notify_client, IBMF_TNF_TRACE, "",
				    "ibmf_i_notify_client(): %s, msg = 0x%p\n",
				    tnf_string, msg, "Awaking thread",
				    tnf_opaque, msgimplp, msgimplp);

				cv_signal(&msgimplp->im_trans_cv);
			} else {
				IBMF_TRACE_2(IBMF_TNF_DEBUG, DPRINT_L3,
				    ibmf_i_notify_client, IBMF_TNF_TRACE, "",
				    "ibmf_i_notify_client(): %s, msg = 0x%p\n",
				    tnf_string, msg, "Notify client, no wait",
				    tnf_opaque, msgimplp, msgimplp);
			}

			msgimplp->im_trans_state_flags |=
			    IBMF_TRANS_STATE_FLAG_SIGNALED;

			mutex_exit(&msgimplp->im_mutex);

		} else {

			/* Processing for a non-blocking transaction */

			mutex_enter(&msgimplp->im_mutex);
			msgimplp->im_flags &= ~IBMF_MSG_FLAGS_BUSY;
			mutex_exit(&msgimplp->im_mutex);

			IBMF_TRACE_2(IBMF_TNF_DEBUG, DPRINT_L3,
			    ibmf_i_notify_client, IBMF_TNF_TRACE, "",
			    "ibmf_i_notify_client(): %s, msg = 0x%p\n",
			    tnf_string, msg, "No thread is blocking",
			    tnf_opaque, msgimplp, msgimplp);

			if (msgimplp->im_trans_cb != NULL) {
				msgimplp->im_trans_cb(
				    (ibmf_handle_t)clientp,
				    (ibmf_msg_t *)msgimplp,
				    msgimplp->im_trans_cb_arg);
			}
		}
	}

	IBMF_TRACE_0(IBMF_TNF_DEBUG, DPRINT_L4,	ibmf_i_notify_client_end,
	    IBMF_TNF_TRACE, "", "ibmf_i_notify_client() exit\n");
}
/*
 * tavor_agent_handle_req()
 *    Context: Called with priority of taskQ thread
 */
static void
tavor_agent_handle_req(void *cb_args)
{
	tavor_agent_handler_arg_t	*agent_args;
	tavor_agent_list_t		*curr;
	tavor_state_t			*state;
	ibmf_handle_t			ibmf_handle;
	ibmf_msg_t			*msgp;
	ibmf_msg_bufs_t			*recv_msgbufp;
	ibmf_msg_bufs_t			*send_msgbufp;
	ibmf_retrans_t			retrans;
	uint_t				port;
	int				status;

	TAVOR_TNF_ENTER(tavor_agent_handle_req);

	/* Extract the necessary info from the callback args parameter */
	agent_args  = (tavor_agent_handler_arg_t *)cb_args;
	ibmf_handle = agent_args->ahd_ibmfhdl;
	msgp	    = agent_args->ahd_ibmfmsg;
	curr	    = agent_args->ahd_agentlist;
	state	    = curr->agl_state;
	port	    = curr->agl_port;

	/*
	 * Set the message send buffer pointers to the message receive buffer
	 * pointers to reuse the IBMF provided buffers for the sender
	 * information.
	 */
	recv_msgbufp = &msgp->im_msgbufs_recv;
	send_msgbufp = &msgp->im_msgbufs_send;
	bcopy(recv_msgbufp, send_msgbufp, sizeof (ibmf_msg_bufs_t));

	/*
	 * Check if the incoming packet is a special "Tavor Trap" MAD.  If it
	 * is, then do the special handling.  If it isn't, then simply pass it
	 * on to the firmware and forward the response back to the IBMF.
	 *
	 * Note: Tavor has a unique method for handling internally generated
	 * Traps.  All internally detected/generated Trap messages are
	 * automatically received by the IBMF (as receive completions on QP0),
	 * which (because all Tavor Trap MADs have SLID == 0) detects it as a
	 * special "Tavor Trap" and forwards it here to the driver's SMA.
	 * It is then our responsibility here to fill in the Trap MAD's DLID
	 * for forwarding to the real Master SM (as programmed in the port's
	 * PortInfo.MasterSMLID field.)
	 */
	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(msgp->im_local_addr))
	if (TAVOR_IS_SPECIAL_TRAP_MAD(msgp)) {
		msgp->im_local_addr.ia_remote_lid =
		    TAVOR_PORT_MASTERSMLID_GET(state, port - 1);
	} else {
		/*
		 * Post the command to the firmware (using the MAD_IFC
		 * command).  Note: We also reuse the command that was passed
		 * in.  We pass the pointer to the original MAD payload as if
		 * it were both the source of the incoming MAD as well as the
		 * destination for the response.  This is acceptable and saves
		 * us the step of one additional copy.  Note:  If this command
		 * fails for any reason other than TAVOR_CMD_BAD_PKT, it
		 * probably indicates a serious problem.
		 */
		status = tavor_mad_ifc_cmd_post(state, port,
		    TAVOR_CMD_SLEEP_NOSPIN,
		    (uint32_t *)recv_msgbufp->im_bufs_mad_hdr,
		    (uint32_t *)send_msgbufp->im_bufs_mad_hdr);
		if (status != TAVOR_CMD_SUCCESS) {
			if ((status != TAVOR_CMD_BAD_PKT) &&
			    (status != TAVOR_CMD_INSUFF_RSRC)) {
				cmn_err(CE_CONT, "Tavor: MAD_IFC (port %02d) "
				    "command failed: %08x\n", port, status);
				TNF_PROBE_1(tavor_agent_handle_req_madifc_fail,
				    TAVOR_TNF_ERROR, "", tnf_uint, cmd_status,
				    status);
			}

			/* finish cleanup */
			goto tavor_agent_handle_req_skip_response;
		}
	}

	/*
	 * If incoming MAD was "TrapRepress", then no response is necessary.
	 * Free the IBMF message and return.
	 */
	if (TAVOR_IS_TRAP_REPRESS_MAD(msgp)) {
		goto tavor_agent_handle_req_skip_response;
	}

	/*
	 * Modify the response MAD as necessary (for any special cases).
	 * Specifically, if this MAD was a directed route MAD, then some
	 * additional packet manipulation may be necessary because the Tavor
	 * firmware does not do all the required steps to respond to the
	 * MAD.
	 */
	tavor_agent_mad_resp_handling(state, msgp, port);

	/*
	 * Send response (or forwarded "Trap" MAD) back to IBMF.  We use the
	 * "response callback" to indicate when it is appropriate (later) to
	 * free the IBMF msg.
	 */
	status = ibmf_msg_transport(ibmf_handle, IBMF_QP_HANDLE_DEFAULT,
	    msgp, &retrans, tavor_agent_response_cb, state, 0);
	if (status != IBMF_SUCCESS) {
		TNF_PROBE_1(tavor_ibmf_send_msg_fail, TAVOR_TNF_ERROR, "",
		    tnf_uint, ibmf_status, status);
		goto tavor_agent_handle_req_skip_response;
	}

	/* Free up the callback args parameter */
	kmem_free(agent_args, sizeof (tavor_agent_handler_arg_t));
	TAVOR_TNF_EXIT(tavor_agent_handle_req);
	return;

tavor_agent_handle_req_skip_response:
	/* Free up the ibmf message */
	status = ibmf_free_msg(ibmf_handle, &msgp);
	if (status != IBMF_SUCCESS) {
		TNF_PROBE_1(tavor_agent_handle_req_ibmf_free_msg_fail,
		    TAVOR_TNF_ERROR, "", tnf_uint, ibmf_status,
		    status);
	}
	/* Free up the callback args parameter */
	kmem_free(agent_args, sizeof (tavor_agent_handler_arg_t));
	TAVOR_TNF_EXIT(tavor_agent_handle_req);
}
/*
 * tavor_agent_request_cb()
 *    Context: Called from the IBMF context
 */
static void
tavor_agent_request_cb(ibmf_handle_t ibmf_handle, ibmf_msg_t *msgp,
    void *args)
{
	tavor_agent_handler_arg_t	*cb_args;
	tavor_agent_list_t		*curr;
	tavor_state_t			*state;
	int				status;
	int				ibmf_status;

	TAVOR_TNF_ENTER(tavor_agent_request_cb);

	curr  = (tavor_agent_list_t *)args;
	state = curr->agl_state;

	/*
	 * Allocate space to hold the callback args (for passing to the
	 * task queue).  Note: If we are unable to allocate space for the
	 * the callback args here, then we just return.  But we must ensure
	 * that we call ibmf_free_msg() to free up the message.
	 */
	cb_args = (tavor_agent_handler_arg_t *)kmem_zalloc(
	    sizeof (tavor_agent_handler_arg_t), KM_NOSLEEP);
	if (cb_args == NULL) {
		ibmf_status = ibmf_free_msg(ibmf_handle, &msgp);
		if (ibmf_status != IBMF_SUCCESS) {
			TNF_PROBE_1(tavor_agent_request_cb_ibmf_free_msg_fail,
			    TAVOR_TNF_ERROR, "", tnf_uint, ibmf_status,
			    ibmf_status);
		}
		TNF_PROBE_0(tavor_agent_request_cb_kma_fail,
		    TAVOR_TNF_ERROR, "");
		TAVOR_TNF_EXIT(tavor_agent_request_cb);
		return;
	}
	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*cb_args))

	/* Fill in the callback args */
	cb_args->ahd_ibmfhdl	= ibmf_handle;
	cb_args->ahd_ibmfmsg	= msgp;
	cb_args->ahd_agentlist	= args;

	/*
	 * Dispatch the message to the task queue.  Note: Just like above,
	 * if this request fails for any reason then make sure to free up
	 * the IBMF message and then return
	 */
	status = ddi_taskq_dispatch(state->ts_taskq_agents,
	    tavor_agent_handle_req, cb_args, DDI_NOSLEEP);
	if (status == DDI_FAILURE) {
		kmem_free(cb_args, sizeof (tavor_agent_handler_arg_t));
		ibmf_status = ibmf_free_msg(ibmf_handle, &msgp);
		if (ibmf_status != IBMF_SUCCESS) {
			TNF_PROBE_1(tavor_agent_request_cb_ibmf_free_msg_fail,
			    TAVOR_TNF_ERROR, "", tnf_uint, ibmf_status,
			    ibmf_status);
		}
		TNF_PROBE_0(tavor_agent_request_cb_taskq_fail,
		    TAVOR_TNF_ERROR, "");
	}
	TAVOR_TNF_EXIT(tavor_agent_request_cb);
}