/* * Not all fields in struct clist are interesting to the RPC over RDMA * protocol. Only XDR the interesting fields. */ bool_t xdr_clist(XDR *xdrs, clist *objp) { if (!xdr_uint32(xdrs, &objp->c_xdroff)) return (FALSE); if (!xdr_uint32(xdrs, &objp->c_smemhandle.mrc_rmr)) return (FALSE); if (!xdr_uint32(xdrs, &objp->c_len)) return (FALSE); if (!xdr_uint64(xdrs, &objp->w.c_saddr)) return (FALSE); if (!xdr_do_clist(xdrs, &objp->c_next)) return (FALSE); return (TRUE); }
/* * If xp_cl is NULL value, then the RPC payload will NOT carry * an RDMA READ chunk list, in this case we insert FALSE into * the XDR stream. Otherwise we use the clist and RDMA register * the memory and encode the clist into the outbound XDR stream. */ static int clnt_setup_rlist(CONN *conn, XDR *xdrs, XDR *call_xdrp) { int status; struct clist *rclp; int32_t xdr_flag = XDR_RDMA_RLIST_REG; XDR_CONTROL(call_xdrp, XDR_RDMA_GET_RLIST, &rclp); if (rclp != NULL) { status = clist_register(conn, rclp, CLIST_REG_SOURCE); if (status != RDMA_SUCCESS) { return (CLNT_RDMA_FAIL); } XDR_CONTROL(call_xdrp, XDR_RDMA_SET_FLAGS, &xdr_flag); } (void) xdr_do_clist(xdrs, &rclp); return (CLNT_RDMA_SUCCESS); }
/* ARGSUSED */ static enum clnt_stat clnt_rdma_kcallit(CLIENT *h, rpcproc_t procnum, xdrproc_t xdr_args, caddr_t argsp, xdrproc_t xdr_results, caddr_t resultsp, struct timeval wait) { cku_private_t *p = htop(h); int try_call_again; int refresh_attempt = AUTH_REFRESH_COUNT; int status; int msglen; XDR *call_xdrp, callxdr; /* for xdrrdma encoding the RPC call */ XDR *reply_xdrp, replyxdr; /* for xdrrdma decoding the RPC reply */ XDR *rdmahdr_o_xdrs, *rdmahdr_i_xdrs; struct rpc_msg reply_msg; rdma_registry_t *m; struct clist *cl_sendlist; struct clist *cl_recvlist; struct clist *cl; struct clist *cl_rpcmsg; struct clist *cl_rdma_reply; struct clist *cl_rpcreply_wlist; struct clist *cl_long_reply; rdma_buf_t rndup; uint_t vers; uint_t op; uint_t off; uint32_t seg_array_len; uint_t long_reply_len; uint_t rpcsec_gss; uint_t gss_i_or_p; CONN *conn = NULL; rdma_buf_t clmsg; rdma_buf_t rpcmsg; rdma_chunkinfo_lengths_t rcil; clock_t ticks; bool_t wlist_exists_reply; uint32_t rdma_credit = rdma_bufs_rqst; RCSTAT_INCR(rccalls); call_again: bzero(&clmsg, sizeof (clmsg)); bzero(&rpcmsg, sizeof (rpcmsg)); bzero(&rndup, sizeof (rndup)); try_call_again = 0; cl_sendlist = NULL; cl_recvlist = NULL; cl = NULL; cl_rpcmsg = NULL; cl_rdma_reply = NULL; call_xdrp = NULL; reply_xdrp = NULL; wlist_exists_reply = FALSE; cl_rpcreply_wlist = NULL; cl_long_reply = NULL; rcil.rcil_len = 0; rcil.rcil_len_alt = 0; long_reply_len = 0; rw_enter(&rdma_lock, RW_READER); m = (rdma_registry_t *)p->cku_rd_handle; if (m->r_mod_state == RDMA_MOD_INACTIVE) { /* * If we didn't find a matching RDMA module in the registry * then there is no transport. */ rw_exit(&rdma_lock); p->cku_err.re_status = RPC_CANTSEND; p->cku_err.re_errno = EIO; ticks = clnt_rdma_min_delay * drv_usectohz(1000000); if (h->cl_nosignal == TRUE) { delay(ticks); } else { if (delay_sig(ticks) == EINTR) { p->cku_err.re_status = RPC_INTR; p->cku_err.re_errno = EINTR; } } return (RPC_CANTSEND); } /* * Get unique xid */ if (p->cku_xid == 0) p->cku_xid = alloc_xid(); status = RDMA_GET_CONN(p->cku_rd_mod->rdma_ops, &p->cku_srcaddr, &p->cku_addr, p->cku_addrfmly, p->cku_rd_handle, &conn); rw_exit(&rdma_lock); /* * If there is a problem with the connection reflect the issue * back to the higher level to address, we MAY delay for a short * period so that we are kind to the transport. */ if (conn == NULL) { /* * Connect failed to server. Could be because of one * of several things. In some cases we don't want * the caller to retry immediately - delay before * returning to caller. */ switch (status) { case RDMA_TIMEDOUT: /* * Already timed out. No need to delay * some more. */ p->cku_err.re_status = RPC_TIMEDOUT; p->cku_err.re_errno = ETIMEDOUT; break; case RDMA_INTR: /* * Failed because of an signal. Very likely * the caller will not retry. */ p->cku_err.re_status = RPC_INTR; p->cku_err.re_errno = EINTR; break; default: /* * All other failures - server down or service * down or temporary resource failure. Delay before * returning to caller. */ ticks = clnt_rdma_min_delay * drv_usectohz(1000000); p->cku_err.re_status = RPC_CANTCONNECT; p->cku_err.re_errno = EIO; if (h->cl_nosignal == TRUE) { delay(ticks); } else { if (delay_sig(ticks) == EINTR) { p->cku_err.re_status = RPC_INTR; p->cku_err.re_errno = EINTR; } } break; } return (p->cku_err.re_status); } if (p->cku_srcaddr.maxlen < conn->c_laddr.len) { if ((p->cku_srcaddr.maxlen != 0) && (p->cku_srcaddr.buf != NULL)) kmem_free(p->cku_srcaddr.buf, p->cku_srcaddr.maxlen); p->cku_srcaddr.buf = kmem_zalloc(conn->c_laddr.maxlen, KM_SLEEP); p->cku_srcaddr.maxlen = conn->c_laddr.maxlen; } p->cku_srcaddr.len = conn->c_laddr.len; bcopy(conn->c_laddr.buf, p->cku_srcaddr.buf, conn->c_laddr.len); clnt_check_credit(conn); status = CLNT_RDMA_FAIL; rpcsec_gss = gss_i_or_p = FALSE; if (IS_RPCSEC_GSS(h)) { rpcsec_gss = TRUE; if (rpc_gss_get_service_type(h->cl_auth) == rpc_gss_svc_integrity || rpc_gss_get_service_type(h->cl_auth) == rpc_gss_svc_privacy) gss_i_or_p = TRUE; } /* * Try a regular RDMA message if RPCSEC_GSS is not being used * or if RPCSEC_GSS is being used for authentication only. */ if (rpcsec_gss == FALSE || (rpcsec_gss == TRUE && gss_i_or_p == FALSE)) { /* * Grab a send buffer for the request. Try to * encode it to see if it fits. If not, then it * needs to be sent in a chunk. */ rpcmsg.type = SEND_BUFFER; if (rdma_buf_alloc(conn, &rpcmsg)) { DTRACE_PROBE(krpc__e__clntrdma__callit_nobufs); goto done; } /* First try to encode into regular send buffer */ op = RDMA_MSG; call_xdrp = &callxdr; xdrrdma_create(call_xdrp, rpcmsg.addr, rpcmsg.len, rdma_minchunk, NULL, XDR_ENCODE, conn); status = clnt_compose_rpcmsg(h, procnum, &rpcmsg, call_xdrp, xdr_args, argsp); if (status != CLNT_RDMA_SUCCESS) { /* Clean up from previous encode attempt */ rdma_buf_free(conn, &rpcmsg); XDR_DESTROY(call_xdrp); } else { XDR_CONTROL(call_xdrp, XDR_RDMA_GET_CHUNK_LEN, &rcil); } } /* If the encode didn't work, then try a NOMSG */ if (status != CLNT_RDMA_SUCCESS) { msglen = CKU_HDRSIZE + BYTES_PER_XDR_UNIT + MAX_AUTH_BYTES + xdr_sizeof(xdr_args, argsp); msglen = calc_length(msglen); /* pick up the lengths for the reply buffer needed */ (void) xdrrdma_sizeof(xdr_args, argsp, 0, &rcil.rcil_len, &rcil.rcil_len_alt); /* * Construct a clist to describe the CHUNK_BUFFER * for the rpcmsg. */ cl_rpcmsg = clist_alloc(); cl_rpcmsg->c_len = msglen; cl_rpcmsg->rb_longbuf.type = RDMA_LONG_BUFFER; cl_rpcmsg->rb_longbuf.len = msglen; if (rdma_buf_alloc(conn, &cl_rpcmsg->rb_longbuf)) { clist_free(cl_rpcmsg); goto done; } cl_rpcmsg->w.c_saddr3 = cl_rpcmsg->rb_longbuf.addr; op = RDMA_NOMSG; call_xdrp = &callxdr; xdrrdma_create(call_xdrp, cl_rpcmsg->rb_longbuf.addr, cl_rpcmsg->rb_longbuf.len, 0, cl_rpcmsg, XDR_ENCODE, conn); status = clnt_compose_rpcmsg(h, procnum, &rpcmsg, call_xdrp, xdr_args, argsp); if (status != CLNT_RDMA_SUCCESS) { p->cku_err.re_status = RPC_CANTENCODEARGS; p->cku_err.re_errno = EIO; DTRACE_PROBE(krpc__e__clntrdma__callit__composemsg); goto done; } } /* * During the XDR_ENCODE we may have "allocated" an RDMA READ or * RDMA WRITE clist. * * First pull the RDMA READ chunk list from the XDR private * area to keep it handy. */ XDR_CONTROL(call_xdrp, XDR_RDMA_GET_RLIST, &cl); if (gss_i_or_p) { long_reply_len = rcil.rcil_len + rcil.rcil_len_alt; long_reply_len += MAX_AUTH_BYTES; } else { long_reply_len = rcil.rcil_len; } /* * Update the chunk size information for the Long RPC msg. */ if (cl && op == RDMA_NOMSG) cl->c_len = p->cku_outsz; /* * Prepare the RDMA header. On success xdrs will hold the result * of xdrmem_create() for a SEND_BUFFER. */ status = clnt_compose_rdma_header(conn, h, &clmsg, &rdmahdr_o_xdrs, &op); if (status != CLNT_RDMA_SUCCESS) { p->cku_err.re_status = RPC_CANTSEND; p->cku_err.re_errno = EIO; RCSTAT_INCR(rcnomem); DTRACE_PROBE(krpc__e__clntrdma__callit__nobufs2); goto done; } /* * Now insert the RDMA READ list iff present */ status = clnt_setup_rlist(conn, rdmahdr_o_xdrs, call_xdrp); if (status != CLNT_RDMA_SUCCESS) { DTRACE_PROBE(krpc__e__clntrdma__callit__clistreg); rdma_buf_free(conn, &clmsg); p->cku_err.re_status = RPC_CANTSEND; p->cku_err.re_errno = EIO; goto done; } /* * Setup RDMA WRITE chunk list for nfs read operation * other operations will have a NULL which will result * as a NULL list in the XDR stream. */ status = clnt_setup_wlist(conn, rdmahdr_o_xdrs, call_xdrp, &rndup); if (status != CLNT_RDMA_SUCCESS) { rdma_buf_free(conn, &clmsg); p->cku_err.re_status = RPC_CANTSEND; p->cku_err.re_errno = EIO; goto done; } /* * If NULL call and RPCSEC_GSS, provide a chunk such that * large responses can flow back to the client. * If RPCSEC_GSS with integrity or privacy is in use, get chunk. */ if ((procnum == 0 && rpcsec_gss == TRUE) || (rpcsec_gss == TRUE && gss_i_or_p == TRUE)) long_reply_len += 1024; status = clnt_setup_long_reply(conn, &cl_long_reply, long_reply_len); if (status != CLNT_RDMA_SUCCESS) { rdma_buf_free(conn, &clmsg); p->cku_err.re_status = RPC_CANTSEND; p->cku_err.re_errno = EIO; goto done; } /* * XDR encode the RDMA_REPLY write chunk */ seg_array_len = (cl_long_reply ? 1 : 0); (void) xdr_encode_reply_wchunk(rdmahdr_o_xdrs, cl_long_reply, seg_array_len); /* * Construct a clist in "sendlist" that represents what we * will push over the wire. * * Start with the RDMA header and clist (if any) */ clist_add(&cl_sendlist, 0, XDR_GETPOS(rdmahdr_o_xdrs), &clmsg.handle, clmsg.addr, NULL, NULL); /* * Put the RPC call message in sendlist if small RPC */ if (op == RDMA_MSG) { clist_add(&cl_sendlist, 0, p->cku_outsz, &rpcmsg.handle, rpcmsg.addr, NULL, NULL); } else { /* Long RPC already in chunk list */ RCSTAT_INCR(rclongrpcs); } /* * Set up a reply buffer ready for the reply */ status = rdma_clnt_postrecv(conn, p->cku_xid); if (status != RDMA_SUCCESS) { rdma_buf_free(conn, &clmsg); p->cku_err.re_status = RPC_CANTSEND; p->cku_err.re_errno = EIO; goto done; } /* * sync the memory for dma */ if (cl != NULL) { status = clist_syncmem(conn, cl, CLIST_REG_SOURCE); if (status != RDMA_SUCCESS) { (void) rdma_clnt_postrecv_remove(conn, p->cku_xid); rdma_buf_free(conn, &clmsg); p->cku_err.re_status = RPC_CANTSEND; p->cku_err.re_errno = EIO; goto done; } } /* * Send the RDMA Header and RPC call message to the server */ status = RDMA_SEND(conn, cl_sendlist, p->cku_xid); if (status != RDMA_SUCCESS) { (void) rdma_clnt_postrecv_remove(conn, p->cku_xid); p->cku_err.re_status = RPC_CANTSEND; p->cku_err.re_errno = EIO; goto done; } /* * RDMA plugin now owns the send msg buffers. * Clear them out and don't free them. */ clmsg.addr = NULL; if (rpcmsg.type == SEND_BUFFER) rpcmsg.addr = NULL; /* * Recv rpc reply */ status = RDMA_RECV(conn, &cl_recvlist, p->cku_xid); /* * Now check recv status */ if (status != 0) { if (status == RDMA_INTR) { p->cku_err.re_status = RPC_INTR; p->cku_err.re_errno = EINTR; RCSTAT_INCR(rcintrs); } else if (status == RPC_TIMEDOUT) { p->cku_err.re_status = RPC_TIMEDOUT; p->cku_err.re_errno = ETIMEDOUT; RCSTAT_INCR(rctimeouts); } else { p->cku_err.re_status = RPC_CANTRECV; p->cku_err.re_errno = EIO; } goto done; } /* * Process the reply message. * * First the chunk list (if any) */ rdmahdr_i_xdrs = &(p->cku_inxdr); xdrmem_create(rdmahdr_i_xdrs, (caddr_t)(uintptr_t)cl_recvlist->w.c_saddr3, cl_recvlist->c_len, XDR_DECODE); /* * Treat xid as opaque (xid is the first entity * in the rpc rdma message). * Skip xid and set the xdr position accordingly. */ XDR_SETPOS(rdmahdr_i_xdrs, sizeof (uint32_t)); (void) xdr_u_int(rdmahdr_i_xdrs, &vers); (void) xdr_u_int(rdmahdr_i_xdrs, &rdma_credit); (void) xdr_u_int(rdmahdr_i_xdrs, &op); (void) xdr_do_clist(rdmahdr_i_xdrs, &cl); clnt_update_credit(conn, rdma_credit); wlist_exists_reply = FALSE; if (! xdr_decode_wlist(rdmahdr_i_xdrs, &cl_rpcreply_wlist, &wlist_exists_reply)) { DTRACE_PROBE(krpc__e__clntrdma__callit__wlist_decode); p->cku_err.re_status = RPC_CANTDECODERES; p->cku_err.re_errno = EIO; goto done; } /* * The server shouldn't have sent a RDMA_SEND that * the client needs to RDMA_WRITE a reply back to * the server. So silently ignoring what the * server returns in the rdma_reply section of the * header. */ (void) xdr_decode_reply_wchunk(rdmahdr_i_xdrs, &cl_rdma_reply); off = xdr_getpos(rdmahdr_i_xdrs); clnt_decode_long_reply(conn, cl_long_reply, cl_rdma_reply, &replyxdr, &reply_xdrp, cl, cl_recvlist, op, off); if (reply_xdrp == NULL) goto done; if (wlist_exists_reply) { XDR_CONTROL(reply_xdrp, XDR_RDMA_SET_WLIST, cl_rpcreply_wlist); } reply_msg.rm_direction = REPLY; reply_msg.rm_reply.rp_stat = MSG_ACCEPTED; reply_msg.acpted_rply.ar_stat = SUCCESS; reply_msg.acpted_rply.ar_verf = _null_auth; /* * xdr_results will be done in AUTH_UNWRAP. */ reply_msg.acpted_rply.ar_results.where = NULL; reply_msg.acpted_rply.ar_results.proc = xdr_void; /* * Decode and validate the response. */ if (xdr_replymsg(reply_xdrp, &reply_msg)) { enum clnt_stat re_status; _seterr_reply(&reply_msg, &(p->cku_err)); re_status = p->cku_err.re_status; if (re_status == RPC_SUCCESS) { /* * Reply is good, check auth. */ if (!AUTH_VALIDATE(h->cl_auth, &reply_msg.acpted_rply.ar_verf)) { p->cku_err.re_status = RPC_AUTHERROR; p->cku_err.re_why = AUTH_INVALIDRESP; RCSTAT_INCR(rcbadverfs); DTRACE_PROBE( krpc__e__clntrdma__callit__authvalidate); } else if (!AUTH_UNWRAP(h->cl_auth, reply_xdrp, xdr_results, resultsp)) { p->cku_err.re_status = RPC_CANTDECODERES; p->cku_err.re_errno = EIO; DTRACE_PROBE( krpc__e__clntrdma__callit__authunwrap); } } else { /* set errno in case we can't recover */ if (re_status != RPC_VERSMISMATCH && re_status != RPC_AUTHERROR && re_status != RPC_PROGVERSMISMATCH) p->cku_err.re_errno = EIO; if (re_status == RPC_AUTHERROR) { if ((refresh_attempt > 0) && AUTH_REFRESH(h->cl_auth, &reply_msg, p->cku_cred)) { refresh_attempt--; try_call_again = 1; goto done; } try_call_again = 0; /* * We have used the client handle to * do an AUTH_REFRESH and the RPC status may * be set to RPC_SUCCESS; Let's make sure to * set it to RPC_AUTHERROR. */ p->cku_err.re_status = RPC_AUTHERROR; /* * Map recoverable and unrecoverable * authentication errors to appropriate * errno */ switch (p->cku_err.re_why) { case AUTH_BADCRED: case AUTH_BADVERF: case AUTH_INVALIDRESP: case AUTH_TOOWEAK: case AUTH_FAILED: case RPCSEC_GSS_NOCRED: case RPCSEC_GSS_FAILED: p->cku_err.re_errno = EACCES; break; case AUTH_REJECTEDCRED: case AUTH_REJECTEDVERF: default: p->cku_err.re_errno = EIO; break; } } DTRACE_PROBE1(krpc__e__clntrdma__callit__rpcfailed, int, p->cku_err.re_why); } } else {