/* * XDR decode the long reply write chunk. */ bool_t xdr_decode_reply_wchunk(XDR *xdrs, struct clist **clist) { bool_t have_rchunk = FALSE; struct clist *first = NULL, *ncl = NULL; uint32_t num_wclist; uint32_t i; if (!xdr_bool(xdrs, &have_rchunk)) return (FALSE); if (have_rchunk == FALSE) return (TRUE); if (!xdr_uint32(xdrs, &num_wclist)) { DTRACE_PROBE(krpc__e__xdrrdma__replywchunk__listlength); return (FALSE); } if (num_wclist == 0) { return (FALSE); } first = ncl = clist_alloc(); for (i = 0; i < num_wclist; i++) { if (i > 0) { ncl->c_next = clist_alloc(); ncl = ncl->c_next; } if (!xdr_uint32(xdrs, &ncl->c_dmemhandle.mrc_rmr)) goto err_out; if (!xdr_uint32(xdrs, &ncl->c_len)) goto err_out; if (!xdr_uint64(xdrs, &ncl->u.c_daddr)) goto err_out; if (ncl->c_len > MAX_SVC_XFER_SIZE) { DTRACE_PROBE( krpc__e__xdrrdma__replywchunk__chunklist_toobig); ncl->c_len = MAX_SVC_XFER_SIZE; } if (!(ncl->c_dmemhandle.mrc_rmr && (ncl->c_len > 0) && ncl->u.c_daddr)) DTRACE_PROBE( krpc__e__xdrrdma__replywchunk__invalid_segaddr); DTRACE_PROBE1(krpc__i__xdr_decode_reply_wchunk_c_len, uint32_t, ncl->c_len); } *clist = first; return (TRUE); err_out: clist_free(first); return (FALSE); }
/* * Conditionally decode a RDMA WRITE chunk list from XDR stream. * * If the next boolean in the XDR stream is false there is no * RDMA WRITE chunk list present. Otherwise iterate over the * array and for each entry: allocate a struct clist and decode. * Pass back an indication via wlist_exists if we have seen a * RDMA WRITE chunk list. */ bool_t xdr_decode_wlist(XDR *xdrs, struct clist **w, bool_t *wlist_exists) { struct clist *tmp; bool_t more = FALSE; uint32_t seg_array_len; uint32_t i; if (!xdr_bool(xdrs, &more)) return (FALSE); /* is there a wlist? */ if (more == FALSE) { *wlist_exists = FALSE; return (TRUE); } *wlist_exists = TRUE; if (!xdr_uint32(xdrs, &seg_array_len)) return (FALSE); tmp = *w = clist_alloc(); for (i = 0; i < seg_array_len; i++) { if (!xdr_uint32(xdrs, &tmp->c_dmemhandle.mrc_rmr)) return (FALSE); if (!xdr_uint32(xdrs, &tmp->c_len)) return (FALSE); DTRACE_PROBE1(krpc__i__xdr_decode_wlist_len, uint_t, tmp->c_len); if (!xdr_uint64(xdrs, &tmp->u.c_daddr)) return (FALSE); if (i < seg_array_len - 1) { tmp->c_next = clist_alloc(); tmp = tmp->c_next; } else { tmp->c_next = NULL; } } more = FALSE; if (!xdr_bool(xdrs, &more)) return (FALSE); return (TRUE); }
static int clnt_setup_long_reply(CONN *conn, struct clist **clpp, uint_t length) { if (length == 0) { *clpp = NULL; return (CLNT_RDMA_SUCCESS); } *clpp = clist_alloc(); (*clpp)->rb_longbuf.len = calc_length(length); (*clpp)->rb_longbuf.type = RDMA_LONG_BUFFER; if (rdma_buf_alloc(conn, &((*clpp)->rb_longbuf))) { clist_free(*clpp); *clpp = NULL; return (CLNT_RDMA_FAIL); } (*clpp)->u.c_daddr3 = (*clpp)->rb_longbuf.addr; (*clpp)->c_len = (*clpp)->rb_longbuf.len; (*clpp)->c_next = NULL; (*clpp)->c_dmemhandle = (*clpp)->rb_longbuf.handle; if (clist_register(conn, *clpp, CLIST_REG_DST)) { DTRACE_PROBE(krpc__e__clntrdma__longrep_regbuf); rdma_buf_free(conn, &((*clpp)->rb_longbuf)); clist_free(*clpp); return (CLNT_RDMA_FAIL); } return (CLNT_RDMA_SUCCESS); }
/* * Generate or free a clist structure from the * kmem_cache "rdma_clist" */ bool_t xdr_ref_clist(XDR *xdrs, caddr_t *pp) { caddr_t loc = *pp; bool_t stat; if (loc == NULL) { switch (xdrs->x_op) { case XDR_FREE: return (TRUE); case XDR_DECODE: *pp = loc = (caddr_t)clist_alloc(); break; case XDR_ENCODE: ASSERT(loc); break; } } stat = xdr_clist(xdrs, (struct clist *)loc); if (xdrs->x_op == XDR_FREE) { kmem_cache_free(clist_cache, loc); *pp = NULL; } return (stat); }
/* * ENCODE some bytes into an XDR stream xp_min_chunk = 0, means the stream of * bytes contain no chunks to seperate out, and if the bytes do not fit in * the supplied buffer, grow the buffer and free the old buffer. */ static bool_t xdrrdma_putbytes(XDR *xdrs, caddr_t addr, int len) { xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private); /* * Is this stream accepting chunks? * If so, does the either of the two following conditions exist? * - length of bytes to encode is greater than the min chunk size? * - remaining space in this stream is shorter than length of * bytes to encode? * * If the above exists, then create a chunk for this encoding * and save the addresses, etc. */ if (xdrp->xp_flags & XDR_RDMA_CHUNK && ((xdrp->xp_min_chunk != 0 && len >= xdrp->xp_min_chunk) || (xdrs->x_handy - len < 0))) { struct clist *cle; int offset = xdrp->xp_offp - xdrs->x_base; cle = clist_alloc(); cle->c_xdroff = offset; cle->c_len = len; cle->w.c_saddr = (uint64)(uintptr_t)addr; cle->c_next = NULL; *(xdrp->xp_rcl_next) = cle; xdrp->xp_rcl_next = &(cle->c_next); return (TRUE); } /* Is there enough space to encode what is left? */ if ((xdrs->x_handy -= len) < 0) { return (FALSE); } bcopy(addr, xdrp->xp_offp, len); xdrp->xp_offp += len; return (TRUE); }
/* ARGSUSED */ static enum clnt_stat clnt_rdma_kcallit(CLIENT *h, rpcproc_t procnum, xdrproc_t xdr_args, caddr_t argsp, xdrproc_t xdr_results, caddr_t resultsp, struct timeval wait) { cku_private_t *p = htop(h); int try_call_again; int refresh_attempt = AUTH_REFRESH_COUNT; int status; int msglen; XDR *call_xdrp, callxdr; /* for xdrrdma encoding the RPC call */ XDR *reply_xdrp, replyxdr; /* for xdrrdma decoding the RPC reply */ XDR *rdmahdr_o_xdrs, *rdmahdr_i_xdrs; struct rpc_msg reply_msg; rdma_registry_t *m; struct clist *cl_sendlist; struct clist *cl_recvlist; struct clist *cl; struct clist *cl_rpcmsg; struct clist *cl_rdma_reply; struct clist *cl_rpcreply_wlist; struct clist *cl_long_reply; rdma_buf_t rndup; uint_t vers; uint_t op; uint_t off; uint32_t seg_array_len; uint_t long_reply_len; uint_t rpcsec_gss; uint_t gss_i_or_p; CONN *conn = NULL; rdma_buf_t clmsg; rdma_buf_t rpcmsg; rdma_chunkinfo_lengths_t rcil; clock_t ticks; bool_t wlist_exists_reply; uint32_t rdma_credit = rdma_bufs_rqst; RCSTAT_INCR(rccalls); call_again: bzero(&clmsg, sizeof (clmsg)); bzero(&rpcmsg, sizeof (rpcmsg)); bzero(&rndup, sizeof (rndup)); try_call_again = 0; cl_sendlist = NULL; cl_recvlist = NULL; cl = NULL; cl_rpcmsg = NULL; cl_rdma_reply = NULL; call_xdrp = NULL; reply_xdrp = NULL; wlist_exists_reply = FALSE; cl_rpcreply_wlist = NULL; cl_long_reply = NULL; rcil.rcil_len = 0; rcil.rcil_len_alt = 0; long_reply_len = 0; rw_enter(&rdma_lock, RW_READER); m = (rdma_registry_t *)p->cku_rd_handle; if (m->r_mod_state == RDMA_MOD_INACTIVE) { /* * If we didn't find a matching RDMA module in the registry * then there is no transport. */ rw_exit(&rdma_lock); p->cku_err.re_status = RPC_CANTSEND; p->cku_err.re_errno = EIO; ticks = clnt_rdma_min_delay * drv_usectohz(1000000); if (h->cl_nosignal == TRUE) { delay(ticks); } else { if (delay_sig(ticks) == EINTR) { p->cku_err.re_status = RPC_INTR; p->cku_err.re_errno = EINTR; } } return (RPC_CANTSEND); } /* * Get unique xid */ if (p->cku_xid == 0) p->cku_xid = alloc_xid(); status = RDMA_GET_CONN(p->cku_rd_mod->rdma_ops, &p->cku_srcaddr, &p->cku_addr, p->cku_addrfmly, p->cku_rd_handle, &conn); rw_exit(&rdma_lock); /* * If there is a problem with the connection reflect the issue * back to the higher level to address, we MAY delay for a short * period so that we are kind to the transport. */ if (conn == NULL) { /* * Connect failed to server. Could be because of one * of several things. In some cases we don't want * the caller to retry immediately - delay before * returning to caller. */ switch (status) { case RDMA_TIMEDOUT: /* * Already timed out. No need to delay * some more. */ p->cku_err.re_status = RPC_TIMEDOUT; p->cku_err.re_errno = ETIMEDOUT; break; case RDMA_INTR: /* * Failed because of an signal. Very likely * the caller will not retry. */ p->cku_err.re_status = RPC_INTR; p->cku_err.re_errno = EINTR; break; default: /* * All other failures - server down or service * down or temporary resource failure. Delay before * returning to caller. */ ticks = clnt_rdma_min_delay * drv_usectohz(1000000); p->cku_err.re_status = RPC_CANTCONNECT; p->cku_err.re_errno = EIO; if (h->cl_nosignal == TRUE) { delay(ticks); } else { if (delay_sig(ticks) == EINTR) { p->cku_err.re_status = RPC_INTR; p->cku_err.re_errno = EINTR; } } break; } return (p->cku_err.re_status); } if (p->cku_srcaddr.maxlen < conn->c_laddr.len) { if ((p->cku_srcaddr.maxlen != 0) && (p->cku_srcaddr.buf != NULL)) kmem_free(p->cku_srcaddr.buf, p->cku_srcaddr.maxlen); p->cku_srcaddr.buf = kmem_zalloc(conn->c_laddr.maxlen, KM_SLEEP); p->cku_srcaddr.maxlen = conn->c_laddr.maxlen; } p->cku_srcaddr.len = conn->c_laddr.len; bcopy(conn->c_laddr.buf, p->cku_srcaddr.buf, conn->c_laddr.len); clnt_check_credit(conn); status = CLNT_RDMA_FAIL; rpcsec_gss = gss_i_or_p = FALSE; if (IS_RPCSEC_GSS(h)) { rpcsec_gss = TRUE; if (rpc_gss_get_service_type(h->cl_auth) == rpc_gss_svc_integrity || rpc_gss_get_service_type(h->cl_auth) == rpc_gss_svc_privacy) gss_i_or_p = TRUE; } /* * Try a regular RDMA message if RPCSEC_GSS is not being used * or if RPCSEC_GSS is being used for authentication only. */ if (rpcsec_gss == FALSE || (rpcsec_gss == TRUE && gss_i_or_p == FALSE)) { /* * Grab a send buffer for the request. Try to * encode it to see if it fits. If not, then it * needs to be sent in a chunk. */ rpcmsg.type = SEND_BUFFER; if (rdma_buf_alloc(conn, &rpcmsg)) { DTRACE_PROBE(krpc__e__clntrdma__callit_nobufs); goto done; } /* First try to encode into regular send buffer */ op = RDMA_MSG; call_xdrp = &callxdr; xdrrdma_create(call_xdrp, rpcmsg.addr, rpcmsg.len, rdma_minchunk, NULL, XDR_ENCODE, conn); status = clnt_compose_rpcmsg(h, procnum, &rpcmsg, call_xdrp, xdr_args, argsp); if (status != CLNT_RDMA_SUCCESS) { /* Clean up from previous encode attempt */ rdma_buf_free(conn, &rpcmsg); XDR_DESTROY(call_xdrp); } else { XDR_CONTROL(call_xdrp, XDR_RDMA_GET_CHUNK_LEN, &rcil); } } /* If the encode didn't work, then try a NOMSG */ if (status != CLNT_RDMA_SUCCESS) { msglen = CKU_HDRSIZE + BYTES_PER_XDR_UNIT + MAX_AUTH_BYTES + xdr_sizeof(xdr_args, argsp); msglen = calc_length(msglen); /* pick up the lengths for the reply buffer needed */ (void) xdrrdma_sizeof(xdr_args, argsp, 0, &rcil.rcil_len, &rcil.rcil_len_alt); /* * Construct a clist to describe the CHUNK_BUFFER * for the rpcmsg. */ cl_rpcmsg = clist_alloc(); cl_rpcmsg->c_len = msglen; cl_rpcmsg->rb_longbuf.type = RDMA_LONG_BUFFER; cl_rpcmsg->rb_longbuf.len = msglen; if (rdma_buf_alloc(conn, &cl_rpcmsg->rb_longbuf)) { clist_free(cl_rpcmsg); goto done; } cl_rpcmsg->w.c_saddr3 = cl_rpcmsg->rb_longbuf.addr; op = RDMA_NOMSG; call_xdrp = &callxdr; xdrrdma_create(call_xdrp, cl_rpcmsg->rb_longbuf.addr, cl_rpcmsg->rb_longbuf.len, 0, cl_rpcmsg, XDR_ENCODE, conn); status = clnt_compose_rpcmsg(h, procnum, &rpcmsg, call_xdrp, xdr_args, argsp); if (status != CLNT_RDMA_SUCCESS) { p->cku_err.re_status = RPC_CANTENCODEARGS; p->cku_err.re_errno = EIO; DTRACE_PROBE(krpc__e__clntrdma__callit__composemsg); goto done; } } /* * During the XDR_ENCODE we may have "allocated" an RDMA READ or * RDMA WRITE clist. * * First pull the RDMA READ chunk list from the XDR private * area to keep it handy. */ XDR_CONTROL(call_xdrp, XDR_RDMA_GET_RLIST, &cl); if (gss_i_or_p) { long_reply_len = rcil.rcil_len + rcil.rcil_len_alt; long_reply_len += MAX_AUTH_BYTES; } else { long_reply_len = rcil.rcil_len; } /* * Update the chunk size information for the Long RPC msg. */ if (cl && op == RDMA_NOMSG) cl->c_len = p->cku_outsz; /* * Prepare the RDMA header. On success xdrs will hold the result * of xdrmem_create() for a SEND_BUFFER. */ status = clnt_compose_rdma_header(conn, h, &clmsg, &rdmahdr_o_xdrs, &op); if (status != CLNT_RDMA_SUCCESS) { p->cku_err.re_status = RPC_CANTSEND; p->cku_err.re_errno = EIO; RCSTAT_INCR(rcnomem); DTRACE_PROBE(krpc__e__clntrdma__callit__nobufs2); goto done; } /* * Now insert the RDMA READ list iff present */ status = clnt_setup_rlist(conn, rdmahdr_o_xdrs, call_xdrp); if (status != CLNT_RDMA_SUCCESS) { DTRACE_PROBE(krpc__e__clntrdma__callit__clistreg); rdma_buf_free(conn, &clmsg); p->cku_err.re_status = RPC_CANTSEND; p->cku_err.re_errno = EIO; goto done; } /* * Setup RDMA WRITE chunk list for nfs read operation * other operations will have a NULL which will result * as a NULL list in the XDR stream. */ status = clnt_setup_wlist(conn, rdmahdr_o_xdrs, call_xdrp, &rndup); if (status != CLNT_RDMA_SUCCESS) { rdma_buf_free(conn, &clmsg); p->cku_err.re_status = RPC_CANTSEND; p->cku_err.re_errno = EIO; goto done; } /* * If NULL call and RPCSEC_GSS, provide a chunk such that * large responses can flow back to the client. * If RPCSEC_GSS with integrity or privacy is in use, get chunk. */ if ((procnum == 0 && rpcsec_gss == TRUE) || (rpcsec_gss == TRUE && gss_i_or_p == TRUE)) long_reply_len += 1024; status = clnt_setup_long_reply(conn, &cl_long_reply, long_reply_len); if (status != CLNT_RDMA_SUCCESS) { rdma_buf_free(conn, &clmsg); p->cku_err.re_status = RPC_CANTSEND; p->cku_err.re_errno = EIO; goto done; } /* * XDR encode the RDMA_REPLY write chunk */ seg_array_len = (cl_long_reply ? 1 : 0); (void) xdr_encode_reply_wchunk(rdmahdr_o_xdrs, cl_long_reply, seg_array_len); /* * Construct a clist in "sendlist" that represents what we * will push over the wire. * * Start with the RDMA header and clist (if any) */ clist_add(&cl_sendlist, 0, XDR_GETPOS(rdmahdr_o_xdrs), &clmsg.handle, clmsg.addr, NULL, NULL); /* * Put the RPC call message in sendlist if small RPC */ if (op == RDMA_MSG) { clist_add(&cl_sendlist, 0, p->cku_outsz, &rpcmsg.handle, rpcmsg.addr, NULL, NULL); } else { /* Long RPC already in chunk list */ RCSTAT_INCR(rclongrpcs); } /* * Set up a reply buffer ready for the reply */ status = rdma_clnt_postrecv(conn, p->cku_xid); if (status != RDMA_SUCCESS) { rdma_buf_free(conn, &clmsg); p->cku_err.re_status = RPC_CANTSEND; p->cku_err.re_errno = EIO; goto done; } /* * sync the memory for dma */ if (cl != NULL) { status = clist_syncmem(conn, cl, CLIST_REG_SOURCE); if (status != RDMA_SUCCESS) { (void) rdma_clnt_postrecv_remove(conn, p->cku_xid); rdma_buf_free(conn, &clmsg); p->cku_err.re_status = RPC_CANTSEND; p->cku_err.re_errno = EIO; goto done; } } /* * Send the RDMA Header and RPC call message to the server */ status = RDMA_SEND(conn, cl_sendlist, p->cku_xid); if (status != RDMA_SUCCESS) { (void) rdma_clnt_postrecv_remove(conn, p->cku_xid); p->cku_err.re_status = RPC_CANTSEND; p->cku_err.re_errno = EIO; goto done; } /* * RDMA plugin now owns the send msg buffers. * Clear them out and don't free them. */ clmsg.addr = NULL; if (rpcmsg.type == SEND_BUFFER) rpcmsg.addr = NULL; /* * Recv rpc reply */ status = RDMA_RECV(conn, &cl_recvlist, p->cku_xid); /* * Now check recv status */ if (status != 0) { if (status == RDMA_INTR) { p->cku_err.re_status = RPC_INTR; p->cku_err.re_errno = EINTR; RCSTAT_INCR(rcintrs); } else if (status == RPC_TIMEDOUT) { p->cku_err.re_status = RPC_TIMEDOUT; p->cku_err.re_errno = ETIMEDOUT; RCSTAT_INCR(rctimeouts); } else { p->cku_err.re_status = RPC_CANTRECV; p->cku_err.re_errno = EIO; } goto done; } /* * Process the reply message. * * First the chunk list (if any) */ rdmahdr_i_xdrs = &(p->cku_inxdr); xdrmem_create(rdmahdr_i_xdrs, (caddr_t)(uintptr_t)cl_recvlist->w.c_saddr3, cl_recvlist->c_len, XDR_DECODE); /* * Treat xid as opaque (xid is the first entity * in the rpc rdma message). * Skip xid and set the xdr position accordingly. */ XDR_SETPOS(rdmahdr_i_xdrs, sizeof (uint32_t)); (void) xdr_u_int(rdmahdr_i_xdrs, &vers); (void) xdr_u_int(rdmahdr_i_xdrs, &rdma_credit); (void) xdr_u_int(rdmahdr_i_xdrs, &op); (void) xdr_do_clist(rdmahdr_i_xdrs, &cl); clnt_update_credit(conn, rdma_credit); wlist_exists_reply = FALSE; if (! xdr_decode_wlist(rdmahdr_i_xdrs, &cl_rpcreply_wlist, &wlist_exists_reply)) { DTRACE_PROBE(krpc__e__clntrdma__callit__wlist_decode); p->cku_err.re_status = RPC_CANTDECODERES; p->cku_err.re_errno = EIO; goto done; } /* * The server shouldn't have sent a RDMA_SEND that * the client needs to RDMA_WRITE a reply back to * the server. So silently ignoring what the * server returns in the rdma_reply section of the * header. */ (void) xdr_decode_reply_wchunk(rdmahdr_i_xdrs, &cl_rdma_reply); off = xdr_getpos(rdmahdr_i_xdrs); clnt_decode_long_reply(conn, cl_long_reply, cl_rdma_reply, &replyxdr, &reply_xdrp, cl, cl_recvlist, op, off); if (reply_xdrp == NULL) goto done; if (wlist_exists_reply) { XDR_CONTROL(reply_xdrp, XDR_RDMA_SET_WLIST, cl_rpcreply_wlist); } reply_msg.rm_direction = REPLY; reply_msg.rm_reply.rp_stat = MSG_ACCEPTED; reply_msg.acpted_rply.ar_stat = SUCCESS; reply_msg.acpted_rply.ar_verf = _null_auth; /* * xdr_results will be done in AUTH_UNWRAP. */ reply_msg.acpted_rply.ar_results.where = NULL; reply_msg.acpted_rply.ar_results.proc = xdr_void; /* * Decode and validate the response. */ if (xdr_replymsg(reply_xdrp, &reply_msg)) { enum clnt_stat re_status; _seterr_reply(&reply_msg, &(p->cku_err)); re_status = p->cku_err.re_status; if (re_status == RPC_SUCCESS) { /* * Reply is good, check auth. */ if (!AUTH_VALIDATE(h->cl_auth, &reply_msg.acpted_rply.ar_verf)) { p->cku_err.re_status = RPC_AUTHERROR; p->cku_err.re_why = AUTH_INVALIDRESP; RCSTAT_INCR(rcbadverfs); DTRACE_PROBE( krpc__e__clntrdma__callit__authvalidate); } else if (!AUTH_UNWRAP(h->cl_auth, reply_xdrp, xdr_results, resultsp)) { p->cku_err.re_status = RPC_CANTDECODERES; p->cku_err.re_errno = EIO; DTRACE_PROBE( krpc__e__clntrdma__callit__authunwrap); } } else { /* set errno in case we can't recover */ if (re_status != RPC_VERSMISMATCH && re_status != RPC_AUTHERROR && re_status != RPC_PROGVERSMISMATCH) p->cku_err.re_errno = EIO; if (re_status == RPC_AUTHERROR) { if ((refresh_attempt > 0) && AUTH_REFRESH(h->cl_auth, &reply_msg, p->cku_cred)) { refresh_attempt--; try_call_again = 1; goto done; } try_call_again = 0; /* * We have used the client handle to * do an AUTH_REFRESH and the RPC status may * be set to RPC_SUCCESS; Let's make sure to * set it to RPC_AUTHERROR. */ p->cku_err.re_status = RPC_AUTHERROR; /* * Map recoverable and unrecoverable * authentication errors to appropriate * errno */ switch (p->cku_err.re_why) { case AUTH_BADCRED: case AUTH_BADVERF: case AUTH_INVALIDRESP: case AUTH_TOOWEAK: case AUTH_FAILED: case RPCSEC_GSS_NOCRED: case RPCSEC_GSS_FAILED: p->cku_err.re_errno = EACCES; break; case AUTH_REJECTEDCRED: case AUTH_REJECTEDVERF: default: p->cku_err.re_errno = EIO; break; } } DTRACE_PROBE1(krpc__e__clntrdma__callit__rpcfailed, int, p->cku_err.re_why); } } else {
/* * If xp_wcl is NULL value, then the RPC payload will NOT carry * an RDMA WRITE chunk list, in this case we insert FALSE into * the XDR stream. Otherwise we use the clist and RDMA register * the memory and encode the clist into the outbound XDR stream. */ static int clnt_setup_wlist(CONN *conn, XDR *xdrs, XDR *call_xdrp, rdma_buf_t *rndbuf) { int status; struct clist *wlist, *rndcl; int wlen, rndlen; int32_t xdr_flag = XDR_RDMA_WLIST_REG; XDR_CONTROL(call_xdrp, XDR_RDMA_GET_WLIST, &wlist); if (wlist != NULL) { /* * If we are sending a non 4-byte alligned length * the server will roundup the length to 4-byte * boundary. In such a case, a trailing chunk is * added to take any spill over roundup bytes. */ wlen = clist_len(wlist); rndlen = (roundup(wlen, BYTES_PER_XDR_UNIT) - wlen); if (rndlen) { rndcl = clist_alloc(); /* * calc_length() will allocate a PAGESIZE * buffer below. */ rndcl->c_len = calc_length(rndlen); rndcl->rb_longbuf.type = RDMA_LONG_BUFFER; rndcl->rb_longbuf.len = rndcl->c_len; if (rdma_buf_alloc(conn, &rndcl->rb_longbuf)) { clist_free(rndcl); return (CLNT_RDMA_FAIL); } /* Roundup buffer freed back in caller */ *rndbuf = rndcl->rb_longbuf; rndcl->u.c_daddr3 = rndcl->rb_longbuf.addr; rndcl->c_next = NULL; rndcl->c_dmemhandle = rndcl->rb_longbuf.handle; wlist->c_next = rndcl; } status = clist_register(conn, wlist, CLIST_REG_DST); if (status != RDMA_SUCCESS) { rdma_buf_free(conn, rndbuf); bzero(rndbuf, sizeof (rdma_buf_t)); return (CLNT_RDMA_FAIL); } XDR_CONTROL(call_xdrp, XDR_RDMA_SET_FLAGS, &xdr_flag); } if (!xdr_encode_wlist(xdrs, wlist)) { if (rndlen) { rdma_buf_free(conn, rndbuf); bzero(rndbuf, sizeof (rdma_buf_t)); } return (CLNT_RDMA_FAIL); } return (CLNT_RDMA_SUCCESS); }
/* ioctl(2) ※file_operations->unlocked_ioctl対応 */ static long clbench_ioctl(struct file *flip, unsigned int cmd, unsigned long arg) { int retval = -1; struct pid *p; struct task_struct *t; struct ioc_submit_spec submit_spec; switch(cmd){ case IOC_USEREND_NOTIFY: /* USEREND_NOTIFYがioctl(2)される前にユーザ側でsleep(PERIOD)してくれている */ /* signal送信を止める処理 */ if(sigspec.sr_status == SIG_READY){ int nr_objs, nr_first, nr_burst; sigspec.sr_status = SIGRESET_REQUEST; printk(KERN_INFO "%s : IOC_USEREND_NOTIFY recieved\n", log_prefix); /* ユーザに通知してユーザにread(2)してもらう */ nr_objs = clist_set_end(clist_ctl, &nr_first, &nr_burst); nr_objs += nr_first + (nr_burst * clist_ctl->nr_composed); put_user(nr_objs, (unsigned int __user *)arg); retval = 1; } else{ printk(KERN_INFO "%s : IOC_USEREND_NOTIFY was regarded\n", log_prefix); retval = -EPERM; } break; case IOC_SIGRESET_REQUEST: /* シグナルを止める処理 */ if(sigspec.sr_status == SIG_READY){ sigspec.sr_status = SIGRESET_REQUEST; printk(KERN_INFO "%s : IOC_SIGRESET_REQUES recieved\n", log_prefix); retval = 1; } else{ printk(KERN_INFO "%s : IOC_SIGRESET_REQUEST was regarded\n", log_prefix); retval = -EPERM; } break; case IOC_SUBMIT_SPEC: copy_from_user(&submit_spec, (struct ioc_submit_spec __user *)arg, sizeof(struct ioc_submit_spec)); printk(KERN_INFO "%s : IOC_SET_SPEC pid:%d, flush_period:%d signo:%d nr_node:%d node_nr_cmposed:%d\n", log_prefix, submit_spec.pid, submit_spec.flush_period, submit_spec.signo, submit_spec.nr_node, submit_spec.node_nr_composed); /* pidの準備 */ p = find_vpid(submit_spec.pid); t = pid_task(p, PIDTYPE_PID); sigspec.t = t; sigspec.info.si_errno = 0; sigspec.info.si_code = SI_KERNEL; sigspec.info.si_pid = 0; sigspec.info.si_uid = 0; /* signoの準備 */ sigspec.signo = submit_spec.signo; sigspec.info.si_signo = submit_spec.signo; /* flush_periodの準備 */ sigspec.flush_period = submit_spec.flush_period; /* 準備完了 */ sigspec.sr_status = SIG_READY; printk(KERN_INFO "%s : signal ready, object-size is %ld byte\n", log_prefix, sizeof(struct object)); clist_ctl = clist_alloc(submit_spec.nr_node, submit_spec.node_nr_composed, sizeof(struct object)); if(clist_ctl == NULL){ /* エラー処理 */ printk(KERN_INFO "%s : clist_alloc() failed returned NULL\n"); retval = -ENOMEM; } else{ mod_timer(&sigspec.flush_timer, jiffies + msecs_to_jiffies(sigspec.flush_period)); printk(KERN_INFO "%s : device setup complete\n", log_prefix); retval = 1; } break; } return retval; }
static bool_t xdrrdma_control(XDR *xdrs, int request, void *info) { int32_t *int32p; int len, i; uint_t in_flags; xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private); rdma_chunkinfo_t *rcip = NULL; rdma_wlist_conn_info_t *rwcip = NULL; rdma_chunkinfo_lengths_t *rcilp = NULL; struct uio *uiop; struct clist *rwl = NULL, *first = NULL; struct clist *prev = NULL; switch (request) { case XDR_PEEK: /* * Return the next 4 byte unit in the XDR stream. */ if (xdrs->x_handy < sizeof (int32_t)) return (FALSE); int32p = (int32_t *)info; *int32p = (int32_t)ntohl((uint32_t) (*((int32_t *)(xdrp->xp_offp)))); return (TRUE); case XDR_SKIPBYTES: /* * Skip the next N bytes in the XDR stream. */ int32p = (int32_t *)info; len = RNDUP((int)(*int32p)); if ((xdrs->x_handy -= len) < 0) return (FALSE); xdrp->xp_offp += len; return (TRUE); case XDR_RDMA_SET_FLAGS: /* * Set the flags provided in the *info in xp_flags for rdma * xdr stream control. */ int32p = (int32_t *)info; in_flags = (uint_t)(*int32p); xdrp->xp_flags |= in_flags; return (TRUE); case XDR_RDMA_GET_FLAGS: /* * Get the flags provided in xp_flags return through *info */ int32p = (int32_t *)info; *int32p = (int32_t)xdrp->xp_flags; return (TRUE); case XDR_RDMA_GET_CHUNK_LEN: rcilp = (rdma_chunkinfo_lengths_t *)info; rcilp->rcil_len = xdrp->xp_reply_chunk_len; rcilp->rcil_len_alt = xdrp->xp_reply_chunk_len_alt; return (TRUE); case XDR_RDMA_ADD_CHUNK: /* * Store wlist information */ rcip = (rdma_chunkinfo_t *)info; DTRACE_PROBE2(krpc__i__xdrrdma__control__add__chunk, rci_type_t, rcip->rci_type, uint32, rcip->rci_len); switch (rcip->rci_type) { case RCI_WRITE_UIO_CHUNK: xdrp->xp_reply_chunk_len_alt += rcip->rci_len; if ((rcip->rci_len + XDR_RDMA_BUF_OVERHEAD) < xdrp->xp_min_chunk) { xdrp->xp_wcl = NULL; *(rcip->rci_clpp) = NULL; return (TRUE); } uiop = rcip->rci_a.rci_uiop; for (i = 0; i < uiop->uio_iovcnt; i++) { rwl = clist_alloc(); if (first == NULL) first = rwl; rwl->c_len = uiop->uio_iov[i].iov_len; rwl->u.c_daddr = (uint64)(uintptr_t) (uiop->uio_iov[i].iov_base); /* * if userspace address, put adspace ptr in * clist. If not, then do nothing since it's * already set to NULL (from kmem_zalloc) */ if (uiop->uio_segflg == UIO_USERSPACE) { rwl->c_adspc = ttoproc(curthread)->p_as; } if (prev == NULL) prev = rwl; else { prev->c_next = rwl; prev = rwl; } } rwl->c_next = NULL; xdrp->xp_wcl = first; *(rcip->rci_clpp) = first; break; case RCI_WRITE_ADDR_CHUNK: rwl = clist_alloc(); rwl->c_len = rcip->rci_len; rwl->u.c_daddr3 = rcip->rci_a.rci_addr; rwl->c_next = NULL; xdrp->xp_reply_chunk_len_alt += rcip->rci_len; xdrp->xp_wcl = rwl; *(rcip->rci_clpp) = rwl; break; case RCI_REPLY_CHUNK: xdrp->xp_reply_chunk_len += rcip->rci_len; break; } return (TRUE); case XDR_RDMA_GET_WLIST: *((struct clist **)info) = xdrp->xp_wcl; return (TRUE); case XDR_RDMA_SET_WLIST: xdrp->xp_wcl = (struct clist *)info; return (TRUE); case XDR_RDMA_GET_RLIST: *((struct clist **)info) = xdrp->xp_rcl; return (TRUE); case XDR_RDMA_GET_WCINFO: rwcip = (rdma_wlist_conn_info_t *)info; rwcip->rwci_wlist = xdrp->xp_wcl; rwcip->rwci_conn = xdrp->xp_conn; return (TRUE); default: return (FALSE); } }
bool_t xdrrdma_getrdmablk(XDR *xdrs, struct clist **rlist, uint_t *sizep, CONN **conn, const uint_t maxsize) { xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private); struct clist *cle = *(xdrp->xp_rcl_next); struct clist *rdclist = NULL, *prev = NULL; bool_t retval = TRUE; uint32_t cur_offset = 0; uint32_t total_segments = 0; uint32_t actual_segments = 0; uint32_t alen; uint_t total_len; ASSERT(xdrs->x_op != XDR_FREE); /* * first deal with the length since xdr bytes are counted */ if (!xdr_u_int(xdrs, sizep)) { DTRACE_PROBE(xdr__e__getrdmablk_sizep_fail); return (FALSE); } total_len = *sizep; if (total_len > maxsize) { DTRACE_PROBE2(xdr__e__getrdmablk_bad_size, int, total_len, int, maxsize); return (FALSE); } (*conn) = xdrp->xp_conn; /* * if no data we are done */ if (total_len == 0) return (TRUE); while (cle) { total_segments++; cle = cle->c_next; } cle = *(xdrp->xp_rcl_next); /* * If there was a chunk at the current offset, then setup a read * chunk list which records the destination address and length * and will RDMA READ the data in later. */ if (cle == NULL) return (FALSE); if (cle->c_xdroff != (xdrp->xp_offp - xdrs->x_base)) return (FALSE); /* * Setup the chunk list with appropriate * address (offset) and length */ for (actual_segments = 0; actual_segments < total_segments; actual_segments++) { DTRACE_PROBE3(krpc__i__xdrrdma_getrdmablk, uint32_t, cle->c_len, uint32_t, total_len, uint32_t, cle->c_xdroff); if (total_len <= 0) break; /* * not the first time in the loop */ if (actual_segments > 0) cle = cle->c_next; cle->u.c_daddr = (uint64) cur_offset; alen = 0; if (cle->c_len > total_len) { alen = cle->c_len; cle->c_len = total_len; } if (!alen) xdrp->xp_rcl_next = &cle->c_next; cur_offset += cle->c_len; total_len -= cle->c_len; if ((total_segments - actual_segments - 1) == 0 && total_len > 0) { DTRACE_PROBE(krpc__e__xdrrdma_getblk_chunktooshort); retval = FALSE; } if ((total_segments - actual_segments - 1) > 0 && total_len == 0) { DTRACE_PROBE2(krpc__e__xdrrdma_getblk_toobig, int, total_segments, int, actual_segments); } rdclist = clist_alloc(); (*rdclist) = (*cle); if ((*rlist) == NULL) (*rlist) = rdclist; if (prev == NULL) prev = rdclist; else { prev->c_next = rdclist; prev = rdclist; } } out: if (prev != NULL) prev->c_next = NULL; /* * Adjust the chunk length, if we read only a part of * a chunk. */ if (alen) { cle->w.c_saddr = (uint64)(uintptr_t)cle->w.c_saddr + cle->c_len; cle->c_len = alen - cle->c_len; } return (retval); }
static bool_t xdrrdma_read_a_chunk(XDR *xdrs, CONN **conn) { int status; int32_t len = 0; xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private); struct clist *cle = *(xdrp->xp_rcl_next); struct clist *rclp = xdrp->xp_rcl; struct clist *clp; /* * len is used later to decide xdr offset in * the chunk factoring any 4-byte XDR alignment * (See read chunk example top of this file) */ while (rclp != cle) { len += rclp->c_len; rclp = rclp->c_next; } len = RNDUP(len) - len; ASSERT(xdrs->x_handy <= 0); /* * If this is the first chunk to contain the RPC * message set xp_off to the xdr offset of the * inline message. */ if (xdrp->xp_off == 0) xdrp->xp_off = (xdrp->xp_offp - xdrs->x_base); if (cle == NULL || (cle->c_xdroff != xdrp->xp_off)) return (FALSE); /* * Make a copy of the chunk to read from client. * Chunks are read on demand, so read only one * for now. */ rclp = clist_alloc(); *rclp = *cle; rclp->c_next = NULL; xdrp->xp_rcl_next = &cle->c_next; /* * If there is a roundup present, then skip those * bytes when reading. */ if (len) { rclp->w.c_saddr = (uint64)(uintptr_t)rclp->w.c_saddr + len; rclp->c_len = rclp->c_len - len; } status = xdrrdma_read_from_client(rclp, conn, rclp->c_len); if (status == FALSE) { clist_free(rclp); return (status); } xdrp->xp_offp = rclp->rb_longbuf.addr; xdrs->x_base = xdrp->xp_offp; xdrs->x_handy = rclp->c_len; /* * This copy of read chunks containing the XDR * message is freed later in xdrrdma_destroy() */ if (xdrp->xp_rcl_xdr) { /* Add the chunk to end of the list */ clp = xdrp->xp_rcl_xdr; while (clp->c_next != NULL) clp = clp->c_next; clp->c_next = rclp; } else { xdrp->xp_rcl_xdr = rclp; } return (TRUE); }
bool_t xdrrdma_send_read_data(XDR *xdrs, uint_t data_len, struct clist *wcl) { int status; xrdma_private_t *xdrp = (xrdma_private_t *)(xdrs->x_private); struct xdr_ops *xops = xdrrdma_xops(); struct clist *tcl, *wrcl, *cl; struct clist fcl; int rndup_present, rnduplen; rndup_present = 0; wrcl = NULL; /* caller is doing a sizeof */ if (xdrs->x_ops != &xdrrdma_ops || xdrs->x_ops == xops) return (TRUE); /* copy of the first chunk */ fcl = *wcl; fcl.c_next = NULL; /* * The entire buffer is registered with the first chunk. * Later chunks will use the same registered memory handle. */ status = clist_register(xdrp->xp_conn, &fcl, CLIST_REG_SOURCE); if (status != RDMA_SUCCESS) { return (FALSE); } wcl->c_regtype = CLIST_REG_SOURCE; wcl->c_smemhandle = fcl.c_smemhandle; wcl->c_ssynchandle = fcl.c_ssynchandle; /* * Only transfer the read data ignoring any trailing * roundup chunks. A bit of work, but it saves an * unnecessary extra RDMA_WRITE containing only * roundup bytes. */ rnduplen = clist_len(wcl) - data_len; if (rnduplen) { tcl = wcl->c_next; /* * Check if there is a trailing roundup chunk */ while (tcl) { if ((tcl->c_next == NULL) && (tcl->c_len == rnduplen)) { rndup_present = 1; break; } tcl = tcl->c_next; } /* * Make a copy chunk list skipping the last chunk */ if (rndup_present) { cl = wcl; tcl = NULL; while (cl) { if (tcl == NULL) { tcl = clist_alloc(); wrcl = tcl; } else { tcl->c_next = clist_alloc(); tcl = tcl->c_next; } *tcl = *cl; cl = cl->c_next; /* last chunk */ if (cl->c_next == NULL) break; } tcl->c_next = NULL; } } if (wrcl == NULL) { /* No roundup chunks */ wrcl = wcl; } /* * Set the registered memory handles for the * rest of the chunks same as the first chunk. */ tcl = wrcl->c_next; while (tcl) { tcl->c_smemhandle = fcl.c_smemhandle; tcl->c_ssynchandle = fcl.c_ssynchandle; tcl = tcl->c_next; } /* * Sync the total len beginning from the first chunk. */ fcl.c_len = clist_len(wrcl); status = clist_syncmem(xdrp->xp_conn, &fcl, CLIST_REG_SOURCE); if (status != RDMA_SUCCESS) { return (FALSE); } status = RDMA_WRITE(xdrp->xp_conn, wrcl, WAIT); if (rndup_present) clist_free(wrcl); if (status != RDMA_SUCCESS) { return (FALSE); } return (TRUE); }
/* * Server side RDMA WRITE list decode. * XDR context is memory ops */ bool_t xdr_decode_wlist_svc(XDR *xdrs, struct clist **wclp, bool_t *wwl, uint32_t *total_length, CONN *conn) { struct clist *first, *ncl; char *memp; uint32_t num_wclist; uint32_t wcl_length = 0; uint32_t i; bool_t more = FALSE; *wclp = NULL; *wwl = FALSE; *total_length = 0; if (!xdr_bool(xdrs, &more)) { return (FALSE); } if (more == FALSE) { return (TRUE); } *wwl = TRUE; if (!xdr_uint32(xdrs, &num_wclist)) { DTRACE_PROBE(krpc__e__xdrrdma__wlistsvc__listlength); return (FALSE); } first = ncl = clist_alloc(); for (i = 0; i < num_wclist; i++) { if (!xdr_uint32(xdrs, &ncl->c_dmemhandle.mrc_rmr)) goto err_out; if (!xdr_uint32(xdrs, &ncl->c_len)) goto err_out; if (!xdr_uint64(xdrs, &ncl->u.c_daddr)) goto err_out; if (ncl->c_len > MAX_SVC_XFER_SIZE) { DTRACE_PROBE( krpc__e__xdrrdma__wlistsvc__chunklist_toobig); ncl->c_len = MAX_SVC_XFER_SIZE; } DTRACE_PROBE1(krpc__i__xdr_decode_wlist_svc_len, uint_t, ncl->c_len); wcl_length += ncl->c_len; if (i < num_wclist - 1) { ncl->c_next = clist_alloc(); ncl = ncl->c_next; } } if (!xdr_bool(xdrs, &more)) goto err_out; first->rb_longbuf.type = RDMA_LONG_BUFFER; first->rb_longbuf.len = wcl_length > WCL_BUF_LEN ? wcl_length : WCL_BUF_LEN; if (rdma_buf_alloc(conn, &first->rb_longbuf)) { clist_free(first); return (FALSE); } memp = first->rb_longbuf.addr; ncl = first; for (i = 0; i < num_wclist; i++) { ncl->w.c_saddr3 = (caddr_t)memp; memp += ncl->c_len; ncl = ncl->c_next; } *wclp = first; *total_length = wcl_length; return (TRUE); err_out: clist_free(first); return (FALSE); }