/** * iser_sg_to_page_vec - Translates scatterlist entries to physical addresses * and returns the length of resulting physical address array (may be less than * the original due to possible compaction). * * we build a "page vec" under the assumption that the SG meets the RDMA * alignment requirements. Other then the first and last SG elements, all * the "internal" elements can be compacted into a list whose elements are * dma addresses of physical pages. The code supports also the weird case * where --few fragments of the same page-- are present in the SG as * consecutive elements. Also, it handles one entry SG. */ static int iser_sg_to_page_vec(struct iser_data_buf *data, struct iser_page_vec *page_vec, struct ib_device *ibdev) { struct scatterlist *sg = (struct scatterlist *)data->buf; u64 first_addr, last_addr, page; int end_aligned; unsigned int cur_page = 0; unsigned long total_sz = 0; int i; /* compute the offset of first element */ page_vec->offset = (u64) sg[0].offset & ~MASK_4K; for (i = 0; i < data->dma_nents; i++) { unsigned int dma_len = ib_sg_dma_len(ibdev, &sg[i]); total_sz += dma_len; first_addr = ib_sg_dma_address(ibdev, &sg[i]); last_addr = first_addr + dma_len; end_aligned = !(last_addr & ~MASK_4K); /* continue to collect page fragments till aligned or SG ends */ while (!end_aligned && (i + 1 < data->dma_nents)) { i++; dma_len = ib_sg_dma_len(ibdev, &sg[i]); total_sz += dma_len; last_addr = ib_sg_dma_address(ibdev, &sg[i]) + dma_len; end_aligned = !(last_addr & ~MASK_4K); } /* handle the 1st page in the 1st DMA element */ if (cur_page == 0) { page = first_addr & MASK_4K; page_vec->pages[cur_page] = page; cur_page++; page += SIZE_4K; } else page = first_addr; for (; page < last_addr; page += SIZE_4K) { page_vec->pages[cur_page] = page; cur_page++; } } page_vec->data_size = total_sz; iser_dbg("page_vec->data_size:%d cur_page %d\n", page_vec->data_size,cur_page); return cur_page; }
static int iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem, struct iser_mem_reg *reg) { struct scatterlist *sg = mem->sg; reg->sge.lkey = device->pd->local_dma_lkey; /* * FIXME: rework the registration code path to differentiate * rkey/lkey use cases */ if (device->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) reg->rkey = device->pd->unsafe_global_rkey; else reg->rkey = 0; reg->sge.addr = ib_sg_dma_address(device->ib_device, &sg[0]); reg->sge.length = ib_sg_dma_len(device->ib_device, &sg[0]); iser_dbg("Single DMA entry: lkey=0x%x, rkey=0x%x, addr=0x%llx," " length=0x%x\n", reg->sge.lkey, reg->rkey, reg->sge.addr, reg->sge.length); return 0; }
static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp, struct scatterlist *sg, u32 offset, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { struct ib_device *dev = qp->pd->device; struct ib_rdma_wr *rdma_wr = &ctx->single.wr; ctx->nr_ops = 1; ctx->single.sge.lkey = qp->pd->local_dma_lkey; ctx->single.sge.addr = ib_sg_dma_address(dev, sg) + offset; ctx->single.sge.length = ib_sg_dma_len(dev, sg) - offset; memset(rdma_wr, 0, sizeof(*rdma_wr)); if (dir == DMA_TO_DEVICE) rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; else rdma_wr->wr.opcode = IB_WR_RDMA_READ; rdma_wr->wr.sg_list = &ctx->single.sge; rdma_wr->wr.num_sge = 1; rdma_wr->remote_addr = remote_addr; rdma_wr->rkey = rkey; ctx->type = RDMA_RW_SINGLE_WR; return 1; }
static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, struct scatterlist *sg, u32 sg_cnt, u32 offset, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { struct ib_device *dev = qp->pd->device; u32 max_sge = rdma_rw_max_sge(dev, dir); struct ib_sge *sge; u32 total_len = 0, i, j; ctx->nr_ops = DIV_ROUND_UP(sg_cnt, max_sge); ctx->map.sges = sge = kcalloc(sg_cnt, sizeof(*sge), GFP_KERNEL); if (!ctx->map.sges) goto out; ctx->map.wrs = kcalloc(ctx->nr_ops, sizeof(*ctx->map.wrs), GFP_KERNEL); if (!ctx->map.wrs) goto out_free_sges; for (i = 0; i < ctx->nr_ops; i++) { struct ib_rdma_wr *rdma_wr = &ctx->map.wrs[i]; u32 nr_sge = min(sg_cnt, max_sge); if (dir == DMA_TO_DEVICE) rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; else rdma_wr->wr.opcode = IB_WR_RDMA_READ; rdma_wr->remote_addr = remote_addr + total_len; rdma_wr->rkey = rkey; rdma_wr->wr.sg_list = sge; for (j = 0; j < nr_sge; j++, sg = sg_next(sg)) { rdma_wr->wr.num_sge++; sge->addr = ib_sg_dma_address(dev, sg) + offset; sge->length = ib_sg_dma_len(dev, sg) - offset; sge->lkey = qp->pd->local_dma_lkey; total_len += sge->length; sge++; sg_cnt--; offset = 0; } if (i + 1 < ctx->nr_ops) rdma_wr->wr.next = &ctx->map.wrs[i + 1].wr; } ctx->type = RDMA_RW_MULTI_WR; return ctx->nr_ops; out_free_sges: kfree(ctx->map.sges); out: return -ENOMEM; }
static void iser_data_buf_dump(struct iser_data_buf *data, struct ib_device *ibdev) { struct scatterlist *sg; int i; for_each_sg(data->sg, sg, data->dma_nents, i) iser_dbg("sg[%d] dma_addr:0x%lX page:0x%p " "off:0x%x sz:0x%x dma_len:0x%x\n", i, (unsigned long)ib_sg_dma_address(ibdev, sg), sg_page(sg), sg->offset, sg->length, ib_sg_dma_len(ibdev, sg)); }
static void iser_data_buf_dump(struct iser_data_buf *data, struct ib_device *ibdev) { struct scatterlist *sg = (struct scatterlist *)data->buf; int i; for (i = 0; i < data->dma_nents; i++) iser_err("sg[%d] dma_addr:0x%lX page:0x%p " "off:0x%x sz:0x%x dma_len:0x%x\n", i, (unsigned long)ib_sg_dma_address(ibdev, &sg[i]), sg[i].page, sg[i].offset, sg[i].length, ib_sg_dma_len(ibdev, &sg[i])); }
/** * iser_data_buf_aligned_len - Tries to determine the maximal correctly aligned * for RDMA sub-list of a scatter-gather list of memory buffers, and returns * the number of entries which are aligned correctly. Supports the case where * consecutive SG elements are actually fragments of the same physcial page. */ static unsigned int iser_data_buf_aligned_len(struct iser_data_buf *data, struct ib_device *ibdev) { struct scatterlist *sg; u64 end_addr, next_addr; int i, cnt; unsigned int ret_len = 0; sg = (struct scatterlist *)data->buf; for (cnt = 0, i = 0; i < data->dma_nents; i++, cnt++) { /* iser_dbg("Checking sg iobuf [%d]: phys=0x%08lX " "offset: %ld sz: %ld\n", i, (unsigned long)page_to_phys(sg[i].page), (unsigned long)sg[i].offset, (unsigned long)sg[i].length); */ end_addr = ib_sg_dma_address(ibdev, &sg[i]) + ib_sg_dma_len(ibdev, &sg[i]); /* iser_dbg("Checking sg iobuf end address " "0x%08lX\n", end_addr); */ if (i + 1 < data->dma_nents) { next_addr = ib_sg_dma_address(ibdev, &sg[i+1]); /* are i, i+1 fragments of the same page? */ if (end_addr == next_addr) continue; else if (!IS_4K_ALIGNED(end_addr)) { ret_len = cnt + 1; break; } } } if (i == data->dma_nents) ret_len = cnt; /* loop ended */ iser_dbg("Found %d aligned entries out of %d in sg:0x%p\n", ret_len, data->dma_nents, data); return ret_len; }
int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr, struct scatterlist *sg, unsigned int nents) { struct ib_device *dev = rds_ibdev->dev; struct rds_ib_fmr *fmr = &ibmr->u.fmr; struct scatterlist *scat = sg; u64 io_addr = 0; u64 *dma_pages; u32 len; int page_cnt, sg_dma_len; int i, j; int ret; sg_dma_len = ib_dma_map_sg(dev, sg, nents, DMA_BIDIRECTIONAL); if (unlikely(!sg_dma_len)) { pr_warn("RDS/IB: %s failed!\n", __func__); return -EBUSY; } len = 0; page_cnt = 0; for (i = 0; i < sg_dma_len; ++i) { unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]); u64 dma_addr = ib_sg_dma_address(dev, &scat[i]); if (dma_addr & ~PAGE_MASK) { if (i > 0) return -EINVAL; else ++page_cnt; } if ((dma_addr + dma_len) & ~PAGE_MASK) { if (i < sg_dma_len - 1) return -EINVAL; else ++page_cnt; } len += dma_len; } page_cnt += len >> PAGE_SHIFT; if (page_cnt > ibmr->pool->fmr_attr.max_pages) return -EINVAL; dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC, rdsibdev_to_node(rds_ibdev)); if (!dma_pages) return -ENOMEM; page_cnt = 0; for (i = 0; i < sg_dma_len; ++i) { unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]); u64 dma_addr = ib_sg_dma_address(dev, &scat[i]); for (j = 0; j < dma_len; j += PAGE_SIZE) dma_pages[page_cnt++] = (dma_addr & PAGE_MASK) + j; } ret = ib_map_phys_fmr(fmr->fmr, dma_pages, page_cnt, io_addr); if (ret) goto out; /* Success - we successfully remapped the MR, so we can * safely tear down the old mapping. */ rds_ib_teardown_mr(ibmr); ibmr->sg = scat; ibmr->sg_len = nents; ibmr->sg_dma_len = sg_dma_len; ibmr->remap_count++; if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) rds_ib_stats_inc(s_ib_rdma_mr_8k_used); else rds_ib_stats_inc(s_ib_rdma_mr_1m_used); ret = 0; out: kfree(dma_pages); return ret; }
static int rds_ib_map_frmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr_pool *pool, struct rds_ib_mr *ibmr, struct scatterlist *sg, unsigned int sg_len) { struct ib_device *dev = rds_ibdev->dev; struct rds_ib_frmr *frmr = &ibmr->u.frmr; int i; u32 len; int ret = 0; /* We want to teardown old ibmr values here and fill it up with * new sg values */ rds_ib_teardown_mr(ibmr); ibmr->sg = sg; ibmr->sg_len = sg_len; ibmr->sg_dma_len = 0; frmr->sg_byte_len = 0; WARN_ON(ibmr->sg_dma_len); ibmr->sg_dma_len = ib_dma_map_sg(dev, ibmr->sg, ibmr->sg_len, DMA_BIDIRECTIONAL); if (unlikely(!ibmr->sg_dma_len)) { pr_warn("RDS/IB: %s failed!\n", __func__); return -EBUSY; } frmr->sg_byte_len = 0; frmr->dma_npages = 0; len = 0; ret = -EINVAL; for (i = 0; i < ibmr->sg_dma_len; ++i) { unsigned int dma_len = ib_sg_dma_len(dev, &ibmr->sg[i]); u64 dma_addr = ib_sg_dma_address(dev, &ibmr->sg[i]); frmr->sg_byte_len += dma_len; if (dma_addr & ~PAGE_MASK) { if (i > 0) goto out_unmap; else ++frmr->dma_npages; } if ((dma_addr + dma_len) & ~PAGE_MASK) { if (i < ibmr->sg_dma_len - 1) goto out_unmap; else ++frmr->dma_npages; } len += dma_len; } frmr->dma_npages += len >> PAGE_SHIFT; if (frmr->dma_npages > ibmr->pool->fmr_attr.max_pages) { ret = -EMSGSIZE; goto out_unmap; } ret = rds_ib_post_reg_frmr(ibmr); if (ret) goto out_unmap; if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) rds_ib_stats_inc(s_ib_rdma_mr_8k_used); else rds_ib_stats_inc(s_ib_rdma_mr_1m_used); return ret; out_unmap: ib_dma_unmap_sg(rds_ibdev->dev, ibmr->sg, ibmr->sg_len, DMA_BIDIRECTIONAL); ibmr->sg_dma_len = 0; return ret; }
/** * iser_reg_rdma_mem - Registers memory intended for RDMA, * obtaining rkey and va * * returns 0 on success, errno code on failure */ int iser_reg_rdma_mem(struct iscsi_iser_cmd_task *iser_ctask, enum iser_data_dir cmd_dir) { struct iser_conn *ib_conn = iser_ctask->iser_conn->ib_conn; struct iser_device *device = ib_conn->device; struct ib_device *ibdev = device->ib_device; struct iser_data_buf *mem = &iser_ctask->data[cmd_dir]; struct iser_regd_buf *regd_buf; int aligned_len; int err; int i; struct scatterlist *sg; regd_buf = &iser_ctask->rdma_regd[cmd_dir]; aligned_len = iser_data_buf_aligned_len(mem, ibdev); if (aligned_len != mem->dma_nents) { iser_err("rdma alignment violation %d/%d aligned\n", aligned_len, mem->size); iser_data_buf_dump(mem, ibdev); /* unmap the command data before accessing it */ iser_dma_unmap_task_data(iser_ctask); /* allocate copy buf, if we are writing, copy the */ /* unaligned scatterlist, dma map the copy */ if (iser_start_rdma_unaligned_sg(iser_ctask, cmd_dir) != 0) return -ENOMEM; mem = &iser_ctask->data_copy[cmd_dir]; } /* if there a single dma entry, FMR is not needed */ if (mem->dma_nents == 1) { sg = (struct scatterlist *)mem->buf; regd_buf->reg.lkey = device->mr->lkey; regd_buf->reg.rkey = device->mr->rkey; regd_buf->reg.len = ib_sg_dma_len(ibdev, &sg[0]); regd_buf->reg.va = ib_sg_dma_address(ibdev, &sg[0]); regd_buf->reg.is_fmr = 0; iser_dbg("PHYSICAL Mem.register: lkey: 0x%08X rkey: 0x%08X " "va: 0x%08lX sz: %ld]\n", (unsigned int)regd_buf->reg.lkey, (unsigned int)regd_buf->reg.rkey, (unsigned long)regd_buf->reg.va, (unsigned long)regd_buf->reg.len); } else { /* use FMR for multiple dma entries */ iser_page_vec_build(mem, ib_conn->page_vec, ibdev); err = iser_reg_page_vec(ib_conn, ib_conn->page_vec, ®d_buf->reg); if (err) { iser_data_buf_dump(mem, ibdev); iser_err("mem->dma_nents = %d (dlength = 0x%x)\n", mem->dma_nents, ntoh24(iser_ctask->desc.iscsi_header.dlength)); iser_err("page_vec: data_size = 0x%x, length = %d, offset = 0x%x\n", ib_conn->page_vec->data_size, ib_conn->page_vec->length, ib_conn->page_vec->offset); for (i=0 ; i<ib_conn->page_vec->length ; i++) iser_err("page_vec[%d] = 0x%llx\n", i, (unsigned long long) ib_conn->page_vec->pages[i]); return err; } } /* take a reference on this regd buf such that it will not be released * * (eg in send dto completion) before we get the scsi response */ atomic_inc(®d_buf->ref_count); return 0; }