/* * Copies iovecs from src to the end of dst. It starts copying after skipping * the given number of bytes in src and copies until src is completely copied * or the total size of the copied iovec reaches size.The size of the last * copied iovec is changed in order to fit the specified total size if it isn't * a perfect fit already. */ void qemu_iovec_copy(QEMUIOVector *dst, QEMUIOVector *src, uint64_t skip, size_t size) { int i; size_t done; void *iov_base; uint64_t iov_len; assert(dst->nalloc != -1); done = 0; for (i = 0; (i < src->niov) && (done != size); i++) { if (skip >= src->iov[i].iov_len) { /* Skip the whole iov */ skip -= src->iov[i].iov_len; continue; } else { /* Skip only part (or nothing) of the iov */ iov_base = (uint8_t*) src->iov[i].iov_base + skip; iov_len = src->iov[i].iov_len - skip; skip = 0; } if (done + iov_len > size) { qemu_iovec_add(dst, iov_base, size - done); break; } else { qemu_iovec_add(dst, iov_base, iov_len); } done += iov_len; } }
/** * Copy contents of I/O vector * * The relative relationships of overlapping iovecs are preserved. This is * necessary to ensure identical semantics in the cloned I/O vector. */ void qemu_iovec_clone(QEMUIOVector *dest, const QEMUIOVector *src, void *buf) { IOVectorSortElem sortelems[src->niov]; void *last_end; int i; /* Sort by source iovecs by base address */ for (i = 0; i < src->niov; i++) { sortelems[i].src_index = i; sortelems[i].src_iov = &src->iov[i]; } qsort(sortelems, src->niov, sizeof(sortelems[0]), sortelem_cmp_src_base); /* Allocate buffer space taking into account overlapping iovecs */ last_end = NULL; for (i = 0; i < src->niov; i++) { struct iovec *cur = sortelems[i].src_iov; ptrdiff_t rewind = 0; /* Detect overlap */ if (last_end && last_end > cur->iov_base) { rewind = last_end - cur->iov_base; } sortelems[i].dest_base = buf - rewind; buf += cur->iov_len - MIN(rewind, cur->iov_len); last_end = MAX(cur->iov_base + cur->iov_len, last_end); } /* Sort by source iovec index and build destination iovec */ qsort(sortelems, src->niov, sizeof(sortelems[0]), sortelem_cmp_src_index); for (i = 0; i < src->niov; i++) { qemu_iovec_add(dest, sortelems[i].dest_base, src->iov[i].iov_len); } }
int usb_packet_map(USBPacket *p, QEMUSGList *sgl) { DMADirection dir = (p->pid == USB_TOKEN_IN) ? DMA_DIRECTION_FROM_DEVICE : DMA_DIRECTION_TO_DEVICE; dma_addr_t len; void *mem; int i; for (i = 0; i < sgl->nsg; i++) { len = sgl->sg[i].len; mem = dma_memory_map(sgl->dma, sgl->sg[i].base, &len, dir); if (!mem) { goto err; } qemu_iovec_add(&p->iov, mem, len); if (len != sgl->sg[i].len) { goto err; } } return 0; err: usb_packet_unmap(p, sgl); return -1; }
static int coroutine_fn raw_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int flags) { void *buf = NULL; BlockDriver *drv; QEMUIOVector local_qiov; int ret; if (bs->probed && sector_num == 0) { /* As long as these conditions are true, we can't get partial writes to * the probe buffer and can just directly check the request. */ QEMU_BUILD_BUG_ON(BLOCK_PROBE_BUF_SIZE != 512); QEMU_BUILD_BUG_ON(BDRV_SECTOR_SIZE != 512); if (nb_sectors == 0) { /* qemu_iovec_to_buf() would fail, but we want to return success * instead of -EINVAL in this case. */ return 0; } buf = qemu_try_blockalign(bs->file->bs, 512); if (!buf) { ret = -ENOMEM; goto fail; } ret = qemu_iovec_to_buf(qiov, 0, buf, 512); if (ret != 512) { ret = -EINVAL; goto fail; } drv = bdrv_probe_all(buf, 512, NULL); if (drv != bs->drv) { ret = -EPERM; goto fail; } /* Use the checked buffer, a malicious guest might be overwriting its * original buffer in the background. */ qemu_iovec_init(&local_qiov, qiov->niov + 1); qemu_iovec_add(&local_qiov, buf, 512); qemu_iovec_concat(&local_qiov, qiov, 512, qiov->size - 512); qiov = &local_qiov; } BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); ret = bdrv_co_do_pwritev(bs->file->bs, sector_num * BDRV_SECTOR_SIZE, nb_sectors * BDRV_SECTOR_SIZE, qiov, flags); fail: if (qiov == &local_qiov) { qemu_iovec_destroy(&local_qiov); } qemu_vfree(buf); return ret; }
/* * Copies iovecs from src to the end dst until src is completely copied or the * total size of the copied iovec reaches size. The size of the last copied * iovec is changed in order to fit the specified total size if it isn't a * perfect fit already. */ void qemu_iovec_concat(QEMUIOVector *dst, QEMUIOVector *src, size_t size) { int i; size_t done; assert(dst->nalloc != -1); done = 0; for (i = 0; (i < src->niov) && (done != size); i++) { if (done + src->iov[i].iov_len > size) { qemu_iovec_add(dst, src->iov[i].iov_base, size - done); break; } else { qemu_iovec_add(dst, src->iov[i].iov_base, src->iov[i].iov_len); } done += src->iov[i].iov_len; } }
static void dma_bdrv_cb(void *opaque, int ret) { DMAAIOCB *dbs = (DMAAIOCB *)opaque; target_phys_addr_t cur_addr, cur_len; void *mem; dbs->acb = NULL; dbs->sector_num += dbs->iov.size / 512; dma_bdrv_unmap(dbs); qemu_iovec_reset(&dbs->iov); if (dbs->sg_cur_index == dbs->sg->nsg || ret < 0) { dbs->common.cb(dbs->common.opaque, ret); qemu_iovec_destroy(&dbs->iov); qemu_aio_release(dbs); return; } while (dbs->sg_cur_index < dbs->sg->nsg) { cur_addr = dbs->sg->sg[dbs->sg_cur_index].base + dbs->sg_cur_byte; cur_len = dbs->sg->sg[dbs->sg_cur_index].len - dbs->sg_cur_byte; mem = cpu_physical_memory_map(cur_addr, &cur_len, !dbs->is_write); if (!mem) break; qemu_iovec_add(&dbs->iov, mem, cur_len); dbs->sg_cur_byte += cur_len; if (dbs->sg_cur_byte == dbs->sg->sg[dbs->sg_cur_index].len) { dbs->sg_cur_byte = 0; ++dbs->sg_cur_index; } } if (dbs->iov.size == 0) { cpu_register_map_client(dbs, continue_after_map_failure); return; } if (dbs->is_write) { dbs->acb = bdrv_aio_writev(dbs->bs, dbs->sector_num, &dbs->iov, dbs->iov.size / 512, dma_bdrv_cb, dbs); } else { dbs->acb = bdrv_aio_readv(dbs->bs, dbs->sector_num, &dbs->iov, dbs->iov.size / 512, dma_bdrv_cb, dbs); } if (!dbs->acb) { dma_bdrv_unmap(dbs); qemu_iovec_destroy(&dbs->iov); return; } }
static inline void flash_sync_area(Flash *s, int64_t off, int64_t len) { QEMUIOVector *iov; if (!s->blk || blk_is_read_only(s->blk)) { return; } assert(!(len % BDRV_SECTOR_SIZE)); iov = g_new(QEMUIOVector, 1); qemu_iovec_init(iov, 1); qemu_iovec_add(iov, s->storage + off, len); blk_aio_pwritev(s->blk, off, iov, 0, blk_sync_complete, iov); }
static void flash_sync_page(Flash *s, int page) { if (s->bdrv) { int bdrv_sector, nb_sectors; QEMUIOVector iov; bdrv_sector = (page * s->pi->page_size) / BDRV_SECTOR_SIZE; nb_sectors = DIV_ROUND_UP(s->pi->page_size, BDRV_SECTOR_SIZE); qemu_iovec_init(&iov, 1); qemu_iovec_add(&iov, s->storage + bdrv_sector * BDRV_SECTOR_SIZE, nb_sectors * BDRV_SECTOR_SIZE); bdrv_aio_writev(s->bdrv, bdrv_sector, &iov, nb_sectors, bdrv_sync_complete, NULL); } }
static void flash_sync_page(Flash *s, int page) { QEMUIOVector *iov; if (!s->blk || blk_is_read_only(s->blk)) { return; } iov = g_new(QEMUIOVector, 1); qemu_iovec_init(iov, 1); qemu_iovec_add(iov, s->storage + page * s->pi->page_size, s->pi->page_size); blk_aio_pwritev(s->blk, page * s->pi->page_size, iov, 0, blk_sync_complete, iov); }
/* * Parse multiple length statements for vectored I/O, and construct an I/O * vector matching it. */ static void * create_iovec(QEMUIOVector *qiov, char **argv, int nr_iov, int pattern) { size_t *sizes = g_new0(size_t, nr_iov); size_t count = 0; void *buf = NULL; void *p; int i; for (i = 0; i < nr_iov; i++) { char *arg = argv[i]; int64_t len; len = cvtnum(arg); if (len < 0) { printf("non-numeric length argument -- %s\n", arg); goto fail; } /* should be SIZE_T_MAX, but that doesn't exist */ if (len > INT_MAX) { printf("too large length argument -- %s\n", arg); goto fail; } if (len & 0x1ff) { printf("length argument %" PRId64 " is not sector aligned\n", len); goto fail; } sizes[i] = len; count += len; } qemu_iovec_init(qiov, nr_iov); buf = p = qemu_io_alloc(count, pattern); for (i = 0; i < nr_iov; i++) { qemu_iovec_add(qiov, p, sizes[i]); p += sizes[i]; } fail: g_free(sizes); return buf; }
static void dma_blk_cb(void *opaque, int ret) { DMAAIOCB *dbs = (DMAAIOCB *)opaque; dma_addr_t cur_addr, cur_len; void *mem; trace_dma_blk_cb(dbs, ret); dbs->acb = NULL; dbs->sector_num += dbs->iov.size / 512; if (dbs->sg_cur_index == dbs->sg->nsg || ret < 0) { dma_complete(dbs, ret); return; } dma_blk_unmap(dbs); while (dbs->sg_cur_index < dbs->sg->nsg) { cur_addr = dbs->sg->sg[dbs->sg_cur_index].base + dbs->sg_cur_byte; cur_len = dbs->sg->sg[dbs->sg_cur_index].len - dbs->sg_cur_byte; mem = dma_memory_map(dbs->sg->as, cur_addr, &cur_len, dbs->dir); if (!mem) break; qemu_iovec_add(&dbs->iov, mem, cur_len); dbs->sg_cur_byte += cur_len; if (dbs->sg_cur_byte == dbs->sg->sg[dbs->sg_cur_index].len) { dbs->sg_cur_byte = 0; ++dbs->sg_cur_index; } } if (dbs->iov.size == 0) { trace_dma_map_wait(dbs); dbs->bh = aio_bh_new(blk_get_aio_context(dbs->blk), reschedule_dma, dbs); cpu_register_map_client(dbs->bh); return; } if (dbs->iov.size & ~BDRV_SECTOR_MASK) { qemu_iovec_discard_back(&dbs->iov, dbs->iov.size & ~BDRV_SECTOR_MASK); } dbs->acb = dbs->io_func(dbs->blk, dbs->sector_num, &dbs->iov, dbs->iov.size / 512, dma_blk_cb, dbs); assert(dbs->acb); }
static void flash_sync_page(Flash *s, int page) { int blk_sector, nb_sectors; QEMUIOVector iov; if (!s->blk || blk_is_read_only(s->blk)) { return; } blk_sector = (page * s->pi->page_size) / BDRV_SECTOR_SIZE; nb_sectors = DIV_ROUND_UP(s->pi->page_size, BDRV_SECTOR_SIZE); qemu_iovec_init(&iov, 1); qemu_iovec_add(&iov, s->storage + blk_sector * BDRV_SECTOR_SIZE, nb_sectors * BDRV_SECTOR_SIZE); blk_aio_writev(s->blk, blk_sector, &iov, nb_sectors, blk_sync_complete, NULL); }
static inline void submit_requests(BlockBackend *blk, MultiReqBuffer *mrb, int start, int num_reqs, int niov) { QEMUIOVector *qiov = &mrb->reqs[start]->qiov; int64_t sector_num = mrb->reqs[start]->sector_num; int nb_sectors = mrb->reqs[start]->qiov.size / BDRV_SECTOR_SIZE; bool is_write = mrb->is_write; if (num_reqs > 1) { int i; struct iovec *tmp_iov = qiov->iov; int tmp_niov = qiov->niov; /* mrb->reqs[start]->qiov was initialized from external so we can't * modifiy it here. We need to initialize it locally and then add the * external iovecs. */ qemu_iovec_init(qiov, niov); for (i = 0; i < tmp_niov; i++) { qemu_iovec_add(qiov, tmp_iov[i].iov_base, tmp_iov[i].iov_len); } for (i = start + 1; i < start + num_reqs; i++) { qemu_iovec_concat(qiov, &mrb->reqs[i]->qiov, 0, mrb->reqs[i]->qiov.size); mrb->reqs[i - 1]->mr_next = mrb->reqs[i]; nb_sectors += mrb->reqs[i]->qiov.size / BDRV_SECTOR_SIZE; } assert(nb_sectors == qiov->size / BDRV_SECTOR_SIZE); trace_virtio_blk_submit_multireq(mrb, start, num_reqs, sector_num, nb_sectors, is_write); block_acct_merge_done(blk_get_stats(blk), is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ, num_reqs - 1); } if (is_write) { blk_aio_writev(blk, sector_num, qiov, nb_sectors, virtio_blk_rw_complete, mrb->reqs[start]); } else { blk_aio_readv(blk, sector_num, qiov, nb_sectors, virtio_blk_rw_complete, mrb->reqs[start]); } }
static inline void flash_sync_area(Flash *s, int64_t off, int64_t len) { int64_t start, end, nb_sectors; QEMUIOVector iov; if (!s->blk || blk_is_read_only(s->blk)) { return; } assert(!(len % BDRV_SECTOR_SIZE)); start = off / BDRV_SECTOR_SIZE; end = (off + len) / BDRV_SECTOR_SIZE; nb_sectors = end - start; qemu_iovec_init(&iov, 1); qemu_iovec_add(&iov, s->storage + (start * BDRV_SECTOR_SIZE), nb_sectors * BDRV_SECTOR_SIZE); blk_aio_writev(s->blk, start, &iov, nb_sectors, blk_sync_complete, NULL); }
/* * Concatenates (partial) iovecs from src_iov to the end of dst. * It starts copying after skipping `soffset' bytes at the * beginning of src and adds individual vectors from src to * dst copies up to `sbytes' bytes total, or up to the end * of src_iov if it comes first. This way, it is okay to specify * very large value for `sbytes' to indicate "up to the end * of src". * Only vector pointers are processed, not the actual data buffers. */ void qemu_iovec_concat_iov(QEMUIOVector *dst, struct iovec *src_iov, unsigned int src_cnt, size_t soffset, size_t sbytes) { int i; size_t done; assert(dst->nalloc != -1); for (i = 0, done = 0; done < sbytes && i < src_cnt; i++) { if (soffset < src_iov[i].iov_len) { size_t len = MIN(src_iov[i].iov_len - soffset, sbytes - done); qemu_iovec_add(dst, src_iov[i].iov_base + soffset, len); done += len; soffset = 0; } else { soffset -= src_iov[i].iov_len; } } assert(soffset == 0); /* offset beyond end of src */ }
/* * Concatenates (partial) iovecs from src to the end of dst. * It starts copying after skipping `soffset' bytes at the * beginning of src and adds individual vectors from src to * dst copies up to `sbytes' bytes total, or up to the end * of src if it comes first. This way, it is okay to specify * very large value for `sbytes' to indicate "up to the end * of src". * Only vector pointers are processed, not the actual data buffers. */ void qemu_iovec_concat(QEMUIOVector *dst, QEMUIOVector *src, size_t soffset, size_t sbytes) { int i; size_t done; struct iovec *siov = src->iov; assert(dst->nalloc != -1); assert(src->size >= soffset); for (i = 0, done = 0; done < sbytes && i < src->niov; i++) { if (soffset < siov[i].iov_len) { size_t len = MIN(siov[i].iov_len - soffset, sbytes - done); qemu_iovec_add(dst, siov[i].iov_base + soffset, len); done += len; soffset = 0; } else { soffset -= siov[i].iov_len; } } /* return done; */ }
static void dma_bdrv_cb(void *opaque, int ret) { DMAAIOCB *dbs = (DMAAIOCB *)opaque; dma_addr_t cur_addr, cur_len; void *mem; trace_dma_bdrv_cb(dbs, ret); dbs->acb = NULL; dbs->sector_num += dbs->iov.size / 512; dma_bdrv_unmap(dbs); if (dbs->sg_cur_index == dbs->sg->nsg || ret < 0) { dma_complete(dbs, ret); return; } while (dbs->sg_cur_index < dbs->sg->nsg) { cur_addr = dbs->sg->sg[dbs->sg_cur_index].base + dbs->sg_cur_byte; cur_len = dbs->sg->sg[dbs->sg_cur_index].len - dbs->sg_cur_byte; mem = dma_memory_map(dbs->sg->dma, cur_addr, &cur_len, dbs->dir); if (!mem) break; qemu_iovec_add(&dbs->iov, mem, cur_len); dbs->sg_cur_byte += cur_len; if (dbs->sg_cur_byte == dbs->sg->sg[dbs->sg_cur_index].len) { dbs->sg_cur_byte = 0; ++dbs->sg_cur_index; } } if (dbs->iov.size == 0) { trace_dma_map_wait(dbs); cpu_register_map_client(dbs, continue_after_map_failure); return; } dbs->acb = dbs->io_func(dbs->bs, dbs->sector_num, &dbs->iov, dbs->iov.size / 512, dma_bdrv_cb, dbs); assert(dbs->acb); }
static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t prp1, uint64_t prp2, uint32_t len, NvmeCtrl *n) { hwaddr trans_len = n->page_size - (prp1 % n->page_size); trans_len = MIN(len, trans_len); int num_prps = (len >> n->page_bits) + 1; if (unlikely(!prp1)) { trace_nvme_err_invalid_prp(); return NVME_INVALID_FIELD | NVME_DNR; } else if (n->cmbsz && prp1 >= n->ctrl_mem.addr && prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) { qsg->nsg = 0; qemu_iovec_init(iov, num_prps); qemu_iovec_add(iov, (void *)&n->cmbuf[prp1 - n->ctrl_mem.addr], trans_len); } else { pci_dma_sglist_init(qsg, &n->parent_obj, num_prps); qemu_sglist_add(qsg, prp1, trans_len); } len -= trans_len; if (len) { if (unlikely(!prp2)) { trace_nvme_err_invalid_prp2_missing(); goto unmap; } if (len > n->page_size) { uint64_t prp_list[n->max_prp_ents]; uint32_t nents, prp_trans; int i = 0; nents = (len + n->page_size - 1) >> n->page_bits; prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t); nvme_addr_read(n, prp2, (void *)prp_list, prp_trans); while (len != 0) { uint64_t prp_ent = le64_to_cpu(prp_list[i]); if (i == n->max_prp_ents - 1 && len > n->page_size) { if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) { trace_nvme_err_invalid_prplist_ent(prp_ent); goto unmap; } i = 0; nents = (len + n->page_size - 1) >> n->page_bits; prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t); nvme_addr_read(n, prp_ent, (void *)prp_list, prp_trans); prp_ent = le64_to_cpu(prp_list[i]); } if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) { trace_nvme_err_invalid_prplist_ent(prp_ent); goto unmap; } trans_len = MIN(len, n->page_size); if (qsg->nsg){ qemu_sglist_add(qsg, prp_ent, trans_len); } else { qemu_iovec_add(iov, (void *)&n->cmbuf[prp_ent - n->ctrl_mem.addr], trans_len); } len -= trans_len; i++; } } else { if (unlikely(prp2 & (n->page_size - 1))) {
/* * translate request into iovec + start offset * do sanity checks along the way */ static int ioreq_parse(struct ioreq *ioreq) { struct XenBlkDev *blkdev = ioreq->blkdev; uintptr_t mem; size_t len; int i; xen_be_printf(&blkdev->xendev, 3, "op %d, nr %d, handle %d, id %" PRId64 ", sector %" PRId64 "\n", ioreq->req.operation, ioreq->req.nr_segments, ioreq->req.handle, ioreq->req.id, ioreq->req.sector_number); switch (ioreq->req.operation) { case BLKIF_OP_READ: ioreq->prot = PROT_WRITE; /* to memory */ break; case BLKIF_OP_WRITE_BARRIER: if (!ioreq->req.nr_segments) { ioreq->presync = 1; return 0; } if (!syncwrite) ioreq->presync = ioreq->postsync = 1; /* fall through */ case BLKIF_OP_WRITE: ioreq->prot = PROT_READ; /* from memory */ if (syncwrite) ioreq->postsync = 1; break; default: xen_be_printf(&blkdev->xendev, 0, "error: unknown operation (%d)\n", ioreq->req.operation); goto err; }; if (ioreq->req.operation != BLKIF_OP_READ && blkdev->mode[0] != 'w') { xen_be_printf(&blkdev->xendev, 0, "error: write req for ro device\n"); goto err; } ioreq->start = ioreq->req.sector_number * blkdev->file_blk; for (i = 0; i < ioreq->req.nr_segments; i++) { if (i == BLKIF_MAX_SEGMENTS_PER_REQUEST) { xen_be_printf(&blkdev->xendev, 0, "error: nr_segments too big\n"); goto err; } if (ioreq->req.seg[i].first_sect > ioreq->req.seg[i].last_sect) { xen_be_printf(&blkdev->xendev, 0, "error: first > last sector\n"); goto err; } if (ioreq->req.seg[i].last_sect * BLOCK_SIZE >= XC_PAGE_SIZE) { xen_be_printf(&blkdev->xendev, 0, "error: page crossing\n"); goto err; } ioreq->domids[i] = blkdev->xendev.dom; ioreq->refs[i] = ioreq->req.seg[i].gref; mem = ioreq->req.seg[i].first_sect * blkdev->file_blk; len = (ioreq->req.seg[i].last_sect - ioreq->req.seg[i].first_sect + 1) * blkdev->file_blk; qemu_iovec_add(&ioreq->v, (void*)mem, len); } if (ioreq->start + ioreq->v.size > blkdev->file_size) { xen_be_printf(&blkdev->xendev, 0, "error: access beyond end of file\n"); goto err; } return 0; err: ioreq->status = BLKIF_RSP_ERROR; return -1; }
void usb_packet_addbuf(USBPacket *p, void *ptr, size_t len) { qemu_iovec_add(&p->iov, ptr, len); }
/* Submit async read while handling COW. * Returns: The number of bytes copied after and including offset, * excluding any bytes copied prior to offset due to alignment. * This will be @bytes if no alignment is necessary, or * (new_end - offset) if tail is rounded up or down due to * alignment or buffer limit. */ static uint64_t mirror_do_read(MirrorBlockJob *s, int64_t offset, uint64_t bytes) { BlockBackend *source = s->common.blk; int nb_chunks; uint64_t ret; MirrorOp *op; uint64_t max_bytes; max_bytes = s->granularity * s->max_iov; /* We can only handle as much as buf_size at a time. */ bytes = MIN(s->buf_size, MIN(max_bytes, bytes)); assert(bytes); assert(bytes < BDRV_REQUEST_MAX_BYTES); ret = bytes; if (s->cow_bitmap) { ret += mirror_cow_align(s, &offset, &bytes); } assert(bytes <= s->buf_size); /* The offset is granularity-aligned because: * 1) Caller passes in aligned values; * 2) mirror_cow_align is used only when target cluster is larger. */ assert(QEMU_IS_ALIGNED(offset, s->granularity)); /* The range is sector-aligned, since bdrv_getlength() rounds up. */ assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); nb_chunks = DIV_ROUND_UP(bytes, s->granularity); while (s->buf_free_count < nb_chunks) { trace_mirror_yield_in_flight(s, offset, s->in_flight); mirror_wait_for_io(s); } /* Allocate a MirrorOp that is used as an AIO callback. */ op = g_new(MirrorOp, 1); op->s = s; op->offset = offset; op->bytes = bytes; /* Now make a QEMUIOVector taking enough granularity-sized chunks * from s->buf_free. */ qemu_iovec_init(&op->qiov, nb_chunks); while (nb_chunks-- > 0) { MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free); size_t remaining = bytes - op->qiov.size; QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next); s->buf_free_count--; qemu_iovec_add(&op->qiov, buf, MIN(s->granularity, remaining)); } /* Copy the dirty cluster. */ s->in_flight++; s->bytes_in_flight += bytes; trace_mirror_one_iteration(s, offset, bytes); blk_aio_preadv(source, offset, &op->qiov, 0, mirror_read_complete, op); return ret; }
static int ioreq_runio_qemu_aio(struct ioreq *ioreq) { struct XenBlkDev *blkdev = ioreq->blkdev; ioreq->buf = qemu_memalign(XC_PAGE_SIZE, ioreq->size); if (ioreq->req.nr_segments && (ioreq->req.operation == BLKIF_OP_WRITE || ioreq->req.operation == BLKIF_OP_FLUSH_DISKCACHE) && ioreq_grant_copy(ioreq)) { qemu_vfree(ioreq->buf); goto err; } ioreq->aio_inflight++; if (ioreq->presync) { blk_aio_flush(ioreq->blkdev->blk, qemu_aio_complete, ioreq); return 0; } switch (ioreq->req.operation) { case BLKIF_OP_READ: qemu_iovec_add(&ioreq->v, ioreq->buf, ioreq->size); block_acct_start(blk_get_stats(blkdev->blk), &ioreq->acct, ioreq->v.size, BLOCK_ACCT_READ); ioreq->aio_inflight++; blk_aio_preadv(blkdev->blk, ioreq->start, &ioreq->v, 0, qemu_aio_complete, ioreq); break; case BLKIF_OP_WRITE: case BLKIF_OP_FLUSH_DISKCACHE: if (!ioreq->req.nr_segments) { break; } qemu_iovec_add(&ioreq->v, ioreq->buf, ioreq->size); block_acct_start(blk_get_stats(blkdev->blk), &ioreq->acct, ioreq->v.size, ioreq->req.operation == BLKIF_OP_WRITE ? BLOCK_ACCT_WRITE : BLOCK_ACCT_FLUSH); ioreq->aio_inflight++; blk_aio_pwritev(blkdev->blk, ioreq->start, &ioreq->v, 0, qemu_aio_complete, ioreq); break; case BLKIF_OP_DISCARD: { struct blkif_request_discard *req = (void *)&ioreq->req; if (!blk_split_discard(ioreq, req->sector_number, req->nr_sectors)) { goto err; } break; } default: /* unknown operation (shouldn't happen -- parse catches this) */ goto err; } qemu_aio_complete(ioreq, 0); return 0; err: ioreq_finish(ioreq); ioreq->status = BLKIF_RSP_ERROR; return -1; }
static coroutine_fn int block_crypto_co_writev(BlockDriverState *bs, int64_t sector_num, int remaining_sectors, QEMUIOVector *qiov) { BlockCrypto *crypto = bs->opaque; int cur_nr_sectors; /* number of sectors in current iteration */ uint64_t bytes_done = 0; uint8_t *cipher_data = NULL; QEMUIOVector hd_qiov; int ret = 0; size_t payload_offset = qcrypto_block_get_payload_offset(crypto->block) / 512; qemu_iovec_init(&hd_qiov, qiov->niov); /* Bounce buffer so we have a linear mem region for * entire sector. XXX optimize so we avoid bounce * buffer in case that qiov->niov == 1 */ cipher_data = qemu_try_blockalign(bs->file->bs, MIN(BLOCK_CRYPTO_MAX_SECTORS * 512, qiov->size)); if (cipher_data == NULL) { ret = -ENOMEM; goto cleanup; } while (remaining_sectors) { cur_nr_sectors = remaining_sectors; if (cur_nr_sectors > BLOCK_CRYPTO_MAX_SECTORS) { cur_nr_sectors = BLOCK_CRYPTO_MAX_SECTORS; } qemu_iovec_to_buf(qiov, bytes_done, cipher_data, cur_nr_sectors * 512); if (qcrypto_block_encrypt(crypto->block, sector_num, cipher_data, cur_nr_sectors * 512, NULL) < 0) { ret = -EIO; goto cleanup; } qemu_iovec_reset(&hd_qiov); qemu_iovec_add(&hd_qiov, cipher_data, cur_nr_sectors * 512); ret = bdrv_co_writev(bs->file, payload_offset + sector_num, cur_nr_sectors, &hd_qiov); if (ret < 0) { goto cleanup; } remaining_sectors -= cur_nr_sectors; sector_num += cur_nr_sectors; bytes_done += cur_nr_sectors * 512; } cleanup: qemu_iovec_destroy(&hd_qiov); qemu_vfree(cipher_data); return ret; }
static void coroutine_fn mirror_iteration(MirrorBlockJob *s) { BlockDriverState *source = s->common.bs; int nb_sectors, sectors_per_chunk, nb_chunks; int64_t end, sector_num, next_chunk, next_sector, hbitmap_next_sector; MirrorOp *op; s->sector_num = hbitmap_iter_next(&s->hbi); if (s->sector_num < 0) { bdrv_dirty_iter_init(source, &s->hbi); s->sector_num = hbitmap_iter_next(&s->hbi); trace_mirror_restart_iter(s, bdrv_get_dirty_count(source)); assert(s->sector_num >= 0); } hbitmap_next_sector = s->sector_num; sector_num = s->sector_num; sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; end = s->common.len >> BDRV_SECTOR_BITS; /* Extend the QEMUIOVector to include all adjacent blocks that will * be copied in this operation. * * We have to do this if we have no backing file yet in the destination, * and the cluster size is very large. Then we need to do COW ourselves. * The first time a cluster is copied, copy it entirely. Note that, * because both the granularity and the cluster size are powers of two, * the number of sectors to copy cannot exceed one cluster. * * We also want to extend the QEMUIOVector to include more adjacent * dirty blocks if possible, to limit the number of I/O operations and * run efficiently even with a small granularity. */ nb_chunks = 0; nb_sectors = 0; next_sector = sector_num; next_chunk = sector_num / sectors_per_chunk; /* Wait for I/O to this cluster (from a previous iteration) to be done. */ while (test_bit(next_chunk, s->in_flight_bitmap)) { trace_mirror_yield_in_flight(s, sector_num, s->in_flight); qemu_coroutine_yield(); } do { int added_sectors, added_chunks; if (!bdrv_get_dirty(source, next_sector) || test_bit(next_chunk, s->in_flight_bitmap)) { assert(nb_sectors > 0); break; } added_sectors = sectors_per_chunk; if (s->cow_bitmap && !test_bit(next_chunk, s->cow_bitmap)) { bdrv_round_to_clusters(s->target, next_sector, added_sectors, &next_sector, &added_sectors); /* On the first iteration, the rounding may make us copy * sectors before the first dirty one. */ if (next_sector < sector_num) { assert(nb_sectors == 0); sector_num = next_sector; next_chunk = next_sector / sectors_per_chunk; } } added_sectors = MIN(added_sectors, end - (sector_num + nb_sectors)); added_chunks = (added_sectors + sectors_per_chunk - 1) / sectors_per_chunk; /* When doing COW, it may happen that there is not enough space for * a full cluster. Wait if that is the case. */ while (nb_chunks == 0 && s->buf_free_count < added_chunks) { trace_mirror_yield_buf_busy(s, nb_chunks, s->in_flight); qemu_coroutine_yield(); } if (s->buf_free_count < nb_chunks + added_chunks) { trace_mirror_break_buf_busy(s, nb_chunks, s->in_flight); break; } /* We have enough free space to copy these sectors. */ bitmap_set(s->in_flight_bitmap, next_chunk, added_chunks); nb_sectors += added_sectors; nb_chunks += added_chunks; next_sector += added_sectors; next_chunk += added_chunks; } while (next_sector < end); /* Allocate a MirrorOp that is used as an AIO callback. */ op = g_slice_new(MirrorOp); op->s = s; op->sector_num = sector_num; op->nb_sectors = nb_sectors; /* Now make a QEMUIOVector taking enough granularity-sized chunks * from s->buf_free. */ qemu_iovec_init(&op->qiov, nb_chunks); next_sector = sector_num; while (nb_chunks-- > 0) { MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free); QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next); s->buf_free_count--; qemu_iovec_add(&op->qiov, buf, s->granularity); /* Advance the HBitmapIter in parallel, so that we do not examine * the same sector twice. */ if (next_sector > hbitmap_next_sector && bdrv_get_dirty(source, next_sector)) { hbitmap_next_sector = hbitmap_iter_next(&s->hbi); } next_sector += sectors_per_chunk; } bdrv_reset_dirty(source, sector_num, nb_sectors); /* Copy the dirty cluster. */ s->in_flight++; trace_mirror_one_iteration(s, sector_num, nb_sectors); bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors, mirror_read_complete, op); }
static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) { BlockDriverState *source = s->common.bs; int nb_sectors, sectors_per_chunk, nb_chunks; int64_t end, sector_num, next_chunk, next_sector, hbitmap_next_sector; uint64_t delay_ns = 0; MirrorOp *op; int pnum; int64_t ret; s->sector_num = hbitmap_iter_next(&s->hbi); if (s->sector_num < 0) { bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi); s->sector_num = hbitmap_iter_next(&s->hbi); trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap)); assert(s->sector_num >= 0); } hbitmap_next_sector = s->sector_num; sector_num = s->sector_num; sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; end = s->bdev_length / BDRV_SECTOR_SIZE; /* Extend the QEMUIOVector to include all adjacent blocks that will * be copied in this operation. * * We have to do this if we have no backing file yet in the destination, * and the cluster size is very large. Then we need to do COW ourselves. * The first time a cluster is copied, copy it entirely. Note that, * because both the granularity and the cluster size are powers of two, * the number of sectors to copy cannot exceed one cluster. * * We also want to extend the QEMUIOVector to include more adjacent * dirty blocks if possible, to limit the number of I/O operations and * run efficiently even with a small granularity. */ nb_chunks = 0; nb_sectors = 0; next_sector = sector_num; next_chunk = sector_num / sectors_per_chunk; /* Wait for I/O to this cluster (from a previous iteration) to be done. */ while (test_bit(next_chunk, s->in_flight_bitmap)) { trace_mirror_yield_in_flight(s, sector_num, s->in_flight); s->waiting_for_io = true; qemu_coroutine_yield(); s->waiting_for_io = false; } do { int added_sectors, added_chunks; if (!bdrv_get_dirty(source, s->dirty_bitmap, next_sector) || test_bit(next_chunk, s->in_flight_bitmap)) { assert(nb_sectors > 0); break; } added_sectors = sectors_per_chunk; if (s->cow_bitmap && !test_bit(next_chunk, s->cow_bitmap)) { bdrv_round_to_clusters(s->target, next_sector, added_sectors, &next_sector, &added_sectors); /* On the first iteration, the rounding may make us copy * sectors before the first dirty one. */ if (next_sector < sector_num) { assert(nb_sectors == 0); sector_num = next_sector; next_chunk = next_sector / sectors_per_chunk; } } added_sectors = MIN(added_sectors, end - (sector_num + nb_sectors)); added_chunks = (added_sectors + sectors_per_chunk - 1) / sectors_per_chunk; /* When doing COW, it may happen that there is not enough space for * a full cluster. Wait if that is the case. */ while (nb_chunks == 0 && s->buf_free_count < added_chunks) { trace_mirror_yield_buf_busy(s, nb_chunks, s->in_flight); s->waiting_for_io = true; qemu_coroutine_yield(); s->waiting_for_io = false; } if (s->buf_free_count < nb_chunks + added_chunks) { trace_mirror_break_buf_busy(s, nb_chunks, s->in_flight); break; } if (IOV_MAX < nb_chunks + added_chunks) { trace_mirror_break_iov_max(s, nb_chunks, added_chunks); break; } /* We have enough free space to copy these sectors. */ bitmap_set(s->in_flight_bitmap, next_chunk, added_chunks); nb_sectors += added_sectors; nb_chunks += added_chunks; next_sector += added_sectors; next_chunk += added_chunks; if (!s->synced && s->common.speed) { delay_ns = ratelimit_calculate_delay(&s->limit, added_sectors); } } while (delay_ns == 0 && next_sector < end); /* Allocate a MirrorOp that is used as an AIO callback. */ op = g_new(MirrorOp, 1); op->s = s; op->sector_num = sector_num; op->nb_sectors = nb_sectors; /* Now make a QEMUIOVector taking enough granularity-sized chunks * from s->buf_free. */ qemu_iovec_init(&op->qiov, nb_chunks); next_sector = sector_num; while (nb_chunks-- > 0) { MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free); size_t remaining = (nb_sectors * BDRV_SECTOR_SIZE) - op->qiov.size; QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next); s->buf_free_count--; qemu_iovec_add(&op->qiov, buf, MIN(s->granularity, remaining)); /* Advance the HBitmapIter in parallel, so that we do not examine * the same sector twice. */ if (next_sector > hbitmap_next_sector && bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) { hbitmap_next_sector = hbitmap_iter_next(&s->hbi); } next_sector += sectors_per_chunk; } bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num, nb_sectors); /* Copy the dirty cluster. */ s->in_flight++; s->sectors_in_flight += nb_sectors; trace_mirror_one_iteration(s, sector_num, nb_sectors); ret = bdrv_get_block_status_above(source, NULL, sector_num, nb_sectors, &pnum); if (ret < 0 || pnum < nb_sectors || (ret & BDRV_BLOCK_DATA && !(ret & BDRV_BLOCK_ZERO))) { bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors, mirror_read_complete, op); } else if (ret & BDRV_BLOCK_ZERO) { bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors, s->unmap ? BDRV_REQ_MAY_UNMAP : 0, mirror_write_complete, op); } else { assert(!(ret & BDRV_BLOCK_DATA)); bdrv_aio_discard(s->target, sector_num, op->nb_sectors, mirror_write_complete, op); } return delay_ns; }
static int xen_block_do_aio(XenBlockRequest *request) { XenBlockDataPlane *dataplane = request->dataplane; if (request->req.nr_segments && (request->req.operation == BLKIF_OP_WRITE || request->req.operation == BLKIF_OP_FLUSH_DISKCACHE) && xen_block_copy_request(request)) { goto err; } request->aio_inflight++; if (request->presync) { blk_aio_flush(request->dataplane->blk, xen_block_complete_aio, request); return 0; } switch (request->req.operation) { case BLKIF_OP_READ: qemu_iovec_add(&request->v, request->buf, request->size); block_acct_start(blk_get_stats(dataplane->blk), &request->acct, request->v.size, BLOCK_ACCT_READ); request->aio_inflight++; blk_aio_preadv(dataplane->blk, request->start, &request->v, 0, xen_block_complete_aio, request); break; case BLKIF_OP_WRITE: case BLKIF_OP_FLUSH_DISKCACHE: if (!request->req.nr_segments) { break; } qemu_iovec_add(&request->v, request->buf, request->size); block_acct_start(blk_get_stats(dataplane->blk), &request->acct, request->v.size, request->req.operation == BLKIF_OP_WRITE ? BLOCK_ACCT_WRITE : BLOCK_ACCT_FLUSH); request->aio_inflight++; blk_aio_pwritev(dataplane->blk, request->start, &request->v, 0, xen_block_complete_aio, request); break; case BLKIF_OP_DISCARD: { struct blkif_request_discard *req = (void *)&request->req; if (!xen_block_split_discard(request, req->sector_number, req->nr_sectors)) { goto err; } break; } default: /* unknown operation (shouldn't happen -- parse catches this) */ goto err; } xen_block_complete_aio(request, 0); return 0; err: xen_block_finish_request(request); request->status = BLKIF_RSP_ERROR; return -1; }