static BlockDriverAIOCB *blkverify_aio_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) { BDRVBlkverifyState *s = bs->opaque; BlkverifyAIOCB *acb = blkverify_aio_get(bs, true, sector_num, qiov, nb_sectors, cb, opaque); bdrv_aio_writev(s->test_file, sector_num, qiov, nb_sectors, blkverify_aio_cb, acb); bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors, blkverify_aio_cb, acb); return &acb->common; }
static int ioreq_runio_qemu_aio(struct ioreq *ioreq) { struct XenBlkDev *blkdev = ioreq->blkdev; if (ioreq->req.nr_segments && ioreq_map(ioreq) == -1) { goto err_no_map; } ioreq->aio_inflight++; if (ioreq->presync) { bdrv_flush(blkdev->bs); /* FIXME: aio_flush() ??? */ } switch (ioreq->req.operation) { case BLKIF_OP_READ: bdrv_acct_start(blkdev->bs, &ioreq->acct, ioreq->v.size, BDRV_ACCT_READ); ioreq->aio_inflight++; bdrv_aio_readv(blkdev->bs, ioreq->start / BLOCK_SIZE, &ioreq->v, ioreq->v.size / BLOCK_SIZE, qemu_aio_complete, ioreq); break; case BLKIF_OP_WRITE: case BLKIF_OP_WRITE_BARRIER: if (!ioreq->req.nr_segments) { break; } bdrv_acct_start(blkdev->bs, &ioreq->acct, ioreq->v.size, BDRV_ACCT_WRITE); ioreq->aio_inflight++; bdrv_aio_writev(blkdev->bs, ioreq->start / BLOCK_SIZE, &ioreq->v, ioreq->v.size / BLOCK_SIZE, qemu_aio_complete, ioreq); break; default: /* unknown operation (shouldn't happen -- parse catches this) */ goto err; } if (ioreq->postsync) { bdrv_flush(blkdev->bs); /* FIXME: aio_flush() ??? */ } qemu_aio_complete(ioreq, 0); return 0; err: ioreq_unmap(ioreq); err_no_map: ioreq_finish(ioreq); ioreq->status = BLKIF_RSP_ERROR; return -1; }
static void dma_bdrv_cb(void *opaque, int ret) { DMAAIOCB *dbs = (DMAAIOCB *)opaque; target_phys_addr_t cur_addr, cur_len; void *mem; dbs->acb = NULL; dbs->sector_num += dbs->iov.size / 512; dma_bdrv_unmap(dbs); qemu_iovec_reset(&dbs->iov); if (dbs->sg_cur_index == dbs->sg->nsg || ret < 0) { dbs->common.cb(dbs->common.opaque, ret); qemu_iovec_destroy(&dbs->iov); qemu_aio_release(dbs); return; } while (dbs->sg_cur_index < dbs->sg->nsg) { cur_addr = dbs->sg->sg[dbs->sg_cur_index].base + dbs->sg_cur_byte; cur_len = dbs->sg->sg[dbs->sg_cur_index].len - dbs->sg_cur_byte; mem = cpu_physical_memory_map(cur_addr, &cur_len, !dbs->is_write); if (!mem) break; qemu_iovec_add(&dbs->iov, mem, cur_len); dbs->sg_cur_byte += cur_len; if (dbs->sg_cur_byte == dbs->sg->sg[dbs->sg_cur_index].len) { dbs->sg_cur_byte = 0; ++dbs->sg_cur_index; } } if (dbs->iov.size == 0) { cpu_register_map_client(dbs, continue_after_map_failure); return; } if (dbs->is_write) { dbs->acb = bdrv_aio_writev(dbs->bs, dbs->sector_num, &dbs->iov, dbs->iov.size / 512, dma_bdrv_cb, dbs); } else { dbs->acb = bdrv_aio_readv(dbs->bs, dbs->sector_num, &dbs->iov, dbs->iov.size / 512, dma_bdrv_cb, dbs); } if (!dbs->acb) { dma_bdrv_unmap(dbs); qemu_iovec_destroy(&dbs->iov); return; } }
static void flash_sync_page(Flash *s, int page) { if (s->bdrv) { int bdrv_sector, nb_sectors; QEMUIOVector iov; bdrv_sector = (page * s->pi->page_size) / BDRV_SECTOR_SIZE; nb_sectors = DIV_ROUND_UP(s->pi->page_size, BDRV_SECTOR_SIZE); qemu_iovec_init(&iov, 1); qemu_iovec_add(&iov, s->storage + bdrv_sector * BDRV_SECTOR_SIZE, nb_sectors * BDRV_SECTOR_SIZE); bdrv_aio_writev(s->bdrv, bdrv_sector, &iov, nb_sectors, bdrv_sync_complete, NULL); } }
static void qed_write_header_read_cb(void *opaque, int ret) { QEDWriteHeaderCB *write_header_cb = opaque; BDRVQEDState *s = write_header_cb->s; if (ret) { qed_write_header_cb(write_header_cb, ret); return; } /* Update header */ qed_header_cpu_to_le(&s->header, (QEDHeader *)write_header_cb->buf); bdrv_aio_writev(s->bs->file, 0, &write_header_cb->qiov, write_header_cb->nsectors, qed_write_header_cb, write_header_cb); }
static inline void flash_sync_area(Flash *s, int64_t off, int64_t len) { int64_t start, end, nb_sectors; QEMUIOVector iov; if (!s->bdrv) { return; } assert(!(len % BDRV_SECTOR_SIZE)); start = off / BDRV_SECTOR_SIZE; end = (off + len) / BDRV_SECTOR_SIZE; nb_sectors = end - start; qemu_iovec_init(&iov, 1); qemu_iovec_add(&iov, s->storage + (start * BDRV_SECTOR_SIZE), nb_sectors * BDRV_SECTOR_SIZE); bdrv_aio_writev(s->bdrv, start, &iov, nb_sectors, bdrv_sync_complete, NULL); }
/** * Write out an updated part or all of a table * * @s: QED state * @offset: Offset of table in image file, in bytes * @table: Table * @index: Index of first element * @n: Number of elements * @flush: Whether or not to sync to disk * @cb: Completion function * @opaque: Argument for completion function */ static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table, unsigned int index, unsigned int n, bool flush, BlockDriverCompletionFunc *cb, void *opaque) { QEDWriteTableCB *write_table_cb; BlockDriverAIOCB *aiocb; unsigned int sector_mask = BDRV_SECTOR_SIZE / sizeof(uint64_t) - 1; unsigned int start, end, i; size_t len_bytes; trace_qed_write_table(s, offset, table, index, n); /* Calculate indices of the first and one after last elements */ start = index & ~sector_mask; end = (index + n + sector_mask) & ~sector_mask; len_bytes = (end - start) * sizeof(uint64_t); write_table_cb = gencb_alloc(sizeof(*write_table_cb), cb, opaque); write_table_cb->s = s; write_table_cb->orig_table = table; write_table_cb->flush = flush; write_table_cb->table = qemu_blockalign(s->bs, len_bytes); write_table_cb->iov.iov_base = write_table_cb->table->offsets; write_table_cb->iov.iov_len = len_bytes; qemu_iovec_init_external(&write_table_cb->qiov, &write_table_cb->iov, 1); /* Byteswap table */ for (i = start; i < end; i++) { uint64_t le_offset = cpu_to_le64(table->offsets[i]); write_table_cb->table->offsets[i - start] = le_offset; } /* Adjust for offset into table */ offset += start * sizeof(uint64_t); aiocb = bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE, &write_table_cb->qiov, write_table_cb->qiov.size / BDRV_SECTOR_SIZE, qed_write_table_cb, write_table_cb); if (!aiocb) { qed_write_table_cb(write_table_cb, -EIO); } }
static void mirror_read_complete(void *opaque, int ret) { MirrorOp *op = opaque; MirrorBlockJob *s = op->s; if (ret < 0) { BlockErrorAction action; bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors); action = mirror_error_action(s, true, -ret); if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) { s->ret = ret; } mirror_iteration_done(op, ret); return; } bdrv_aio_writev(s->target, op->sector_num, &op->qiov, op->nb_sectors, mirror_write_complete, op); }
static void mirror_read_complete(void *opaque, int ret) { MirrorOp *op = opaque; MirrorBlockJob *s = op->s; if (ret < 0) { BlockDriverState *source = s->common.bs; BlockErrorAction action; bdrv_set_dirty(source, op->sector_num, op->nb_sectors); action = mirror_error_action(s, true, -ret); if (action == BDRV_ACTION_REPORT && s->ret >= 0) { s->ret = ret; } mirror_iteration_done(op, ret); return; } bdrv_aio_writev(s->target, op->sector_num, &op->qiov, op->nb_sectors, mirror_write_complete, op); }
static int ioreq_runio_qemu_aio(struct ioreq *ioreq) { struct XenBlkDev *blkdev = ioreq->blkdev; if (ioreq_map(ioreq) == -1) goto err; ioreq->aio_inflight++; if (ioreq->presync) bdrv_flush(blkdev->bs); /* FIXME: aio_flush() ??? */ switch (ioreq->req.operation) { case BLKIF_OP_READ: ioreq->aio_inflight++; bdrv_aio_readv(blkdev->bs, ioreq->start / BLOCK_SIZE, &ioreq->v, ioreq->v.size / BLOCK_SIZE, qemu_aio_complete, ioreq); break; case BLKIF_OP_WRITE: case BLKIF_OP_WRITE_BARRIER: ioreq->aio_inflight++; bdrv_aio_writev(blkdev->bs, ioreq->start / BLOCK_SIZE, &ioreq->v, ioreq->v.size / BLOCK_SIZE, qemu_aio_complete, ioreq); break; default: /* unknown operation (shouldn't happen -- parse catches this) */ goto err; } if (ioreq->postsync) bdrv_flush(blkdev->bs); /* FIXME: aio_flush() ??? */ qemu_aio_complete(ioreq, 0); return 0; err: ioreq->status = BLKIF_RSP_ERROR; return -1; }
static inline BlockDriverAIOCB *store_data (int soft_write, FvdAIOCB * parent_acb, BlockDriverState * bs, int64_t sector_num, QEMUIOVector * orig_qiov, int nb_sectors, BlockDriverCompletionFunc * cb, void *opaque) { BDRVFvdState *s = bs->opaque; TRACE_STORE_IN_FVD ("store_data", sector_num, nb_sectors); if (!s->table) { /* Write directly since it is not a compact image. */ return bdrv_aio_writev (s->fvd_data, s->data_offset + sector_num, orig_qiov, nb_sectors, cb, opaque); } else { return store_data_in_compact_image (NULL, soft_write, parent_acb, bs, sector_num, orig_qiov, nb_sectors, cb, opaque); } }
static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) { return bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors, cb, opaque); }
/* Return FALSE if the submitted request is cancelled. */ static int submit_rand_io (RandomIO * r) { BlockDriverAIOCB *acb = NULL; QDEBUG ("TESTER %03d: %s test%" PRIX64 " sector_num=%" PRId64 " nb_sectors=%d niov=%d\n", r->tester, op_type_str[r->type], r->uuid, r->sector_num, r->nb_sectors, r->qiov.niov); printf ("TESTER %03d: %s sector_num=%" PRId64 " nb_sectors=%d niov=%d\n", r->tester, op_type_str[r->type], r->sector_num, r->nb_sectors, r->qiov.niov); int ret; if (fail_prob <= 0) { ret = 0; } else if (random () / (double) RAND_MAX <= fail_prob) { ret = -EIO; } else { ret = 0; } /* This affects whether this request will fail or not. */ sim_set_disk_io_return_code (ret); switch (r->type) { case OP_READ: if (!(acb = bdrv_aio_readv (bs, r->sector_num, &r->qiov, r->nb_sectors, rand_io_cb, r))) { die ("bdrv_aio_readv\n"); } break; case OP_WRITE: if (!(acb = bdrv_aio_writev (bs, r->sector_num, &r->qiov, r->nb_sectors, rand_io_cb, r))) { die ("bdrv_aio_writev\n"); } break; case OP_FLUSH: if (!(acb = bdrv_aio_flush (bs, rand_io_cb, r))) { die ("bdrv_aio_flush\n"); } break; case OP_NULL: die ("OP_NULL"); break; } sim_set_disk_io_return_code (0); /* Reset to no failure state. */ if (r->allow_cancel && cancel_prob > 0 && random () / (double) RAND_MAX <= cancel_prob) { QDEBUG ("TESTER %03d: cancel %s test%" PRIX64 " sector_num=%" PRId64 " nb_sectors=%d niov=%d\n", r->tester, op_type_str[r->type], r->uuid, r->sector_num, r->nb_sectors, r->qiov.niov); printf ("TESTER %03d: cancel %s sector_num=%" PRId64 " nb_sectors=%d niov=%d\n", r->tester, op_type_str[r->type], r->sector_num, r->nb_sectors, r->qiov.niov); bdrv_aio_cancel (acb); return FALSE; } else { return TRUE; } }
/* Store data in the compact image. The argument 'soft_write' means * the store was caused by copy-on-read or prefetching, which need not * update metadata immediately. */ static BlockDriverAIOCB *store_data_in_compact_image (FvdAIOCB * acb, int soft_write, FvdAIOCB * parent_acb, BlockDriverState * bs, int64_t sector_num, QEMUIOVector * orig_qiov, const int nb_sectors, BlockDriverCompletionFunc * cb, void *opaque) { BDRVFvdState *s = bs->opaque; const uint32_t first_chunk = sector_num / s->chunk_size; const uint32_t last_chunk = (sector_num + nb_sectors - 1) / s->chunk_size; int table_dirty = FALSE; uint32_t chunk; int64_t start_sec; /* Check if storag space is allocated. */ for (chunk = first_chunk; chunk <= last_chunk; chunk++) { if (IS_EMPTY (s->table[chunk])) { uint32_t id = allocate_chunk (bs); if (IS_EMPTY (id)) { return NULL; } id |= DIRTY_TABLE; WRITE_TABLE (s->table[chunk], id); table_dirty = TRUE; } else if (IS_DIRTY (s->table[chunk])) { /* This is possible if a previous soft-write allocated the storage * space but did not flush the table entry change to the journal * and hence did not clean the dirty bit. This is also possible * with two concurrent hard-writes. The first hard-write allocated * the storage space but has not flushed the table entry change to * the journal yet and hence the table entry remains dirty. In * this case, the second hard-write will also try to flush this * dirty table entry to the journal. The outcome is correct since * they store the same metadata change in the journal (although * twice). For this race condition, we prefer to have two writes * to the journal rather than introducing a locking mechanism, * because this happens rarely and those two writes to the journal * are likely to be merged by the kernel into a single write since * they are likely to update back-to-back sectors in the journal. * A locking mechanism would be less efficient, because the large * size of chunks would cause unnecessary locking due to ``false * sharing'' of a chunk by two writes. */ table_dirty = TRUE; } } const int update_table = (!soft_write && table_dirty); size_t iov_left; uint8_t *iov_buf; int nb, iov_index, nqiov, niov; uint32_t prev; if (first_chunk == last_chunk) { goto handle_one_continuous_region; } /* Count the number of qiov and iov needed to cover the continuous regions * of the compact image. */ iov_left = orig_qiov->iov[0].iov_len; iov_buf = orig_qiov->iov[0].iov_base; iov_index = 0; nqiov = 0; niov = 0; prev = READ_TABLE (s->table[first_chunk]); /* Data in the first chunk. */ nb = s->chunk_size - (sector_num % s->chunk_size); for (chunk = first_chunk + 1; chunk <= last_chunk; chunk++) { uint32_t current = READ_TABLE (s->table[chunk]); int64_t data_size; if (chunk < last_chunk) { data_size = s->chunk_size; } else { data_size = (sector_num + nb_sectors) % s->chunk_size; if (data_size == 0) { data_size = s->chunk_size; } } if (current == prev + 1) { nb += data_size; /* Continue the previous region. */ } else { /* Terminate the previous region. */ niov += count_iov (orig_qiov->iov, &iov_index, &iov_buf, &iov_left, nb * 512); nqiov++; nb = data_size; /* Data in the new region. */ } prev = current; } if (nqiov == 0) { handle_one_continuous_region: /* A simple case. All data can be written out in one qiov and no new * chunks are allocated. */ start_sec = READ_TABLE (s->table[first_chunk]) * s->chunk_size + (sector_num % s->chunk_size); if (!update_table && !acb) { if (parent_acb) { QDEBUG ("STORE: acb%llu-%p " "store_directly_without_table_update\n", parent_acb->uuid, parent_acb); } return bdrv_aio_writev (s->fvd_data, s->data_offset + start_sec, orig_qiov, nb_sectors, cb, opaque); } if (!acb && !(acb = init_store_acb (soft_write, orig_qiov, bs, sector_num, nb_sectors, parent_acb, cb, opaque))) { return NULL; } QDEBUG ("STORE: acb%llu-%p store_directly sector_num=%" PRId64 " nb_sectors=%d\n", acb->uuid, acb, acb->sector_num, acb->nb_sectors); acb->store.update_table = update_table; acb->store.num_children = 1; acb->store.one_child.hd_acb = bdrv_aio_writev (s->fvd_data, s->data_offset + start_sec, orig_qiov, nb_sectors, finish_store_data_in_compact_image, &acb->store.one_child); if (acb->store.one_child.hd_acb) { acb->store.one_child.acb = acb; return &acb->common; } else { my_qemu_aio_unref (acb); return NULL; } } /* qiov for the last continuous region. */ niov += count_iov (orig_qiov->iov, &iov_index, &iov_buf, &iov_left, nb * 512); nqiov++; ASSERT (iov_index == orig_qiov->niov - 1 && iov_left == 0); /* Need to submit multiple requests to the lower layer. */ if (!acb && !(acb = init_store_acb (soft_write, orig_qiov, bs, sector_num, nb_sectors, parent_acb, cb, opaque))) { return NULL; } acb->store.update_table = update_table; acb->store.num_children = nqiov; if (!parent_acb) { QDEBUG ("STORE: acb%llu-%p start sector_num=%" PRId64 " nb_sectors=%d\n", acb->uuid, acb, acb->sector_num, acb->nb_sectors); } /* Allocate memory and create multiple requests. */ const size_t metadata_size = nqiov * (sizeof (CompactChildCB) + sizeof (QEMUIOVector)) + niov * sizeof (struct iovec); acb->store.children = (CompactChildCB *) my_qemu_malloc (metadata_size); QEMUIOVector *q = (QEMUIOVector *) (acb->store.children + nqiov); struct iovec *v = (struct iovec *) (q + nqiov); start_sec = READ_TABLE (s->table[first_chunk]) * s->chunk_size + (sector_num % s->chunk_size); nqiov = 0; iov_index = 0; iov_left = orig_qiov->iov[0].iov_len; iov_buf = orig_qiov->iov[0].iov_base; prev = READ_TABLE (s->table[first_chunk]); /* Data in the first chunk. */ if (first_chunk == last_chunk) { nb = nb_sectors; } else { nb = s->chunk_size - (sector_num % s->chunk_size); } for (chunk = first_chunk + 1; chunk <= last_chunk; chunk++) { uint32_t current = READ_TABLE (s->table[chunk]); int64_t data_size; if (chunk < last_chunk) { data_size = s->chunk_size; } else { data_size = (sector_num + nb_sectors) % s->chunk_size; if (data_size == 0) { data_size = s->chunk_size; } } if (current == prev + 1) { nb += data_size; /* Continue the previous region. */ } else { /* Terminate the previous continuous region. */ niov = setup_iov (orig_qiov->iov, v, &iov_index, &iov_buf, &iov_left, nb * 512); qemu_iovec_init_external (q, v, niov); QDEBUG ("STORE: acb%llu-%p create_child %d sector_num=%" PRId64 " nb_sectors=%d niov=%d\n", acb->uuid, acb, nqiov, start_sec, q->size / 512, q->niov); acb->store.children[nqiov].hd_acb = bdrv_aio_writev (s->fvd_data, s->data_offset + start_sec, q, q->size / 512, finish_store_data_in_compact_image, &acb->store.children[nqiov]); if (!acb->store.children[nqiov].hd_acb) { goto fail; } acb->store.children[nqiov].acb = acb; v += niov; q++; nqiov++; start_sec = current * s->chunk_size; /* Begin of the new region. */ nb = data_size; /* Data in the new region. */ } prev = current; } /* Requst for the last chunk. */ niov = setup_iov (orig_qiov->iov, v, &iov_index, &iov_buf, &iov_left, nb * 512); ASSERT (iov_index == orig_qiov->niov - 1 && iov_left == 0); qemu_iovec_init_external (q, v, niov); QDEBUG ("STORE: acb%llu-%p create_child_last %d sector_num=%" PRId64 " nb_sectors=%d niov=%d\n", acb->uuid, acb, nqiov, start_sec, q->size / 512, q->niov); acb->store.children[nqiov].hd_acb = bdrv_aio_writev (s->fvd_data, s->data_offset + start_sec, q, q->size / 512, finish_store_data_in_compact_image, &acb->store.children[nqiov]); if (acb->store.children[nqiov].hd_acb) { acb->store.children[nqiov].acb = acb; return &acb->common; } int i; fail: QDEBUG ("STORE: acb%llu-%p failed\n", acb->uuid, acb); for (i = 0; i < nqiov; i++) { bdrv_aio_cancel (acb->store.children[i].hd_acb); } my_qemu_free (acb->store.children); my_qemu_aio_unref (acb); return NULL; }