/* * Allocate an entry in the cache. At the point we don't have the data, * we're just creating a placeholder so that multiple threads don't all * go off and read the same blocks. */ static vdev_cache_entry_t * vdev_cache_allocate(zio_t *zio) { vdev_cache_t *vc = &zio->io_vd->vdev_cache; uint64_t offset = P2ALIGN(zio->io_offset, VCBS); vdev_cache_entry_t *ve; ASSERT(MUTEX_HELD(&vc->vc_lock)); if (zfs_vdev_cache_size == 0) return (NULL); /* * If adding a new entry would exceed the cache size, * evict the oldest entry (LRU). */ if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) > zfs_vdev_cache_size) { ve = avl_first(&vc->vc_lastused_tree); if (ve->ve_fill_io != NULL) return (NULL); ASSERT(ve->ve_hits != 0); vdev_cache_evict(vc, ve); } ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); ve->ve_offset = offset; ve->ve_lastused = lbolt; ve->ve_data = zio_buf_alloc(VCBS); avl_add(&vc->vc_offset_tree, ve); avl_add(&vc->vc_lastused_tree, ve); return (ve); }
int zfs_sa_get_xattr(znode_t *zp) { zfs_sb_t *zsb = ZTOZSB(zp); char *obj; int size; int error; ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); ASSERT(!zp->z_xattr_cached); ASSERT(zp->z_is_sa); error = sa_size(zp->z_sa_hdl, SA_ZPL_DXATTR(zsb), &size); if (error) { if (error == ENOENT) return nvlist_alloc(&zp->z_xattr_cached, NV_UNIQUE_NAME, KM_SLEEP); else return (error); } obj = zio_buf_alloc(size); error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DXATTR(zsb), obj, size); if (error == 0) error = nvlist_unpack(obj, size, &zp->z_xattr_cached, KM_SLEEP); zio_buf_free(obj, size); return (error); }
int zfs_sa_set_xattr(znode_t *zp) { zfs_sb_t *zsb = ZTOZSB(zp); dmu_tx_t *tx; char *obj; size_t size; int error; ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock)); ASSERT(zp->z_xattr_cached); ASSERT(zp->z_is_sa); error = nvlist_size(zp->z_xattr_cached, &size, NV_ENCODE_XDR); if (error) goto out; obj = zio_buf_alloc(size); error = nvlist_pack(zp->z_xattr_cached, &obj, &size, NV_ENCODE_XDR, KM_SLEEP); if (error) goto out_free; tx = dmu_tx_create(zsb->z_os); dmu_tx_hold_sa_create(tx, size); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { error = sa_update(zp->z_sa_hdl, SA_ZPL_DXATTR(zsb), obj, size, tx); if (error) dmu_tx_abort(tx); else dmu_tx_commit(tx); } out_free: zio_buf_free(obj, size); out: return (error); }
static zio_t * vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) { zio_t *fio, *lio, *aio, *dio, *nio, *mio; avl_tree_t *t; int flags; uint64_t maxspan = zfs_vdev_aggregation_limit; uint64_t maxgap; int stretch; again: ASSERT(MUTEX_HELD(&vq->vq_lock)); if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || avl_numnodes(&vq->vq_deadline_tree) == 0) return (NULL); fio = lio = avl_first(&vq->vq_deadline_tree); t = fio->io_vdev_tree; flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT; maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0; if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) { /* * We can aggregate I/Os that are sufficiently adjacent and of * the same flavor, as expressed by the AGG_INHERIT flags. * The latter requirement is necessary so that certain * attributes of the I/O, such as whether it's a normal I/O * or a scrub/resilver, can be preserved in the aggregate. * We can include optional I/Os, but don't allow them * to begin a range as they add no benefit in that situation. */ /* * We keep track of the last non-optional I/O. */ mio = (fio->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : fio; /* * Walk backwards through sufficiently contiguous I/Os * recording the last non-option I/O. */ while ((dio = AVL_PREV(t, fio)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && IO_SPAN(dio, lio) <= maxspan && IO_GAP(dio, fio) <= maxgap) { fio = dio; if (mio == NULL && !(fio->io_flags & ZIO_FLAG_OPTIONAL)) mio = fio; } /* * Skip any initial optional I/Os. */ while ((fio->io_flags & ZIO_FLAG_OPTIONAL) && fio != lio) { fio = AVL_NEXT(t, fio); ASSERT(fio != NULL); } /* * Walk forward through sufficiently contiguous I/Os. */ while ((dio = AVL_NEXT(t, lio)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && IO_SPAN(fio, dio) <= maxspan && IO_GAP(lio, dio) <= maxgap) { lio = dio; if (!(lio->io_flags & ZIO_FLAG_OPTIONAL)) mio = lio; } /* * Now that we've established the range of the I/O aggregation * we must decide what to do with trailing optional I/Os. * For reads, there's nothing to do. While we are unable to * aggregate further, it's possible that a trailing optional * I/O would allow the underlying device to aggregate with * subsequent I/Os. We must therefore determine if the next * non-optional I/O is close enough to make aggregation * worthwhile. */ stretch = B_FALSE; if (t != &vq->vq_read_tree && mio != NULL) { nio = lio; while ((dio = AVL_NEXT(t, nio)) != NULL && IO_GAP(nio, dio) == 0 && IO_GAP(mio, dio) <= zfs_vdev_write_gap_limit) { nio = dio; if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) { stretch = B_TRUE; break; } } } if (stretch) { /* This may be a no-op. */ VERIFY((dio = AVL_NEXT(t, lio)) != NULL); dio->io_flags &= ~ZIO_FLAG_OPTIONAL; } else { while (lio != mio && lio != fio) { ASSERT(lio->io_flags & ZIO_FLAG_OPTIONAL); lio = AVL_PREV(t, lio); ASSERT(lio != NULL); } } } if (fio != lio) { uint64_t size = IO_SPAN(fio, lio); ASSERT(size <= zfs_vdev_aggregation_limit); aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset, zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); aio->io_timestamp = fio->io_timestamp; nio = fio; do { dio = nio; nio = AVL_NEXT(t, dio); ASSERT(dio->io_type == aio->io_type); ASSERT(dio->io_vdev_tree == t); if (dio->io_flags & ZIO_FLAG_NODATA) { ASSERT(dio->io_type == ZIO_TYPE_WRITE); bzero((char *)aio->io_data + (dio->io_offset - aio->io_offset), dio->io_size); } else if (dio->io_type == ZIO_TYPE_WRITE) { bcopy(dio->io_data, (char *)aio->io_data + (dio->io_offset - aio->io_offset), dio->io_size); } zio_add_child(dio, aio); vdev_queue_io_remove(vq, dio); zio_vdev_io_bypass(dio); zio_execute(dio); } while (dio != lio); vdev_queue_pending_add(vq, aio); return (aio); } ASSERT(fio->io_vdev_tree == t); vdev_queue_io_remove(vq, fio); /* * If the I/O is or was optional and therefore has no data, we need to * simply discard it. We need to drop the vdev queue's lock to avoid a * deadlock that we could encounter since this I/O will complete * immediately. */ if (fio->io_flags & ZIO_FLAG_NODATA) { mutex_exit(&vq->vq_lock); zio_vdev_io_bypass(fio); zio_execute(fio); mutex_enter(&vq->vq_lock); goto again; } vdev_queue_pending_add(vq, fio); return (fio); }
static zio_t * vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) { zio_t *first, *last, *aio, *dio, *mandatory, *nio; uint64_t maxgap = 0; uint64_t size; boolean_t stretch; avl_tree_t *t; enum zio_flag flags; ASSERT(MUTEX_HELD(&vq->vq_lock)); if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE) return (NULL); /* * The synchronous i/o queues are not sorted by LBA, so we can't * find adjacent i/os. These i/os tend to not be tightly clustered, * or too large to aggregate, so this has little impact on performance. */ if (zio->io_priority == ZIO_PRIORITY_SYNC_READ || zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) return (NULL); first = last = zio; if (zio->io_type == ZIO_TYPE_READ) maxgap = zfs_vdev_read_gap_limit; /* * We can aggregate I/Os that are sufficiently adjacent and of * the same flavor, as expressed by the AGG_INHERIT flags. * The latter requirement is necessary so that certain * attributes of the I/O, such as whether it's a normal I/O * or a scrub/resilver, can be preserved in the aggregate. * We can include optional I/Os, but don't allow them * to begin a range as they add no benefit in that situation. */ /* * We keep track of the last non-optional I/O. */ mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first; /* * Walk backwards through sufficiently contiguous I/Os * recording the last non-option I/O. */ flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; t = &vq->vq_class[zio->io_priority].vqc_queued_tree; while ((dio = AVL_PREV(t, first)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && IO_SPAN(dio, last) <= zfs_vdev_aggregation_limit && IO_GAP(dio, first) <= maxgap) { first = dio; if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL)) mandatory = first; } /* * Skip any initial optional I/Os. */ while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) { first = AVL_NEXT(t, first); ASSERT(first != NULL); } /* * Walk forward through sufficiently contiguous I/Os. */ while ((dio = AVL_NEXT(t, last)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && IO_SPAN(first, dio) <= zfs_vdev_aggregation_limit && IO_GAP(last, dio) <= maxgap) { last = dio; if (!(last->io_flags & ZIO_FLAG_OPTIONAL)) mandatory = last; } /* * Now that we've established the range of the I/O aggregation * we must decide what to do with trailing optional I/Os. * For reads, there's nothing to do. While we are unable to * aggregate further, it's possible that a trailing optional * I/O would allow the underlying device to aggregate with * subsequent I/Os. We must therefore determine if the next * non-optional I/O is close enough to make aggregation * worthwhile. */ stretch = B_FALSE; if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) { zio_t *nio = last; while ((dio = AVL_NEXT(t, nio)) != NULL && IO_GAP(nio, dio) == 0 && IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) { nio = dio; if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) { stretch = B_TRUE; break; } } } if (stretch) { /* This may be a no-op. */ dio = AVL_NEXT(t, last); dio->io_flags &= ~ZIO_FLAG_OPTIONAL; } else { while (last != mandatory && last != first) { ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL); last = AVL_PREV(t, last); ASSERT(last != NULL); } } if (first == last) return (NULL); size = IO_SPAN(first, last); ASSERT3U(size, <=, zfs_vdev_aggregation_limit); aio = zio_vdev_delegated_io(first->io_vd, first->io_offset, zio_buf_alloc(size), size, first->io_type, zio->io_priority, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); aio->io_timestamp = first->io_timestamp; nio = first; do { dio = nio; nio = AVL_NEXT(t, dio); ASSERT3U(dio->io_type, ==, aio->io_type); if (dio->io_flags & ZIO_FLAG_NODATA) { ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE); bzero((char *)aio->io_data + (dio->io_offset - aio->io_offset), dio->io_size); } else if (dio->io_type == ZIO_TYPE_WRITE) { bcopy(dio->io_data, (char *)aio->io_data + (dio->io_offset - aio->io_offset), dio->io_size); } zio_add_child(dio, aio); vdev_queue_io_remove(vq, dio); zio_vdev_io_bypass(dio); zio_execute(dio); } while (dio != last);
/* * Load the space map disk into the specified range tree. Segments of maptype * are added to the range tree, other segment types are removed. * * Note: space_map_load() will drop sm_lock across dmu_read() calls. * The caller must be OK with this. */ int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype) { uint64_t *entry, *entry_map, *entry_map_end; uint64_t bufsize, size, offset, end, space; int error = 0; ASSERT(MUTEX_HELD(sm->sm_lock)); end = space_map_length(sm); space = space_map_allocated(sm); VERIFY0(range_tree_space(rt)); if (maptype == SM_FREE) { range_tree_add(rt, sm->sm_start, sm->sm_size); space = sm->sm_size - space; } bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE); entry_map = zio_buf_alloc(bufsize); mutex_exit(sm->sm_lock); if (end > bufsize) { dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize, end - bufsize, ZIO_PRIORITY_SYNC_READ); } mutex_enter(sm->sm_lock); for (offset = 0; offset < end; offset += bufsize) { size = MIN(end - offset, bufsize); VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0); VERIFY(size != 0); ASSERT3U(sm->sm_blksz, !=, 0); dprintf("object=%llu offset=%llx size=%llx\n", space_map_object(sm), offset, size); mutex_exit(sm->sm_lock); error = dmu_read(sm->sm_os, space_map_object(sm), offset, size, entry_map, DMU_READ_PREFETCH); mutex_enter(sm->sm_lock); if (error != 0) break; entry_map_end = entry_map + (size / sizeof (uint64_t)); for (entry = entry_map; entry < entry_map_end; entry++) { uint64_t e = *entry; uint64_t offset, size; if (SM_DEBUG_DECODE(e)) /* Skip debug entries */ continue; offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) + sm->sm_start; size = SM_RUN_DECODE(e) << sm->sm_shift; VERIFY0(P2PHASE(offset, 1ULL << sm->sm_shift)); VERIFY0(P2PHASE(size, 1ULL << sm->sm_shift)); VERIFY3U(offset, >=, sm->sm_start); VERIFY3U(offset + size, <=, sm->sm_start + sm->sm_size); if (SM_TYPE_DECODE(e) == maptype) { VERIFY3U(range_tree_space(rt) + size, <=, sm->sm_size); range_tree_add(rt, offset, size); } else { range_tree_remove(rt, offset, size); } }
int zio_compress_data(int cpfunc, void *src, uint64_t srcsize, void **destp, uint64_t *destsizep, uint64_t *destbufsizep) { uint64_t *word, *word_end; uint64_t ciosize, gapsize, destbufsize; zio_compress_info_t *ci = &zio_compress_table[cpfunc]; char *dest; uint_t allzero; ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS); ASSERT((uint_t)cpfunc == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL); /* * If the data is all zeroes, we don't even need to allocate * a block for it. We indicate this by setting *destsizep = 0. */ allzero = 1; word = src; word_end = (uint64_t *)(uintptr_t)((uintptr_t)word + srcsize); while (word < word_end) { if (*word++ != 0) { allzero = 0; break; } } if (allzero) { *destp = NULL; *destsizep = 0; *destbufsizep = 0; return (1); } if (cpfunc == ZIO_COMPRESS_EMPTY) return (0); /* Compress at least 12.5% */ destbufsize = P2ALIGN(srcsize - (srcsize >> 3), SPA_MINBLOCKSIZE); if (destbufsize == 0) return (0); dest = zio_buf_alloc(destbufsize); ciosize = ci->ci_compress(src, dest, (size_t)srcsize, (size_t)destbufsize, ci->ci_level); if (ciosize > destbufsize) { zio_buf_free(dest, destbufsize); return (0); } /* Cool. We compressed at least as much as we were hoping to. */ /* For security, make sure we don't write random heap crap to disk */ gapsize = P2ROUNDUP(ciosize, SPA_MINBLOCKSIZE) - ciosize; if (gapsize != 0) { bzero(dest + ciosize, gapsize); ciosize += gapsize; } ASSERT3U(ciosize, <=, destbufsize); ASSERT(P2PHASE(ciosize, SPA_MINBLOCKSIZE) == 0); *destp = dest; *destsizep = ciosize; *destbufsizep = destbufsize; return (1); }
/* * Start a log block write and advance to the next log block. * Calls are serialized. */ static lwb_t * zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) { lwb_t *nlwb; zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1; spa_t *spa = zilog->zl_spa; blkptr_t *bp = &ztp->zit_next_blk; uint64_t txg; uint64_t zil_blksz; int error; ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb)); /* * Allocate the next block and save its address in this block * before writing it in order to establish the log chain. * Note that if the allocation of nlwb synced before we wrote * the block that points at it (lwb), we'd leak it if we crashed. * Therefore, we don't do txg_rele_to_sync() until zil_lwb_write_done(). */ txg = txg_hold_open(zilog->zl_dmu_pool, &lwb->lwb_txgh); txg_rele_to_quiesce(&lwb->lwb_txgh); /* * Pick a ZIL blocksize. We request a size that is the * maximum of the previous used size, the current used size and * the amount waiting in the queue. */ zil_blksz = MAX(zilog->zl_prev_used, zilog->zl_cur_used + sizeof (*ztp)); zil_blksz = MAX(zil_blksz, zilog->zl_itx_list_sz + sizeof (*ztp)); zil_blksz = P2ROUNDUP_TYPED(zil_blksz, ZIL_MIN_BLKSZ, uint64_t); if (zil_blksz > ZIL_MAX_BLKSZ) zil_blksz = ZIL_MAX_BLKSZ; BP_ZERO(bp); /* pass the old blkptr in order to spread log blocks across devs */ error = zio_alloc_blk(spa, zil_blksz, bp, &lwb->lwb_blk, txg); if (error) { dmu_tx_t *tx = dmu_tx_create_assigned(zilog->zl_dmu_pool, txg); /* * We dirty the dataset to ensure that zil_sync() will * be called to remove this lwb from our zl_lwb_list. * Failing to do so, may leave an lwb with a NULL lwb_buf * hanging around on the zl_lwb_list. */ dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); dmu_tx_commit(tx); /* * Since we've just experienced an allocation failure so we * terminate the current lwb and send it on its way. */ ztp->zit_pad = 0; ztp->zit_nused = lwb->lwb_nused; ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum; zio_nowait(lwb->lwb_zio); /* * By returning NULL the caller will call tx_wait_synced() */ return (NULL); } ASSERT3U(bp->blk_birth, ==, txg); ztp->zit_pad = 0; ztp->zit_nused = lwb->lwb_nused; ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum; bp->blk_cksum = lwb->lwb_blk.blk_cksum; bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; /* * Allocate a new log write buffer (lwb). */ nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); nlwb->lwb_zilog = zilog; nlwb->lwb_blk = *bp; nlwb->lwb_nused = 0; nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk); nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz); nlwb->lwb_max_txg = txg; nlwb->lwb_zio = NULL; /* * Put new lwb at the end of the log chain */ mutex_enter(&zilog->zl_lock); list_insert_tail(&zilog->zl_lwb_list, nlwb); mutex_exit(&zilog->zl_lock); /* Record the block for later vdev flushing */ zil_add_block(zilog, &lwb->lwb_blk); /* * kick off the write for the old log block */ dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg); ASSERT(lwb->lwb_zio); zio_nowait(lwb->lwb_zio); return (nlwb); }
/* * Create an on-disk intent log. */ static void zil_create(zilog_t *zilog) { const zil_header_t *zh = zilog->zl_header; lwb_t *lwb; uint64_t txg = 0; dmu_tx_t *tx = NULL; blkptr_t blk; int error = 0; /* * Wait for any previous destroy to complete. */ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); ASSERT(zh->zh_claim_txg == 0); ASSERT(zh->zh_replay_seq == 0); blk = zh->zh_log; /* * If we don't already have an initial log block or we have one * but it's the wrong endianness then allocate one. */ if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) { tx = dmu_tx_create(zilog->zl_os); (void) dmu_tx_assign(tx, TXG_WAIT); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); if (!BP_IS_HOLE(&blk)) { zio_free_blk(zilog->zl_spa, &blk, txg); BP_ZERO(&blk); } error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk, NULL, txg); if (error == 0) zil_init_log_chain(zilog, &blk); } /* * Allocate a log write buffer (lwb) for the first log block. */ if (error == 0) { lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); lwb->lwb_zilog = zilog; lwb->lwb_blk = blk; lwb->lwb_nused = 0; lwb->lwb_sz = BP_GET_LSIZE(&lwb->lwb_blk); lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz); lwb->lwb_max_txg = txg; lwb->lwb_zio = NULL; mutex_enter(&zilog->zl_lock); list_insert_tail(&zilog->zl_lwb_list, lwb); mutex_exit(&zilog->zl_lock); } /* * If we just allocated the first log block, commit our transaction * and wait for zil_sync() to stuff the block poiner into zh_log. * (zh is part of the MOS, so we cannot modify it in open context.) */ if (tx != NULL) { dmu_tx_commit(tx); txg_wait_synced(zilog->zl_dmu_pool, txg); } ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); }
static zio_t * vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) { zio_t *fio, *lio, *aio, *dio, *nio; avl_tree_t *t; int flags; uint64_t maxspan = zfs_vdev_aggregation_limit; uint64_t maxgap; ASSERT(MUTEX_HELD(&vq->vq_lock)); if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || avl_numnodes(&vq->vq_deadline_tree) == 0) return (NULL); fio = lio = avl_first(&vq->vq_deadline_tree); t = fio->io_vdev_tree; flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT; maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0; if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) { /* * We can aggregate I/Os that are adjacent and of the * same flavor, as expressed by the AGG_INHERIT flags. * The latter is necessary so that certain attributes * of the I/O, such as whether it's a normal I/O or a * scrub/resilver, can be preserved in the aggregate. */ while ((dio = AVL_PREV(t, fio)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && IO_SPAN(dio, lio) <= maxspan && IO_GAP(dio, fio) <= maxgap) fio = dio; while ((dio = AVL_NEXT(t, lio)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && IO_SPAN(fio, dio) <= maxspan && IO_GAP(lio, dio) <= maxgap) lio = dio; } if (fio != lio) { uint64_t size = IO_SPAN(fio, lio); ASSERT(size <= zfs_vdev_aggregation_limit); aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset, zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_NOW, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); nio = fio; do { dio = nio; nio = AVL_NEXT(t, dio); ASSERT(dio->io_type == aio->io_type); ASSERT(dio->io_vdev_tree == t); if (dio->io_type == ZIO_TYPE_WRITE) bcopy(dio->io_data, (char *)aio->io_data + (dio->io_offset - aio->io_offset), dio->io_size); zio_add_child(dio, aio); vdev_queue_io_remove(vq, dio); zio_vdev_io_bypass(dio); zio_execute(dio); } while (dio != lio); avl_add(&vq->vq_pending_tree, aio); return (aio); } ASSERT(fio->io_vdev_tree == t); vdev_queue_io_remove(vq, fio); avl_add(&vq->vq_pending_tree, fio); return (fio); }
static zio_t * vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) { zio_t *fio, *lio, *aio, *dio; avl_tree_t *tree; uint64_t size; ASSERT(MUTEX_HELD(&vq->vq_lock)); if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || avl_numnodes(&vq->vq_deadline_tree) == 0) return (NULL); fio = lio = avl_first(&vq->vq_deadline_tree); tree = fio->io_vdev_tree; size = fio->io_size; while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) && size + dio->io_size <= zfs_vdev_aggregation_limit) { dio->io_delegate_next = fio; fio = dio; size += dio->io_size; } while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) && size + dio->io_size <= zfs_vdev_aggregation_limit) { lio->io_delegate_next = dio; lio = dio; size += dio->io_size; } if (fio != lio) { char *buf = zio_buf_alloc(size); uint64_t offset = 0; int nagg = 0; ASSERT(size <= zfs_vdev_aggregation_limit); aio = zio_vdev_child_io(fio, NULL, fio->io_vd, fio->io_offset, buf, size, fio->io_type, ZIO_PRIORITY_NOW, ZIO_FLAG_DONT_QUEUE | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_NOBOOKMARK, vdev_queue_agg_io_done, NULL); aio->io_delegate_list = fio; for (dio = fio; dio != NULL; dio = dio->io_delegate_next) { ASSERT(dio->io_type == aio->io_type); ASSERT(dio->io_vdev_tree == tree); if (dio->io_type == ZIO_TYPE_WRITE) bcopy(dio->io_data, buf + offset, dio->io_size); offset += dio->io_size; vdev_queue_io_remove(vq, dio); zio_vdev_io_bypass(dio); nagg++; } ASSERT(offset == size); dprintf("%5s T=%llu off=%8llx agg=%3d " "old=%5llx new=%5llx\n", zio_type_name[fio->io_type], fio->io_deadline, fio->io_offset, nagg, fio->io_size, size); avl_add(&vq->vq_pending_tree, aio); return (aio); } ASSERT(fio->io_vdev_tree == tree); vdev_queue_io_remove(vq, fio); avl_add(&vq->vq_pending_tree, fio); return (fio); }