Пример #1
0
static int
vdev_file_io_start(zio_t *zio)
{
	vdev_t *vd = zio->io_vd;
	vdev_file_t *vf = vd->vdev_tsd;
#ifdef LINUX_AIO
	struct iocb *iocbp = &zio->io_aio;
#endif

	ssize_t resid;
	int error;

	if (zio->io_type == ZIO_TYPE_IOCTL) {
		zio_vdev_io_bypass(zio);

		/* XXPOLICY */
		if (!vdev_readable(vd)) {
			zio->io_error = ENXIO;
			return (ZIO_PIPELINE_CONTINUE);
		}

		switch (zio->io_cmd) {
		case DKIOCFLUSHWRITECACHE:
			if (zfs_nocacheflush)
				break;

			/* This doesn't actually do much with O_DIRECT... */
			zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC,
			    kcred, NULL);
			dprintf("fsync(%s) = %d\n", vdev_description(vd),
			    zio->io_error);

			if (vd->vdev_nowritecache) {
				zio->io_error = ENOTSUP;
				break;
			}

			/* Flush the write cache */
			error = flushwc(vf->vf_vnode);
			dprintf("flushwc(%s) = %d\n", vdev_description(vd),
			    error);

			if (error) {
#ifdef _KERNEL
				cmn_err(CE_WARN, "Failed to flush write cache "
				    "on device '%s'. Data on pool '%s' may be lost "
				    "if power fails. No further warnings will "
				    "be given.", vdev_description(vd),
				    spa_name(vd->vdev_spa));
#endif

				vd->vdev_nowritecache = B_TRUE;
				zio->io_error = error;
			}

			break;
		default:
			zio->io_error = ENOTSUP;
		}

		return (ZIO_PIPELINE_CONTINUE);
	}

	/*
	 * In the kernel, don't bother double-caching, but in userland,
	 * we want to test the vdev_cache code.
	 */
	if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
		return (ZIO_PIPELINE_STOP);

	if ((zio = vdev_queue_io(zio)) == NULL)
		return (ZIO_PIPELINE_STOP);

	/* XXPOLICY */
	if (zio->io_type == ZIO_TYPE_WRITE)
		error = vdev_writeable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
	else
		error = vdev_readable(vd) ? vdev_error_inject(vd, zio) : ENXIO;
	error = (vd->vdev_remove_wanted || vd->vdev_is_failing) ? ENXIO : error;
	if (error) {
		zio->io_error = error;
		zio_interrupt(zio);
		return (ZIO_PIPELINE_STOP);
	}

#ifdef LINUX_AIO
	if (zio->io_aio_ctx && zio->io_aio_ctx->zac_enabled) {
		if (zio->io_type == ZIO_TYPE_READ)
			io_prep_pread(&zio->io_aio, vf->vf_vnode->v_fd,
			    zio->io_data, zio->io_size, zio->io_offset);
		else
			io_prep_pwrite(&zio->io_aio, vf->vf_vnode->v_fd,
			    zio->io_data, zio->io_size, zio->io_offset);

		zio->io_aio.data = zio;

		do {
			error = io_submit(zio->io_aio_ctx->zac_ctx, 1, &iocbp);
		} while (error == -EINTR);

		if (error < 0) {
			zio->io_error = -error;
			zio_interrupt(zio);
		} else
			VERIFY(error == 1);

		return (ZIO_PIPELINE_STOP);
	}
#endif

	zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
	    UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data,
	    zio->io_size, zio->io_offset, UIO_SYSSPACE,
	    0, RLIM64_INFINITY, kcred, &resid);

	if (resid != 0 && zio->io_error == 0)
		zio->io_error = ENOSPC;

	zio_interrupt(zio);

	return (ZIO_PIPELINE_STOP);
}
Пример #2
0
static zio_t *
vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
{
	zio_t *first, *last, *aio, *dio, *mandatory, *nio;
	uint64_t maxgap = 0;
	uint64_t size;
	boolean_t stretch;
	avl_tree_t *t;
	enum zio_flag flags;

	ASSERT(MUTEX_HELD(&vq->vq_lock));

	if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
		return (NULL);

	/*
	 * The synchronous i/o queues are not sorted by LBA, so we can't
	 * find adjacent i/os.  These i/os tend to not be tightly clustered,
	 * or too large to aggregate, so this has little impact on performance.
	 */
	if (zio->io_priority == ZIO_PRIORITY_SYNC_READ ||
	    zio->io_priority == ZIO_PRIORITY_SYNC_WRITE)
		return (NULL);

	first = last = zio;

	if (zio->io_type == ZIO_TYPE_READ)
		maxgap = zfs_vdev_read_gap_limit;

	/*
	 * We can aggregate I/Os that are sufficiently adjacent and of
	 * the same flavor, as expressed by the AGG_INHERIT flags.
	 * The latter requirement is necessary so that certain
	 * attributes of the I/O, such as whether it's a normal I/O
	 * or a scrub/resilver, can be preserved in the aggregate.
	 * We can include optional I/Os, but don't allow them
	 * to begin a range as they add no benefit in that situation.
	 */

	/*
	 * We keep track of the last non-optional I/O.
	 */
	mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first;

	/*
	 * Walk backwards through sufficiently contiguous I/Os
	 * recording the last non-option I/O.
	 */
	flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
	t = &vq->vq_class[zio->io_priority].vqc_queued_tree;
	while ((dio = AVL_PREV(t, first)) != NULL &&
	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
	    IO_SPAN(dio, last) <= zfs_vdev_aggregation_limit &&
	    IO_GAP(dio, first) <= maxgap) {
		first = dio;
		if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL))
			mandatory = first;
	}

	/*
	 * Skip any initial optional I/Os.
	 */
	while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) {
		first = AVL_NEXT(t, first);
		ASSERT(first != NULL);
	}

	/*
	 * Walk forward through sufficiently contiguous I/Os.
	 */
	while ((dio = AVL_NEXT(t, last)) != NULL &&
	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
	    IO_SPAN(first, dio) <= zfs_vdev_aggregation_limit &&
	    IO_GAP(last, dio) <= maxgap) {
		last = dio;
		if (!(last->io_flags & ZIO_FLAG_OPTIONAL))
			mandatory = last;
	}

	/*
	 * Now that we've established the range of the I/O aggregation
	 * we must decide what to do with trailing optional I/Os.
	 * For reads, there's nothing to do. While we are unable to
	 * aggregate further, it's possible that a trailing optional
	 * I/O would allow the underlying device to aggregate with
	 * subsequent I/Os. We must therefore determine if the next
	 * non-optional I/O is close enough to make aggregation
	 * worthwhile.
	 */
	stretch = B_FALSE;
	if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) {
		zio_t *nio = last;
		while ((dio = AVL_NEXT(t, nio)) != NULL &&
		    IO_GAP(nio, dio) == 0 &&
		    IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) {
			nio = dio;
			if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
				stretch = B_TRUE;
				break;
			}
		}
	}

	if (stretch) {
		/* This may be a no-op. */
		dio = AVL_NEXT(t, last);
		dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
	} else {
		while (last != mandatory && last != first) {
			ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL);
			last = AVL_PREV(t, last);
			ASSERT(last != NULL);
		}
	}

	if (first == last)
		return (NULL);

	size = IO_SPAN(first, last);
	ASSERT3U(size, <=, zfs_vdev_aggregation_limit);

	aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
	    zio_buf_alloc(size), size, first->io_type, zio->io_priority,
	    flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
	    vdev_queue_agg_io_done, NULL);
	aio->io_timestamp = first->io_timestamp;

	nio = first;
	do {
		dio = nio;
		nio = AVL_NEXT(t, dio);
		ASSERT3U(dio->io_type, ==, aio->io_type);

		if (dio->io_flags & ZIO_FLAG_NODATA) {
			ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE);
			bzero((char *)aio->io_data + (dio->io_offset -
			    aio->io_offset), dio->io_size);
		} else if (dio->io_type == ZIO_TYPE_WRITE) {
			bcopy(dio->io_data, (char *)aio->io_data +
			    (dio->io_offset - aio->io_offset),
			    dio->io_size);
		}

		zio_add_child(dio, aio);
		vdev_queue_io_remove(vq, dio);
		zio_vdev_io_bypass(dio);
		zio_execute(dio);
	} while (dio != last);
Пример #3
0
static zio_t *
vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
{
	zio_t *fio, *lio, *aio, *dio, *nio, *mio;
	avl_tree_t *t;
	int flags;
	uint64_t maxspan = zfs_vdev_aggregation_limit;
	uint64_t maxgap;
	int stretch;

again:
	ASSERT(MUTEX_HELD(&vq->vq_lock));

	if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
	    avl_numnodes(&vq->vq_deadline_tree) == 0)
		return (NULL);

	fio = lio = avl_first(&vq->vq_deadline_tree);

	t = fio->io_vdev_tree;
	flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT;
	maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0;

	if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) {
		/*
		 * We can aggregate I/Os that are sufficiently adjacent and of
		 * the same flavor, as expressed by the AGG_INHERIT flags.
		 * The latter requirement is necessary so that certain
		 * attributes of the I/O, such as whether it's a normal I/O
		 * or a scrub/resilver, can be preserved in the aggregate.
		 * We can include optional I/Os, but don't allow them
		 * to begin a range as they add no benefit in that situation.
		 */

		/*
		 * We keep track of the last non-optional I/O.
		 */
		mio = (fio->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : fio;

		/*
		 * Walk backwards through sufficiently contiguous I/Os
		 * recording the last non-option I/O.
		 */
		while ((dio = AVL_PREV(t, fio)) != NULL &&
		    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
		    IO_SPAN(dio, lio) <= maxspan &&
		    IO_GAP(dio, fio) <= maxgap) {
			fio = dio;
			if (mio == NULL && !(fio->io_flags & ZIO_FLAG_OPTIONAL))
				mio = fio;
		}

		/*
		 * Skip any initial optional I/Os.
		 */
		while ((fio->io_flags & ZIO_FLAG_OPTIONAL) && fio != lio) {
			fio = AVL_NEXT(t, fio);
			ASSERT(fio != NULL);
		}

		/*
		 * Walk forward through sufficiently contiguous I/Os.
		 */
		while ((dio = AVL_NEXT(t, lio)) != NULL &&
		    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
		    IO_SPAN(fio, dio) <= maxspan &&
		    IO_GAP(lio, dio) <= maxgap) {
			lio = dio;
			if (!(lio->io_flags & ZIO_FLAG_OPTIONAL))
				mio = lio;
		}

		/*
		 * Now that we've established the range of the I/O aggregation
		 * we must decide what to do with trailing optional I/Os.
		 * For reads, there's nothing to do. While we are unable to
		 * aggregate further, it's possible that a trailing optional
		 * I/O would allow the underlying device to aggregate with
		 * subsequent I/Os. We must therefore determine if the next
		 * non-optional I/O is close enough to make aggregation
		 * worthwhile.
		 */
		stretch = B_FALSE;
		if (t != &vq->vq_read_tree && mio != NULL) {
			nio = lio;
			while ((dio = AVL_NEXT(t, nio)) != NULL &&
			    IO_GAP(nio, dio) == 0 &&
			    IO_GAP(mio, dio) <= zfs_vdev_write_gap_limit) {
				nio = dio;
				if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
					stretch = B_TRUE;
					break;
				}
			}
		}

		if (stretch) {
			/* This may be a no-op. */
			VERIFY((dio = AVL_NEXT(t, lio)) != NULL);
			dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
		} else {
			while (lio != mio && lio != fio) {
				ASSERT(lio->io_flags & ZIO_FLAG_OPTIONAL);
				lio = AVL_PREV(t, lio);
				ASSERT(lio != NULL);
			}
		}
	}

	if (fio != lio) {
		uint64_t size = IO_SPAN(fio, lio);
		ASSERT(size <= zfs_vdev_aggregation_limit);

		aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
		    zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG,
		    flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
		    vdev_queue_agg_io_done, NULL);
		aio->io_timestamp = fio->io_timestamp;

		nio = fio;
		do {
			dio = nio;
			nio = AVL_NEXT(t, dio);
			ASSERT(dio->io_type == aio->io_type);
			ASSERT(dio->io_vdev_tree == t);

			if (dio->io_flags & ZIO_FLAG_NODATA) {
				ASSERT(dio->io_type == ZIO_TYPE_WRITE);
				bzero((char *)aio->io_data + (dio->io_offset -
				    aio->io_offset), dio->io_size);
			} else if (dio->io_type == ZIO_TYPE_WRITE) {
				bcopy(dio->io_data, (char *)aio->io_data +
				    (dio->io_offset - aio->io_offset),
				    dio->io_size);
			}

			zio_add_child(dio, aio);
			vdev_queue_io_remove(vq, dio);
			zio_vdev_io_bypass(dio);
			zio_execute(dio);
		} while (dio != lio);

		vdev_queue_pending_add(vq, aio);

		return (aio);
	}

	ASSERT(fio->io_vdev_tree == t);
	vdev_queue_io_remove(vq, fio);

	/*
	 * If the I/O is or was optional and therefore has no data, we need to
	 * simply discard it. We need to drop the vdev queue's lock to avoid a
	 * deadlock that we could encounter since this I/O will complete
	 * immediately.
	 */
	if (fio->io_flags & ZIO_FLAG_NODATA) {
		mutex_exit(&vq->vq_lock);
		zio_vdev_io_bypass(fio);
		zio_execute(fio);
		mutex_enter(&vq->vq_lock);
		goto again;
	}

	vdev_queue_pending_add(vq, fio);

	return (fio);
}
Пример #4
0
/*
 * Read data from the cache.  Returns 0 on cache hit, errno on a miss.
 */
int
vdev_cache_read(zio_t *zio)
{
	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
	vdev_cache_entry_t *ve, ve_search;
	uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS);
	uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
	zio_t *fio;

	ASSERT(zio->io_type == ZIO_TYPE_READ);

	if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
		return (EINVAL);

	if (zio->io_size > zfs_vdev_cache_max)
		return (EOVERFLOW);

	/*
	 * If the I/O straddles two or more cache blocks, don't cache it.
	 */
	if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS))
		return (EXDEV);

	ASSERT(cache_phase + zio->io_size <= VCBS);

	mutex_enter(&vc->vc_lock);

	ve_search.ve_offset = cache_offset;
	ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL);

	if (ve != NULL) {
		if (ve->ve_missed_update) {
			mutex_exit(&vc->vc_lock);
			return (ESTALE);
		}

		if ((fio = ve->ve_fill_io) != NULL) {
			zio_vdev_io_bypass(zio);
			zio_add_child(zio, fio);
			mutex_exit(&vc->vc_lock);
			VDCSTAT_BUMP(vdc_stat_delegations);
			return (0);
		}

		vdev_cache_hit(vc, ve, zio);
		zio_vdev_io_bypass(zio);

		mutex_exit(&vc->vc_lock);
		VDCSTAT_BUMP(vdc_stat_hits);
		return (0);
	}

	ve = vdev_cache_allocate(zio);

	if (ve == NULL) {
		mutex_exit(&vc->vc_lock);
		return (ENOMEM);
	}

	fio = zio_vdev_delegated_io(zio->io_vd, cache_offset,
	    ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL,
	    ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve);

	ve->ve_fill_io = fio;
	zio_vdev_io_bypass(zio);
	zio_add_child(zio, fio);

	mutex_exit(&vc->vc_lock);
	zio_nowait(fio);
	VDCSTAT_BUMP(vdc_stat_misses);

	return (0);
}
Пример #5
0
static int
vdev_disk_io_start(zio_t *zio)
{
	vdev_t *vd = zio->io_vd;
	vdev_disk_t *dvd = vd->vdev_tsd;
	struct buf *bp;
	vfs_context_t context;
	int flags, error = 0;

	if (zio->io_type == ZIO_TYPE_IOCTL) {
		zio_vdev_io_bypass(zio);

		/* XXPOLICY */
		if (vdev_is_dead(vd)) {
			zio->io_error = ENXIO;
			//zio_next_stage_async(zio);
			return (ZIO_PIPELINE_CONTINUE);
            //return;
		}

		switch (zio->io_cmd) {

		case DKIOCFLUSHWRITECACHE:

			if (zfs_nocacheflush)
				break;

			if (vd->vdev_nowritecache) {
				zio->io_error = SET_ERROR(ENOTSUP);
				break;
			}

			context = vfs_context_create((vfs_context_t)0);
			error = VNOP_IOCTL(dvd->vd_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, context);
			(void) vfs_context_rele(context);

			if (error == 0)
				vdev_disk_ioctl_done(zio, error);
			else
				error = ENOTSUP;

			if (error == 0) {
				/*
				 * The ioctl will be done asychronously,
				 * and will call vdev_disk_ioctl_done()
				 * upon completion.
				 */
				return ZIO_PIPELINE_STOP;
			} else if (error == ENOTSUP || error == ENOTTY) {
				/*
				 * If we get ENOTSUP or ENOTTY, we know that
				 * no future attempts will ever succeed.
				 * In this case we set a persistent bit so
				 * that we don't bother with the ioctl in the
				 * future.
				 */
				vd->vdev_nowritecache = B_TRUE;
			}
			zio->io_error = error;

			break;

		default:
			zio->io_error = SET_ERROR(ENOTSUP);
		}

		//zio_next_stage_async(zio);
        return (ZIO_PIPELINE_CONTINUE);
	}

	if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
        return (ZIO_PIPELINE_STOP);
    //		return;

	if ((zio = vdev_queue_io(zio)) == NULL)
        return (ZIO_PIPELINE_CONTINUE);
    //		return;

	flags = (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
	//flags |= B_NOCACHE;

	if (zio->io_flags & ZIO_FLAG_FAILFAST)
		flags |= B_FAILFAST;

	/*
	 * Check the state of this device to see if it has been offlined or
	 * is in an error state.  If the device was offlined or closed,
	 * dvd will be NULL and buf_alloc below will fail
	 */
	//error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio);
	if (vdev_is_dead(vd)) {
        error = ENXIO;
    }

	if (error) {
		zio->io_error = error;
		//zio_next_stage_async(zio);
		return (ZIO_PIPELINE_CONTINUE);
	}

	bp = buf_alloc(dvd->vd_devvp);

	ASSERT(bp != NULL);
	ASSERT(zio->io_data != NULL);
	ASSERT(zio->io_size != 0);

	buf_setflags(bp, flags);
	buf_setcount(bp, zio->io_size);
	buf_setdataptr(bp, (uintptr_t)zio->io_data);
    if (dvd->vd_ashift) {
        buf_setlblkno(bp, zio->io_offset>>dvd->vd_ashift);
        buf_setblkno(bp,  zio->io_offset>>dvd->vd_ashift);
    } else {
Пример #6
0
/*
 * Read data from the cache.  Returns 0 on cache hit, errno on a miss.
 */
int
vdev_cache_read(zio_t *zio)
{
	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
	vdev_cache_entry_t *ve, ve_search;
	uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS);
	uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
	zio_t *fio;

	ASSERT(zio->io_type == ZIO_TYPE_READ);

	if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
		return (EINVAL);

	if (zio->io_size > zfs_vdev_cache_max)
		return (EOVERFLOW);

	/*
	 * If the I/O straddles two or more cache blocks, don't cache it.
	 */
	if (P2CROSS(zio->io_offset, zio->io_offset + zio->io_size - 1, VCBS))
		return (EXDEV);

	ASSERT(cache_phase + zio->io_size <= VCBS);

	mutex_enter(&vc->vc_lock);

	ve_search.ve_offset = cache_offset;
	ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL);

	if (ve != NULL) {
		if (ve->ve_missed_update) {
			mutex_exit(&vc->vc_lock);
			return (ESTALE);
		}

		if ((fio = ve->ve_fill_io) != NULL) {
			zio->io_delegate_next = fio->io_delegate_list;
			fio->io_delegate_list = zio;
			zio_vdev_io_bypass(zio);
			mutex_exit(&vc->vc_lock);
			return (0);
		}

		vdev_cache_hit(vc, ve, zio);
		zio_vdev_io_bypass(zio);

		mutex_exit(&vc->vc_lock);
		zio_next_stage(zio);
		return (0);
	}

	if (!(zio->io_flags & ZIO_FLAG_METADATA)) {
		mutex_exit(&vc->vc_lock);
		return (EINVAL);
	}

	ve = vdev_cache_allocate(zio);

	if (ve == NULL) {
		mutex_exit(&vc->vc_lock);
		return (ENOMEM);
	}

	fio = zio_vdev_child_io(zio, NULL, zio->io_vd, cache_offset,
	    ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL,
	    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
	    ZIO_FLAG_DONT_RETRY | ZIO_FLAG_NOBOOKMARK,
	    vdev_cache_fill, ve);

	ve->ve_fill_io = fio;
	fio->io_delegate_list = zio;
	zio_vdev_io_bypass(zio);

	mutex_exit(&vc->vc_lock);
	zio_nowait(fio);

	return (0);
}
Пример #7
0
static zio_t *
vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
{
	zio_t *first, *last, *aio, *dio, *mandatory, *nio;
	uint64_t maxgap = 0;
	uint64_t size;
	boolean_t stretch = B_FALSE;
	avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type);
	enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;

	if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
		return (NULL);

	/*
	 * Prevent users from setting the zfs_vdev_aggregation_limit
	 * tuning larger than SPA_MAXBLOCKSIZE.
	 */
	zfs_vdev_aggregation_limit =
	    MIN(zfs_vdev_aggregation_limit, SPA_MAXBLOCKSIZE);

	first = last = zio;

	if (zio->io_type == ZIO_TYPE_READ)
		maxgap = zfs_vdev_read_gap_limit;

	/*
	 * We can aggregate I/Os that are sufficiently adjacent and of
	 * the same flavor, as expressed by the AGG_INHERIT flags.
	 * The latter requirement is necessary so that certain
	 * attributes of the I/O, such as whether it's a normal I/O
	 * or a scrub/resilver, can be preserved in the aggregate.
	 * We can include optional I/Os, but don't allow them
	 * to begin a range as they add no benefit in that situation.
	 */

	/*
	 * We keep track of the last non-optional I/O.
	 */
	mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first;

	/*
	 * Walk backwards through sufficiently contiguous I/Os
	 * recording the last non-optional I/O.
	 */
	while ((dio = AVL_PREV(t, first)) != NULL &&
	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
	    IO_SPAN(dio, last) <= zfs_vdev_aggregation_limit &&
	    IO_GAP(dio, first) <= maxgap &&
	    dio->io_type == zio->io_type) {
		first = dio;
		if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL))
			mandatory = first;
	}

	/*
	 * Skip any initial optional I/Os.
	 */
	while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) {
		first = AVL_NEXT(t, first);
		ASSERT(first != NULL);
	}


	/*
	 * Walk forward through sufficiently contiguous I/Os.
	 * The aggregation limit does not apply to optional i/os, so that
	 * we can issue contiguous writes even if they are larger than the
	 * aggregation limit.
	 */
	while ((dio = AVL_NEXT(t, last)) != NULL &&
	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
	    (IO_SPAN(first, dio) <= zfs_vdev_aggregation_limit ||
	    (dio->io_flags & ZIO_FLAG_OPTIONAL)) &&
	    IO_GAP(last, dio) <= maxgap &&
	    dio->io_type == zio->io_type) {
		last = dio;
		if (!(last->io_flags & ZIO_FLAG_OPTIONAL))
			mandatory = last;
	}

	/*
	 * Now that we've established the range of the I/O aggregation
	 * we must decide what to do with trailing optional I/Os.
	 * For reads, there's nothing to do. While we are unable to
	 * aggregate further, it's possible that a trailing optional
	 * I/O would allow the underlying device to aggregate with
	 * subsequent I/Os. We must therefore determine if the next
	 * non-optional I/O is close enough to make aggregation
	 * worthwhile.
	 */
	if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) {
		zio_t *nio = last;
		while ((dio = AVL_NEXT(t, nio)) != NULL &&
		    IO_GAP(nio, dio) == 0 &&
		    IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) {
			nio = dio;
			if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
				stretch = B_TRUE;
				break;
			}
		}
	}

	if (stretch) {
		/*
		 * We are going to include an optional io in our aggregated
		 * span, thus closing the write gap.  Only mandatory i/os can
		 * start aggregated spans, so make sure that the next i/o
		 * after our span is mandatory.
		 */
		dio = AVL_NEXT(t, last);
		dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
	} else {
		/* do not include the optional i/o */
		while (last != mandatory && last != first) {
			ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL);
			last = AVL_PREV(t, last);
			ASSERT(last != NULL);
		}
	}

	if (first == last)
		return (NULL);

	size = IO_SPAN(first, last);
	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);

	aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
	    abd_alloc_for_io(size, B_TRUE), size, first->io_type,
	    zio->io_priority, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
	    vdev_queue_agg_io_done, NULL);
	aio->io_timestamp = first->io_timestamp;

	nio = first;
	do {
		dio = nio;
		nio = AVL_NEXT(t, dio);
		ASSERT3U(dio->io_type, ==, aio->io_type);

		if (dio->io_flags & ZIO_FLAG_NODATA) {
			ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE);
			abd_zero_off(aio->io_abd,
			    dio->io_offset - aio->io_offset, dio->io_size);
		} else if (dio->io_type == ZIO_TYPE_WRITE) {
			abd_copy_off(aio->io_abd, dio->io_abd,
			    dio->io_offset - aio->io_offset, 0, dio->io_size);
		}

		zio_add_child(dio, aio);
		vdev_queue_io_remove(vq, dio);
		zio_vdev_io_bypass(dio);
		zio_execute(dio);
	} while (dio != last);
Пример #8
0
static void
vdev_file_io_start(zio_t *zio)
{
	vdev_t *vd = zio->io_vd;
	vdev_file_t *vf = vd->vdev_tsd;
	ssize_t resid;
	int error;

	if (zio->io_type == ZIO_TYPE_IOCTL) {
		zio_vdev_io_bypass(zio);

		/* XXPOLICY */
		if (vdev_is_dead(vd)) {
			zio->io_error = ENXIO;
			zio_next_stage_async(zio);
			return;
		}

		switch (zio->io_cmd) {
		case DKIOCFLUSHWRITECACHE:
			zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC,
			    kcred);
			dprintf("fsync(%s) = %d\n", vdev_description(vd),
			    zio->io_error);
			break;
		default:
			zio->io_error = ENOTSUP;
		}

		zio_next_stage_async(zio);
		return;
	}

	/*
	 * In the kernel, don't bother double-caching, but in userland,
	 * we want to test the vdev_cache code.
	 */
#ifndef _KERNEL
	if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
		return;
#endif

	if ((zio = vdev_queue_io(zio)) == NULL)
		return;

	/* XXPOLICY */
	error = vdev_is_dead(vd) ? ENXIO : vdev_error_inject(vd, zio);
	if (error) {
		zio->io_error = error;
		zio_next_stage_async(zio);
		return;
	}

#ifdef ZFS_READONLY_KEXT
	if (zio->io_type == ZIO_TYPE_WRITE) {
		zio->io_error = 0;
		zio_next_stage_async(zio);
		return;
	}
#endif /* ZFS_READONLY_KEXT */

	zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
	    UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data,
	    zio->io_size, zio->io_offset, UIO_SYSSPACE,
	    0, RLIM64_INFINITY, kcred, &resid);

	if (resid != 0 && zio->io_error == 0)
		zio->io_error = ENOSPC;

	zio_next_stage_async(zio);
}
Пример #9
0
static zio_t *
vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
{
	zio_t *fio, *lio, *aio, *dio, *nio;
	avl_tree_t *t;
	int flags;
	uint64_t maxspan = zfs_vdev_aggregation_limit;
	uint64_t maxgap;

	ASSERT(MUTEX_HELD(&vq->vq_lock));

	if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
	    avl_numnodes(&vq->vq_deadline_tree) == 0)
		return (NULL);

	fio = lio = avl_first(&vq->vq_deadline_tree);

	t = fio->io_vdev_tree;
	flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT;
	maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0;

	if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) {
		/*
		 * We can aggregate I/Os that are adjacent and of the
		 * same flavor, as expressed by the AGG_INHERIT flags.
		 * The latter is necessary so that certain attributes
		 * of the I/O, such as whether it's a normal I/O or a
		 * scrub/resilver, can be preserved in the aggregate.
		 */
		while ((dio = AVL_PREV(t, fio)) != NULL &&
		    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
		    IO_SPAN(dio, lio) <= maxspan && IO_GAP(dio, fio) <= maxgap)
			fio = dio;

		while ((dio = AVL_NEXT(t, lio)) != NULL &&
		    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
		    IO_SPAN(fio, dio) <= maxspan && IO_GAP(lio, dio) <= maxgap)
			lio = dio;
	}

	if (fio != lio) {
		uint64_t size = IO_SPAN(fio, lio);
		ASSERT(size <= zfs_vdev_aggregation_limit);

		aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
		    zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_NOW,
		    flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
		    vdev_queue_agg_io_done, NULL);

		nio = fio;
		do {
			dio = nio;
			nio = AVL_NEXT(t, dio);
			ASSERT(dio->io_type == aio->io_type);
			ASSERT(dio->io_vdev_tree == t);

			if (dio->io_type == ZIO_TYPE_WRITE)
				bcopy(dio->io_data, (char *)aio->io_data +
				    (dio->io_offset - aio->io_offset),
				    dio->io_size);

			zio_add_child(dio, aio);
			vdev_queue_io_remove(vq, dio);
			zio_vdev_io_bypass(dio);
			zio_execute(dio);
		} while (dio != lio);

		avl_add(&vq->vq_pending_tree, aio);

		return (aio);
	}

	ASSERT(fio->io_vdev_tree == t);
	vdev_queue_io_remove(vq, fio);

	avl_add(&vq->vq_pending_tree, fio);

	return (fio);
}
Пример #10
0
static zio_t *
vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
{
	zio_t *fio, *lio, *aio, *dio;
	avl_tree_t *tree;
	uint64_t size;

	ASSERT(MUTEX_HELD(&vq->vq_lock));

	if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
	    avl_numnodes(&vq->vq_deadline_tree) == 0)
		return (NULL);

	fio = lio = avl_first(&vq->vq_deadline_tree);

	tree = fio->io_vdev_tree;
	size = fio->io_size;

	while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) &&
	    size + dio->io_size <= zfs_vdev_aggregation_limit) {
		dio->io_delegate_next = fio;
		fio = dio;
		size += dio->io_size;
	}

	while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) &&
	    size + dio->io_size <= zfs_vdev_aggregation_limit) {
		lio->io_delegate_next = dio;
		lio = dio;
		size += dio->io_size;
	}

	if (fio != lio) {
		char *buf = zio_buf_alloc(size);
		uint64_t offset = 0;
		int nagg = 0;

		ASSERT(size <= zfs_vdev_aggregation_limit);

		aio = zio_vdev_child_io(fio, NULL, fio->io_vd,
		    fio->io_offset, buf, size, fio->io_type,
		    ZIO_PRIORITY_NOW, ZIO_FLAG_DONT_QUEUE |
		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
		    ZIO_FLAG_NOBOOKMARK,
		    vdev_queue_agg_io_done, NULL);

		aio->io_delegate_list = fio;

		for (dio = fio; dio != NULL; dio = dio->io_delegate_next) {
			ASSERT(dio->io_type == aio->io_type);
			ASSERT(dio->io_vdev_tree == tree);
			if (dio->io_type == ZIO_TYPE_WRITE)
				bcopy(dio->io_data, buf + offset, dio->io_size);
			offset += dio->io_size;
			vdev_queue_io_remove(vq, dio);
			zio_vdev_io_bypass(dio);
			nagg++;
		}

		ASSERT(offset == size);

		dprintf("%5s  T=%llu  off=%8llx  agg=%3d  "
		    "old=%5llx  new=%5llx\n",
		    zio_type_name[fio->io_type],
		    fio->io_deadline, fio->io_offset, nagg, fio->io_size, size);

		avl_add(&vq->vq_pending_tree, aio);

		return (aio);
	}

	ASSERT(fio->io_vdev_tree == tree);
	vdev_queue_io_remove(vq, fio);

	avl_add(&vq->vq_pending_tree, fio);

	return (fio);
}