Пример #1
0
void
vdev_queue_init(vdev_t *vd)
{
	vdev_queue_t *vq = &vd->vdev_queue;

	mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
	vq->vq_vdev = vd;

	avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
	    sizeof (zio_t), offsetof(struct zio, io_queue_node));
	avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
	    vdev_queue_offset_compare, sizeof (zio_t),
	    offsetof(struct zio, io_offset_node));
	avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
	    vdev_queue_offset_compare, sizeof (zio_t),
	    offsetof(struct zio, io_offset_node));

	for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
		int (*compfn) (const void *, const void *);

		/*
		 * The synchronous i/o queues are dispatched in FIFO rather
		 * than LBA order.  This provides more consistent latency for
		 * these i/os.
		 */
		if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE)
			compfn = vdev_queue_timestamp_compare;
		else
			compfn = vdev_queue_offset_compare;

		avl_create(vdev_queue_class_tree(vq, p), compfn,
		    sizeof (zio_t), offsetof(struct zio, io_queue_node));
	}
}
Пример #2
0
void
vdev_queue_fini(vdev_t *vd)
{
	vdev_queue_t *vq = &vd->vdev_queue;

	for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
		avl_destroy(vdev_queue_class_tree(vq, p));
	avl_destroy(&vq->vq_active_tree);
	avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
	avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));

	mutex_destroy(&vq->vq_lock);
}
Пример #3
0
static void
vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
{
	spa_t *spa = zio->io_spa;
	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
	avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
	avl_add(vdev_queue_type_tree(vq, zio->io_type), zio);

	mutex_enter(&spa->spa_iokstat_lock);
	spa->spa_queue_stats[zio->io_priority].spa_queued++;
	if (spa->spa_iokstat != NULL)
		kstat_waitq_enter(spa->spa_iokstat->ks_data);
	mutex_exit(&spa->spa_iokstat_lock);
}
Пример #4
0
static void
vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
{
	spa_t *spa = zio->io_spa;
	spa_stats_history_t *ssh = &spa->spa_stats.io_history;

	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
	avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
	avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio);

	if (ssh->kstat != NULL) {
		mutex_enter(&ssh->lock);
		kstat_waitq_exit(ssh->kstat->ks_data);
		mutex_exit(&ssh->lock);
	}
}
Пример #5
0
static void
vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
{
	spa_t *spa = zio->io_spa;
	avl_tree_t *qtt;
	ASSERT(MUTEX_HELD(&vq->vq_lock));
	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
	avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
	qtt = vdev_queue_type_tree(vq, zio->io_type);
	if (qtt)
		avl_add(qtt, zio);

#ifdef illumos
	mutex_enter(&spa->spa_iokstat_lock);
	spa->spa_queue_stats[zio->io_priority].spa_queued++;
	if (spa->spa_iokstat != NULL)
		kstat_waitq_enter(spa->spa_iokstat->ks_data);
	mutex_exit(&spa->spa_iokstat_lock);
#endif
}
Пример #6
0
static zio_t *
vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
{
	zio_t *first, *last, *aio, *dio, *mandatory, *nio;
	uint64_t maxgap = 0;
	uint64_t size;
	boolean_t stretch = B_FALSE;
	avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type);
	enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;

	if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
		return (NULL);

	first = last = zio;

	if (zio->io_type == ZIO_TYPE_READ)
		maxgap = zfs_vdev_read_gap_limit;

	/*
	 * We can aggregate I/Os that are sufficiently adjacent and of
	 * the same flavor, as expressed by the AGG_INHERIT flags.
	 * The latter requirement is necessary so that certain
	 * attributes of the I/O, such as whether it's a normal I/O
	 * or a scrub/resilver, can be preserved in the aggregate.
	 * We can include optional I/Os, but don't allow them
	 * to begin a range as they add no benefit in that situation.
	 */

	/*
	 * We keep track of the last non-optional I/O.
	 */
	mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first;

	/*
	 * Walk backwards through sufficiently contiguous I/Os
	 * recording the last non-option I/O.
	 */
	while ((dio = AVL_PREV(t, first)) != NULL &&
	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
	    IO_SPAN(dio, last) <= zfs_vdev_aggregation_limit &&
	    IO_GAP(dio, first) <= maxgap) {
		first = dio;
		if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL))
			mandatory = first;
	}

	/*
	 * Skip any initial optional I/Os.
	 */
	while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) {
		first = AVL_NEXT(t, first);
		ASSERT(first != NULL);
	}

	/*
	 * Walk forward through sufficiently contiguous I/Os.
	 */
	while ((dio = AVL_NEXT(t, last)) != NULL &&
	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
	    IO_SPAN(first, dio) <= zfs_vdev_aggregation_limit &&
	    IO_GAP(last, dio) <= maxgap) {
		last = dio;
		if (!(last->io_flags & ZIO_FLAG_OPTIONAL))
			mandatory = last;
	}

	/*
	 * Now that we've established the range of the I/O aggregation
	 * we must decide what to do with trailing optional I/Os.
	 * For reads, there's nothing to do. While we are unable to
	 * aggregate further, it's possible that a trailing optional
	 * I/O would allow the underlying device to aggregate with
	 * subsequent I/Os. We must therefore determine if the next
	 * non-optional I/O is close enough to make aggregation
	 * worthwhile.
	 */
	if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) {
		zio_t *nio = last;
		while ((dio = AVL_NEXT(t, nio)) != NULL &&
		    IO_GAP(nio, dio) == 0 &&
		    IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) {
			nio = dio;
			if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
				stretch = B_TRUE;
				break;
			}
		}
	}

	if (stretch) {
		/* This may be a no-op. */
		dio = AVL_NEXT(t, last);
		dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
	} else {
		while (last != mandatory && last != first) {
			ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL);
			last = AVL_PREV(t, last);
			ASSERT(last != NULL);
		}
	}

	if (first == last)
		return (NULL);

	size = IO_SPAN(first, last);
	ASSERT3U(size, <=, zfs_vdev_aggregation_limit);

	aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
	    zio_buf_alloc(size), size, first->io_type, zio->io_priority,
	    flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
	    vdev_queue_agg_io_done, NULL);
	aio->io_timestamp = first->io_timestamp;

	nio = first;
	do {
		dio = nio;
		nio = AVL_NEXT(t, dio);
		ASSERT3U(dio->io_type, ==, aio->io_type);

		if (dio->io_flags & ZIO_FLAG_NODATA) {
			ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE);
			bzero((char *)aio->io_data + (dio->io_offset -
			    aio->io_offset), dio->io_size);
		} else if (dio->io_type == ZIO_TYPE_WRITE) {
			bcopy(dio->io_data, (char *)aio->io_data +
			    (dio->io_offset - aio->io_offset),
			    dio->io_size);
		}

		zio_add_child(dio, aio);
		vdev_queue_io_remove(vq, dio);
		zio_vdev_io_bypass(dio);
		zio_execute(dio);
	} while (dio != last);
Пример #7
0
static zio_t *
vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
{
	zio_t *first, *last, *aio, *dio, *mandatory, *nio;
	zio_link_t *zl = NULL;
	uint64_t maxgap = 0;
	uint64_t size;
	uint64_t limit;
	int maxblocksize;
	boolean_t stretch;
	avl_tree_t *t;
	enum zio_flag flags;

	ASSERT(MUTEX_HELD(&vq->vq_lock));

	maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa);
	if (vq->vq_vdev->vdev_nonrot)
		limit = zfs_vdev_aggregation_limit_non_rotating;
	else
		limit = zfs_vdev_aggregation_limit;
	limit = MAX(MIN(limit, maxblocksize), 0);

	if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || limit == 0)
		return (NULL);

	first = last = zio;

	if (zio->io_type == ZIO_TYPE_READ)
		maxgap = zfs_vdev_read_gap_limit;

	/*
	 * We can aggregate I/Os that are sufficiently adjacent and of
	 * the same flavor, as expressed by the AGG_INHERIT flags.
	 * The latter requirement is necessary so that certain
	 * attributes of the I/O, such as whether it's a normal I/O
	 * or a scrub/resilver, can be preserved in the aggregate.
	 * We can include optional I/Os, but don't allow them
	 * to begin a range as they add no benefit in that situation.
	 */

	/*
	 * We keep track of the last non-optional I/O.
	 */
	mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first;

	/*
	 * Walk backwards through sufficiently contiguous I/Os
	 * recording the last non-optional I/O.
	 */
	flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
	t = vdev_queue_type_tree(vq, zio->io_type);
	while (t != NULL && (dio = AVL_PREV(t, first)) != NULL &&
	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
	    IO_SPAN(dio, last) <= limit &&
	    IO_GAP(dio, first) <= maxgap &&
	    dio->io_type == zio->io_type) {
		first = dio;
		if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL))
			mandatory = first;
	}

	/*
	 * Skip any initial optional I/Os.
	 */
	while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) {
		first = AVL_NEXT(t, first);
		ASSERT(first != NULL);
	}

	/*
	 * Walk forward through sufficiently contiguous I/Os.
	 * The aggregation limit does not apply to optional i/os, so that
	 * we can issue contiguous writes even if they are larger than the
	 * aggregation limit.
	 */
	while ((dio = AVL_NEXT(t, last)) != NULL &&
	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
	    (IO_SPAN(first, dio) <= limit ||
	    (dio->io_flags & ZIO_FLAG_OPTIONAL)) &&
	    IO_SPAN(first, dio) <= maxblocksize &&
	    IO_GAP(last, dio) <= maxgap &&
	    dio->io_type == zio->io_type) {
		last = dio;
		if (!(last->io_flags & ZIO_FLAG_OPTIONAL))
			mandatory = last;
	}

	/*
	 * Now that we've established the range of the I/O aggregation
	 * we must decide what to do with trailing optional I/Os.
	 * For reads, there's nothing to do. While we are unable to
	 * aggregate further, it's possible that a trailing optional
	 * I/O would allow the underlying device to aggregate with
	 * subsequent I/Os. We must therefore determine if the next
	 * non-optional I/O is close enough to make aggregation
	 * worthwhile.
	 */
	stretch = B_FALSE;
	if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) {
		zio_t *nio = last;
		while ((dio = AVL_NEXT(t, nio)) != NULL &&
		    IO_GAP(nio, dio) == 0 &&
		    IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) {
			nio = dio;
			if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
				stretch = B_TRUE;
				break;
			}
		}
	}

	if (stretch) {
		/*
		 * We are going to include an optional io in our aggregated
		 * span, thus closing the write gap.  Only mandatory i/os can
		 * start aggregated spans, so make sure that the next i/o
		 * after our span is mandatory.
		 */
		dio = AVL_NEXT(t, last);
		dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
	} else {
		/* do not include the optional i/o */
		while (last != mandatory && last != first) {
			ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL);
			last = AVL_PREV(t, last);
			ASSERT(last != NULL);
		}
	}

	if (first == last)
		return (NULL);

	size = IO_SPAN(first, last);
	ASSERT3U(size, <=, maxblocksize);

	aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
	    abd_alloc_for_io(size, B_TRUE), size, first->io_type,
	    zio->io_priority, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
	    vdev_queue_agg_io_done, NULL);
	aio->io_timestamp = first->io_timestamp;

	nio = first;
	do {
		dio = nio;
		nio = AVL_NEXT(t, dio);
		ASSERT3U(dio->io_type, ==, aio->io_type);

		if (dio->io_flags & ZIO_FLAG_NODATA) {
			ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE);
			abd_zero_off(aio->io_abd,
			    dio->io_offset - aio->io_offset, dio->io_size);
		} else if (dio->io_type == ZIO_TYPE_WRITE) {
			abd_copy_off(aio->io_abd, dio->io_abd,
			    dio->io_offset - aio->io_offset, 0, dio->io_size);
		}

		zio_add_child(dio, aio);
		vdev_queue_io_remove(vq, dio);
	} while (dio != last);