コード例 #1
0
/*
 * Allocate an entry in the cache.  At the point we don't have the data,
 * we're just creating a placeholder so that multiple threads don't all
 * go off and read the same blocks.
 */
static vdev_cache_entry_t *
vdev_cache_allocate(zio_t *zio)
{
	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
	uint64_t offset = P2ALIGN(zio->io_offset, VCBS);
	vdev_cache_entry_t *ve;

	ASSERT(MUTEX_HELD(&vc->vc_lock));

	if (zfs_vdev_cache_size == 0)
		return (NULL);

	/*
	 * If adding a new entry would exceed the cache size,
	 * evict the oldest entry (LRU).
	 */
	if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) >
	    zfs_vdev_cache_size) {
		ve = avl_first(&vc->vc_lastused_tree);
		if (ve->ve_fill_io != NULL)
			return (NULL);
		ASSERT(ve->ve_hits != 0);
		vdev_cache_evict(vc, ve);
	}

	ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
	ve->ve_offset = offset;
	ve->ve_lastused = lbolt;
	ve->ve_data = zio_buf_alloc(VCBS);

	avl_add(&vc->vc_offset_tree, ve);
	avl_add(&vc->vc_lastused_tree, ve);

	return (ve);
}
コード例 #2
0
ファイル: zfs_sa.c プロジェクト: Alyseo/zfs
int
zfs_sa_get_xattr(znode_t *zp)
{
	zfs_sb_t *zsb = ZTOZSB(zp);
	char *obj;
	int size;
	int error;

	ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
	ASSERT(!zp->z_xattr_cached);
	ASSERT(zp->z_is_sa);

	error = sa_size(zp->z_sa_hdl, SA_ZPL_DXATTR(zsb), &size);
	if (error) {
		if (error == ENOENT)
			return nvlist_alloc(&zp->z_xattr_cached,
			    NV_UNIQUE_NAME, KM_SLEEP);
		else
			return (error);
	}

	obj = zio_buf_alloc(size);

	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DXATTR(zsb), obj, size);
	if (error == 0)
		error = nvlist_unpack(obj, size, &zp->z_xattr_cached, KM_SLEEP);

	zio_buf_free(obj, size);

	return (error);
}
コード例 #3
0
ファイル: zfs_sa.c プロジェクト: avg-I/zfs
int
zfs_sa_set_xattr(znode_t *zp)
{
    zfs_sb_t *zsb = ZTOZSB(zp);
    dmu_tx_t *tx;
    char *obj;
    size_t size;
    int error;

    ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock));
    ASSERT(zp->z_xattr_cached);
    ASSERT(zp->z_is_sa);

    error = nvlist_size(zp->z_xattr_cached, &size, NV_ENCODE_XDR);
    if (error)
        goto out;

    obj = zio_buf_alloc(size);

    error = nvlist_pack(zp->z_xattr_cached, &obj, &size,
                        NV_ENCODE_XDR, KM_SLEEP);
    if (error)
        goto out_free;

    tx = dmu_tx_create(zsb->z_os);
    dmu_tx_hold_sa_create(tx, size);
    dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);

    error = dmu_tx_assign(tx, TXG_WAIT);
    if (error) {
        dmu_tx_abort(tx);
    } else {
        error = sa_update(zp->z_sa_hdl, SA_ZPL_DXATTR(zsb),
                          obj, size, tx);
        if (error)
            dmu_tx_abort(tx);
        else
            dmu_tx_commit(tx);
    }
out_free:
    zio_buf_free(obj, size);
out:
    return (error);
}
コード例 #4
0
ファイル: vdev_queue.c プロジェクト: mcarpenter/illumos-gate
static zio_t *
vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
{
	zio_t *fio, *lio, *aio, *dio, *nio, *mio;
	avl_tree_t *t;
	int flags;
	uint64_t maxspan = zfs_vdev_aggregation_limit;
	uint64_t maxgap;
	int stretch;

again:
	ASSERT(MUTEX_HELD(&vq->vq_lock));

	if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
	    avl_numnodes(&vq->vq_deadline_tree) == 0)
		return (NULL);

	fio = lio = avl_first(&vq->vq_deadline_tree);

	t = fio->io_vdev_tree;
	flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT;
	maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0;

	if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) {
		/*
		 * We can aggregate I/Os that are sufficiently adjacent and of
		 * the same flavor, as expressed by the AGG_INHERIT flags.
		 * The latter requirement is necessary so that certain
		 * attributes of the I/O, such as whether it's a normal I/O
		 * or a scrub/resilver, can be preserved in the aggregate.
		 * We can include optional I/Os, but don't allow them
		 * to begin a range as they add no benefit in that situation.
		 */

		/*
		 * We keep track of the last non-optional I/O.
		 */
		mio = (fio->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : fio;

		/*
		 * Walk backwards through sufficiently contiguous I/Os
		 * recording the last non-option I/O.
		 */
		while ((dio = AVL_PREV(t, fio)) != NULL &&
		    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
		    IO_SPAN(dio, lio) <= maxspan &&
		    IO_GAP(dio, fio) <= maxgap) {
			fio = dio;
			if (mio == NULL && !(fio->io_flags & ZIO_FLAG_OPTIONAL))
				mio = fio;
		}

		/*
		 * Skip any initial optional I/Os.
		 */
		while ((fio->io_flags & ZIO_FLAG_OPTIONAL) && fio != lio) {
			fio = AVL_NEXT(t, fio);
			ASSERT(fio != NULL);
		}

		/*
		 * Walk forward through sufficiently contiguous I/Os.
		 */
		while ((dio = AVL_NEXT(t, lio)) != NULL &&
		    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
		    IO_SPAN(fio, dio) <= maxspan &&
		    IO_GAP(lio, dio) <= maxgap) {
			lio = dio;
			if (!(lio->io_flags & ZIO_FLAG_OPTIONAL))
				mio = lio;
		}

		/*
		 * Now that we've established the range of the I/O aggregation
		 * we must decide what to do with trailing optional I/Os.
		 * For reads, there's nothing to do. While we are unable to
		 * aggregate further, it's possible that a trailing optional
		 * I/O would allow the underlying device to aggregate with
		 * subsequent I/Os. We must therefore determine if the next
		 * non-optional I/O is close enough to make aggregation
		 * worthwhile.
		 */
		stretch = B_FALSE;
		if (t != &vq->vq_read_tree && mio != NULL) {
			nio = lio;
			while ((dio = AVL_NEXT(t, nio)) != NULL &&
			    IO_GAP(nio, dio) == 0 &&
			    IO_GAP(mio, dio) <= zfs_vdev_write_gap_limit) {
				nio = dio;
				if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
					stretch = B_TRUE;
					break;
				}
			}
		}

		if (stretch) {
			/* This may be a no-op. */
			VERIFY((dio = AVL_NEXT(t, lio)) != NULL);
			dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
		} else {
			while (lio != mio && lio != fio) {
				ASSERT(lio->io_flags & ZIO_FLAG_OPTIONAL);
				lio = AVL_PREV(t, lio);
				ASSERT(lio != NULL);
			}
		}
	}

	if (fio != lio) {
		uint64_t size = IO_SPAN(fio, lio);
		ASSERT(size <= zfs_vdev_aggregation_limit);

		aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
		    zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG,
		    flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
		    vdev_queue_agg_io_done, NULL);
		aio->io_timestamp = fio->io_timestamp;

		nio = fio;
		do {
			dio = nio;
			nio = AVL_NEXT(t, dio);
			ASSERT(dio->io_type == aio->io_type);
			ASSERT(dio->io_vdev_tree == t);

			if (dio->io_flags & ZIO_FLAG_NODATA) {
				ASSERT(dio->io_type == ZIO_TYPE_WRITE);
				bzero((char *)aio->io_data + (dio->io_offset -
				    aio->io_offset), dio->io_size);
			} else if (dio->io_type == ZIO_TYPE_WRITE) {
				bcopy(dio->io_data, (char *)aio->io_data +
				    (dio->io_offset - aio->io_offset),
				    dio->io_size);
			}

			zio_add_child(dio, aio);
			vdev_queue_io_remove(vq, dio);
			zio_vdev_io_bypass(dio);
			zio_execute(dio);
		} while (dio != lio);

		vdev_queue_pending_add(vq, aio);

		return (aio);
	}

	ASSERT(fio->io_vdev_tree == t);
	vdev_queue_io_remove(vq, fio);

	/*
	 * If the I/O is or was optional and therefore has no data, we need to
	 * simply discard it. We need to drop the vdev queue's lock to avoid a
	 * deadlock that we could encounter since this I/O will complete
	 * immediately.
	 */
	if (fio->io_flags & ZIO_FLAG_NODATA) {
		mutex_exit(&vq->vq_lock);
		zio_vdev_io_bypass(fio);
		zio_execute(fio);
		mutex_enter(&vq->vq_lock);
		goto again;
	}

	vdev_queue_pending_add(vq, fio);

	return (fio);
}
コード例 #5
0
ファイル: vdev_queue.c プロジェクト: adonis1104/freebsd
static zio_t *
vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
{
	zio_t *first, *last, *aio, *dio, *mandatory, *nio;
	uint64_t maxgap = 0;
	uint64_t size;
	boolean_t stretch;
	avl_tree_t *t;
	enum zio_flag flags;

	ASSERT(MUTEX_HELD(&vq->vq_lock));

	if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
		return (NULL);

	/*
	 * The synchronous i/o queues are not sorted by LBA, so we can't
	 * find adjacent i/os.  These i/os tend to not be tightly clustered,
	 * or too large to aggregate, so this has little impact on performance.
	 */
	if (zio->io_priority == ZIO_PRIORITY_SYNC_READ ||
	    zio->io_priority == ZIO_PRIORITY_SYNC_WRITE)
		return (NULL);

	first = last = zio;

	if (zio->io_type == ZIO_TYPE_READ)
		maxgap = zfs_vdev_read_gap_limit;

	/*
	 * We can aggregate I/Os that are sufficiently adjacent and of
	 * the same flavor, as expressed by the AGG_INHERIT flags.
	 * The latter requirement is necessary so that certain
	 * attributes of the I/O, such as whether it's a normal I/O
	 * or a scrub/resilver, can be preserved in the aggregate.
	 * We can include optional I/Os, but don't allow them
	 * to begin a range as they add no benefit in that situation.
	 */

	/*
	 * We keep track of the last non-optional I/O.
	 */
	mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first;

	/*
	 * Walk backwards through sufficiently contiguous I/Os
	 * recording the last non-option I/O.
	 */
	flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
	t = &vq->vq_class[zio->io_priority].vqc_queued_tree;
	while ((dio = AVL_PREV(t, first)) != NULL &&
	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
	    IO_SPAN(dio, last) <= zfs_vdev_aggregation_limit &&
	    IO_GAP(dio, first) <= maxgap) {
		first = dio;
		if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL))
			mandatory = first;
	}

	/*
	 * Skip any initial optional I/Os.
	 */
	while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) {
		first = AVL_NEXT(t, first);
		ASSERT(first != NULL);
	}

	/*
	 * Walk forward through sufficiently contiguous I/Os.
	 */
	while ((dio = AVL_NEXT(t, last)) != NULL &&
	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
	    IO_SPAN(first, dio) <= zfs_vdev_aggregation_limit &&
	    IO_GAP(last, dio) <= maxgap) {
		last = dio;
		if (!(last->io_flags & ZIO_FLAG_OPTIONAL))
			mandatory = last;
	}

	/*
	 * Now that we've established the range of the I/O aggregation
	 * we must decide what to do with trailing optional I/Os.
	 * For reads, there's nothing to do. While we are unable to
	 * aggregate further, it's possible that a trailing optional
	 * I/O would allow the underlying device to aggregate with
	 * subsequent I/Os. We must therefore determine if the next
	 * non-optional I/O is close enough to make aggregation
	 * worthwhile.
	 */
	stretch = B_FALSE;
	if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) {
		zio_t *nio = last;
		while ((dio = AVL_NEXT(t, nio)) != NULL &&
		    IO_GAP(nio, dio) == 0 &&
		    IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) {
			nio = dio;
			if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
				stretch = B_TRUE;
				break;
			}
		}
	}

	if (stretch) {
		/* This may be a no-op. */
		dio = AVL_NEXT(t, last);
		dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
	} else {
		while (last != mandatory && last != first) {
			ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL);
			last = AVL_PREV(t, last);
			ASSERT(last != NULL);
		}
	}

	if (first == last)
		return (NULL);

	size = IO_SPAN(first, last);
	ASSERT3U(size, <=, zfs_vdev_aggregation_limit);

	aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
	    zio_buf_alloc(size), size, first->io_type, zio->io_priority,
	    flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
	    vdev_queue_agg_io_done, NULL);
	aio->io_timestamp = first->io_timestamp;

	nio = first;
	do {
		dio = nio;
		nio = AVL_NEXT(t, dio);
		ASSERT3U(dio->io_type, ==, aio->io_type);

		if (dio->io_flags & ZIO_FLAG_NODATA) {
			ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE);
			bzero((char *)aio->io_data + (dio->io_offset -
			    aio->io_offset), dio->io_size);
		} else if (dio->io_type == ZIO_TYPE_WRITE) {
			bcopy(dio->io_data, (char *)aio->io_data +
			    (dio->io_offset - aio->io_offset),
			    dio->io_size);
		}

		zio_add_child(dio, aio);
		vdev_queue_io_remove(vq, dio);
		zio_vdev_io_bypass(dio);
		zio_execute(dio);
	} while (dio != last);
コード例 #6
0
/*
 * Load the space map disk into the specified range tree. Segments of maptype
 * are added to the range tree, other segment types are removed.
 *
 * Note: space_map_load() will drop sm_lock across dmu_read() calls.
 * The caller must be OK with this.
 */
int
space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
{
	uint64_t *entry, *entry_map, *entry_map_end;
	uint64_t bufsize, size, offset, end, space;
	int error = 0;

	ASSERT(MUTEX_HELD(sm->sm_lock));

	end = space_map_length(sm);
	space = space_map_allocated(sm);

	VERIFY0(range_tree_space(rt));

	if (maptype == SM_FREE) {
		range_tree_add(rt, sm->sm_start, sm->sm_size);
		space = sm->sm_size - space;
	}

	bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
	entry_map = zio_buf_alloc(bufsize);

	mutex_exit(sm->sm_lock);
	if (end > bufsize) {
		dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize,
		    end - bufsize, ZIO_PRIORITY_SYNC_READ);
	}
	mutex_enter(sm->sm_lock);

	for (offset = 0; offset < end; offset += bufsize) {
		size = MIN(end - offset, bufsize);
		VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
		VERIFY(size != 0);
		ASSERT3U(sm->sm_blksz, !=, 0);

		dprintf("object=%llu  offset=%llx  size=%llx\n",
		    space_map_object(sm), offset, size);

		mutex_exit(sm->sm_lock);
		error = dmu_read(sm->sm_os, space_map_object(sm), offset, size,
		    entry_map, DMU_READ_PREFETCH);
		mutex_enter(sm->sm_lock);
		if (error != 0)
			break;

		entry_map_end = entry_map + (size / sizeof (uint64_t));
		for (entry = entry_map; entry < entry_map_end; entry++) {
			uint64_t e = *entry;
			uint64_t offset, size;

			if (SM_DEBUG_DECODE(e))		/* Skip debug entries */
				continue;

			offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) +
			    sm->sm_start;
			size = SM_RUN_DECODE(e) << sm->sm_shift;

			VERIFY0(P2PHASE(offset, 1ULL << sm->sm_shift));
			VERIFY0(P2PHASE(size, 1ULL << sm->sm_shift));
			VERIFY3U(offset, >=, sm->sm_start);
			VERIFY3U(offset + size, <=, sm->sm_start + sm->sm_size);
			if (SM_TYPE_DECODE(e) == maptype) {
				VERIFY3U(range_tree_space(rt) + size, <=,
				    sm->sm_size);
				range_tree_add(rt, offset, size);
			} else {
				range_tree_remove(rt, offset, size);
			}
		}
コード例 #7
0
int
zio_compress_data(int cpfunc, void *src, uint64_t srcsize, void **destp,
    uint64_t *destsizep, uint64_t *destbufsizep)
{
	uint64_t *word, *word_end;
	uint64_t ciosize, gapsize, destbufsize;
	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
	char *dest;
	uint_t allzero;

	ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS);
	ASSERT((uint_t)cpfunc == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL);

	/*
	 * If the data is all zeroes, we don't even need to allocate
	 * a block for it.  We indicate this by setting *destsizep = 0.
	 */
	allzero = 1;
	word = src;
	word_end = (uint64_t *)(uintptr_t)((uintptr_t)word + srcsize);
	while (word < word_end) {
		if (*word++ != 0) {
			allzero = 0;
			break;
		}
	}
	if (allzero) {
		*destp = NULL;
		*destsizep = 0;
		*destbufsizep = 0;
		return (1);
	}

	if (cpfunc == ZIO_COMPRESS_EMPTY)
		return (0);

	/* Compress at least 12.5% */
	destbufsize = P2ALIGN(srcsize - (srcsize >> 3), SPA_MINBLOCKSIZE);
	if (destbufsize == 0)
		return (0);
	dest = zio_buf_alloc(destbufsize);

	ciosize = ci->ci_compress(src, dest, (size_t)srcsize,
	    (size_t)destbufsize, ci->ci_level);
	if (ciosize > destbufsize) {
		zio_buf_free(dest, destbufsize);
		return (0);
	}

	/* Cool.  We compressed at least as much as we were hoping to. */

	/* For security, make sure we don't write random heap crap to disk */
	gapsize = P2ROUNDUP(ciosize, SPA_MINBLOCKSIZE) - ciosize;
	if (gapsize != 0) {
		bzero(dest + ciosize, gapsize);
		ciosize += gapsize;
	}

	ASSERT3U(ciosize, <=, destbufsize);
	ASSERT(P2PHASE(ciosize, SPA_MINBLOCKSIZE) == 0);
	*destp = dest;
	*destsizep = ciosize;
	*destbufsizep = destbufsize;

	return (1);
}
コード例 #8
0
ファイル: zil.c プロジェクト: harshada/zfs
/*
 * Start a log block write and advance to the next log block.
 * Calls are serialized.
 */
static lwb_t *
zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
{
	lwb_t *nlwb;
	zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1;
	spa_t *spa = zilog->zl_spa;
	blkptr_t *bp = &ztp->zit_next_blk;
	uint64_t txg;
	uint64_t zil_blksz;
	int error;

	ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb));

	/*
	 * Allocate the next block and save its address in this block
	 * before writing it in order to establish the log chain.
	 * Note that if the allocation of nlwb synced before we wrote
	 * the block that points at it (lwb), we'd leak it if we crashed.
	 * Therefore, we don't do txg_rele_to_sync() until zil_lwb_write_done().
	 */
	txg = txg_hold_open(zilog->zl_dmu_pool, &lwb->lwb_txgh);
	txg_rele_to_quiesce(&lwb->lwb_txgh);

	/*
	 * Pick a ZIL blocksize. We request a size that is the
	 * maximum of the previous used size, the current used size and
	 * the amount waiting in the queue.
	 */
	zil_blksz = MAX(zilog->zl_prev_used,
	    zilog->zl_cur_used + sizeof (*ztp));
	zil_blksz = MAX(zil_blksz, zilog->zl_itx_list_sz + sizeof (*ztp));
	zil_blksz = P2ROUNDUP_TYPED(zil_blksz, ZIL_MIN_BLKSZ, uint64_t);
	if (zil_blksz > ZIL_MAX_BLKSZ)
		zil_blksz = ZIL_MAX_BLKSZ;

	BP_ZERO(bp);
	/* pass the old blkptr in order to spread log blocks across devs */
	error = zio_alloc_blk(spa, zil_blksz, bp, &lwb->lwb_blk, txg);
	if (error) {
		dmu_tx_t *tx = dmu_tx_create_assigned(zilog->zl_dmu_pool, txg);

		/*
		 * We dirty the dataset to ensure that zil_sync() will
		 * be called to remove this lwb from our zl_lwb_list.
		 * Failing to do so, may leave an lwb with a NULL lwb_buf
		 * hanging around on the zl_lwb_list.
		 */
		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
		dmu_tx_commit(tx);

		/*
		 * Since we've just experienced an allocation failure so we
		 * terminate the current lwb and send it on its way.
		 */
		ztp->zit_pad = 0;
		ztp->zit_nused = lwb->lwb_nused;
		ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
		zio_nowait(lwb->lwb_zio);

		/*
		 * By returning NULL the caller will call tx_wait_synced()
		 */
		return (NULL);
	}

	ASSERT3U(bp->blk_birth, ==, txg);
	ztp->zit_pad = 0;
	ztp->zit_nused = lwb->lwb_nused;
	ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
	bp->blk_cksum = lwb->lwb_blk.blk_cksum;
	bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;

	/*
	 * Allocate a new log write buffer (lwb).
	 */
	nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);

	nlwb->lwb_zilog = zilog;
	nlwb->lwb_blk = *bp;
	nlwb->lwb_nused = 0;
	nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk);
	nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz);
	nlwb->lwb_max_txg = txg;
	nlwb->lwb_zio = NULL;

	/*
	 * Put new lwb at the end of the log chain
	 */
	mutex_enter(&zilog->zl_lock);
	list_insert_tail(&zilog->zl_lwb_list, nlwb);
	mutex_exit(&zilog->zl_lock);

	/* Record the block for later vdev flushing */
	zil_add_block(zilog, &lwb->lwb_blk);

	/*
	 * kick off the write for the old log block
	 */
	dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg);
	ASSERT(lwb->lwb_zio);
	zio_nowait(lwb->lwb_zio);

	return (nlwb);
}
コード例 #9
0
ファイル: zil.c プロジェクト: harshada/zfs
/*
 * Create an on-disk intent log.
 */
static void
zil_create(zilog_t *zilog)
{
	const zil_header_t *zh = zilog->zl_header;
	lwb_t *lwb;
	uint64_t txg = 0;
	dmu_tx_t *tx = NULL;
	blkptr_t blk;
	int error = 0;

	/*
	 * Wait for any previous destroy to complete.
	 */
	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);

	ASSERT(zh->zh_claim_txg == 0);
	ASSERT(zh->zh_replay_seq == 0);

	blk = zh->zh_log;

	/*
	 * If we don't already have an initial log block or we have one
	 * but it's the wrong endianness then allocate one.
	 */
	if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) {
		tx = dmu_tx_create(zilog->zl_os);
		(void) dmu_tx_assign(tx, TXG_WAIT);
		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
		txg = dmu_tx_get_txg(tx);

		if (!BP_IS_HOLE(&blk)) {
			zio_free_blk(zilog->zl_spa, &blk, txg);
			BP_ZERO(&blk);
		}

		error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk,
		    NULL, txg);

		if (error == 0)
			zil_init_log_chain(zilog, &blk);
	}

	/*
	 * Allocate a log write buffer (lwb) for the first log block.
	 */
	if (error == 0) {
		lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
		lwb->lwb_zilog = zilog;
		lwb->lwb_blk = blk;
		lwb->lwb_nused = 0;
		lwb->lwb_sz = BP_GET_LSIZE(&lwb->lwb_blk);
		lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz);
		lwb->lwb_max_txg = txg;
		lwb->lwb_zio = NULL;

		mutex_enter(&zilog->zl_lock);
		list_insert_tail(&zilog->zl_lwb_list, lwb);
		mutex_exit(&zilog->zl_lock);
	}

	/*
	 * If we just allocated the first log block, commit our transaction
	 * and wait for zil_sync() to stuff the block poiner into zh_log.
	 * (zh is part of the MOS, so we cannot modify it in open context.)
	 */
	if (tx != NULL) {
		dmu_tx_commit(tx);
		txg_wait_synced(zilog->zl_dmu_pool, txg);
	}

	ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
}
コード例 #10
0
static zio_t *
vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
{
	zio_t *fio, *lio, *aio, *dio, *nio;
	avl_tree_t *t;
	int flags;
	uint64_t maxspan = zfs_vdev_aggregation_limit;
	uint64_t maxgap;

	ASSERT(MUTEX_HELD(&vq->vq_lock));

	if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
	    avl_numnodes(&vq->vq_deadline_tree) == 0)
		return (NULL);

	fio = lio = avl_first(&vq->vq_deadline_tree);

	t = fio->io_vdev_tree;
	flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT;
	maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0;

	if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) {
		/*
		 * We can aggregate I/Os that are adjacent and of the
		 * same flavor, as expressed by the AGG_INHERIT flags.
		 * The latter is necessary so that certain attributes
		 * of the I/O, such as whether it's a normal I/O or a
		 * scrub/resilver, can be preserved in the aggregate.
		 */
		while ((dio = AVL_PREV(t, fio)) != NULL &&
		    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
		    IO_SPAN(dio, lio) <= maxspan && IO_GAP(dio, fio) <= maxgap)
			fio = dio;

		while ((dio = AVL_NEXT(t, lio)) != NULL &&
		    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
		    IO_SPAN(fio, dio) <= maxspan && IO_GAP(lio, dio) <= maxgap)
			lio = dio;
	}

	if (fio != lio) {
		uint64_t size = IO_SPAN(fio, lio);
		ASSERT(size <= zfs_vdev_aggregation_limit);

		aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
		    zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_NOW,
		    flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
		    vdev_queue_agg_io_done, NULL);

		nio = fio;
		do {
			dio = nio;
			nio = AVL_NEXT(t, dio);
			ASSERT(dio->io_type == aio->io_type);
			ASSERT(dio->io_vdev_tree == t);

			if (dio->io_type == ZIO_TYPE_WRITE)
				bcopy(dio->io_data, (char *)aio->io_data +
				    (dio->io_offset - aio->io_offset),
				    dio->io_size);

			zio_add_child(dio, aio);
			vdev_queue_io_remove(vq, dio);
			zio_vdev_io_bypass(dio);
			zio_execute(dio);
		} while (dio != lio);

		avl_add(&vq->vq_pending_tree, aio);

		return (aio);
	}

	ASSERT(fio->io_vdev_tree == t);
	vdev_queue_io_remove(vq, fio);

	avl_add(&vq->vq_pending_tree, fio);

	return (fio);
}
コード例 #11
0
static zio_t *
vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
{
	zio_t *fio, *lio, *aio, *dio;
	avl_tree_t *tree;
	uint64_t size;

	ASSERT(MUTEX_HELD(&vq->vq_lock));

	if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
	    avl_numnodes(&vq->vq_deadline_tree) == 0)
		return (NULL);

	fio = lio = avl_first(&vq->vq_deadline_tree);

	tree = fio->io_vdev_tree;
	size = fio->io_size;

	while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) &&
	    size + dio->io_size <= zfs_vdev_aggregation_limit) {
		dio->io_delegate_next = fio;
		fio = dio;
		size += dio->io_size;
	}

	while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) &&
	    size + dio->io_size <= zfs_vdev_aggregation_limit) {
		lio->io_delegate_next = dio;
		lio = dio;
		size += dio->io_size;
	}

	if (fio != lio) {
		char *buf = zio_buf_alloc(size);
		uint64_t offset = 0;
		int nagg = 0;

		ASSERT(size <= zfs_vdev_aggregation_limit);

		aio = zio_vdev_child_io(fio, NULL, fio->io_vd,
		    fio->io_offset, buf, size, fio->io_type,
		    ZIO_PRIORITY_NOW, ZIO_FLAG_DONT_QUEUE |
		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
		    ZIO_FLAG_NOBOOKMARK,
		    vdev_queue_agg_io_done, NULL);

		aio->io_delegate_list = fio;

		for (dio = fio; dio != NULL; dio = dio->io_delegate_next) {
			ASSERT(dio->io_type == aio->io_type);
			ASSERT(dio->io_vdev_tree == tree);
			if (dio->io_type == ZIO_TYPE_WRITE)
				bcopy(dio->io_data, buf + offset, dio->io_size);
			offset += dio->io_size;
			vdev_queue_io_remove(vq, dio);
			zio_vdev_io_bypass(dio);
			nagg++;
		}

		ASSERT(offset == size);

		dprintf("%5s  T=%llu  off=%8llx  agg=%3d  "
		    "old=%5llx  new=%5llx\n",
		    zio_type_name[fio->io_type],
		    fio->io_deadline, fio->io_offset, nagg, fio->io_size, size);

		avl_add(&vq->vq_pending_tree, aio);

		return (aio);
	}

	ASSERT(fio->io_vdev_tree == tree);
	vdev_queue_io_remove(vq, fio);

	avl_add(&vq->vq_pending_tree, fio);

	return (fio);
}