Exemplo n.º 1
0
static void
trim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
{
	trim_map_t *tm = vd->vdev_trimmap;
	trim_seg_t *ts;
	uint64_t start, size, txglimit;

	ASSERT(vd->vdev_ops->vdev_op_leaf);

	if (tm == NULL)
		return;

	txglimit = MIN(spa->spa_syncing_txg, spa_freeze_txg(spa)) -
	    trim_txg_limit;

	mutex_enter(&tm->tm_lock);
	/*
	 * Loop until we send all frees up to the txglimit.
	 */
	while ((ts = trim_map_first(tm, txglimit)) != NULL) {
		list_remove(&tm->tm_head, ts);
		avl_remove(&tm->tm_queued_frees, ts);
		avl_add(&tm->tm_inflight_frees, ts);
		zio_nowait(zio_trim(zio, spa, vd, ts->ts_start,
		    ts->ts_end - ts->ts_start));
	}
	mutex_exit(&tm->tm_lock);
}
Exemplo n.º 2
0
void
vdev_queue_io_done(zio_t *zio)
{
	vdev_queue_t *vq = &zio->io_vd->vdev_queue;

	if (zio_injection_enabled)
		delay(SEC_TO_TICK(zio_handle_io_delay(zio)));

	mutex_enter(&vq->vq_lock);

	vdev_queue_pending_remove(vq, zio);

	vq->vq_io_complete_ts = gethrtime();

	for (int i = 0; i < zfs_vdev_ramp_rate; i++) {
		zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending);
		if (nio == NULL)
			break;
		mutex_exit(&vq->vq_lock);
		if (nio->io_done == vdev_queue_agg_io_done) {
			zio_nowait(nio);
		} else {
			zio_vdev_io_reissue(nio);
			zio_execute(nio);
		}
		mutex_enter(&vq->vq_lock);
	}

	mutex_exit(&vq->vq_lock);
}
Exemplo n.º 3
0
void
vdev_queue_io_done(zio_t *zio)
{
	vdev_queue_t *vq = &zio->io_vd->vdev_queue;

	mutex_enter(&vq->vq_lock);

	avl_remove(&vq->vq_pending_tree, zio);

	for (int i = 0; i < zfs_vdev_ramp_rate; i++) {
		zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending);
		if (nio == NULL)
			break;
		mutex_exit(&vq->vq_lock);
		if (nio->io_done == vdev_queue_agg_io_done) {
			zio_nowait(nio);
		} else {
			zio_vdev_io_reissue(nio);
			zio_execute(nio);
		}
		mutex_enter(&vq->vq_lock);
	}

	mutex_exit(&vq->vq_lock);
}
Exemplo n.º 4
0
/* Takes care of physical writing and limiting # of concurrent ZIOs. */
static int
vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data)
{
	spa_t *spa = vd->vdev_spa;

	/* Limit inflight initializing I/Os */
	mutex_enter(&vd->vdev_initialize_io_lock);
	while (vd->vdev_initialize_inflight >= zfs_initialize_limit) {
		cv_wait(&vd->vdev_initialize_io_cv,
		    &vd->vdev_initialize_io_lock);
	}
	vd->vdev_initialize_inflight++;
	mutex_exit(&vd->vdev_initialize_io_lock);

	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
	uint64_t txg = dmu_tx_get_txg(tx);

	spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER);
	mutex_enter(&vd->vdev_initialize_lock);

	if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) {
		uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
		*guid = vd->vdev_guid;

		/* This is the first write of this txg. */
		dsl_sync_task_nowait(spa_get_dsl(spa),
		    vdev_initialize_zap_update_sync, guid, 2,
		    ZFS_SPACE_CHECK_RESERVED, tx);
	}

	/*
	 * We know the vdev struct will still be around since all
	 * consumers of vdev_free must stop the initialization first.
	 */
	if (vdev_initialize_should_stop(vd)) {
		mutex_enter(&vd->vdev_initialize_io_lock);
		ASSERT3U(vd->vdev_initialize_inflight, >, 0);
		vd->vdev_initialize_inflight--;
		mutex_exit(&vd->vdev_initialize_io_lock);
		spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
		mutex_exit(&vd->vdev_initialize_lock);
		dmu_tx_commit(tx);
		return (SET_ERROR(EINTR));
	}
	mutex_exit(&vd->vdev_initialize_lock);

	vd->vdev_initialize_offset[txg & TXG_MASK] = start + size;
	zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start,
	    size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL,
	    ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE));
	/* vdev_initialize_cb releases SCL_STATE_ALL */

	dmu_tx_commit(tx);

	return (0);
}
Exemplo n.º 5
0
static void
trim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
{
	trim_map_t *tm = vd->vdev_trimmap;
	trim_seg_t *ts;
	uint64_t size, offset, txgtarget, txgsafe;
	int64_t hard, soft;
	hrtime_t timelimit;

	ASSERT(vd->vdev_ops->vdev_op_leaf);

	if (tm == NULL)
		return;

	timelimit = gethrtime() - (hrtime_t)trim_timeout * NANOSEC;
	if (vd->vdev_isl2cache) {
		txgsafe = UINT64_MAX;
		txgtarget = UINT64_MAX;
	} else {
		txgsafe = MIN(spa_last_synced_txg(spa), spa_freeze_txg(spa));
		if (txgsafe > trim_txg_delay)
			txgtarget = txgsafe - trim_txg_delay;
		else
			txgtarget = 0;
	}

	mutex_enter(&tm->tm_lock);
	hard = 0;
	if (tm->tm_pending > trim_vdev_max_pending)
		hard = (tm->tm_pending - trim_vdev_max_pending) / 4;
	soft = P2ROUNDUP(hard + tm->tm_pending / trim_timeout + 1, 64);
	/* Loop until we have sent all outstanding free's */
	while (soft > 0 &&
	    (ts = trim_map_first(tm, txgtarget, txgsafe, timelimit, hard > 0))
	    != NULL) {
		TRIM_MAP_REM(tm, ts);
		avl_remove(&tm->tm_queued_frees, ts);
		avl_add(&tm->tm_inflight_frees, ts);
		size = ts->ts_end - ts->ts_start;
		offset = ts->ts_start;
		/*
		 * We drop the lock while we call zio_nowait as the IO
		 * scheduler can result in a different IO being run e.g.
		 * a write which would result in a recursive lock.
		 */
		mutex_exit(&tm->tm_lock);

		zio_nowait(zio_trim(zio, spa, vd, offset, size));

		soft -= TRIM_MAP_SEGS(size);
		hard -= TRIM_MAP_SEGS(size);
		mutex_enter(&tm->tm_lock);
	}
	mutex_exit(&tm->tm_lock);
}
Exemplo n.º 6
0
Arquivo: mmp.c Projeto: LLNL/zfs
/*
 * Choose a random vdev, label, and MMP block, and write over it
 * with a copy of the last-synced uberblock, whose timestamp
 * has been updated to reflect that the pool is in use.
 */
static void
mmp_write_uberblock(spa_t *spa)
{
	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
	mmp_thread_t *mmp = &spa->spa_mmp;
	uberblock_t *ub;
	vdev_t *vd;
	int label;
	uint64_t offset;

	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
	vd = mmp_random_leaf(spa->spa_root_vdev);
	if (vd == NULL) {
		spa_config_exit(spa, SCL_STATE, FTAG);
		return;
	}

	mutex_enter(&mmp->mmp_io_lock);

	if (mmp->mmp_zio_root == NULL)
		mmp->mmp_zio_root = zio_root(spa, NULL, NULL,
		    flags | ZIO_FLAG_GODFATHER);

	ub = &mmp->mmp_ub;
	ub->ub_timestamp = gethrestime_sec();
	ub->ub_mmp_magic = MMP_MAGIC;
	ub->ub_mmp_delay = mmp->mmp_delay;
	vd->vdev_mmp_pending = gethrtime();

	zio_t *zio  = zio_null(mmp->mmp_zio_root, spa, NULL, NULL, NULL, flags);
	abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
	abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
	abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));

	mutex_exit(&mmp->mmp_io_lock);

	offset = VDEV_UBERBLOCK_OFFSET(vd, VDEV_UBERBLOCK_COUNT(vd) -
	    MMP_BLOCKS_PER_LABEL + spa_get_random(MMP_BLOCKS_PER_LABEL));

	label = spa_get_random(VDEV_LABELS);
	vdev_label_write(zio, vd, label, ub_abd, offset,
	    VDEV_UBERBLOCK_SIZE(vd), mmp_write_done, mmp,
	    flags | ZIO_FLAG_DONT_PROPAGATE);

	spa_mmp_history_add(ub->ub_txg, ub->ub_timestamp, ub->ub_mmp_delay, vd,
	    label);

	zio_nowait(zio);
}
Exemplo n.º 7
0
zio_t *
vdev_queue_io(zio_t *zio)
{
	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
	zio_t *nio;

	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);

	if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
		return (zio);

	zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;

	if (zio->io_type == ZIO_TYPE_READ)
		zio->io_vdev_tree = &vq->vq_read_tree;
	else
		zio->io_vdev_tree = &vq->vq_write_tree;

	mutex_enter(&vq->vq_lock);

	zio->io_timestamp = gethrtime();
	zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) +
	    zio->io_priority;

	vdev_queue_io_add(vq, zio);

	nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending);

	mutex_exit(&vq->vq_lock);

	if (nio == NULL)
		return (NULL);

	if (nio->io_done == vdev_queue_agg_io_done) {
		zio_nowait(nio);
		return (NULL);
	}

	return (nio);
}
Exemplo n.º 8
0
static void
trim_map_vdev_commit(spa_t *spa, zio_t *zio, vdev_t *vd)
{
	trim_map_t *tm = vd->vdev_trimmap;
	trim_seg_t *ts;
	uint64_t size, txgtarget, txgsafe;
	hrtime_t timelimit;

	ASSERT(vd->vdev_ops->vdev_op_leaf);

	if (tm == NULL)
		return;

	timelimit = gethrtime() - trim_timeout * NANOSEC;
	if (vd->vdev_isl2cache) {
		txgsafe = UINT64_MAX;
		txgtarget = UINT64_MAX;
	} else {
		txgsafe = MIN(spa_last_synced_txg(spa), spa_freeze_txg(spa));
		if (txgsafe > trim_txg_delay)
			txgtarget = txgsafe - trim_txg_delay;
		else
			txgtarget = 0;
	}

	mutex_enter(&tm->tm_lock);
	/* Loop until we have sent all outstanding free's */
	while ((ts = trim_map_first(tm, txgtarget, txgsafe, timelimit))
	    != NULL) {
		list_remove(&tm->tm_head, ts);
		avl_remove(&tm->tm_queued_frees, ts);
		avl_add(&tm->tm_inflight_frees, ts);
		size = ts->ts_end - ts->ts_start;
		zio_nowait(zio_trim(zio, spa, vd, ts->ts_start, size));
		TRIM_MAP_SDEC(tm, size);
		TRIM_MAP_QDEC(tm);
	}
	mutex_exit(&tm->tm_lock);
}
Exemplo n.º 9
0
void
dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
{
	ASSERT(dsl_pool_sync_context(dp));
	zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags));
}
Exemplo n.º 10
0
/* called from dsl */
void
dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
{
	int txgoff;
	zbookmark_t zb;
	zio_prop_t zp;
	zio_t *zio;
	list_t *list;
	list_t *newlist = NULL;
	dbuf_dirty_record_t *dr;

	dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);

	ASSERT(dmu_tx_is_syncing(tx));
	/* XXX the write_done callback should really give us the tx... */
	os->os_synctx = tx;

	if (os->os_dsl_dataset == NULL) {
		/*
		 * This is the MOS.  If we have upgraded,
		 * spa_max_replication() could change, so reset
		 * os_copies here.
		 */
		os->os_copies = spa_max_replication(os->os_spa);
	}

	/*
	 * Create the root block IO
	 */
	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
	arc_release(os->os_phys_buf, &os->os_phys_buf);

	dmu_write_policy(os, NULL, 0, 0, &zp);

	zio = arc_write(pio, os->os_spa, tx->tx_txg,
	    os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), &zp,
	    dmu_objset_write_ready, dmu_objset_write_done, os,
	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);

	/*
	 * Sync special dnodes - the parent IO for the sync is the root block
	 */
	DMU_META_DNODE(os)->dn_zio = zio;
	dnode_sync(DMU_META_DNODE(os), tx);

	os->os_phys->os_flags = os->os_flags;

	if (DMU_USERUSED_DNODE(os) &&
	    DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
		DMU_USERUSED_DNODE(os)->dn_zio = zio;
		dnode_sync(DMU_USERUSED_DNODE(os), tx);
		DMU_GROUPUSED_DNODE(os)->dn_zio = zio;
		dnode_sync(DMU_GROUPUSED_DNODE(os), tx);
	}

	txgoff = tx->tx_txg & TXG_MASK;

	if (dmu_objset_userused_enabled(os)) {
		newlist = &os->os_synced_dnodes;
		/*
		 * We must create the list here because it uses the
		 * dn_dirty_link[] of this txg.
		 */
		list_create(newlist, sizeof (dnode_t),
		    offsetof(dnode_t, dn_dirty_link[txgoff]));
	}

	dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx);
	dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx);

	list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
	while (dr = list_head(list)) {
		ASSERT0(dr->dr_dbuf->db_level);
		list_remove(list, dr);
		if (dr->dr_zio)
			zio_nowait(dr->dr_zio);
	}
	/*
	 * Free intent log blocks up to this tx.
	 */
	zil_sync(os->os_zil, tx);
	os->os_phys->os_zil_header = os->os_zil_header;
	zio_nowait(zio);
}
Exemplo n.º 11
0
/*
 * Read data from the cache.  Returns 0 on cache hit, errno on a miss.
 */
int
vdev_cache_read(zio_t *zio)
{
	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
	vdev_cache_entry_t *ve, ve_search;
	uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS);
	uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
	zio_t *fio;

	ASSERT(zio->io_type == ZIO_TYPE_READ);

	if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
		return (EINVAL);

	if (zio->io_size > zfs_vdev_cache_max)
		return (EOVERFLOW);

	/*
	 * If the I/O straddles two or more cache blocks, don't cache it.
	 */
	if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS))
		return (EXDEV);

	ASSERT(cache_phase + zio->io_size <= VCBS);

	mutex_enter(&vc->vc_lock);

	ve_search.ve_offset = cache_offset;
	ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL);

	if (ve != NULL) {
		if (ve->ve_missed_update) {
			mutex_exit(&vc->vc_lock);
			return (ESTALE);
		}

		if ((fio = ve->ve_fill_io) != NULL) {
			zio_vdev_io_bypass(zio);
			zio_add_child(zio, fio);
			mutex_exit(&vc->vc_lock);
			VDCSTAT_BUMP(vdc_stat_delegations);
			return (0);
		}

		vdev_cache_hit(vc, ve, zio);
		zio_vdev_io_bypass(zio);

		mutex_exit(&vc->vc_lock);
		VDCSTAT_BUMP(vdc_stat_hits);
		return (0);
	}

	ve = vdev_cache_allocate(zio);

	if (ve == NULL) {
		mutex_exit(&vc->vc_lock);
		return (ENOMEM);
	}

	fio = zio_vdev_delegated_io(zio->io_vd, cache_offset,
	    ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL,
	    ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve);

	ve->ve_fill_io = fio;
	zio_vdev_io_bypass(zio);
	zio_add_child(zio, fio);

	mutex_exit(&vc->vc_lock);
	zio_nowait(fio);
	VDCSTAT_BUMP(vdc_stat_misses);

	return (0);
}
Exemplo n.º 12
0
/* called from dsl */
void
dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
{
	int txgoff;
	zbookmark_t zb;
	writeprops_t wp = { 0 };
	zio_t *zio;
	list_t *list;
	dbuf_dirty_record_t *dr;

	dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);

	ASSERT(dmu_tx_is_syncing(tx));
	/* XXX the write_done callback should really give us the tx... */
	os->os_synctx = tx;

	if (os->os_dsl_dataset == NULL) {
		/*
		 * This is the MOS.  If we have upgraded,
		 * spa_max_replication() could change, so reset
		 * os_copies here.
		 */
		os->os_copies = spa_max_replication(os->os_spa);
	}

	/*
	 * Create the root block IO
	 */
	zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
	zb.zb_object = 0;
	zb.zb_level = -1;
	zb.zb_blkid = 0;
	if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg)) {
		(void) dsl_dataset_block_kill(os->os_dsl_dataset,
		    os->os_rootbp, pio, tx);
	}
	wp.wp_type = DMU_OT_OBJSET;
	wp.wp_copies = os->os_copies;
	wp.wp_level = (uint8_t)-1;
	wp.wp_oschecksum = os->os_checksum;
	wp.wp_oscompress = os->os_compress;
	arc_release(os->os_phys_buf, &os->os_phys_buf);
	zio = arc_write(pio, os->os_spa, &wp,
	    tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, NULL, os,
	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_METADATA,
	    &zb);

	/*
	 * Sync meta-dnode - the parent IO for the sync is the root block
	 */
	os->os_meta_dnode->dn_zio = zio;
	dnode_sync(os->os_meta_dnode, tx);

	txgoff = tx->tx_txg & TXG_MASK;

	dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], tx);
	dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], tx);

	list = &os->os_meta_dnode->dn_dirty_records[txgoff];
	while (dr = list_head(list)) {
		ASSERT(dr->dr_dbuf->db_level == 0);
		list_remove(list, dr);
		if (dr->dr_zio)
			zio_nowait(dr->dr_zio);
	}
	/*
	 * Free intent log blocks up to this tx.
	 */
	zil_sync(os->os_zil, tx);
	os->os_phys->os_zil_header = os->os_zil_header;
	zio_nowait(zio);
}
Exemplo n.º 13
0
/*
 * Read data from the cache.  Returns 0 on cache hit, errno on a miss.
 */
int
vdev_cache_read(zio_t *zio)
{
	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
	vdev_cache_entry_t *ve, ve_search;
	uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS);
	uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
	zio_t *fio;

	ASSERT(zio->io_type == ZIO_TYPE_READ);

	if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
		return (EINVAL);

	if (zio->io_size > zfs_vdev_cache_max)
		return (EOVERFLOW);

	/*
	 * If the I/O straddles two or more cache blocks, don't cache it.
	 */
	if (P2CROSS(zio->io_offset, zio->io_offset + zio->io_size - 1, VCBS))
		return (EXDEV);

	ASSERT(cache_phase + zio->io_size <= VCBS);

	mutex_enter(&vc->vc_lock);

	ve_search.ve_offset = cache_offset;
	ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL);

	if (ve != NULL) {
		if (ve->ve_missed_update) {
			mutex_exit(&vc->vc_lock);
			return (ESTALE);
		}

		if ((fio = ve->ve_fill_io) != NULL) {
			zio->io_delegate_next = fio->io_delegate_list;
			fio->io_delegate_list = zio;
			zio_vdev_io_bypass(zio);
			mutex_exit(&vc->vc_lock);
			return (0);
		}

		vdev_cache_hit(vc, ve, zio);
		zio_vdev_io_bypass(zio);

		mutex_exit(&vc->vc_lock);
		zio_next_stage(zio);
		return (0);
	}

	if (!(zio->io_flags & ZIO_FLAG_METADATA)) {
		mutex_exit(&vc->vc_lock);
		return (EINVAL);
	}

	ve = vdev_cache_allocate(zio);

	if (ve == NULL) {
		mutex_exit(&vc->vc_lock);
		return (ENOMEM);
	}

	fio = zio_vdev_child_io(zio, NULL, zio->io_vd, cache_offset,
	    ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL,
	    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
	    ZIO_FLAG_DONT_RETRY | ZIO_FLAG_NOBOOKMARK,
	    vdev_cache_fill, ve);

	ve->ve_fill_io = fio;
	fio->io_delegate_list = zio;
	zio_vdev_io_bypass(zio);

	mutex_exit(&vc->vc_lock);
	zio_nowait(fio);

	return (0);
}
Exemplo n.º 14
0
/* called from dsl */
void
dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
{
	int txgoff;
	zbookmark_t zb;
	writeprops_t wp = { 0 };
	zio_t *zio;
	list_t *list;
	list_t *newlist = NULL;
	dbuf_dirty_record_t *dr;

	dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);

	ASSERT(dmu_tx_is_syncing(tx));
	/* XXX the write_done callback should really give us the tx... */
	os->os_synctx = tx;

	if (os->os_dsl_dataset == NULL) {
		/*
		 * This is the MOS.  If we have upgraded,
		 * spa_max_replication() could change, so reset
		 * os_copies here.
		 */
		os->os_copies = spa_max_replication(os->os_spa);
	}

	/*
	 * Create the root block IO
	 */
	zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
	zb.zb_object = 0;
	zb.zb_level = -1;	/* for block ordering; it's level 0 on disk */
	zb.zb_blkid = 0;

	wp.wp_type = DMU_OT_OBJSET;
	wp.wp_level = 0;	/* on-disk BP level; see above */
	wp.wp_copies = os->os_copies;
	wp.wp_oschecksum = os->os_checksum;
	wp.wp_oscompress = os->os_compress;

	if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg)) {
		(void) dsl_dataset_block_kill(os->os_dsl_dataset,
		    os->os_rootbp, pio, tx);
	}

	arc_release(os->os_phys_buf, &os->os_phys_buf);

	zio = arc_write(pio, os->os_spa, &wp, DMU_OS_IS_L2CACHEABLE(os),
	    tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, NULL, os,
	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);

	/*
	 * Sync special dnodes - the parent IO for the sync is the root block
	 */
	os->os_meta_dnode->dn_zio = zio;
	dnode_sync(os->os_meta_dnode, tx);

	os->os_phys->os_flags = os->os_flags;

	if (os->os_userused_dnode &&
	    os->os_userused_dnode->dn_type != DMU_OT_NONE) {
		os->os_userused_dnode->dn_zio = zio;
		dnode_sync(os->os_userused_dnode, tx);
		os->os_groupused_dnode->dn_zio = zio;
		dnode_sync(os->os_groupused_dnode, tx);
	}

	txgoff = tx->tx_txg & TXG_MASK;

	if (dmu_objset_userused_enabled(os)) {
		newlist = &os->os_synced_dnodes;
		/*
		 * We must create the list here because it uses the
		 * dn_dirty_link[] of this txg.
		 */
		list_create(newlist, sizeof (dnode_t),
		    offsetof(dnode_t, dn_dirty_link[txgoff]));
	}

	dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx);
	dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx);

	list = &os->os_meta_dnode->dn_dirty_records[txgoff];
	while ((dr = list_head(list))) {
		ASSERT(dr->dr_dbuf->db_level == 0);
		list_remove(list, dr);
		if (dr->dr_zio)
			zio_nowait(dr->dr_zio);
	}
	/*
	 * Free intent log blocks up to this tx.
	 */
	zil_sync(os->os_zil, tx);
	os->os_phys->os_zil_header = os->os_zil_header;
	zio_nowait(zio);
}
Exemplo n.º 15
0
Arquivo: zil.c Projeto: harshada/zfs
/*
 * Start a log block write and advance to the next log block.
 * Calls are serialized.
 */
static lwb_t *
zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
{
	lwb_t *nlwb;
	zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1;
	spa_t *spa = zilog->zl_spa;
	blkptr_t *bp = &ztp->zit_next_blk;
	uint64_t txg;
	uint64_t zil_blksz;
	int error;

	ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb));

	/*
	 * Allocate the next block and save its address in this block
	 * before writing it in order to establish the log chain.
	 * Note that if the allocation of nlwb synced before we wrote
	 * the block that points at it (lwb), we'd leak it if we crashed.
	 * Therefore, we don't do txg_rele_to_sync() until zil_lwb_write_done().
	 */
	txg = txg_hold_open(zilog->zl_dmu_pool, &lwb->lwb_txgh);
	txg_rele_to_quiesce(&lwb->lwb_txgh);

	/*
	 * Pick a ZIL blocksize. We request a size that is the
	 * maximum of the previous used size, the current used size and
	 * the amount waiting in the queue.
	 */
	zil_blksz = MAX(zilog->zl_prev_used,
	    zilog->zl_cur_used + sizeof (*ztp));
	zil_blksz = MAX(zil_blksz, zilog->zl_itx_list_sz + sizeof (*ztp));
	zil_blksz = P2ROUNDUP_TYPED(zil_blksz, ZIL_MIN_BLKSZ, uint64_t);
	if (zil_blksz > ZIL_MAX_BLKSZ)
		zil_blksz = ZIL_MAX_BLKSZ;

	BP_ZERO(bp);
	/* pass the old blkptr in order to spread log blocks across devs */
	error = zio_alloc_blk(spa, zil_blksz, bp, &lwb->lwb_blk, txg);
	if (error) {
		dmu_tx_t *tx = dmu_tx_create_assigned(zilog->zl_dmu_pool, txg);

		/*
		 * We dirty the dataset to ensure that zil_sync() will
		 * be called to remove this lwb from our zl_lwb_list.
		 * Failing to do so, may leave an lwb with a NULL lwb_buf
		 * hanging around on the zl_lwb_list.
		 */
		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
		dmu_tx_commit(tx);

		/*
		 * Since we've just experienced an allocation failure so we
		 * terminate the current lwb and send it on its way.
		 */
		ztp->zit_pad = 0;
		ztp->zit_nused = lwb->lwb_nused;
		ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
		zio_nowait(lwb->lwb_zio);

		/*
		 * By returning NULL the caller will call tx_wait_synced()
		 */
		return (NULL);
	}

	ASSERT3U(bp->blk_birth, ==, txg);
	ztp->zit_pad = 0;
	ztp->zit_nused = lwb->lwb_nused;
	ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
	bp->blk_cksum = lwb->lwb_blk.blk_cksum;
	bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;

	/*
	 * Allocate a new log write buffer (lwb).
	 */
	nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);

	nlwb->lwb_zilog = zilog;
	nlwb->lwb_blk = *bp;
	nlwb->lwb_nused = 0;
	nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk);
	nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz);
	nlwb->lwb_max_txg = txg;
	nlwb->lwb_zio = NULL;

	/*
	 * Put new lwb at the end of the log chain
	 */
	mutex_enter(&zilog->zl_lock);
	list_insert_tail(&zilog->zl_lwb_list, nlwb);
	mutex_exit(&zilog->zl_lock);

	/* Record the block for later vdev flushing */
	zil_add_block(zilog, &lwb->lwb_blk);

	/*
	 * kick off the write for the old log block
	 */
	dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg);
	ASSERT(lwb->lwb_zio);
	zio_nowait(lwb->lwb_zio);

	return (nlwb);
}