Beispiel #1
0
static void
trim_thread(void *arg)
{
	spa_t *spa = arg;
	zio_t *zio;

#ifdef _KERNEL
	(void) snprintf(curthread->td_name, sizeof(curthread->td_name),
	    "trim %s", spa_name(spa));
#endif

	for (;;) {
		mutex_enter(&spa->spa_trim_lock);
		if (spa->spa_trim_thread == NULL) {
			spa->spa_trim_thread = curthread;
			cv_signal(&spa->spa_trim_cv);
			mutex_exit(&spa->spa_trim_lock);
			thread_exit();
		}

		(void) cv_timedwait(&spa->spa_trim_cv, &spa->spa_trim_lock,
		    hz * trim_max_interval);
		mutex_exit(&spa->spa_trim_lock);

		zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);

		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
		trim_map_commit(spa, zio, spa->spa_root_vdev);
		(void) zio_wait(zio);
		trim_map_commit_done(spa, spa->spa_root_vdev);
		spa_config_exit(spa, SCL_STATE, FTAG);
	}
}
Beispiel #2
0
static void
process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
    dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
{
	struct process_old_arg poa = { 0 };
	dsl_pool_t *dp = ds->ds_dir->dd_pool;
	objset_t *mos = dp->dp_meta_objset;
	uint64_t deadlist_obj;

	ASSERT(ds->ds_deadlist.dl_oldfmt);
	ASSERT(ds_next->ds_deadlist.dl_oldfmt);

	poa.ds = ds;
	poa.ds_prev = ds_prev;
	poa.after_branch_point = after_branch_point;
	poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
	VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
	    process_old_cb, &poa, tx));
	VERIFY0(zio_wait(poa.pio));
	ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes);

	/* change snapused */
	dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
	    -poa.used, -poa.comp, -poa.uncomp, tx);

	/* swap next's deadlist to our deadlist */
	dsl_deadlist_close(&ds->ds_deadlist);
	dsl_deadlist_close(&ds_next->ds_deadlist);
	deadlist_obj = ds->ds_phys->ds_deadlist_obj;
	ds->ds_phys->ds_deadlist_obj = ds_next->ds_phys->ds_deadlist_obj;
	ds_next->ds_phys->ds_deadlist_obj = deadlist_obj;
	dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
	dsl_deadlist_open(&ds_next->ds_deadlist, mos,
	    ds_next->ds_phys->ds_deadlist_obj);
}
Beispiel #3
0
static void
trim_thread(void *arg)
{
	spa_t *spa = arg;
	zio_t *zio;

	for (;;) {
		mutex_enter(&spa->spa_trim_lock);
		if (spa->spa_trim_thread == NULL) {
			spa->spa_trim_thread = curthread;
			cv_signal(&spa->spa_trim_cv);
			mutex_exit(&spa->spa_trim_lock);
			thread_exit();
		}
		cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock);
		mutex_exit(&spa->spa_trim_lock);

		zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);

		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
		trim_map_commit(spa, zio, spa->spa_root_vdev);
		(void) zio_wait(zio);
		trim_map_commit_done(spa, spa->spa_root_vdev);
		spa_config_exit(spa, SCL_STATE, FTAG);
	}
}
Beispiel #4
0
/* ARGSUSED */
static void
zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
{
	spa_t *spa = zilog->zl_spa;
	int err;

	/*
	 * Claim log block if not already committed and not already claimed.
	 */
	if (bp->blk_birth >= first_txg &&
	    zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp)) == 0) {
		err = zio_wait(zio_claim(NULL, spa, first_txg, bp, NULL, NULL,
		    ZIO_FLAG_MUSTSUCCEED));
		ASSERT(err == 0);
	}
}
Beispiel #5
0
/*
 * Read in the data for the dmu_sync()ed block, and change the log
 * record to write this whole block.
 */
void
zil_get_replay_data(zilog_t *zilog, lr_write_t *lr)
{
	blkptr_t *wbp = &lr->lr_blkptr;
	char *wbuf = (char *)(lr + 1); /* data follows lr_write_t */
	uint64_t blksz;

	if (BP_IS_HOLE(wbp)) {	/* compressed to a hole */
		blksz = BP_GET_LSIZE(&lr->lr_blkptr);
		/*
		 * If the blksz is zero then we must be replaying a log
		 * from an version prior to setting the blksize of null blocks.
		 * So we just zero the actual write size reqeusted.
		 */
		if (blksz == 0) {
			bzero(wbuf, lr->lr_length);
			return;
		}
		bzero(wbuf, blksz);
	} else {
		/*
		 * A subsequent write may have overwritten this block, in which
		 * case wbp may have been been freed and reallocated, and our
		 * read of wbp may fail with a checksum error.  We can safely
		 * ignore this because the later write will provide the
		 * correct data.
		 */
		zbookmark_t zb;

		zb.zb_objset = dmu_objset_id(zilog->zl_os);
		zb.zb_object = lr->lr_foid;
		zb.zb_level = 0;
		zb.zb_blkid = -1; /* unknown */

		blksz = BP_GET_LSIZE(&lr->lr_blkptr);
		(void) zio_wait(zio_read(NULL, zilog->zl_spa, wbp, wbuf, blksz,
		    NULL, NULL, ZIO_PRIORITY_SYNC_READ,
		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb));
	}
	lr->lr_offset -= lr->lr_offset % blksz;
	lr->lr_length = blksz;
}
Beispiel #6
0
void
zil_flush_vdevs(zilog_t *zilog)
{
	spa_t *spa = zilog->zl_spa;
	avl_tree_t *t = &zilog->zl_vdev_tree;
	void *cookie = NULL;
	zil_vdev_node_t *zv;
	zio_t *zio;

	ASSERT(zilog->zl_writer);

	/*
	 * We don't need zl_vdev_lock here because we're the zl_writer,
	 * and all zl_get_data() callbacks are done.
	 */
	if (avl_numnodes(t) == 0)
		return;

	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);

	zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);

	while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
		vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
		if (vd != NULL)
			zio_flush(zio, vd);
		kmem_free(zv, sizeof (*zv));
	}

	/*
	 * Wait for all the flushes to complete.  Not all devices actually
	 * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails.
	 */
	(void) zio_wait(zio);

	spa_config_exit(spa, SCL_STATE, FTAG);
}
Beispiel #7
0
/* ARGSUSED */
static void
dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
{
	dnode_t *dn = txh->txh_dnode;
	uint64_t start, end, i;
	int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
	int err = 0;

	if (len == 0)
		return;

	min_bs = SPA_MINBLOCKSHIFT;
	max_bs = SPA_MAXBLOCKSHIFT;
	min_ibs = DN_MIN_INDBLKSHIFT;
	max_ibs = DN_MAX_INDBLKSHIFT;

	if (dn) {
		uint64_t history[DN_MAX_LEVELS];
		int nlvls = dn->dn_nlevels;
		int delta;

		/*
		 * For i/o error checking, read the first and last level-0
		 * blocks (if they are not aligned), and all the level-1 blocks.
		 */
		if (dn->dn_maxblkid == 0) {
			delta = dn->dn_datablksz;
			start = (off < dn->dn_datablksz) ? 0 : 1;
			end = (off+len <= dn->dn_datablksz) ? 0 : 1;
			if (start == 0 && (off > 0 || len < dn->dn_datablksz)) {
				err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
				if (err)
					goto out;
				delta -= off;
			}
		} else {
			zio_t *zio = zio_root(dn->dn_objset->os_spa,
			    NULL, NULL, ZIO_FLAG_CANFAIL);

			/* first level-0 block */
			start = off >> dn->dn_datablkshift;
			if (P2PHASE(off, dn->dn_datablksz) ||
			    len < dn->dn_datablksz) {
				err = dmu_tx_check_ioerr(zio, dn, 0, start);
				if (err)
					goto out;
			}

			/* last level-0 block */
			end = (off+len-1) >> dn->dn_datablkshift;
			if (end != start && end <= dn->dn_maxblkid &&
			    P2PHASE(off+len, dn->dn_datablksz)) {
				err = dmu_tx_check_ioerr(zio, dn, 0, end);
				if (err)
					goto out;
			}

			/* level-1 blocks */
			if (nlvls > 1) {
				int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
				for (i = (start>>shft)+1; i < end>>shft; i++) {
					err = dmu_tx_check_ioerr(zio, dn, 1, i);
					if (err)
						goto out;
				}
			}

			err = zio_wait(zio);
			if (err)
				goto out;
			delta = P2NPHASE(off, dn->dn_datablksz);
		}

		min_ibs = max_ibs = dn->dn_indblkshift;
		if (dn->dn_maxblkid > 0) {
			/*
			 * The blocksize can't change,
			 * so we can make a more precise estimate.
			 */
			ASSERT(dn->dn_datablkshift != 0);
			min_bs = max_bs = dn->dn_datablkshift;
		}

		/*
		 * If this write is not off the end of the file
		 * we need to account for overwrites/unref.
		 */
		if (start <= dn->dn_maxblkid) {
			for (int l = 0; l < DN_MAX_LEVELS; l++)
				history[l] = -1ULL;
		}
		while (start <= dn->dn_maxblkid) {
			dmu_buf_impl_t *db;

			rw_enter(&dn->dn_struct_rwlock, RW_READER);
			err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db);
			rw_exit(&dn->dn_struct_rwlock);

			if (err) {
				txh->txh_tx->tx_err = err;
				return;
			}

			dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE,
			    history);
			dbuf_rele(db, FTAG);
			if (++start > end) {
				/*
				 * Account for new indirects appearing
				 * before this IO gets assigned into a txg.
				 */
				bits = 64 - min_bs;
				epbs = min_ibs - SPA_BLKPTRSHIFT;
				for (bits -= epbs * (nlvls - 1);
				    bits >= 0; bits -= epbs)
					txh->txh_fudge += 1ULL << max_ibs;
				goto out;
			}
			off += delta;
			if (len >= delta)
				len -= delta;
			delta = dn->dn_datablksz;
		}
	}
Beispiel #8
0
/* ARGSUSED */
static void
zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
{
	char *data, *dlimit;
	blkptr_t *bp = &lr->lr_blkptr;
	zbookmark_phys_t zb;
	char buf[SPA_MAXBLOCKSIZE];
	int verbose = MAX(dump_opt['d'], dump_opt['i']);
	int error;

	(void) printf("%sfoid %llu, offset %llx, length %llx\n", prefix,
	    (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset,
	    (u_longlong_t)lr->lr_length);

	if (txtype == TX_WRITE2 || verbose < 5)
		return;

	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
		(void) printf("%shas blkptr, %s\n", prefix,
		    !BP_IS_HOLE(bp) &&
		    bp->blk_birth >= spa_first_txg(zilog->zl_spa) ?
		    "will claim" : "won't claim");
		print_log_bp(bp, prefix);

		if (BP_IS_HOLE(bp)) {
			(void) printf("\t\t\tLSIZE 0x%llx\n",
			    (u_longlong_t)BP_GET_LSIZE(bp));
			bzero(buf, sizeof (buf));
			(void) printf("%s<hole>\n", prefix);
			return;
		}
		if (bp->blk_birth < zilog->zl_header->zh_claim_txg) {
			(void) printf("%s<block already committed>\n", prefix);
			return;
		}

		SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os),
		    lr->lr_foid, ZB_ZIL_LEVEL,
		    lr->lr_offset / BP_GET_LSIZE(bp));

		error = zio_wait(zio_read(NULL, zilog->zl_spa,
		    bp, buf, BP_GET_LSIZE(bp), NULL, NULL,
		    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb));
		if (error)
			return;
		data = buf;
	} else {
		data = (char *)(lr + 1);
	}

	dlimit = data + MIN(lr->lr_length,
	    (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE));

	(void) printf("%s", prefix);
	while (data < dlimit) {
		if (isprint(*data))
			(void) printf("%c ", *data);
		else
			(void) printf("%2hhX", *data);
		data++;
	}
	(void) printf("\n");
}
Beispiel #9
0
Datei: mmp.c Projekt: LLNL/zfs
static void
mmp_thread(void *arg)
{
	spa_t *spa = (spa_t *)arg;
	mmp_thread_t *mmp = &spa->spa_mmp;
	boolean_t last_spa_suspended = spa_suspended(spa);
	boolean_t last_spa_multihost = spa_multihost(spa);
	callb_cpr_t cpr;
	hrtime_t max_fail_ns = zfs_multihost_fail_intervals *
	    MSEC2NSEC(MAX(zfs_multihost_interval, MMP_MIN_INTERVAL));

	mmp_thread_enter(mmp, &cpr);

	/*
	 * The mmp_write_done() function calculates mmp_delay based on the
	 * prior value of mmp_delay and the elapsed time since the last write.
	 * For the first mmp write, there is no "last write", so we start
	 * with fake, but reasonable, default non-zero values.
	 */
	mmp->mmp_delay = MSEC2NSEC(MAX(zfs_multihost_interval,
	    MMP_MIN_INTERVAL)) / MAX(vdev_count_leaves(spa), 1);
	mmp->mmp_last_write = gethrtime() - mmp->mmp_delay;

	while (!mmp->mmp_thread_exiting) {
		uint64_t mmp_fail_intervals = zfs_multihost_fail_intervals;
		uint64_t mmp_interval = MSEC2NSEC(
		    MAX(zfs_multihost_interval, MMP_MIN_INTERVAL));
		boolean_t suspended = spa_suspended(spa);
		boolean_t multihost = spa_multihost(spa);
		hrtime_t start, next_time;

		start = gethrtime();
		if (multihost) {
			next_time = start + mmp_interval /
			    MAX(vdev_count_leaves(spa), 1);
		} else {
			next_time = start + MSEC2NSEC(MMP_DEFAULT_INTERVAL);
		}

		/*
		 * When MMP goes off => on, or spa goes suspended =>
		 * !suspended, we know no writes occurred recently.  We
		 * update mmp_last_write to give us some time to try.
		 */
		if ((!last_spa_multihost && multihost) ||
		    (last_spa_suspended && !suspended)) {
			mutex_enter(&mmp->mmp_io_lock);
			mmp->mmp_last_write = gethrtime();
			mutex_exit(&mmp->mmp_io_lock);
		} else if (last_spa_multihost && !multihost) {
			mutex_enter(&mmp->mmp_io_lock);
			mmp->mmp_delay = 0;
			mutex_exit(&mmp->mmp_io_lock);
		}
		last_spa_multihost = multihost;
		last_spa_suspended = suspended;

		/*
		 * Smooth max_fail_ns when its factors are decreased, because
		 * making (max_fail_ns < mmp_interval) results in the pool being
		 * immediately suspended before writes can occur at the new
		 * higher frequency.
		 */
		if ((mmp_interval * mmp_fail_intervals) < max_fail_ns) {
			max_fail_ns = ((31 * max_fail_ns) + (mmp_interval *
			    mmp_fail_intervals)) / 32;
		} else {
			max_fail_ns = mmp_interval * mmp_fail_intervals;
		}

		/*
		 * Suspend the pool if no MMP write has succeeded in over
		 * mmp_interval * mmp_fail_intervals nanoseconds.
		 */
		if (!suspended && mmp_fail_intervals && multihost &&
		    (start - mmp->mmp_last_write) > max_fail_ns) {
			zio_suspend(spa, NULL);
		}

		if (multihost)
			mmp_write_uberblock(spa);

		CALLB_CPR_SAFE_BEGIN(&cpr);
		(void) cv_timedwait_sig(&mmp->mmp_thread_cv,
		    &mmp->mmp_thread_lock, ddi_get_lbolt() +
		    ((next_time - gethrtime()) / (NANOSEC / hz)));
		CALLB_CPR_SAFE_END(&cpr, &mmp->mmp_thread_lock);
	}

	/* Outstanding writes are allowed to complete. */
	if (mmp->mmp_zio_root)
		zio_wait(mmp->mmp_zio_root);

	mmp->mmp_zio_root = NULL;
	mmp_thread_exit(mmp, &mmp->mmp_thread, &cpr);
}
Beispiel #10
0
/* ARGSUSED */
static void
dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
{
	dnode_t *dn = txh->txh_dnode;
	uint64_t start, end, i;
	int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
	int err = 0;

	if (len == 0)
		return;

	min_bs = SPA_MINBLOCKSHIFT;
	max_bs = SPA_MAXBLOCKSHIFT;
	min_ibs = DN_MIN_INDBLKSHIFT;
	max_ibs = DN_MAX_INDBLKSHIFT;


	/*
	 * For i/o error checking, read the first and last level-0
	 * blocks (if they are not aligned), and all the level-1 blocks.
	 */

	if (dn) {
		if (dn->dn_maxblkid == 0) {
			err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
			if (err)
				goto out;
		} else {
			zio_t *zio = zio_root(dn->dn_objset->os_spa,
			    NULL, NULL, ZIO_FLAG_CANFAIL);

			/* first level-0 block */
			start = off >> dn->dn_datablkshift;
			if (P2PHASE(off, dn->dn_datablksz) ||
			    len < dn->dn_datablksz) {
				err = dmu_tx_check_ioerr(zio, dn, 0, start);
				if (err)
					goto out;
			}

			/* last level-0 block */
			end = (off+len-1) >> dn->dn_datablkshift;
			if (end != start &&
			    P2PHASE(off+len, dn->dn_datablksz)) {
				err = dmu_tx_check_ioerr(zio, dn, 0, end);
				if (err)
					goto out;
			}

			/* level-1 blocks */
			if (dn->dn_nlevels > 1) {
				start >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT;
				end >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT;
				for (i = start+1; i < end; i++) {
					err = dmu_tx_check_ioerr(zio, dn, 1, i);
					if (err)
						goto out;
				}
			}

			err = zio_wait(zio);
			if (err)
				goto out;
		}
	}
Beispiel #11
0
static void
zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
{
	zil_replay_arg_t *zr = zra;
	const zil_header_t *zh = zilog->zl_header;
	uint64_t reclen = lr->lrc_reclen;
	uint64_t txtype = lr->lrc_txtype;
	char *name;
	int pass, error;

	if (!zilog->zl_replay)			/* giving up */
		return;

	if (lr->lrc_txg < claim_txg)		/* already committed */
		return;

	if (lr->lrc_seq <= zh->zh_replay_seq)	/* already replayed */
		return;

	/* Strip case-insensitive bit, still present in log record */
	txtype &= ~TX_CI;

	if (txtype == 0 || txtype >= TX_MAX_TYPE) {
		error = EINVAL;
		goto bad;
	}

	/*
	 * Make a copy of the data so we can revise and extend it.
	 */
	bcopy(lr, zr->zr_lrbuf, reclen);

	/*
	 * The log block containing this lr may have been byteswapped
	 * so that we can easily examine common fields like lrc_txtype.
	 * However, the log is a mix of different data types, and only the
	 * replay vectors know how to byteswap their records.  Therefore, if
	 * the lr was byteswapped, undo it before invoking the replay vector.
	 */
	if (zr->zr_byteswap)
		byteswap_uint64_array(zr->zr_lrbuf, reclen);

	/*
	 * If this is a TX_WRITE with a blkptr, suck in the data.
	 */
	if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
		lr_write_t *lrw = (lr_write_t *)lr;
		blkptr_t *wbp = &lrw->lr_blkptr;
		uint64_t wlen = lrw->lr_length;
		char *wbuf = zr->zr_lrbuf + reclen;

		if (BP_IS_HOLE(wbp)) {	/* compressed to a hole */
			bzero(wbuf, wlen);
		} else {
			/*
			 * A subsequent write may have overwritten this block,
			 * in which case wbp may have been been freed and
			 * reallocated, and our read of wbp may fail with a
			 * checksum error.  We can safely ignore this because
			 * the later write will provide the correct data.
			 */
			zbookmark_t zb;

			zb.zb_objset = dmu_objset_id(zilog->zl_os);
			zb.zb_object = lrw->lr_foid;
			zb.zb_level = -1;
			zb.zb_blkid = lrw->lr_offset / BP_GET_LSIZE(wbp);

			(void) zio_wait(zio_read(NULL, zilog->zl_spa,
			    wbp, wbuf, BP_GET_LSIZE(wbp), NULL, NULL,
			    ZIO_PRIORITY_SYNC_READ,
			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb));
			(void) memmove(wbuf, wbuf + lrw->lr_blkoff, wlen);
		}
	}

	/*
	 * We must now do two things atomically: replay this log record,
	 * and update the log header sequence number to reflect the fact that
	 * we did so. At the end of each replay function the sequence number
	 * is updated if we are in replay mode.
	 */
	for (pass = 1; pass <= 2; pass++) {
		zilog->zl_replaying_seq = lr->lrc_seq;
		/* Only byteswap (if needed) on the 1st pass.  */
		error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf,
		    zr->zr_byteswap && pass == 1);

		if (!error)
			return;

		/*
		 * The DMU's dnode layer doesn't see removes until the txg
		 * commits, so a subsequent claim can spuriously fail with
		 * EEXIST. So if we receive any error we try syncing out
		 * any removes then retry the transaction.
		 */
		if (pass == 1)
			txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
	}

bad:
	ASSERT(error);
	name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
	dmu_objset_name(zr->zr_os, name);
	cmn_err(CE_WARN, "ZFS replay transaction error %d, "
	    "dataset %s, seq 0x%llx, txtype %llu %s\n",
	    error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype,
	    (lr->lrc_txtype & TX_CI) ? "CI" : "");
	zilog->zl_replay = B_FALSE;
	kmem_free(name, MAXNAMELEN);
}
Beispiel #12
0
static void
zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid)
{
	uint64_t txg;
	uint64_t commit_seq = 0;
	itx_t *itx, *itx_next = (itx_t *)-1;
	lwb_t *lwb;
	spa_t *spa;

	zilog->zl_writer = B_TRUE;
	ASSERT(zilog->zl_root_zio == NULL);
	spa = zilog->zl_spa;

	if (zilog->zl_suspend) {
		lwb = NULL;
	} else {
		lwb = list_tail(&zilog->zl_lwb_list);
		if (lwb == NULL) {
			/*
			 * Return if there's nothing to flush before we
			 * dirty the fs by calling zil_create()
			 */
			if (list_is_empty(&zilog->zl_itx_list)) {
				zilog->zl_writer = B_FALSE;
				return;
			}
			mutex_exit(&zilog->zl_lock);
			zil_create(zilog);
			mutex_enter(&zilog->zl_lock);
			lwb = list_tail(&zilog->zl_lwb_list);
		}
	}

	/* Loop through in-memory log transactions filling log blocks. */
	DTRACE_PROBE1(zil__cw1, zilog_t *, zilog);
	for (;;) {
		/*
		 * Find the next itx to push:
		 * Push all transactions related to specified foid and all
		 * other transactions except TX_WRITE, TX_TRUNCATE,
		 * TX_SETATTR and TX_ACL for all other files.
		 */
		if (itx_next != (itx_t *)-1)
			itx = itx_next;
		else
			itx = list_head(&zilog->zl_itx_list);
		for (; itx != NULL; itx = list_next(&zilog->zl_itx_list, itx)) {
			if (foid == 0) /* push all foids? */
				break;
			if (itx->itx_sync) /* push all O_[D]SYNC */
				break;
			switch (itx->itx_lr.lrc_txtype) {
			case TX_SETATTR:
			case TX_WRITE:
			case TX_TRUNCATE:
			case TX_ACL:
				/* lr_foid is same offset for these records */
				if (((lr_write_t *)&itx->itx_lr)->lr_foid
				    != foid) {
					continue; /* skip this record */
				}
			}
			break;
		}
		if (itx == NULL)
			break;

		if ((itx->itx_lr.lrc_seq > seq) &&
		    ((lwb == NULL) || (lwb->lwb_nused == 0) ||
		    (lwb->lwb_nused + itx->itx_sod > ZIL_BLK_DATA_SZ(lwb)))) {
			break;
		}

		/*
		 * Save the next pointer.  Even though we soon drop
		 * zl_lock all threads that may change the list
		 * (another writer or zil_itx_clean) can't do so until
		 * they have zl_writer.
		 */
		itx_next = list_next(&zilog->zl_itx_list, itx);
		list_remove(&zilog->zl_itx_list, itx);
		zilog->zl_itx_list_sz -= itx->itx_sod;
		mutex_exit(&zilog->zl_lock);
		txg = itx->itx_lr.lrc_txg;
		ASSERT(txg);

		if (txg > spa_last_synced_txg(spa) ||
		    txg > spa_freeze_txg(spa))
			lwb = zil_lwb_commit(zilog, itx, lwb);
		kmem_free(itx, offsetof(itx_t, itx_lr)
		    + itx->itx_lr.lrc_reclen);
		mutex_enter(&zilog->zl_lock);
	}
	DTRACE_PROBE1(zil__cw2, zilog_t *, zilog);
	/* determine commit sequence number */
	itx = list_head(&zilog->zl_itx_list);
	if (itx)
		commit_seq = itx->itx_lr.lrc_seq;
	else
		commit_seq = zilog->zl_itx_seq;
	mutex_exit(&zilog->zl_lock);

	/* write the last block out */
	if (lwb != NULL && lwb->lwb_zio != NULL)
		lwb = zil_lwb_write_start(zilog, lwb);

	zilog->zl_prev_used = zilog->zl_cur_used;
	zilog->zl_cur_used = 0;

	/*
	 * Wait if necessary for the log blocks to be on stable storage.
	 */
	if (zilog->zl_root_zio) {
		DTRACE_PROBE1(zil__cw3, zilog_t *, zilog);
		(void) zio_wait(zilog->zl_root_zio);
		zilog->zl_root_zio = NULL;
		DTRACE_PROBE1(zil__cw4, zilog_t *, zilog);
		zil_flush_vdevs(zilog);
	}

	if (zilog->zl_log_error || lwb == NULL) {
		zilog->zl_log_error = 0;
		txg_wait_synced(zilog->zl_dmu_pool, 0);
	}

	mutex_enter(&zilog->zl_lock);
	zilog->zl_writer = B_FALSE;

	ASSERT3U(commit_seq, >=, zilog->zl_commit_seq);
	zilog->zl_commit_seq = commit_seq;
}
Beispiel #13
0
/* ARGSUSED */
static void
dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
{
	dnode_t *dn = txh->txh_dnode;
	int err = 0;

	if (len == 0)
		return;

	(void) refcount_add_many(&txh->txh_space_towrite, len, FTAG);

	if (refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS)
		err = SET_ERROR(EFBIG);

	if (dn == NULL)
		return;

	/*
	 * For i/o error checking, read the blocks that will be needed
	 * to perform the write: the first and last level-0 blocks (if
	 * they are not aligned, i.e. if they are partial-block writes),
	 * and all the level-1 blocks.
	 */
	if (dn->dn_maxblkid == 0) {
		if (off < dn->dn_datablksz &&
		    (off > 0 || len < dn->dn_datablksz)) {
			err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
			if (err != 0) {
				txh->txh_tx->tx_err = err;
			}
		}
	} else {
		zio_t *zio = zio_root(dn->dn_objset->os_spa,
		    NULL, NULL, ZIO_FLAG_CANFAIL);

		/* first level-0 block */
		uint64_t start = off >> dn->dn_datablkshift;
		if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) {
			err = dmu_tx_check_ioerr(zio, dn, 0, start);
			if (err != 0) {
				txh->txh_tx->tx_err = err;
			}
		}

		/* last level-0 block */
		uint64_t end = (off + len - 1) >> dn->dn_datablkshift;
		if (end != start && end <= dn->dn_maxblkid &&
		    P2PHASE(off + len, dn->dn_datablksz)) {
			err = dmu_tx_check_ioerr(zio, dn, 0, end);
			if (err != 0) {
				txh->txh_tx->tx_err = err;
			}
		}

		/* level-1 blocks */
		if (dn->dn_nlevels > 1) {
			int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
			for (uint64_t i = (start >> shft) + 1;
			    i < end >> shft; i++) {
				err = dmu_tx_check_ioerr(zio, dn, 1, i);
				if (err != 0) {
					txh->txh_tx->tx_err = err;
				}
			}
		}

		err = zio_wait(zio);
		if (err != 0) {
			txh->txh_tx->tx_err = err;
		}
	}