Beispiel #1
0
static int
zfs_replay_write(zfsvfs_t *zsb, lr_write_t *lr, boolean_t byteswap)
{
	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
	znode_t	*zp;
	int error;
	uint64_t eod, offset, length;
    ssize_t resid;

	if (byteswap)
		byteswap_uint64_array(lr, sizeof (*lr));

	if ((error = zfs_zget(zsb, lr->lr_foid, &zp)) != 0) {
		/*
		 * As we can log writes out of order, it's possible the
		 * file has been removed. In this case just drop the write
		 * and return success.
		 */
		if (error == ENOENT)
			error = 0;
		return (error);
	}
    zfs_znode_wait_vnode(zp);

	offset = lr->lr_offset;
	length = lr->lr_length;
	eod = offset + length;	/* end of data for this write */

	/*
	 * This may be a write from a dmu_sync() for a whole block,
	 * and may extend beyond the current end of the file.
	 * We can't just replay what was written for this TX_WRITE as
	 * a future TX_WRITE2 may extend the eof and the data for that
	 * write needs to be there. So we write the whole block and
	 * reduce the eof. This needs to be done within the single dmu
	 * transaction created within vn_rdwr -> zfs_write. So a possible
	 * new end of file is passed through in zsb->z_replay_eof
	 */

	zsb->z_replay_eof = 0; /* 0 means don't change end of file */

	/* If it's a dmu_sync() block, write the whole block */
	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
		if (length < blocksize) {
			offset -= offset % blocksize;
			length = blocksize;
		}
		if (zp->z_size < eod)
			zsb->z_replay_eof = eod;
	}

    error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, length, offset,
                    UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);

    VN_RELE(ZTOV(zp));
	zsb->z_replay_eof = 0;	/* safety */

	return (error);
}
Beispiel #2
0
/* ARGSUSED */
static int
dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
{
	if (lrc->lrc_txtype == TX_WRITE) {
		zil_scan_arg_t *zsa = arg;
		dsl_pool_t *dp = zsa->zsa_dp;
		dsl_scan_t *scn = dp->dp_scan;
		zil_header_t *zh = zsa->zsa_zh;
		lr_write_t *lr = (lr_write_t *)lrc;
		blkptr_t *bp = &lr->lr_blkptr;
		zbookmark_t zb;

		if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
			return (0);

		/*
		 * birth can be < claim_txg if this record's txg is
		 * already txg sync'ed (but this log block contains
		 * other records that are not synced)
		 */
		if (claim_txg == 0 || bp->blk_birth < claim_txg)
			return (0);

		SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
		    lr->lr_foid, ZB_ZIL_LEVEL,
		    lr->lr_offset / BP_GET_LSIZE(bp));

		VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
	}
	return (0);
}
Beispiel #3
0
static int
zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
{
	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
	znode_t	*zp;
	int error;
	ssize_t resid;
	uint64_t orig_eof, eod, offset, length;

	if (byteswap)
		byteswap_uint64_array(lr, sizeof (*lr));

	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
		/*
		 * As we can log writes out of order, it's possible the
		 * file has been removed. In this case just drop the write
		 * and return success.
		 */
		if (error == ENOENT)
			error = 0;
		return (error);
	}

	offset = lr->lr_offset;
	length = lr->lr_length;
	eod = offset + length;		/* end of data for this write */

	orig_eof = zp->z_phys->zp_size;

	/* If it's a dmu_sync() block, write the whole block */
	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
		if (length < blocksize) {
			offset -= offset % blocksize;
			length = blocksize;
		}
	}

	error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, length, offset,
	    UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);

	/*
	 * This may be a write from a dmu_sync() for a whole block,
	 * and may extend beyond the current end of the file.
	 * We can't just replay what was written for this TX_WRITE as
	 * a future TX_WRITE2 may extend the eof and the data for that
	 * write needs to be there. So we write the whole block and
	 * reduce the eof.
	 */
	if (orig_eof < zp->z_phys->zp_size) /* file length grew ? */
		zp->z_phys->zp_size = eod;

	VN_RELE(ZTOV(zp));

	return (error);
}
Beispiel #4
0
/*
 * Read a log block, make sure it's valid, and byteswap it if necessary.
 */
static int
zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp)
{
	blkptr_t blk = *bp;
	zbookmark_t zb;
	uint32_t aflags = ARC_WAIT;
	int error;

	zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET];
	zb.zb_object = 0;
	zb.zb_level = -1;
	zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];

	*abufpp = NULL;

	/*
	 * We shouldn't be doing any scrubbing while we're doing log
	 * replay, it's OK to not lock.
	 */
	error = arc_read_nolock(NULL, zilog->zl_spa, &blk,
	    arc_getbuf_func, abufpp, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL |
	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB, &aflags, &zb);

	if (error == 0) {
		char *data = (*abufpp)->b_data;
		uint64_t blksz = BP_GET_LSIZE(bp);
		zil_trailer_t *ztp = (zil_trailer_t *)(data + blksz) - 1;
		zio_cksum_t cksum = bp->blk_cksum;

		/*
		 * Validate the checksummed log block.
		 *
		 * Sequence numbers should be... sequential.  The checksum
		 * verifier for the next block should be bp's checksum plus 1.
		 *
		 * Also check the log chain linkage and size used.
		 */
		cksum.zc_word[ZIL_ZC_SEQ]++;

		if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum,
		    sizeof (cksum)) || BP_IS_HOLE(&ztp->zit_next_blk) ||
		    (ztp->zit_nused > (blksz - sizeof (zil_trailer_t)))) {
			error = ECKSUM;
		}

		if (error) {
			VERIFY(arc_buf_remove_ref(*abufpp, abufpp) == 1);
			*abufpp = NULL;
		}
	}

	dprintf("error %d on %llu:%llu\n", error, zb.zb_objset, zb.zb_blkid);

	return (error);
}
Beispiel #5
0
/*
 * Read in the data for the dmu_sync()ed block, and change the log
 * record to write this whole block.
 */
void
zil_get_replay_data(zilog_t *zilog, lr_write_t *lr)
{
	blkptr_t *wbp = &lr->lr_blkptr;
	char *wbuf = (char *)(lr + 1); /* data follows lr_write_t */
	uint64_t blksz;

	if (BP_IS_HOLE(wbp)) {	/* compressed to a hole */
		blksz = BP_GET_LSIZE(&lr->lr_blkptr);
		/*
		 * If the blksz is zero then we must be replaying a log
		 * from an version prior to setting the blksize of null blocks.
		 * So we just zero the actual write size reqeusted.
		 */
		if (blksz == 0) {
			bzero(wbuf, lr->lr_length);
			return;
		}
		bzero(wbuf, blksz);
	} else {
		/*
		 * A subsequent write may have overwritten this block, in which
		 * case wbp may have been been freed and reallocated, and our
		 * read of wbp may fail with a checksum error.  We can safely
		 * ignore this because the later write will provide the
		 * correct data.
		 */
		zbookmark_t zb;

		zb.zb_objset = dmu_objset_id(zilog->zl_os);
		zb.zb_object = lr->lr_foid;
		zb.zb_level = 0;
		zb.zb_blkid = -1; /* unknown */

		blksz = BP_GET_LSIZE(&lr->lr_blkptr);
		(void) zio_wait(zio_read(NULL, zilog->zl_spa, wbp, wbuf, blksz,
		    NULL, NULL, ZIO_PRIORITY_SYNC_READ,
		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb));
	}
	lr->lr_offset -= lr->lr_offset % blksz;
	lr->lr_length = blksz;
}
Beispiel #6
0
static void
free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
{
	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
	uint64_t bytesfreed = 0;
	int i;

	dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num);

	for (i = 0; i < num; i++, bp++) {
		uint64_t lsize, lvl;
		dmu_object_type_t type;

		if (BP_IS_HOLE(bp))
			continue;

		bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE);
		ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));

		/*
		 * Save some useful information on the holes being
		 * punched, including logical size, type, and indirection
		 * level. Retaining birth time enables detection of when
		 * holes are punched for reducing the number of free
		 * records transmitted during a zfs send.
		 */

		lsize = BP_GET_LSIZE(bp);
		type = BP_GET_TYPE(bp);
		lvl = BP_GET_LEVEL(bp);

		bzero(bp, sizeof (blkptr_t));

		if (spa_feature_is_active(dn->dn_objset->os_spa,
		    SPA_FEATURE_HOLE_BIRTH)) {
			BP_SET_LSIZE(bp, lsize);
			BP_SET_TYPE(bp, type);
			BP_SET_LEVEL(bp, lvl);
			BP_SET_BIRTH(bp, dmu_tx_get_txg(tx), 0);
		}
	}
	dnode_diduse_space(dn, -bytesfreed);
}
Beispiel #7
0
static int
dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type,
    uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data)
{
	struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);


	/*
	 * If there is any kind of pending aggregation (currently either
	 * a grouping of free objects or free blocks), push it out to
	 * the stream, since aggregation can't be done across operations
	 * of different types.
	 */
	if (dsp->dsa_pending_op != PENDING_NONE) {
		if (dump_bytes(dsp, dsp->dsa_drr,
		    sizeof (dmu_replay_record_t)) != 0)
			return (EINTR);
		dsp->dsa_pending_op = PENDING_NONE;
	}
	/* write a DATA record */
	bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
	dsp->dsa_drr->drr_type = DRR_WRITE;
	drrw->drr_object = object;
	drrw->drr_type = type;
	drrw->drr_offset = offset;
	drrw->drr_length = blksz;
	drrw->drr_toguid = dsp->dsa_toguid;
	drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
	if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup)
		drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
	DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
	DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
	DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
	drrw->drr_key.ddk_cksum = bp->blk_cksum;

	if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
		return (EINTR);
	if (dump_bytes(dsp, data, blksz) != 0)
		return (EINTR);
	return (0);
}
Beispiel #8
0
/* ARGSUSED */
int
zil_check_log_chain(char *osname, void *txarg)
{
	zilog_t *zilog;
	zil_header_t *zh;
	blkptr_t blk;
	arc_buf_t *abuf;
	objset_t *os;
	char *lrbuf;
	zil_trailer_t *ztp;
	int error;

	error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os);
	if (error) {
		cmn_err(CE_WARN, "can't open objset for %s", osname);
		return (0);
	}

	zilog = dmu_objset_zil(os);
	zh = zil_header_in_syncing_context(zilog);
	blk = zh->zh_log;
	if (BP_IS_HOLE(&blk)) {
		dmu_objset_close(os);
		return (0); /* no chain */
	}

	for (;;) {
		error = zil_read_log_block(zilog, &blk, &abuf);
		if (error)
			break;
		lrbuf = abuf->b_data;
		ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
		blk = ztp->zit_next_blk;
		VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
	}
	dmu_objset_close(os);
	if (error == ECKSUM)
		return (0); /* normal end of chain */
	return (error);
}
Beispiel #9
0
static int
traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
{
	traverse_data_t *td = arg;

	if (lrc->lrc_txtype == TX_WRITE) {
		lr_write_t *lr = (lr_write_t *)lrc;
		blkptr_t *bp = &lr->lr_blkptr;
		zbookmark_t zb;

		if (bp->blk_birth == 0)
			return (0);

		if (claim_txg == 0 || bp->blk_birth < claim_txg)
			return (0);

		SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid,
		    ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));

		(void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL,
		    td->td_arg);
	}
	return (0);
}
Beispiel #10
0
static void
zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
{
	zil_replay_arg_t *zr = zra;
	const zil_header_t *zh = zilog->zl_header;
	uint64_t reclen = lr->lrc_reclen;
	uint64_t txtype = lr->lrc_txtype;
	char *name;
	int pass, error;

	if (!zilog->zl_replay)			/* giving up */
		return;

	if (lr->lrc_txg < claim_txg)		/* already committed */
		return;

	if (lr->lrc_seq <= zh->zh_replay_seq)	/* already replayed */
		return;

	/* Strip case-insensitive bit, still present in log record */
	txtype &= ~TX_CI;

	if (txtype == 0 || txtype >= TX_MAX_TYPE) {
		error = EINVAL;
		goto bad;
	}

	/*
	 * Make a copy of the data so we can revise and extend it.
	 */
	bcopy(lr, zr->zr_lrbuf, reclen);

	/*
	 * The log block containing this lr may have been byteswapped
	 * so that we can easily examine common fields like lrc_txtype.
	 * However, the log is a mix of different data types, and only the
	 * replay vectors know how to byteswap their records.  Therefore, if
	 * the lr was byteswapped, undo it before invoking the replay vector.
	 */
	if (zr->zr_byteswap)
		byteswap_uint64_array(zr->zr_lrbuf, reclen);

	/*
	 * If this is a TX_WRITE with a blkptr, suck in the data.
	 */
	if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
		lr_write_t *lrw = (lr_write_t *)lr;
		blkptr_t *wbp = &lrw->lr_blkptr;
		uint64_t wlen = lrw->lr_length;
		char *wbuf = zr->zr_lrbuf + reclen;

		if (BP_IS_HOLE(wbp)) {	/* compressed to a hole */
			bzero(wbuf, wlen);
		} else {
			/*
			 * A subsequent write may have overwritten this block,
			 * in which case wbp may have been been freed and
			 * reallocated, and our read of wbp may fail with a
			 * checksum error.  We can safely ignore this because
			 * the later write will provide the correct data.
			 */
			zbookmark_t zb;

			zb.zb_objset = dmu_objset_id(zilog->zl_os);
			zb.zb_object = lrw->lr_foid;
			zb.zb_level = -1;
			zb.zb_blkid = lrw->lr_offset / BP_GET_LSIZE(wbp);

			(void) zio_wait(zio_read(NULL, zilog->zl_spa,
			    wbp, wbuf, BP_GET_LSIZE(wbp), NULL, NULL,
			    ZIO_PRIORITY_SYNC_READ,
			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb));
			(void) memmove(wbuf, wbuf + lrw->lr_blkoff, wlen);
		}
	}

	/*
	 * We must now do two things atomically: replay this log record,
	 * and update the log header sequence number to reflect the fact that
	 * we did so. At the end of each replay function the sequence number
	 * is updated if we are in replay mode.
	 */
	for (pass = 1; pass <= 2; pass++) {
		zilog->zl_replaying_seq = lr->lrc_seq;
		/* Only byteswap (if needed) on the 1st pass.  */
		error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf,
		    zr->zr_byteswap && pass == 1);

		if (!error)
			return;

		/*
		 * The DMU's dnode layer doesn't see removes until the txg
		 * commits, so a subsequent claim can spuriously fail with
		 * EEXIST. So if we receive any error we try syncing out
		 * any removes then retry the transaction.
		 */
		if (pass == 1)
			txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
	}

bad:
	ASSERT(error);
	name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
	dmu_objset_name(zr->zr_os, name);
	cmn_err(CE_WARN, "ZFS replay transaction error %d, "
	    "dataset %s, seq 0x%llx, txtype %llu %s\n",
	    error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype,
	    (lr->lrc_txtype & TX_CI) ? "CI" : "");
	zilog->zl_replay = B_FALSE;
	kmem_free(name, MAXNAMELEN);
}
Beispiel #11
0
	if (zb->zb_object != DMU_META_DNODE_OBJECT)
		return (0);

	if (BP_IS_HOLE(bp)) {
		uint64_t span = DBP_SPAN(dnp, zb->zb_level);
		uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;

		err = report_free_dnode_range(da, dnobj,
		    dnobj + (span >> DNODE_SHIFT) - 1);
		if (err)
			return (err);
	} else if (zb->zb_level == 0) {
		dnode_phys_t *blk;
		arc_buf_t *abuf;
		arc_flags_t aflags = ARC_FLAG_WAIT;
		int blksz = BP_GET_LSIZE(bp);
		int i;

		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
		    &aflags, zb) != 0)
			return (SET_ERROR(EIO));

		for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
			uint64_t dnobj = (zb->zb_blkid <<
			    (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
			blk = abd_array(abuf->b_data, i, dnode_phys_t);
			err = report_dnode(da, dnobj, blk);
			if (err)
				break;
		}
Beispiel #12
0
/* ARGSUSED */
static void
zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
{
	char *data, *dlimit;
	blkptr_t *bp = &lr->lr_blkptr;
	zbookmark_phys_t zb;
	char buf[SPA_MAXBLOCKSIZE];
	int verbose = MAX(dump_opt['d'], dump_opt['i']);
	int error;

	(void) printf("%sfoid %llu, offset %llx, length %llx\n", prefix,
	    (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset,
	    (u_longlong_t)lr->lr_length);

	if (txtype == TX_WRITE2 || verbose < 5)
		return;

	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
		(void) printf("%shas blkptr, %s\n", prefix,
		    !BP_IS_HOLE(bp) &&
		    bp->blk_birth >= spa_first_txg(zilog->zl_spa) ?
		    "will claim" : "won't claim");
		print_log_bp(bp, prefix);

		if (BP_IS_HOLE(bp)) {
			(void) printf("\t\t\tLSIZE 0x%llx\n",
			    (u_longlong_t)BP_GET_LSIZE(bp));
			bzero(buf, sizeof (buf));
			(void) printf("%s<hole>\n", prefix);
			return;
		}
		if (bp->blk_birth < zilog->zl_header->zh_claim_txg) {
			(void) printf("%s<block already committed>\n", prefix);
			return;
		}

		SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os),
		    lr->lr_foid, ZB_ZIL_LEVEL,
		    lr->lr_offset / BP_GET_LSIZE(bp));

		error = zio_wait(zio_read(NULL, zilog->zl_spa,
		    bp, buf, BP_GET_LSIZE(bp), NULL, NULL,
		    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb));
		if (error)
			return;
		data = buf;
	} else {
		data = (char *)(lr + 1);
	}

	dlimit = data + MIN(lr->lr_length,
	    (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE));

	(void) printf("%s", prefix);
	while (data < dlimit) {
		if (isprint(*data))
			(void) printf("%c ", *data);
		else
			(void) printf("%2hhX", *data);
		data++;
	}
	(void) printf("\n");
}
Beispiel #13
0
static int
zfs_replay_write(void *arg1, char *arg2, boolean_t byteswap)
{
	zfsvfs_t *zfsvfs = (zfsvfs_t *)arg1;
	lr_write_t *lr = (lr_write_t *)arg2;
	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
	znode_t	*zp;
	int error;
#ifndef LINUX_PORT
	ssize_t resid;
#else 
	uio_t uio;
	int vflg = 0;
	struct iovec iov;
#endif
	if (byteswap)
		byteswap_uint64_array(lr, sizeof (*lr));
	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
		/*
		 * As we can log writes out of order, it's possible the
		 * file has been removed. In this case just drop the write
		 * and return success.
		 */
		if (error == ENOENT)
			error = 0;
		return (error);
	}
#ifndef LINUX_PORT
	offset = lr->lr_offset;
	length = lr->lr_length;
	iov.iov_base = (void *) data;
	iov.iov_len = lr->lr_length;
	 /* This may be a write from a dmu_sync() for a whole block,
	 * and may extend beyond the current end of the file.
	 * We can't just replay what was written for this TX_WRITE as
	 * a future TX_WRITE2 may extend the eof and the data for that
	 * write needs to be there. So we write the whole block and
	 * reduce the eof. This needs to be done within the single dmu
	 * transaction created within vn_rdwr -> zfs_write. So a possible
	 * new end of file is passed through in zfsvfs->z_replay_eof
	 */

	zfsvfs->z_replay_eof = 0; /* 0 means don't change end of file */

	/* If it's a dmu_sync() block, write the whole block */
	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
		if (length < blocksize) {
			offset -= offset % blocksize;
			length = blocksize;
		}
		if (zp->z_size < eod)
			zfsvfs->z_replay_eof = eod;
	}

	error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, length, offset,
	    UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
#else
        iov.iov_base = (void *) data;
        iov.iov_len = lr->lr_length;

        uio.uio_iov = &iov;
        uio.uio_resid = lr->lr_length;
        uio.uio_iovcnt = 1;
        uio.uio_loffset = (offset_t)lr->lr_offset;
        uio.uio_limit = MAXOFFSET_T;
        uio.uio_segflg = UIO_SYSSPACE;
        error = VOP_WRITE(ZTOV(zp), &uio, vflg, NULL , NULL);
#endif
	VN_RELE(ZTOV(zp));
	zfsvfs->z_replay_eof = 0;	/* safety */

	return (error);
}
Beispiel #14
0
static int
zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap)
{
	zfsvfs_t *zfsvfs = arg1;
	lr_write_t *lr = arg2;
	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
	znode_t	*zp;
	int error, written;
	uint64_t eod, offset, length;

	if (byteswap)
		byteswap_uint64_array(lr, sizeof (*lr));

	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
		/*
		 * As we can log writes out of order, it's possible the
		 * file has been removed. In this case just drop the write
		 * and return success.
		 */
		if (error == ENOENT)
			error = 0;
		return (error);
	}

	offset = lr->lr_offset;
	length = lr->lr_length;
	eod = offset + length;	/* end of data for this write */

	/*
	 * This may be a write from a dmu_sync() for a whole block,
	 * and may extend beyond the current end of the file.
	 * We can't just replay what was written for this TX_WRITE as
	 * a future TX_WRITE2 may extend the eof and the data for that
	 * write needs to be there. So we write the whole block and
	 * reduce the eof. This needs to be done within the single dmu
	 * transaction created within vn_rdwr -> zfs_write. So a possible
	 * new end of file is passed through in zfsvfs->z_replay_eof
	 */

	zfsvfs->z_replay_eof = 0; /* 0 means don't change end of file */

	/* If it's a dmu_sync() block, write the whole block */
	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
		if (length < blocksize) {
			offset -= offset % blocksize;
			length = blocksize;
		}
		if (zp->z_size < eod)
			zfsvfs->z_replay_eof = eod;
	}

	written = zpl_write_common(ZTOI(zp), data, length, &offset,
	    UIO_SYSSPACE, 0, kcred);
	if (written < 0)
		error = -written;
	else if (written < length)
		error = SET_ERROR(EIO); /* short write */

	iput(ZTOI(zp));
	zfsvfs->z_replay_eof = 0;	/* safety */

	return (error);
}
Beispiel #15
0
static int
traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
    const blkptr_t *bp, const zbookmark_phys_t *zb)
{
	zbookmark_phys_t czb;
	int err = 0;
	arc_buf_t *buf = NULL;
	prefetch_data_t *pd = td->td_pfd;
	boolean_t hard = td->td_flags & TRAVERSE_HARD;

	switch (resume_skip_check(td, dnp, zb)) {
	case RESUME_SKIP_ALL:
		return (0);
	case RESUME_SKIP_CHILDREN:
		goto post;
	case RESUME_SKIP_NONE:
		break;
	default:
		ASSERT(0);
	}

	if (bp->blk_birth == 0) {
		/*
		 * Since this block has a birth time of 0 it must be one of
		 * two things: a hole created before the
		 * SPA_FEATURE_HOLE_BIRTH feature was enabled, or a hole
		 * which has always been a hole in an object.
		 *
		 * If a file is written sparsely, then the unwritten parts of
		 * the file were "always holes" -- that is, they have been
		 * holes since this object was allocated.  However, we (and
		 * our callers) can not necessarily tell when an object was
		 * allocated.  Therefore, if it's possible that this object
		 * was freed and then its object number reused, we need to
		 * visit all the holes with birth==0.
		 *
		 * If it isn't possible that the object number was reused,
		 * then if SPA_FEATURE_HOLE_BIRTH was enabled before we wrote
		 * all the blocks we will visit as part of this traversal,
		 * then this hole must have always existed, so we can skip
		 * it.  We visit blocks born after (exclusive) td_min_txg.
		 *
		 * Note that the meta-dnode cannot be reallocated.
		 */
		if (!send_holes_without_birth_time &&
		    (!td->td_realloc_possible ||
		    zb->zb_object == DMU_META_DNODE_OBJECT) &&
		    td->td_hole_birth_enabled_txg <= td->td_min_txg)
			return (0);
	} else if (bp->blk_birth <= td->td_min_txg) {
		return (0);
	}

	if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) {
		uint64_t size = BP_GET_LSIZE(bp);
		mutex_enter(&pd->pd_mtx);
		ASSERT(pd->pd_bytes_fetched >= 0);
		while (pd->pd_bytes_fetched < size && !pd->pd_exited)
			cv_wait(&pd->pd_cv, &pd->pd_mtx);
		pd->pd_bytes_fetched -= size;
		cv_broadcast(&pd->pd_cv);
		mutex_exit(&pd->pd_mtx);
	}

	if (BP_IS_HOLE(bp)) {
		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
		if (err != 0)
			goto post;
		return (0);
	}

	if (td->td_flags & TRAVERSE_PRE) {
		err = td->td_func(td->td_spa, NULL, bp, zb, dnp,
		    td->td_arg);
		if (err == TRAVERSE_VISIT_NO_CHILDREN)
			return (0);
		if (err != 0)
			goto post;
	}

	if (BP_GET_LEVEL(bp) > 0) {
		arc_flags_t flags = ARC_FLAG_WAIT;
		int i;
		blkptr_t *cbp;
		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;

		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
		if (err != 0)
			goto post;
		cbp = buf->b_data;

		for (i = 0; i < epb; i++) {
			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
			    zb->zb_level - 1,
			    zb->zb_blkid * epb + i);
			traverse_prefetch_metadata(td, &cbp[i], &czb);
		}

		/* recursively visitbp() blocks below this */
		for (i = 0; i < epb; i++) {
			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
			    zb->zb_level - 1,
			    zb->zb_blkid * epb + i);
			err = traverse_visitbp(td, dnp, &cbp[i], &czb);
			if (err != 0)
				break;
		}
	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
Beispiel #16
0
/*
 * Parse the intent log, and call parse_func for each valid record within.
 * Return the highest sequence number.
 */
uint64_t
zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
    zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg)
{
	const zil_header_t *zh = zilog->zl_header;
	uint64_t claim_seq = zh->zh_claim_seq;
	uint64_t seq = 0;
	uint64_t max_seq = 0;
	blkptr_t blk = zh->zh_log;
	arc_buf_t *abuf;
	char *lrbuf, *lrp;
	zil_trailer_t *ztp;
	int reclen, error;

	if (BP_IS_HOLE(&blk))
		return (max_seq);

	/*
	 * Starting at the block pointed to by zh_log we read the log chain.
	 * For each block in the chain we strongly check that block to
	 * ensure its validity.  We stop when an invalid block is found.
	 * For each block pointer in the chain we call parse_blk_func().
	 * For each record in each valid block we call parse_lr_func().
	 * If the log has been claimed, stop if we encounter a sequence
	 * number greater than the highest claimed sequence number.
	 */
	zil_dva_tree_init(&zilog->zl_dva_tree);
	for (;;) {
		seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];

		if (claim_seq != 0 && seq > claim_seq)
			break;

		ASSERT(max_seq < seq);
		max_seq = seq;

		error = zil_read_log_block(zilog, &blk, &abuf);

		if (parse_blk_func != NULL)
			parse_blk_func(zilog, &blk, arg, txg);

		if (error)
			break;

		lrbuf = abuf->b_data;
		ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
		blk = ztp->zit_next_blk;

		if (parse_lr_func == NULL) {
			VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
			continue;
		}

		for (lrp = lrbuf; lrp < lrbuf + ztp->zit_nused; lrp += reclen) {
			lr_t *lr = (lr_t *)lrp;
			reclen = lr->lrc_reclen;
			ASSERT3U(reclen, >=, sizeof (lr_t));
			parse_lr_func(zilog, lr, arg, txg);
		}
		VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
	}
	zil_dva_tree_fini(&zilog->zl_dva_tree);

	return (max_seq);
}
Beispiel #17
0
/*
 * Start a log block write and advance to the next log block.
 * Calls are serialized.
 */
static lwb_t *
zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
{
	lwb_t *nlwb;
	zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1;
	spa_t *spa = zilog->zl_spa;
	blkptr_t *bp = &ztp->zit_next_blk;
	uint64_t txg;
	uint64_t zil_blksz;
	int error;

	ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb));

	/*
	 * Allocate the next block and save its address in this block
	 * before writing it in order to establish the log chain.
	 * Note that if the allocation of nlwb synced before we wrote
	 * the block that points at it (lwb), we'd leak it if we crashed.
	 * Therefore, we don't do txg_rele_to_sync() until zil_lwb_write_done().
	 */
	txg = txg_hold_open(zilog->zl_dmu_pool, &lwb->lwb_txgh);
	txg_rele_to_quiesce(&lwb->lwb_txgh);

	/*
	 * Pick a ZIL blocksize. We request a size that is the
	 * maximum of the previous used size, the current used size and
	 * the amount waiting in the queue.
	 */
	zil_blksz = MAX(zilog->zl_prev_used,
	    zilog->zl_cur_used + sizeof (*ztp));
	zil_blksz = MAX(zil_blksz, zilog->zl_itx_list_sz + sizeof (*ztp));
	zil_blksz = P2ROUNDUP_TYPED(zil_blksz, ZIL_MIN_BLKSZ, uint64_t);
	if (zil_blksz > ZIL_MAX_BLKSZ)
		zil_blksz = ZIL_MAX_BLKSZ;

	BP_ZERO(bp);
	/* pass the old blkptr in order to spread log blocks across devs */
	error = zio_alloc_blk(spa, zil_blksz, bp, &lwb->lwb_blk, txg);
	if (error) {
		dmu_tx_t *tx = dmu_tx_create_assigned(zilog->zl_dmu_pool, txg);

		/*
		 * We dirty the dataset to ensure that zil_sync() will
		 * be called to remove this lwb from our zl_lwb_list.
		 * Failing to do so, may leave an lwb with a NULL lwb_buf
		 * hanging around on the zl_lwb_list.
		 */
		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
		dmu_tx_commit(tx);

		/*
		 * Since we've just experienced an allocation failure so we
		 * terminate the current lwb and send it on its way.
		 */
		ztp->zit_pad = 0;
		ztp->zit_nused = lwb->lwb_nused;
		ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
		zio_nowait(lwb->lwb_zio);

		/*
		 * By returning NULL the caller will call tx_wait_synced()
		 */
		return (NULL);
	}

	ASSERT3U(bp->blk_birth, ==, txg);
	ztp->zit_pad = 0;
	ztp->zit_nused = lwb->lwb_nused;
	ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
	bp->blk_cksum = lwb->lwb_blk.blk_cksum;
	bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;

	/*
	 * Allocate a new log write buffer (lwb).
	 */
	nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);

	nlwb->lwb_zilog = zilog;
	nlwb->lwb_blk = *bp;
	nlwb->lwb_nused = 0;
	nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk);
	nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz);
	nlwb->lwb_max_txg = txg;
	nlwb->lwb_zio = NULL;

	/*
	 * Put new lwb at the end of the log chain
	 */
	mutex_enter(&zilog->zl_lock);
	list_insert_tail(&zilog->zl_lwb_list, nlwb);
	mutex_exit(&zilog->zl_lock);

	/* Record the block for later vdev flushing */
	zil_add_block(zilog, &lwb->lwb_blk);

	/*
	 * kick off the write for the old log block
	 */
	dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg);
	ASSERT(lwb->lwb_zio);
	zio_nowait(lwb->lwb_zio);

	return (nlwb);
}
Beispiel #18
0
/*
 * Create an on-disk intent log.
 */
static void
zil_create(zilog_t *zilog)
{
	const zil_header_t *zh = zilog->zl_header;
	lwb_t *lwb;
	uint64_t txg = 0;
	dmu_tx_t *tx = NULL;
	blkptr_t blk;
	int error = 0;

	/*
	 * Wait for any previous destroy to complete.
	 */
	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);

	ASSERT(zh->zh_claim_txg == 0);
	ASSERT(zh->zh_replay_seq == 0);

	blk = zh->zh_log;

	/*
	 * If we don't already have an initial log block or we have one
	 * but it's the wrong endianness then allocate one.
	 */
	if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) {
		tx = dmu_tx_create(zilog->zl_os);
		(void) dmu_tx_assign(tx, TXG_WAIT);
		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
		txg = dmu_tx_get_txg(tx);

		if (!BP_IS_HOLE(&blk)) {
			zio_free_blk(zilog->zl_spa, &blk, txg);
			BP_ZERO(&blk);
		}

		error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk,
		    NULL, txg);

		if (error == 0)
			zil_init_log_chain(zilog, &blk);
	}

	/*
	 * Allocate a log write buffer (lwb) for the first log block.
	 */
	if (error == 0) {
		lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
		lwb->lwb_zilog = zilog;
		lwb->lwb_blk = blk;
		lwb->lwb_nused = 0;
		lwb->lwb_sz = BP_GET_LSIZE(&lwb->lwb_blk);
		lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz);
		lwb->lwb_max_txg = txg;
		lwb->lwb_zio = NULL;

		mutex_enter(&zilog->zl_lock);
		list_insert_tail(&zilog->zl_lwb_list, lwb);
		mutex_exit(&zilog->zl_lock);
	}

	/*
	 * If we just allocated the first log block, commit our transaction
	 * and wait for zil_sync() to stuff the block poiner into zh_log.
	 * (zh is part of the MOS, so we cannot modify it in open context.)
	 */
	if (tx != NULL) {
		dmu_tx_commit(tx);
		txg_wait_synced(zilog->zl_dmu_pool, txg);
	}

	ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
}