Exemple #1
0
static ssize_t osd_read(const struct lu_env *env, struct dt_object *dt,
			struct lu_buf *buf, loff_t *pos)
{
	struct osd_object *obj  = osd_dt_obj(dt);
	struct osd_device *osd = osd_obj2dev(obj);
	uint64_t	   old_size;
	int		   size = buf->lb_len;
	int		   rc;
	unsigned long	   start;

	LASSERT(dt_object_exists(dt));
	LASSERT(obj->oo_db);

	start = cfs_time_current();

	read_lock(&obj->oo_attr_lock);
	old_size = obj->oo_attr.la_size;
	read_unlock(&obj->oo_attr_lock);

	if (*pos + size > old_size) {
		if (old_size < *pos)
			return 0;
		else
			size = old_size - *pos;
	}

	record_start_io(osd, READ, 0);

	rc = -dmu_read(osd->od_os, obj->oo_db->db_object, *pos, size,
			buf->lb_buf, DMU_READ_PREFETCH);

	record_end_io(osd, READ, cfs_time_current() - start, size,
		      size >> PAGE_CACHE_SHIFT);
	if (rc == 0) {
		rc = size;
		*pos += size;
	}
	return rc;
}
Exemple #2
0
vdev_indirect_births_t *
vdev_indirect_births_open(objset_t *os, uint64_t births_object)
{
	vdev_indirect_births_t *vib = kmem_zalloc(sizeof (*vib), KM_SLEEP);

	vib->vib_objset = os;
	vib->vib_object = births_object;

	VERIFY0(dmu_bonus_hold(os, vib->vib_object, vib, &vib->vib_dbuf));
	vib->vib_phys = vib->vib_dbuf->db_data;

	if (vib->vib_phys->vib_count > 0) {
		uint64_t births_size = vdev_indirect_births_size_impl(vib);
		vib->vib_entries = kmem_alloc(births_size, KM_SLEEP);
		VERIFY0(dmu_read(vib->vib_objset, vib->vib_object, 0,
		    births_size, vib->vib_entries, DMU_READ_PREFETCH));
	}

	ASSERT(vdev_indirect_births_verify(vib));

	return (vib);
}
Exemple #3
0
static ssize_t osd_read(const struct lu_env *env, struct dt_object *dt,
			struct lu_buf *buf, loff_t *pos,
			struct lustre_capa *capa)
{
	struct osd_object *obj  = osd_dt_obj(dt);
	struct osd_device *osd = osd_obj2dev(obj);
	uint64_t	   old_size;
	int		   size = buf->lb_len;
	int		   rc;

	LASSERT(dt_object_exists(dt));
	LASSERT(obj->oo_db);

	read_lock(&obj->oo_attr_lock);
	old_size = obj->oo_attr.la_size;
	read_unlock(&obj->oo_attr_lock);

	if (*pos + size > old_size) {
		if (old_size < *pos)
			return 0;
		else
			size = old_size - *pos;
	}

	rc = -dmu_read(osd->od_objset.os, obj->oo_db->db_object, *pos, size,
			buf->lb_buf, DMU_READ_PREFETCH);
	if (rc == 0) {
		rc = size;
		*pos += size;

		/* XXX: workaround for bug in HEAD: fsfilt_ldiskfs_read() returns
		 * requested number of bytes, not actually read ones */
		if (S_ISLNK(obj->oo_dt.do_lu.lo_header->loh_attr))
			rc = buf->lb_len;
	}
	return rc;
}
Exemple #4
0
void
zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
	znode_t *zp, offset_t off, ssize_t resid, int ioflag,
	zil_callback_t callback, void *callback_data)
{
	itx_wr_state_t write_state;
	boolean_t slogging;
	uintptr_t fsync_cnt;
	ssize_t immediate_write_sz;

	if (zil_replaying(zilog, tx) || zp->z_unlinked) {
		if (callback != NULL)
			callback(callback_data);
		return;
	}

	immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
	    ? 0 : (ssize_t)zfs_immediate_write_sz;

	slogging = spa_has_slogs(zilog->zl_spa) &&
	    (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
	if (resid > immediate_write_sz && !slogging && resid <= zp->z_blksz)
		write_state = WR_INDIRECT;
	else if (ioflag & (FSYNC | FDSYNC))
		write_state = WR_COPIED;
	else
		write_state = WR_NEED_COPY;

	if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) {
		(void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1));
	}

	while (resid) {
		itx_t *itx;
		lr_write_t *lr;
		ssize_t len;

		/*
		 * If the write would overflow the largest block then split it.
		 */
		if (write_state != WR_INDIRECT && resid > ZIL_MAX_LOG_DATA)
			len = SPA_MAXBLOCKSIZE >> 1;
		else
			len = resid;

		itx = zil_itx_create(txtype, sizeof (*lr) +
		    (write_state == WR_COPIED ? len : 0));
		lr = (lr_write_t *)&itx->itx_lr;
		if (write_state == WR_COPIED && dmu_read(ZTOZSB(zp)->z_os,
		    zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
			zil_itx_destroy(itx);
			itx = zil_itx_create(txtype, sizeof (*lr));
			lr = (lr_write_t *)&itx->itx_lr;
			write_state = WR_NEED_COPY;
		}

		itx->itx_wr_state = write_state;
		if (write_state == WR_NEED_COPY)
			itx->itx_sod += len;
		lr->lr_foid = zp->z_id;
		lr->lr_offset = off;
		lr->lr_length = len;
		lr->lr_blkoff = 0;
		BP_ZERO(&lr->lr_blkptr);

		itx->itx_private = ZTOZSB(zp);

		if (!(ioflag & (FSYNC | FDSYNC)) && (zp->z_sync_cnt == 0) &&
		    (fsync_cnt == 0))
			itx->itx_sync = B_FALSE;

		itx->itx_callback = callback;
		itx->itx_callback_data = callback_data;
		zil_itx_assign(zilog, itx, tx);

		off += len;
		resid -= len;
	}
Exemple #5
0
/*
 * Read out the command history.
 */
int
spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf)
{
	objset_t *mos = spa->spa_meta_objset;
	dmu_buf_t *dbp;
	uint64_t read_len, phys_read_off, phys_eof;
	uint64_t leftover = 0;
	spa_history_phys_t *shpp;
	int err;

	/*
	 * If the command history  doesn't exist (older pool),
	 * that's ok, just return ENOENT.
	 */
	if (!spa->spa_history)
		return (ENOENT);

	/*
	 * The history is logged asynchronously, so when they request
	 * the first chunk of history, make sure everything has been
	 * synced to disk so that we get it.
	 */
	if (*offp == 0 && spa_writeable(spa))
		txg_wait_synced(spa_get_dsl(spa), 0);

	if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0)
		return (err);
	shpp = dbp->db_data;

#ifdef ZFS_DEBUG
	{
		dmu_object_info_t doi;
		dmu_object_info_from_db(dbp, &doi);
		ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS);
	}
#endif

	mutex_enter(&spa->spa_history_lock);
	phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);

	if (*offp < shpp->sh_pool_create_len) {
		/* read in just the zpool create history */
		phys_read_off = *offp;
		read_len = MIN(*len, shpp->sh_pool_create_len -
		    phys_read_off);
	} else {
		/*
		 * Need to reset passed in offset to BOF if the passed in
		 * offset has since been overwritten.
		 */
		*offp = MAX(*offp, shpp->sh_bof);
		phys_read_off = spa_history_log_to_phys(*offp, shpp);

		/*
		 * Read up to the minimum of what the user passed down or
		 * the EOF (physical or logical).  If we hit physical EOF,
		 * use 'leftover' to read from the physical BOF.
		 */
		if (phys_read_off <= phys_eof) {
			read_len = MIN(*len, phys_eof - phys_read_off);
		} else {
			read_len = MIN(*len,
			    shpp->sh_phys_max_off - phys_read_off);
			if (phys_read_off + *len > shpp->sh_phys_max_off) {
				leftover = MIN(*len - read_len,
				    phys_eof - shpp->sh_pool_create_len);
			}
		}
	}

	/* offset for consumer to use next */
	*offp += read_len + leftover;

	/* tell the consumer how much you actually read */
	*len = read_len + leftover;

	if (read_len == 0) {
		mutex_exit(&spa->spa_history_lock);
		dmu_buf_rele(dbp, FTAG);
		return (0);
	}

	err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf,
	    DMU_READ_PREFETCH);
	if (leftover && err == 0) {
		err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len,
		    leftover, buf + read_len, DMU_READ_PREFETCH);
	}
	mutex_exit(&spa->spa_history_lock);

	dmu_buf_rele(dbp, FTAG);
	return (err);
}
Exemple #6
0
/*
 * Get data to generate a TX_WRITE intent log record.
 */
static int
zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
{
	zvol_state_t *zv = arg;
	objset_t *os = zv->zv_objset;
	uint64_t object = ZVOL_OBJ;
	uint64_t offset = lr->lr_offset;
	uint64_t size = lr->lr_length;
	blkptr_t *bp = &lr->lr_blkptr;
	dmu_buf_t *db;
	zgd_t *zgd;
	int error;

	ASSERT(zio != NULL);
	ASSERT(size != 0);

	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
	zgd->zgd_zilog = zv->zv_zilog;
	zgd->zgd_rl = zfs_range_lock(&zv->zv_range_lock, offset, size,
	    RL_READER);

	/*
	 * Write records come in two flavors: immediate and indirect.
	 * For small writes it's cheaper to store the data with the
	 * log record (immediate); for large writes it's cheaper to
	 * sync the data and get a pointer to it (indirect) so that
	 * we don't have to write the data twice.
	 */
	if (buf != NULL) { /* immediate write */
		error = dmu_read(os, object, offset, size, buf,
		    DMU_READ_NO_PREFETCH);
	} else {
		size = zv->zv_volblocksize;
		offset = P2ALIGN_TYPED(offset, size, uint64_t);
		error = dmu_buf_hold(os, object, offset, zgd, &db,
		    DMU_READ_NO_PREFETCH);
		if (error == 0) {
			blkptr_t *obp = dmu_buf_get_blkptr(db);
			if (obp) {
				ASSERT(BP_IS_HOLE(bp));
				*bp = *obp;
			}

			zgd->zgd_db = db;
			zgd->zgd_bp = &lr->lr_blkptr;

			ASSERT(db != NULL);
			ASSERT(db->db_offset == offset);
			ASSERT(db->db_size == size);

			error = dmu_sync(zio, lr->lr_common.lrc_txg,
			    zvol_get_done, zgd);

			if (error == 0)
				return (0);
		}
	}

	zvol_get_done(zgd, error);

	return (SET_ERROR(error));
}
Exemple #7
0
static void
zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
    uint64_t size, int sync)
{
	uint32_t blocksize = zv->zv_volblocksize;
	zilog_t *zilog = zv->zv_zilog;
	boolean_t slogging;
	ssize_t immediate_write_sz;

	if (zil_replaying(zilog, tx))
		return;

	immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
		? 0 : zvol_immediate_write_sz;
	slogging = spa_has_slogs(zilog->zl_spa) &&
		(zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);

	while (size) {
		itx_t *itx;
		lr_write_t *lr;
		ssize_t len;
		itx_wr_state_t write_state;

		/*
		 * Unlike zfs_log_write() we can be called with
		 * up to DMU_MAX_ACCESS/2 (5MB) writes.
		 */
		if (blocksize > immediate_write_sz && !slogging &&
		    size >= blocksize && offset % blocksize == 0) {
			write_state = WR_INDIRECT; /* uses dmu_sync */
			len = blocksize;
		} else if (sync) {
			write_state = WR_COPIED;
			len = MIN(ZIL_MAX_LOG_DATA, size);
		} else {
			write_state = WR_NEED_COPY;
			len = MIN(ZIL_MAX_LOG_DATA, size);
		}

		itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
		    (write_state == WR_COPIED ? len : 0));
		lr = (lr_write_t *)&itx->itx_lr;
		if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
		    ZVOL_OBJ, offset, len, lr+1, DMU_READ_NO_PREFETCH) != 0) {
			zil_itx_destroy(itx);
			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
			lr = (lr_write_t *)&itx->itx_lr;
			write_state = WR_NEED_COPY;
		}

		itx->itx_wr_state = write_state;
		if (write_state == WR_NEED_COPY)
			itx->itx_sod += len;
		lr->lr_foid = ZVOL_OBJ;
		lr->lr_offset = offset;
		lr->lr_length = len;
		lr->lr_blkoff = 0;
		BP_ZERO(&lr->lr_blkptr);

		itx->itx_private = zv;
		itx->itx_sync = sync;

		(void) zil_itx_assign(zilog, itx, tx);

		offset += len;
		size -= len;
	}
}
Exemple #8
0
int __osd_xattr_get(const struct lu_env *env, struct osd_object *obj,
		struct lu_buf *buf, const char *name, int *sizep)
{
	struct osd_device *osd = osd_obj2dev(obj);
	udmu_objset_t     *uos = &osd->od_objset;
	uint64_t           xa_data_obj;
	dmu_buf_t         *xa_data_db;
	sa_handle_t       *sa_hdl = NULL;
	uint64_t           size;
	int                rc;

	/* check SA_ZPL_DXATTR first then fallback to directory xattr */
	rc = __osd_sa_xattr_get(env, obj, buf, name, sizep);
	if (rc != -ENOENT)
		return rc;

	/* are there any extended attributes? */
	if (obj->oo_xattr == ZFS_NO_OBJECT)
		return -ENOENT;

	/* Lookup the object number containing the xattr data */
	rc = -zap_lookup(uos->os, obj->oo_xattr, name, sizeof(uint64_t), 1,
			&xa_data_obj);
	if (rc)
		return rc;

	rc = __osd_obj2dbuf(env, uos->os, xa_data_obj, &xa_data_db, FTAG);
	if (rc)
		return rc;

	rc = -sa_handle_get(uos->os, xa_data_obj, NULL, SA_HDL_PRIVATE,
			&sa_hdl);
	if (rc)
		goto out_rele;

	/* Get the xattr value length / object size */
	rc = -sa_lookup(sa_hdl, SA_ZPL_SIZE(uos), &size, 8);
	if (rc)
		goto out;

	if (size > INT_MAX) {
		rc = -EOVERFLOW;
		goto out;
	}

	*sizep = (int)size;

	if (buf == NULL || buf->lb_buf == NULL) {
		/* We only need to return the required size */
		goto out;
	}
	if (*sizep > buf->lb_len) {
		rc = -ERANGE; /* match ldiskfs error */
		goto out;
	}

	rc = -dmu_read(uos->os, xa_data_db->db_object, 0,
			size, buf->lb_buf, DMU_READ_PREFETCH);

out:
	sa_handle_destroy(sa_hdl);
out_rele:
	dmu_buf_rele(xa_data_db, FTAG);
	return rc;
}
Exemple #9
0
void
zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
    znode_t *zp, offset_t off, ssize_t resid, int ioflag)
{
	uint32_t blocksize = zp->z_blksz;
	itx_wr_state_t write_state;
	uintptr_t fsync_cnt;

	if (zil_replaying(zilog, tx) || zp->z_unlinked)
		return;

	if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
		write_state = WR_INDIRECT;
	else if (!spa_has_slogs(zilog->zl_spa) &&
	    resid >= zfs_immediate_write_sz)
		write_state = WR_INDIRECT;
	else if (ioflag & (FSYNC | FDSYNC))
		write_state = WR_COPIED;
	else
		write_state = WR_NEED_COPY;

	if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) {
		(void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1));
	}

	while (resid) {
		itx_t *itx;
		lr_write_t *lr;
		itx_wr_state_t wr_state = write_state;
		ssize_t len = resid;

		if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
			wr_state = WR_NEED_COPY;
		else if (wr_state == WR_INDIRECT)
			len = MIN(blocksize - P2PHASE(off, blocksize), resid);

		itx = zil_itx_create(txtype, sizeof (*lr) +
		    (wr_state == WR_COPIED ? len : 0));
		lr = (lr_write_t *)&itx->itx_lr;
		if (wr_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
		    zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
			zil_itx_destroy(itx);
			itx = zil_itx_create(txtype, sizeof (*lr));
			lr = (lr_write_t *)&itx->itx_lr;
			wr_state = WR_NEED_COPY;
		}

		itx->itx_wr_state = wr_state;
		lr->lr_foid = zp->z_id;
		lr->lr_offset = off;
		lr->lr_length = len;
		lr->lr_blkoff = 0;
		BP_ZERO(&lr->lr_blkptr);

		itx->itx_private = zp->z_zfsvfs;

		if (!(ioflag & (FSYNC | FDSYNC)) && (zp->z_sync_cnt == 0) &&
		    (fsync_cnt == 0))
			itx->itx_sync = B_FALSE;

		zil_itx_assign(zilog, itx, tx);

		off += len;
		resid -= len;
	}
}
/*
 * Load the space map disk into the specified range tree. Segments of maptype
 * are added to the range tree, other segment types are removed.
 *
 * Note: space_map_load() will drop sm_lock across dmu_read() calls.
 * The caller must be OK with this.
 */
int
space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
{
	uint64_t *entry, *entry_map, *entry_map_end;
	uint64_t bufsize, size, offset, end, space;
	int error = 0;

	ASSERT(MUTEX_HELD(sm->sm_lock));

	end = space_map_length(sm);
	space = space_map_allocated(sm);

	VERIFY0(range_tree_space(rt));

	if (maptype == SM_FREE) {
		range_tree_add(rt, sm->sm_start, sm->sm_size);
		space = sm->sm_size - space;
	}

	bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
	entry_map = zio_buf_alloc(bufsize);

	mutex_exit(sm->sm_lock);
	if (end > bufsize) {
		dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize,
		    end - bufsize, ZIO_PRIORITY_SYNC_READ);
	}
	mutex_enter(sm->sm_lock);

	for (offset = 0; offset < end; offset += bufsize) {
		size = MIN(end - offset, bufsize);
		VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
		VERIFY(size != 0);
		ASSERT3U(sm->sm_blksz, !=, 0);

		dprintf("object=%llu  offset=%llx  size=%llx\n",
		    space_map_object(sm), offset, size);

		mutex_exit(sm->sm_lock);
		error = dmu_read(sm->sm_os, space_map_object(sm), offset, size,
		    entry_map, DMU_READ_PREFETCH);
		mutex_enter(sm->sm_lock);
		if (error != 0)
			break;

		entry_map_end = entry_map + (size / sizeof (uint64_t));
		for (entry = entry_map; entry < entry_map_end; entry++) {
			uint64_t e = *entry;
			uint64_t offset, size;

			if (SM_DEBUG_DECODE(e))		/* Skip debug entries */
				continue;

			offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) +
			    sm->sm_start;
			size = SM_RUN_DECODE(e) << sm->sm_shift;

			VERIFY0(P2PHASE(offset, 1ULL << sm->sm_shift));
			VERIFY0(P2PHASE(size, 1ULL << sm->sm_shift));
			VERIFY3U(offset, >=, sm->sm_start);
			VERIFY3U(offset + size, <=, sm->sm_start + sm->sm_size);
			if (SM_TYPE_DECODE(e) == maptype) {
				VERIFY3U(range_tree_space(rt) + size, <=,
				    sm->sm_size);
				range_tree_add(rt, offset, size);
			} else {
				range_tree_remove(rt, offset, size);
			}
		}
Exemple #11
0
int
zvol_strategy(buf_t *bp)
{
	zvol_state_t *zv = ddi_get_soft_state(zvol_state, getminor(bp->b_edev));
	uint64_t off, volsize;
	size_t size, resid;
	char *addr;
	objset_t *os;
	int error = 0;
	int sync;
	int reading;
	int txg_sync_needed = B_FALSE;

	if (zv == NULL) {
		bioerror(bp, ENXIO);
		biodone(bp);
		return (0);
	}

	if (getminor(bp->b_edev) == 0) {
		bioerror(bp, EINVAL);
		biodone(bp);
		return (0);
	}

	if (zv->zv_readonly && !(bp->b_flags & B_READ)) {
		bioerror(bp, EROFS);
		biodone(bp);
		return (0);
	}

	off = ldbtob(bp->b_blkno);
	volsize = zv->zv_volsize;

	os = zv->zv_objset;
	ASSERT(os != NULL);
	sync = !(bp->b_flags & B_ASYNC) && !(zil_disable);

	bp_mapin(bp);
	addr = bp->b_un.b_addr;
	resid = bp->b_bcount;

	/*
	 * There must be no buffer changes when doing a dmu_sync() because
	 * we can't change the data whilst calculating the checksum.
	 * A better approach than a per zvol rwlock would be to lock ranges.
	 */
	reading = bp->b_flags & B_READ;
	if (reading || resid <= zvol_immediate_write_sz)
		rw_enter(&zv->zv_dslock, RW_READER);
	else
		rw_enter(&zv->zv_dslock, RW_WRITER);

	while (resid != 0 && off < volsize) {

		size = MIN(resid, 1UL << 20);	/* cap at 1MB per tx */

		if (size > volsize - off)	/* don't write past the end */
			size = volsize - off;

		if (reading) {
			error = dmu_read(os, ZVOL_OBJ, off, size, addr);
		} else {
			dmu_tx_t *tx = dmu_tx_create(os);
			dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
			error = dmu_tx_assign(tx, TXG_WAIT);
			if (error) {
				dmu_tx_abort(tx);
			} else {
				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
				if (sync) {
					/* use the ZIL to commit this write */
					if (zvol_log_write(zv, tx, off, size,
					    addr) != 0) {
						txg_sync_needed = B_TRUE;
					}
				}
				dmu_tx_commit(tx);
			}
		}
		if (error)
			break;
		off += size;
		addr += size;
		resid -= size;
	}
	rw_exit(&zv->zv_dslock);

	if ((bp->b_resid = resid) == bp->b_bcount)
		bioerror(bp, off > volsize ? EINVAL : error);

	biodone(bp);

	if (sync) {
		if (txg_sync_needed)
			txg_wait_synced(dmu_objset_pool(os), 0);
		else
			zil_commit(zv->zv_zilog, UINT64_MAX, 0);
	}

	return (0);
}
Exemple #12
0
void
zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
	znode_t *zp, offset_t off, ssize_t resid, int ioflag)
{
	itx_wr_state_t write_state;
	boolean_t slogging;
	uintptr_t fsync_cnt;

	if (zilog == NULL || zp->z_unlinked)
		return;

	/*
	 * Writes are handled in three different ways:
	 *
	 * WR_INDIRECT:
	 *    If the write is greater than zfs_immediate_write_sz and there are
	 *    no separate logs in this pool then later *if* we need to log the
	 *    write then dmu_sync() is used to immediately write the block and
	 *    its block pointer is put in the log record.
	 * WR_COPIED:
	 *    If we know we'll immediately be committing the
	 *    transaction (FDSYNC (O_DSYNC)), the we allocate a larger
	 *    log record here for the data and copy the data in.
	 * WR_NEED_COPY:
	 *    Otherwise we don't allocate a buffer, and *if* we need to
	 *    flush the write later then a buffer is allocated and
	 *    we retrieve the data using the dmu.
	 */
	slogging = spa_has_slogs(zilog->zl_spa);
	if (resid > zfs_immediate_write_sz && !slogging)
		write_state = WR_INDIRECT;
	else if (ioflag & FDSYNC)
		write_state = WR_COPIED;
	else
		write_state = WR_NEED_COPY;
#ifndef __APPLE__
	if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) {
		(void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1));
	}
#endif
	while (resid) {
		itx_t *itx;
		lr_write_t *lr;
		ssize_t len;

		/*
		 * If there are slogs and the write would overflow the largest
		 * block, then because we don't want to use the main pool
		 * to dmu_sync, we have to split the write.
		 */
		if (slogging && resid > ZIL_MAX_LOG_DATA)
			len = SPA_MAXBLOCKSIZE >> 1;
		else
			len = resid;

		itx = zil_itx_create(txtype, sizeof (*lr) +
		    (write_state == WR_COPIED ? len : 0));
		lr = (lr_write_t *)&itx->itx_lr;
		if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
		    zp->z_id, off, len, lr + 1) != 0) {
			kmem_free(itx, offsetof(itx_t, itx_lr) +
			    itx->itx_lr.lrc_reclen);
			itx = zil_itx_create(txtype, sizeof (*lr));
			lr = (lr_write_t *)&itx->itx_lr;
			write_state = WR_NEED_COPY;
		}

		itx->itx_wr_state = write_state;
		lr->lr_foid = zp->z_id;
		lr->lr_offset = off;
		lr->lr_length = len;
		lr->lr_blkoff = 0;
		BP_ZERO(&lr->lr_blkptr);

		itx->itx_private = zp->z_zfsvfs;

		if ((zp->z_sync_cnt != 0) || (fsync_cnt != 0))
			itx->itx_sync = B_TRUE;
		else
			itx->itx_sync = B_FALSE;

		zp->z_last_itx = zil_itx_assign(zilog, itx, tx);

		off += len;
		resid -= len;
	}
Exemple #13
0
void
zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
              znode_t *zp, offset_t off, ssize_t resid, int ioflag)
{
    itx_wr_state_t write_state;
    boolean_t slogging;
    uintptr_t fsync_cnt;

    if (zilog == NULL || zp->z_unlinked)
        return;

    ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */

    slogging = spa_has_slogs(zilog->zl_spa);
    if (resid > zfs_immediate_write_sz && !slogging && resid <= zp->z_blksz)
        write_state = WR_INDIRECT;
    else if (ioflag & (FSYNC | FDSYNC))
        write_state = WR_COPIED;
    else
        write_state = WR_NEED_COPY;

    if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) {
        (void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1));
    }

    while (resid) {
        itx_t *itx;
        lr_write_t *lr;
        ssize_t len;

        /*
         * If the write would overflow the largest block then split it.
         */
        if (write_state != WR_INDIRECT && resid > ZIL_MAX_LOG_DATA)
            len = SPA_MAXBLOCKSIZE >> 1;
        else
            len = resid;

        itx = zil_itx_create(txtype, sizeof (*lr) +
                             (write_state == WR_COPIED ? len : 0));
        lr = (lr_write_t *)&itx->itx_lr;
        if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
                zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
            kmem_free(itx, offsetof(itx_t, itx_lr) +
                      itx->itx_lr.lrc_reclen);
            itx = zil_itx_create(txtype, sizeof (*lr));
            lr = (lr_write_t *)&itx->itx_lr;
            write_state = WR_NEED_COPY;
        }

        itx->itx_wr_state = write_state;
        if (write_state == WR_NEED_COPY)
            itx->itx_sod += len;
        lr->lr_foid = zp->z_id;
        lr->lr_offset = off;
        lr->lr_length = len;
        lr->lr_blkoff = 0;
        BP_ZERO(&lr->lr_blkptr);

        itx->itx_private = zp->z_zfsvfs;

        if ((zp->z_sync_cnt != 0) || (fsync_cnt != 0) ||
                (ioflag & (FSYNC | FDSYNC)))
            itx->itx_sync = B_TRUE;
        else
            itx->itx_sync = B_FALSE;

        zp->z_last_itx = zil_itx_assign(zilog, itx, tx);

        off += len;
        resid -= len;
    }