static ssize_t osd_read(const struct lu_env *env, struct dt_object *dt, struct lu_buf *buf, loff_t *pos) { struct osd_object *obj = osd_dt_obj(dt); struct osd_device *osd = osd_obj2dev(obj); uint64_t old_size; int size = buf->lb_len; int rc; unsigned long start; LASSERT(dt_object_exists(dt)); LASSERT(obj->oo_db); start = cfs_time_current(); read_lock(&obj->oo_attr_lock); old_size = obj->oo_attr.la_size; read_unlock(&obj->oo_attr_lock); if (*pos + size > old_size) { if (old_size < *pos) return 0; else size = old_size - *pos; } record_start_io(osd, READ, 0); rc = -dmu_read(osd->od_os, obj->oo_db->db_object, *pos, size, buf->lb_buf, DMU_READ_PREFETCH); record_end_io(osd, READ, cfs_time_current() - start, size, size >> PAGE_CACHE_SHIFT); if (rc == 0) { rc = size; *pos += size; } return rc; }
vdev_indirect_births_t * vdev_indirect_births_open(objset_t *os, uint64_t births_object) { vdev_indirect_births_t *vib = kmem_zalloc(sizeof (*vib), KM_SLEEP); vib->vib_objset = os; vib->vib_object = births_object; VERIFY0(dmu_bonus_hold(os, vib->vib_object, vib, &vib->vib_dbuf)); vib->vib_phys = vib->vib_dbuf->db_data; if (vib->vib_phys->vib_count > 0) { uint64_t births_size = vdev_indirect_births_size_impl(vib); vib->vib_entries = kmem_alloc(births_size, KM_SLEEP); VERIFY0(dmu_read(vib->vib_objset, vib->vib_object, 0, births_size, vib->vib_entries, DMU_READ_PREFETCH)); } ASSERT(vdev_indirect_births_verify(vib)); return (vib); }
static ssize_t osd_read(const struct lu_env *env, struct dt_object *dt, struct lu_buf *buf, loff_t *pos, struct lustre_capa *capa) { struct osd_object *obj = osd_dt_obj(dt); struct osd_device *osd = osd_obj2dev(obj); uint64_t old_size; int size = buf->lb_len; int rc; LASSERT(dt_object_exists(dt)); LASSERT(obj->oo_db); read_lock(&obj->oo_attr_lock); old_size = obj->oo_attr.la_size; read_unlock(&obj->oo_attr_lock); if (*pos + size > old_size) { if (old_size < *pos) return 0; else size = old_size - *pos; } rc = -dmu_read(osd->od_objset.os, obj->oo_db->db_object, *pos, size, buf->lb_buf, DMU_READ_PREFETCH); if (rc == 0) { rc = size; *pos += size; /* XXX: workaround for bug in HEAD: fsfilt_ldiskfs_read() returns * requested number of bytes, not actually read ones */ if (S_ISLNK(obj->oo_dt.do_lu.lo_header->loh_attr)) rc = buf->lb_len; } return rc; }
void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, offset_t off, ssize_t resid, int ioflag, zil_callback_t callback, void *callback_data) { itx_wr_state_t write_state; boolean_t slogging; uintptr_t fsync_cnt; ssize_t immediate_write_sz; if (zil_replaying(zilog, tx) || zp->z_unlinked) { if (callback != NULL) callback(callback_data); return; } immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) ? 0 : (ssize_t)zfs_immediate_write_sz; slogging = spa_has_slogs(zilog->zl_spa) && (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY); if (resid > immediate_write_sz && !slogging && resid <= zp->z_blksz) write_state = WR_INDIRECT; else if (ioflag & (FSYNC | FDSYNC)) write_state = WR_COPIED; else write_state = WR_NEED_COPY; if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) { (void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1)); } while (resid) { itx_t *itx; lr_write_t *lr; ssize_t len; /* * If the write would overflow the largest block then split it. */ if (write_state != WR_INDIRECT && resid > ZIL_MAX_LOG_DATA) len = SPA_MAXBLOCKSIZE >> 1; else len = resid; itx = zil_itx_create(txtype, sizeof (*lr) + (write_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; if (write_state == WR_COPIED && dmu_read(ZTOZSB(zp)->z_os, zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { zil_itx_destroy(itx); itx = zil_itx_create(txtype, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; write_state = WR_NEED_COPY; } itx->itx_wr_state = write_state; if (write_state == WR_NEED_COPY) itx->itx_sod += len; lr->lr_foid = zp->z_id; lr->lr_offset = off; lr->lr_length = len; lr->lr_blkoff = 0; BP_ZERO(&lr->lr_blkptr); itx->itx_private = ZTOZSB(zp); if (!(ioflag & (FSYNC | FDSYNC)) && (zp->z_sync_cnt == 0) && (fsync_cnt == 0)) itx->itx_sync = B_FALSE; itx->itx_callback = callback; itx->itx_callback_data = callback_data; zil_itx_assign(zilog, itx, tx); off += len; resid -= len; }
/* * Read out the command history. */ int spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) { objset_t *mos = spa->spa_meta_objset; dmu_buf_t *dbp; uint64_t read_len, phys_read_off, phys_eof; uint64_t leftover = 0; spa_history_phys_t *shpp; int err; /* * If the command history doesn't exist (older pool), * that's ok, just return ENOENT. */ if (!spa->spa_history) return (ENOENT); /* * The history is logged asynchronously, so when they request * the first chunk of history, make sure everything has been * synced to disk so that we get it. */ if (*offp == 0 && spa_writeable(spa)) txg_wait_synced(spa_get_dsl(spa), 0); if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0) return (err); shpp = dbp->db_data; #ifdef ZFS_DEBUG { dmu_object_info_t doi; dmu_object_info_from_db(dbp, &doi); ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS); } #endif mutex_enter(&spa->spa_history_lock); phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp); if (*offp < shpp->sh_pool_create_len) { /* read in just the zpool create history */ phys_read_off = *offp; read_len = MIN(*len, shpp->sh_pool_create_len - phys_read_off); } else { /* * Need to reset passed in offset to BOF if the passed in * offset has since been overwritten. */ *offp = MAX(*offp, shpp->sh_bof); phys_read_off = spa_history_log_to_phys(*offp, shpp); /* * Read up to the minimum of what the user passed down or * the EOF (physical or logical). If we hit physical EOF, * use 'leftover' to read from the physical BOF. */ if (phys_read_off <= phys_eof) { read_len = MIN(*len, phys_eof - phys_read_off); } else { read_len = MIN(*len, shpp->sh_phys_max_off - phys_read_off); if (phys_read_off + *len > shpp->sh_phys_max_off) { leftover = MIN(*len - read_len, phys_eof - shpp->sh_pool_create_len); } } } /* offset for consumer to use next */ *offp += read_len + leftover; /* tell the consumer how much you actually read */ *len = read_len + leftover; if (read_len == 0) { mutex_exit(&spa->spa_history_lock); dmu_buf_rele(dbp, FTAG); return (0); } err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf, DMU_READ_PREFETCH); if (leftover && err == 0) { err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, leftover, buf + read_len, DMU_READ_PREFETCH); } mutex_exit(&spa->spa_history_lock); dmu_buf_rele(dbp, FTAG); return (err); }
/* * Get data to generate a TX_WRITE intent log record. */ static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) { zvol_state_t *zv = arg; objset_t *os = zv->zv_objset; uint64_t object = ZVOL_OBJ; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; blkptr_t *bp = &lr->lr_blkptr; dmu_buf_t *db; zgd_t *zgd; int error; ASSERT(zio != NULL); ASSERT(size != 0); zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_zilog = zv->zv_zilog; zgd->zgd_rl = zfs_range_lock(&zv->zv_range_lock, offset, size, RL_READER); /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the * log record (immediate); for large writes it's cheaper to * sync the data and get a pointer to it (indirect) so that * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); } else { size = zv->zv_volblocksize; offset = P2ALIGN_TYPED(offset, size, uint64_t); error = dmu_buf_hold(os, object, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { blkptr_t *obp = dmu_buf_get_blkptr(db); if (obp) { ASSERT(BP_IS_HOLE(bp)); *bp = *obp; } zgd->zgd_db = db; zgd->zgd_bp = &lr->lr_blkptr; ASSERT(db != NULL); ASSERT(db->db_offset == offset); ASSERT(db->db_size == size); error = dmu_sync(zio, lr->lr_common.lrc_txg, zvol_get_done, zgd); if (error == 0) return (0); } } zvol_get_done(zgd, error); return (SET_ERROR(error)); }
static void zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, uint64_t size, int sync) { uint32_t blocksize = zv->zv_volblocksize; zilog_t *zilog = zv->zv_zilog; boolean_t slogging; ssize_t immediate_write_sz; if (zil_replaying(zilog, tx)) return; immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) ? 0 : zvol_immediate_write_sz; slogging = spa_has_slogs(zilog->zl_spa) && (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY); while (size) { itx_t *itx; lr_write_t *lr; ssize_t len; itx_wr_state_t write_state; /* * Unlike zfs_log_write() we can be called with * up to DMU_MAX_ACCESS/2 (5MB) writes. */ if (blocksize > immediate_write_sz && !slogging && size >= blocksize && offset % blocksize == 0) { write_state = WR_INDIRECT; /* uses dmu_sync */ len = blocksize; } else if (sync) { write_state = WR_COPIED; len = MIN(ZIL_MAX_LOG_DATA, size); } else { write_state = WR_NEED_COPY; len = MIN(ZIL_MAX_LOG_DATA, size); } itx = zil_itx_create(TX_WRITE, sizeof (*lr) + (write_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; if (write_state == WR_COPIED && dmu_read(zv->zv_objset, ZVOL_OBJ, offset, len, lr+1, DMU_READ_NO_PREFETCH) != 0) { zil_itx_destroy(itx); itx = zil_itx_create(TX_WRITE, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; write_state = WR_NEED_COPY; } itx->itx_wr_state = write_state; if (write_state == WR_NEED_COPY) itx->itx_sod += len; lr->lr_foid = ZVOL_OBJ; lr->lr_offset = offset; lr->lr_length = len; lr->lr_blkoff = 0; BP_ZERO(&lr->lr_blkptr); itx->itx_private = zv; itx->itx_sync = sync; (void) zil_itx_assign(zilog, itx, tx); offset += len; size -= len; } }
int __osd_xattr_get(const struct lu_env *env, struct osd_object *obj, struct lu_buf *buf, const char *name, int *sizep) { struct osd_device *osd = osd_obj2dev(obj); udmu_objset_t *uos = &osd->od_objset; uint64_t xa_data_obj; dmu_buf_t *xa_data_db; sa_handle_t *sa_hdl = NULL; uint64_t size; int rc; /* check SA_ZPL_DXATTR first then fallback to directory xattr */ rc = __osd_sa_xattr_get(env, obj, buf, name, sizep); if (rc != -ENOENT) return rc; /* are there any extended attributes? */ if (obj->oo_xattr == ZFS_NO_OBJECT) return -ENOENT; /* Lookup the object number containing the xattr data */ rc = -zap_lookup(uos->os, obj->oo_xattr, name, sizeof(uint64_t), 1, &xa_data_obj); if (rc) return rc; rc = __osd_obj2dbuf(env, uos->os, xa_data_obj, &xa_data_db, FTAG); if (rc) return rc; rc = -sa_handle_get(uos->os, xa_data_obj, NULL, SA_HDL_PRIVATE, &sa_hdl); if (rc) goto out_rele; /* Get the xattr value length / object size */ rc = -sa_lookup(sa_hdl, SA_ZPL_SIZE(uos), &size, 8); if (rc) goto out; if (size > INT_MAX) { rc = -EOVERFLOW; goto out; } *sizep = (int)size; if (buf == NULL || buf->lb_buf == NULL) { /* We only need to return the required size */ goto out; } if (*sizep > buf->lb_len) { rc = -ERANGE; /* match ldiskfs error */ goto out; } rc = -dmu_read(uos->os, xa_data_db->db_object, 0, size, buf->lb_buf, DMU_READ_PREFETCH); out: sa_handle_destroy(sa_hdl); out_rele: dmu_buf_rele(xa_data_db, FTAG); return rc; }
void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, offset_t off, ssize_t resid, int ioflag) { uint32_t blocksize = zp->z_blksz; itx_wr_state_t write_state; uintptr_t fsync_cnt; if (zil_replaying(zilog, tx) || zp->z_unlinked) return; if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) write_state = WR_INDIRECT; else if (!spa_has_slogs(zilog->zl_spa) && resid >= zfs_immediate_write_sz) write_state = WR_INDIRECT; else if (ioflag & (FSYNC | FDSYNC)) write_state = WR_COPIED; else write_state = WR_NEED_COPY; if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) { (void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1)); } while (resid) { itx_t *itx; lr_write_t *lr; itx_wr_state_t wr_state = write_state; ssize_t len = resid; if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA) wr_state = WR_NEED_COPY; else if (wr_state == WR_INDIRECT) len = MIN(blocksize - P2PHASE(off, blocksize), resid); itx = zil_itx_create(txtype, sizeof (*lr) + (wr_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; if (wr_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os, zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { zil_itx_destroy(itx); itx = zil_itx_create(txtype, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; wr_state = WR_NEED_COPY; } itx->itx_wr_state = wr_state; lr->lr_foid = zp->z_id; lr->lr_offset = off; lr->lr_length = len; lr->lr_blkoff = 0; BP_ZERO(&lr->lr_blkptr); itx->itx_private = zp->z_zfsvfs; if (!(ioflag & (FSYNC | FDSYNC)) && (zp->z_sync_cnt == 0) && (fsync_cnt == 0)) itx->itx_sync = B_FALSE; zil_itx_assign(zilog, itx, tx); off += len; resid -= len; } }
/* * Load the space map disk into the specified range tree. Segments of maptype * are added to the range tree, other segment types are removed. * * Note: space_map_load() will drop sm_lock across dmu_read() calls. * The caller must be OK with this. */ int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype) { uint64_t *entry, *entry_map, *entry_map_end; uint64_t bufsize, size, offset, end, space; int error = 0; ASSERT(MUTEX_HELD(sm->sm_lock)); end = space_map_length(sm); space = space_map_allocated(sm); VERIFY0(range_tree_space(rt)); if (maptype == SM_FREE) { range_tree_add(rt, sm->sm_start, sm->sm_size); space = sm->sm_size - space; } bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE); entry_map = zio_buf_alloc(bufsize); mutex_exit(sm->sm_lock); if (end > bufsize) { dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize, end - bufsize, ZIO_PRIORITY_SYNC_READ); } mutex_enter(sm->sm_lock); for (offset = 0; offset < end; offset += bufsize) { size = MIN(end - offset, bufsize); VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0); VERIFY(size != 0); ASSERT3U(sm->sm_blksz, !=, 0); dprintf("object=%llu offset=%llx size=%llx\n", space_map_object(sm), offset, size); mutex_exit(sm->sm_lock); error = dmu_read(sm->sm_os, space_map_object(sm), offset, size, entry_map, DMU_READ_PREFETCH); mutex_enter(sm->sm_lock); if (error != 0) break; entry_map_end = entry_map + (size / sizeof (uint64_t)); for (entry = entry_map; entry < entry_map_end; entry++) { uint64_t e = *entry; uint64_t offset, size; if (SM_DEBUG_DECODE(e)) /* Skip debug entries */ continue; offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) + sm->sm_start; size = SM_RUN_DECODE(e) << sm->sm_shift; VERIFY0(P2PHASE(offset, 1ULL << sm->sm_shift)); VERIFY0(P2PHASE(size, 1ULL << sm->sm_shift)); VERIFY3U(offset, >=, sm->sm_start); VERIFY3U(offset + size, <=, sm->sm_start + sm->sm_size); if (SM_TYPE_DECODE(e) == maptype) { VERIFY3U(range_tree_space(rt) + size, <=, sm->sm_size); range_tree_add(rt, offset, size); } else { range_tree_remove(rt, offset, size); } }
int zvol_strategy(buf_t *bp) { zvol_state_t *zv = ddi_get_soft_state(zvol_state, getminor(bp->b_edev)); uint64_t off, volsize; size_t size, resid; char *addr; objset_t *os; int error = 0; int sync; int reading; int txg_sync_needed = B_FALSE; if (zv == NULL) { bioerror(bp, ENXIO); biodone(bp); return (0); } if (getminor(bp->b_edev) == 0) { bioerror(bp, EINVAL); biodone(bp); return (0); } if (zv->zv_readonly && !(bp->b_flags & B_READ)) { bioerror(bp, EROFS); biodone(bp); return (0); } off = ldbtob(bp->b_blkno); volsize = zv->zv_volsize; os = zv->zv_objset; ASSERT(os != NULL); sync = !(bp->b_flags & B_ASYNC) && !(zil_disable); bp_mapin(bp); addr = bp->b_un.b_addr; resid = bp->b_bcount; /* * There must be no buffer changes when doing a dmu_sync() because * we can't change the data whilst calculating the checksum. * A better approach than a per zvol rwlock would be to lock ranges. */ reading = bp->b_flags & B_READ; if (reading || resid <= zvol_immediate_write_sz) rw_enter(&zv->zv_dslock, RW_READER); else rw_enter(&zv->zv_dslock, RW_WRITER); while (resid != 0 && off < volsize) { size = MIN(resid, 1UL << 20); /* cap at 1MB per tx */ if (size > volsize - off) /* don't write past the end */ size = volsize - off; if (reading) { error = dmu_read(os, ZVOL_OBJ, off, size, addr); } else { dmu_tx_t *tx = dmu_tx_create(os); dmu_tx_hold_write(tx, ZVOL_OBJ, off, size); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { dmu_write(os, ZVOL_OBJ, off, size, addr, tx); if (sync) { /* use the ZIL to commit this write */ if (zvol_log_write(zv, tx, off, size, addr) != 0) { txg_sync_needed = B_TRUE; } } dmu_tx_commit(tx); } } if (error) break; off += size; addr += size; resid -= size; } rw_exit(&zv->zv_dslock); if ((bp->b_resid = resid) == bp->b_bcount) bioerror(bp, off > volsize ? EINVAL : error); biodone(bp); if (sync) { if (txg_sync_needed) txg_wait_synced(dmu_objset_pool(os), 0); else zil_commit(zv->zv_zilog, UINT64_MAX, 0); } return (0); }
void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, offset_t off, ssize_t resid, int ioflag) { itx_wr_state_t write_state; boolean_t slogging; uintptr_t fsync_cnt; if (zilog == NULL || zp->z_unlinked) return; /* * Writes are handled in three different ways: * * WR_INDIRECT: * If the write is greater than zfs_immediate_write_sz and there are * no separate logs in this pool then later *if* we need to log the * write then dmu_sync() is used to immediately write the block and * its block pointer is put in the log record. * WR_COPIED: * If we know we'll immediately be committing the * transaction (FDSYNC (O_DSYNC)), the we allocate a larger * log record here for the data and copy the data in. * WR_NEED_COPY: * Otherwise we don't allocate a buffer, and *if* we need to * flush the write later then a buffer is allocated and * we retrieve the data using the dmu. */ slogging = spa_has_slogs(zilog->zl_spa); if (resid > zfs_immediate_write_sz && !slogging) write_state = WR_INDIRECT; else if (ioflag & FDSYNC) write_state = WR_COPIED; else write_state = WR_NEED_COPY; #ifndef __APPLE__ if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) { (void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1)); } #endif while (resid) { itx_t *itx; lr_write_t *lr; ssize_t len; /* * If there are slogs and the write would overflow the largest * block, then because we don't want to use the main pool * to dmu_sync, we have to split the write. */ if (slogging && resid > ZIL_MAX_LOG_DATA) len = SPA_MAXBLOCKSIZE >> 1; else len = resid; itx = zil_itx_create(txtype, sizeof (*lr) + (write_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os, zp->z_id, off, len, lr + 1) != 0) { kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); itx = zil_itx_create(txtype, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; write_state = WR_NEED_COPY; } itx->itx_wr_state = write_state; lr->lr_foid = zp->z_id; lr->lr_offset = off; lr->lr_length = len; lr->lr_blkoff = 0; BP_ZERO(&lr->lr_blkptr); itx->itx_private = zp->z_zfsvfs; if ((zp->z_sync_cnt != 0) || (fsync_cnt != 0)) itx->itx_sync = B_TRUE; else itx->itx_sync = B_FALSE; zp->z_last_itx = zil_itx_assign(zilog, itx, tx); off += len; resid -= len; }
void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, offset_t off, ssize_t resid, int ioflag) { itx_wr_state_t write_state; boolean_t slogging; uintptr_t fsync_cnt; if (zilog == NULL || zp->z_unlinked) return; ZFS_HANDLE_REPLAY(zilog, tx); /* exits if replay */ slogging = spa_has_slogs(zilog->zl_spa); if (resid > zfs_immediate_write_sz && !slogging && resid <= zp->z_blksz) write_state = WR_INDIRECT; else if (ioflag & (FSYNC | FDSYNC)) write_state = WR_COPIED; else write_state = WR_NEED_COPY; if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) { (void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1)); } while (resid) { itx_t *itx; lr_write_t *lr; ssize_t len; /* * If the write would overflow the largest block then split it. */ if (write_state != WR_INDIRECT && resid > ZIL_MAX_LOG_DATA) len = SPA_MAXBLOCKSIZE >> 1; else len = resid; itx = zil_itx_create(txtype, sizeof (*lr) + (write_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os, zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); itx = zil_itx_create(txtype, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; write_state = WR_NEED_COPY; } itx->itx_wr_state = write_state; if (write_state == WR_NEED_COPY) itx->itx_sod += len; lr->lr_foid = zp->z_id; lr->lr_offset = off; lr->lr_length = len; lr->lr_blkoff = 0; BP_ZERO(&lr->lr_blkptr); itx->itx_private = zp->z_zfsvfs; if ((zp->z_sync_cnt != 0) || (fsync_cnt != 0) || (ioflag & (FSYNC | FDSYNC))) itx->itx_sync = B_TRUE; else itx->itx_sync = B_FALSE; zp->z_last_itx = zil_itx_assign(zilog, itx, tx); off += len; resid -= len; }