static int zvol_write(struct bio *bio) { zvol_state_t *zv = bio->bi_bdev->bd_disk->private_data; uint64_t offset = BIO_BI_SECTOR(bio) << 9; uint64_t size = BIO_BI_SIZE(bio); int error = 0; dmu_tx_t *tx; rl_t *rl; uio_t uio; if (bio->bi_rw & VDEV_REQ_FLUSH) zil_commit(zv->zv_zilog, ZVOL_OBJ); /* * Some requests are just for flush and nothing else. */ if (size == 0) goto out; uio.uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)]; uio.uio_skip = BIO_BI_SKIP(bio); uio.uio_resid = size; uio.uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio); uio.uio_loffset = offset; uio.uio_limit = MAXOFFSET_T; uio.uio_segflg = UIO_BVEC; rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_WRITER); tx = dmu_tx_create(zv->zv_objset); dmu_tx_hold_write(tx, ZVOL_OBJ, offset, size); /* This will only fail for ENOSPC */ error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); zfs_range_unlock(rl); goto out; } error = dmu_write_uio(zv->zv_objset, ZVOL_OBJ, &uio, size, tx); if (error == 0) zvol_log_write(zv, tx, offset, size, !!(bio->bi_rw & VDEV_REQ_FUA)); dmu_tx_commit(tx); zfs_range_unlock(rl); if ((bio->bi_rw & VDEV_REQ_FUA) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zv->zv_zilog, ZVOL_OBJ); out: return (error); }
/* * Common write path running under the zvol taskq context. This function * is responsible for copying the request structure data in to the DMU and * signaling the request queue with the result of the copy. */ static void zvol_write(void *arg) { struct request *req = (struct request *)arg; struct request_queue *q = req->q; zvol_state_t *zv = q->queuedata; fstrans_cookie_t cookie = spl_fstrans_mark(); uint64_t offset = blk_rq_pos(req) << 9; uint64_t size = blk_rq_bytes(req); int error = 0; dmu_tx_t *tx; rl_t *rl; if (req->cmd_flags & VDEV_REQ_FLUSH) zil_commit(zv->zv_zilog, ZVOL_OBJ); /* * Some requests are just for flush and nothing else. */ if (size == 0) { error = 0; goto out; } rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_WRITER); tx = dmu_tx_create(zv->zv_objset); dmu_tx_hold_write(tx, ZVOL_OBJ, offset, size); /* This will only fail for ENOSPC */ error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); zfs_range_unlock(rl); goto out; } error = dmu_write_req(zv->zv_objset, ZVOL_OBJ, req, tx); if (error == 0) zvol_log_write(zv, tx, offset, size, req->cmd_flags & VDEV_REQ_FUA); dmu_tx_commit(tx); zfs_range_unlock(rl); if ((req->cmd_flags & VDEV_REQ_FUA) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zv->zv_zilog, ZVOL_OBJ); out: blk_end_request(req, -error, size); spl_fstrans_unmark(cookie); }
static int zvol_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { zvol_state_t *zv = bdev->bd_disk->private_data; int error = 0; ASSERT(zv && zv->zv_open_count > 0); switch (cmd) { case BLKFLSBUF: zil_commit(zv->zv_zilog, ZVOL_OBJ); break; case BLKZNAME: error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); break; default: error = -ENOTTY; break; } return (SET_ERROR(error)); }
IOReturn net_lundman_zfs_zvol_device::doSynchronizeCache(void) { dprintf("doSync\n"); if (zv && zv->zv_zilog) { zil_commit(zv->zv_zilog, ZVOL_OBJ); } return kIOReturnSuccess; }
/*ARGSUSED*/ int zfs_sync(struct super_block *sb, int wait, cred_t *cr) { zfs_sb_t *zsb = sb->s_fs_info; /* * Data integrity is job one. We don't want a compromised kernel * writing to the storage pool, so we never sync during panic. */ if (unlikely(oops_in_progress)) return (0); /* * Semantically, the only requirement is that the sync be initiated. * The DMU syncs out txgs frequently, so there's nothing to do. */ if (!wait) return (0); if (zsb != NULL) { /* * Sync a specific filesystem. */ dsl_pool_t *dp; ZFS_ENTER(zsb); dp = dmu_objset_pool(zsb->z_os); /* * If the system is shutting down, then skip any * filesystems which may exist on a suspended pool. */ if (spa_suspended(dp->dp_spa)) { ZFS_EXIT(zsb); return (0); } if (zsb->z_log != NULL) zil_commit(zsb->z_log, 0); ZFS_EXIT(zsb); } else { /* * Sync all ZFS filesystems. This is what happens when you * run sync(1M). Unlike other filesystems, ZFS honors the * request by waiting for all pools to commit all dirty data. */ spa_sync_allpools(); } return (0); }
static int zfs_vfs_sync(struct mount *mp, __unused int waitfor, __unused vfs_context_t context) { zfsvfs_t *zfsvfs = vfs_fsprivate(mp); ZFS_ENTER(zfsvfs); /* * Mac OS X needs a file system modify time * * We use the mtime of the "com.apple.system.mtime" * extended attribute, which is associated with the * file system root directory. * * Here we sync any mtime changes to this attribute. */ if (zfsvfs->z_mtime_vp != NULL) { timestruc_t mtime; znode_t *zp; top: zp = VTOZ(zfsvfs->z_mtime_vp); ZFS_TIME_DECODE(&mtime, zp->z_phys->zp_mtime); if (zfsvfs->z_last_mtime_synced < mtime.tv_sec) { dmu_tx_t *tx; int error; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, zp->z_id); error = dmu_tx_assign(tx, zfsvfs->z_assign); if (error) { if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); } else { dmu_buf_will_dirty(zp->z_dbuf, tx); dmu_tx_commit(tx); zfsvfs->z_last_mtime_synced = mtime.tv_sec; } } } if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, UINT64_MAX, 0); else txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); ZFS_EXIT(zfsvfs); return (0); }
int zfs_vnop_ioctl_fullfsync(struct vnode *vp, vfs_context_t ct, zfsvfs_t *zfsvfs) { int error; error = zfs_fsync(vp, /*syncflag*/0, NULL, (caller_context_t *)ct); if (error) return (error); if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, 0); else txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); return (0); }
static int zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) { znode_t *zp = ITOZ(mapping->host); zfs_sb_t *zsb = ITOZSB(mapping->host); enum writeback_sync_modes sync_mode; int result; ZFS_ENTER(zsb); if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS) wbc->sync_mode = WB_SYNC_ALL; ZFS_EXIT(zsb); sync_mode = wbc->sync_mode; /* * We don't want to run write_cache_pages() in SYNC mode here, because * that would make putpage() wait for a single page to be committed to * disk every single time, resulting in atrocious performance. Instead * we run it once in non-SYNC mode so that the ZIL gets all the data, * and then we commit it all in one go. */ wbc->sync_mode = WB_SYNC_NONE; result = write_cache_pages(mapping, wbc, zpl_putpage, mapping); if (sync_mode != wbc->sync_mode) { ZFS_ENTER(zsb); ZFS_VERIFY_ZP(zp); if (zsb->z_log != NULL) zil_commit(zsb->z_log, zp->z_id); ZFS_EXIT(zsb); /* * We need to call write_cache_pages() again (we can't just * return after the commit) because the previous call in * non-SYNC mode does not guarantee that we got all the dirty * pages (see the implementation of write_cache_pages() for * details). That being said, this is a no-op in most cases. */ wbc->sync_mode = sync_mode; result = write_cache_pages(mapping, wbc, zpl_putpage, mapping); } return (result); }
/*ARGSUSED*/ int zfs_sync(vfs_t *vfsp, short flag, cred_t *cr) { /* * Data integrity is job one. We don't want a compromised kernel * writing to the storage pool, so we never sync during panic. */ if (panicstr) return (0); /* * SYNC_ATTR is used by fsflush() to force old filesystems like UFS * to sync metadata, which they would otherwise cache indefinitely. * Semantically, the only requirement is that the sync be initiated. * The DMU syncs out txgs frequently, so there's nothing to do. */ if (flag & SYNC_ATTR) return (0); if (vfsp != NULL) { /* * Sync a specific filesystem. */ zfsvfs_t *zfsvfs = vfsp->vfs_data; ZFS_ENTER(zfsvfs); if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, UINT64_MAX, 0); else txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); ZFS_EXIT(zfsvfs); } else { /* * Sync all ZFS filesystems. This is what happens when you * run sync(1M). Unlike other filesystems, ZFS honors the * request by waiting for all pools to commit all dirty data. */ spa_sync_allpools(); } return (0); }
/* * Suspend an intent log. While in suspended mode, we still honor * synchronous semantics, but we rely on txg_wait_synced() to do it. * We suspend the log briefly when taking a snapshot so that the snapshot * contains all the data it's supposed to, and has an empty intent log. */ int zil_suspend(zilog_t *zilog) { const zil_header_t *zh = zilog->zl_header; mutex_enter(&zilog->zl_lock); if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ mutex_exit(&zilog->zl_lock); return (EBUSY); } if (zilog->zl_suspend++ != 0) { /* * Someone else already began a suspend. * Just wait for them to finish. */ while (zilog->zl_suspending) cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); mutex_exit(&zilog->zl_lock); return (0); } zilog->zl_suspending = B_TRUE; mutex_exit(&zilog->zl_lock); zil_commit(zilog, UINT64_MAX, 0); /* * Wait for any in-flight log writes to complete. */ mutex_enter(&zilog->zl_lock); while (zilog->zl_writer) cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); mutex_exit(&zilog->zl_lock); zil_destroy(zilog, B_FALSE); mutex_enter(&zilog->zl_lock); zilog->zl_suspending = B_FALSE; cv_broadcast(&zilog->zl_cv_suspend); mutex_exit(&zilog->zl_lock); return (0); }
static int zvol_write(zvol_state_t *zv, uio_t *uio, boolean_t sync) { uint64_t volsize = zv->zv_volsize; rl_t *rl; int error = 0; ASSERT(zv && zv->zv_open_count > 0); rl = zfs_range_lock(&zv->zv_range_lock, uio->uio_loffset, uio->uio_resid, RL_WRITER); while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); uint64_t off = uio->uio_loffset; dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); if (bytes > volsize - off) /* don't write past the end */ bytes = volsize - off; dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); /* This will only fail for ENOSPC */ error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); break; } error = dmu_write_uio_dbuf(zv->zv_dbuf, uio, bytes, tx); if (error == 0) zvol_log_write(zv, tx, off, bytes, sync); dmu_tx_commit(tx); if (error) break; } zfs_range_unlock(rl); if (sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); return (error); }
/*ARGSUSED*/ static int zfs_sync(vfs_t *vfsp, int waitfor) { /* * Data integrity is job one. We don't want a compromised kernel * writing to the storage pool, so we never sync during panic. */ if (panicstr) return (0); if (vfsp != NULL) { /* * Sync a specific filesystem. */ zfsvfs_t *zfsvfs = vfsp->vfs_data; int error; error = vfs_stdsync(vfsp, waitfor); if (error != 0) return (error); ZFS_ENTER(zfsvfs); if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, UINT64_MAX, 0); else txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); ZFS_EXIT(zfsvfs); } else { /* * Sync all ZFS filesystems. This is what happens when you * run sync(1M). Unlike other filesystems, ZFS honors the * request by waiting for all pools to commit all dirty data. */ spa_sync_allpools(); } return (0); }
/* * Common write path running under the zvol taskq context. This function * is responsible for copying the request structure data in to the DMU and * signaling the request queue with the result of the copy. */ static void zvol_write(void *arg) { struct request *req = (struct request *)arg; struct request_queue *q = req->q; zvol_state_t *zv = q->queuedata; uint64_t offset = blk_rq_pos(req) << 9; uint64_t size = blk_rq_bytes(req); int error = 0; dmu_tx_t *tx; rl_t *rl; rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_WRITER); tx = dmu_tx_create(zv->zv_objset); dmu_tx_hold_write(tx, ZVOL_OBJ, offset, size); /* This will only fail for ENOSPC */ error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); zfs_range_unlock(rl); blk_end_request(req, -error, size); return; } error = dmu_write_req(zv->zv_objset, ZVOL_OBJ, req, tx); if (error == 0) zvol_log_write(zv, tx, offset, size, rq_is_sync(req)); dmu_tx_commit(tx); zfs_range_unlock(rl); if (rq_is_sync(req)) zil_commit(zv->zv_zilog, ZVOL_OBJ); blk_end_request(req, -error, size); }
static int zvol_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { zvol_state_t *zv = bdev->bd_disk->private_data; int error = 0; if (zv == NULL) return (-ENXIO); switch (cmd) { case BLKFLSBUF: zil_commit(zv->zv_zilog, ZVOL_OBJ); break; default: error = -ENOTTY; break; } return (error); }
/* * Common write path running under the zvol taskq context. This function * is responsible for copying the request structure data in to the DMU and * signaling the request queue with the result of the copy. */ static void zvol_write(void *arg) { struct request *req = (struct request *)arg; struct request_queue *q = req->q; zvol_state_t *zv = q->queuedata; uint64_t offset = blk_rq_pos(req) << 9; uint64_t size = blk_rq_bytes(req); int error = 0; dmu_tx_t *tx; rl_t *rl; /* * Annotate this call path with a flag that indicates that it is * unsafe to use KM_SLEEP during memory allocations due to the * potential for a deadlock. KM_PUSHPAGE should be used instead. */ ASSERT(!(current->flags & PF_NOFS)); current->flags |= PF_NOFS; if (req->cmd_flags & VDEV_REQ_FLUSH) zil_commit(zv->zv_zilog, ZVOL_OBJ); /* * Some requests are just for flush and nothing else. */ if (size == 0) { blk_end_request(req, 0, size); goto out; } rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_WRITER); tx = dmu_tx_create(zv->zv_objset); dmu_tx_hold_write(tx, ZVOL_OBJ, offset, size); /* This will only fail for ENOSPC */ error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); zfs_range_unlock(rl); blk_end_request(req, -error, size); goto out; } error = dmu_write_req(zv->zv_objset, ZVOL_OBJ, req, tx); if (error == 0) zvol_log_write(zv, tx, offset, size, req->cmd_flags & VDEV_REQ_FUA); dmu_tx_commit(tx); zfs_range_unlock(rl); if ((req->cmd_flags & VDEV_REQ_FUA) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zv->zv_zilog, ZVOL_OBJ); blk_end_request(req, -error, size); out: current->flags &= ~PF_NOFS; }
static MAKE_REQUEST_FN_RET zvol_request(struct request_queue *q, struct bio *bio) { uio_t uio; zvol_state_t *zv = q->queuedata; fstrans_cookie_t cookie = spl_fstrans_mark(); int rw = bio_data_dir(bio); #ifdef HAVE_GENERIC_IO_ACCT unsigned long start = jiffies; #endif int error = 0; uio.uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)]; uio.uio_skip = BIO_BI_SKIP(bio); uio.uio_resid = BIO_BI_SIZE(bio); uio.uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio); uio.uio_loffset = BIO_BI_SECTOR(bio) << 9; uio.uio_limit = MAXOFFSET_T; uio.uio_segflg = UIO_BVEC; if (bio_has_data(bio) && uio.uio_loffset + uio.uio_resid > zv->zv_volsize) { printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", zv->zv_disk->disk_name, (long long unsigned)uio.uio_loffset, (long unsigned)uio.uio_resid); error = SET_ERROR(EIO); goto out1; } generic_start_io_acct(rw, bio_sectors(bio), &zv->zv_disk->part0); if (rw == WRITE) { if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { error = SET_ERROR(EROFS); goto out2; } if (bio_is_discard(bio) || bio_is_secure_erase(bio)) { error = zvol_discard(bio); goto out2; } /* * Some requests are just for flush and nothing else. */ if (uio.uio_resid == 0) { if (bio_is_flush(bio)) zil_commit(zv->zv_zilog, ZVOL_OBJ); goto out2; } error = zvol_write(zv, &uio, bio_is_flush(bio) || bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); } else error = zvol_read(zv, &uio); out2: generic_end_io_acct(rw, &zv->zv_disk->part0, start); out1: BIO_END_IO(bio, -error); spl_fstrans_unmark(cookie); #ifdef HAVE_MAKE_REQUEST_FN_RET_INT return (0); #elif defined(HAVE_MAKE_REQUEST_FN_RET_QC) return (BLK_QC_T_NONE); #endif }
int zvol_strategy(buf_t *bp) { zvol_state_t *zv = ddi_get_soft_state(zvol_state, getminor(bp->b_edev)); uint64_t off, volsize; size_t size, resid; char *addr; objset_t *os; int error = 0; int sync; int reading; int txg_sync_needed = B_FALSE; if (zv == NULL) { bioerror(bp, ENXIO); biodone(bp); return (0); } if (getminor(bp->b_edev) == 0) { bioerror(bp, EINVAL); biodone(bp); return (0); } if (zv->zv_readonly && !(bp->b_flags & B_READ)) { bioerror(bp, EROFS); biodone(bp); return (0); } off = ldbtob(bp->b_blkno); volsize = zv->zv_volsize; os = zv->zv_objset; ASSERT(os != NULL); sync = !(bp->b_flags & B_ASYNC) && !(zil_disable); bp_mapin(bp); addr = bp->b_un.b_addr; resid = bp->b_bcount; /* * There must be no buffer changes when doing a dmu_sync() because * we can't change the data whilst calculating the checksum. * A better approach than a per zvol rwlock would be to lock ranges. */ reading = bp->b_flags & B_READ; if (reading || resid <= zvol_immediate_write_sz) rw_enter(&zv->zv_dslock, RW_READER); else rw_enter(&zv->zv_dslock, RW_WRITER); while (resid != 0 && off < volsize) { size = MIN(resid, 1UL << 20); /* cap at 1MB per tx */ if (size > volsize - off) /* don't write past the end */ size = volsize - off; if (reading) { error = dmu_read(os, ZVOL_OBJ, off, size, addr); } else { dmu_tx_t *tx = dmu_tx_create(os); dmu_tx_hold_write(tx, ZVOL_OBJ, off, size); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { dmu_write(os, ZVOL_OBJ, off, size, addr, tx); if (sync) { /* use the ZIL to commit this write */ if (zvol_log_write(zv, tx, off, size, addr) != 0) { txg_sync_needed = B_TRUE; } } dmu_tx_commit(tx); } } if (error) break; off += size; addr += size; resid -= size; } rw_exit(&zv->zv_dslock); if ((bp->b_resid = resid) == bp->b_bcount) bioerror(bp, off > volsize ? EINVAL : error); biodone(bp); if (sync) { if (txg_sync_needed) txg_wait_synced(dmu_objset_pool(os), 0); else zil_commit(zv->zv_zilog, UINT64_MAX, 0); } return (0); }