static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = file->f_mapping->host; size_t count = iov_iter_count(iter); ssize_t ret; ret = blockdev_direct_IO(iocb, inode, iter, offset, jfs_get_block); /* * In case of error extending write may have instantiated a few * blocks outside i_size. Trim these off again. */ if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { loff_t isize = i_size_read(inode); loff_t end = offset + count; if (end > isize) jfs_write_failed(mapping, end); } return ret; }
/** * dax_do_io - Perform I/O to a DAX file * @iocb: The control block for this I/O * @inode: The file which the I/O is directed at * @iter: The addresses to do I/O from or to * @pos: The file offset where the I/O starts * @get_block: The filesystem method used to translate file offsets to blocks * @end_io: A filesystem callback for I/O completion * @flags: See below * * This function uses the same locking scheme as do_blockdev_direct_IO: * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the * caller for writes. For reads, we take and release the i_mutex ourselves. * If DIO_LOCKING is not set, the filesystem takes care of its own locking. * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O * is in progress. */ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, struct iov_iter *iter, loff_t pos, get_block_t get_block, dio_iodone_t end_io, int flags) { struct buffer_head bh; ssize_t retval = -EINVAL; loff_t end = pos + iov_iter_count(iter); memset(&bh, 0, sizeof(bh)); bh.b_bdev = inode->i_sb->s_bdev; if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) { struct address_space *mapping = inode->i_mapping; inode_lock(inode); retval = filemap_write_and_wait_range(mapping, pos, end - 1); if (retval) { inode_unlock(inode); goto out; } } /* Protects against truncate */ if (!(flags & DIO_SKIP_DIO_COUNT)) inode_dio_begin(inode); retval = dax_io(inode, iter, pos, end, get_block, &bh); if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) inode_unlock(inode); if (end_io) { int err; err = end_io(iocb, pos, retval, bh.b_private); if (err) retval = err; } if (!(flags & DIO_SKIP_DIO_COUNT)) inode_dio_end(inode); out: return retval; }
static ssize_t nilfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct inode *inode = file_inode(iocb->ki_filp); if (iov_iter_rw(iter) == WRITE) return 0; /* Needs synchronization with the cleaner */ return blockdev_direct_IO(iocb, inode, iter, nilfs_get_block); }
static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); int err; /* we don't need to use inline_data strictly */ if (f2fs_has_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); if (err) return err; } if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) return 0; err = check_direct_IO(inode, iter, offset); if (err) return err; trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); if (iov_iter_rw(iter) == WRITE) __allocate_data_blocks(inode, offset, count); err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio); if (err < 0 && iov_iter_rw(iter) == WRITE) f2fs_write_failed(mapping, offset + count); trace_f2fs_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), err); return err; }
/** * dax_do_io - Perform I/O to a DAX file * @iocb: The control block for this I/O * @inode: The file which the I/O is directed at * @iter: The addresses to do I/O from or to * @get_block: The filesystem method used to translate file offsets to blocks * @end_io: A filesystem callback for I/O completion * @flags: See below * * This function uses the same locking scheme as do_blockdev_direct_IO: * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the * caller for writes. For reads, we take and release the i_mutex ourselves. * If DIO_LOCKING is not set, the filesystem takes care of its own locking. * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O * is in progress. */ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, struct iov_iter *iter, get_block_t get_block, dio_iodone_t end_io, int flags) { struct buffer_head bh; ssize_t retval = -EINVAL; loff_t pos = iocb->ki_pos; loff_t end = pos + iov_iter_count(iter); memset(&bh, 0, sizeof(bh)); bh.b_bdev = inode->i_sb->s_bdev; if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) inode_lock(inode); /* Protects against truncate */ if (!(flags & DIO_SKIP_DIO_COUNT)) inode_dio_begin(inode); retval = dax_io(inode, iter, pos, end, get_block, &bh); if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) inode_unlock(inode); if (end_io) { int err; err = end_io(iocb, pos, retval, bh.b_private); if (err) retval = err; } if (!(flags & DIO_SKIP_DIO_COUNT)) inode_dio_end(inode); return retval; }
static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, loff_t start, loff_t end, get_block_t get_block, struct buffer_head *bh) { loff_t pos = start, max = start, bh_max = start; bool hole = false, need_wmb = false; struct block_device *bdev = NULL; int rw = iov_iter_rw(iter), rc; long map_len = 0; struct blk_dax_ctl dax = { .addr = (void __pmem *) ERR_PTR(-EIO), }; if (rw == READ) end = min(end, i_size_read(inode)); while (pos < end) { size_t len; if (pos == max) { unsigned blkbits = inode->i_blkbits; long page = pos >> PAGE_SHIFT; sector_t block = page << (PAGE_SHIFT - blkbits); unsigned first = pos - (block << blkbits); long size; if (pos == bh_max) { bh->b_size = PAGE_ALIGN(end - pos); bh->b_state = 0; rc = get_block(inode, block, bh, rw == WRITE); if (rc) break; if (!buffer_size_valid(bh)) bh->b_size = 1 << blkbits; bh_max = pos - first + bh->b_size; bdev = bh->b_bdev; } else { unsigned done = bh->b_size - (bh_max - (pos - first)); bh->b_blocknr += done >> blkbits; bh->b_size -= done; } hole = rw == READ && !buffer_written(bh); if (hole) { size = bh->b_size - first; } else { dax_unmap_atomic(bdev, &dax); dax.sector = to_sector(bh, inode); dax.size = bh->b_size; map_len = dax_map_atomic(bdev, &dax); if (map_len < 0) { rc = map_len; break; } if (buffer_unwritten(bh) || buffer_new(bh)) { dax_new_buf(dax.addr, map_len, first, pos, end); need_wmb = true; } dax.addr += first; size = map_len - first; } max = min(pos + size, end); } if (iov_iter_rw(iter) == WRITE) { len = copy_from_iter_pmem(dax.addr, max - pos, iter); need_wmb = true; } else if (!hole) len = copy_to_iter((void __force *) dax.addr, max - pos, iter); else len = iov_iter_zero(max - pos, iter); if (!len) { rc = -EFAULT; break; } pos += len; if (!IS_ERR(dax.addr)) dax.addr += len; } if (need_wmb) wmb_pmem(); dax_unmap_atomic(bdev, &dax); return (pos == start) ? rc : pos - start; }
static ssize_t ll_direct_IO( # ifndef HAVE_IOV_ITER_RW int rw, # endif struct kiocb *iocb, struct iov_iter *iter, loff_t file_offset) { struct ll_cl_context *lcc; const struct lu_env *env; struct cl_io *io; struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; ssize_t count = iov_iter_count(iter); ssize_t tot_bytes = 0, result = 0; size_t size = MAX_DIO_SIZE; /* FIXME: io smaller than PAGE_SIZE is broken on ia64 ??? */ if ((file_offset & ~PAGE_MASK) || (count & ~PAGE_MASK)) return -EINVAL; CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), size=%zd (max %lu), " "offset=%lld=%llx, pages %zd (max %lu)\n", PFID(ll_inode2fid(inode)), inode, count, MAX_DIO_SIZE, file_offset, file_offset, count >> PAGE_SHIFT, MAX_DIO_SIZE >> PAGE_SHIFT); /* Check that all user buffers are aligned as well */ if (iov_iter_alignment(iter) & ~PAGE_MASK) return -EINVAL; lcc = ll_cl_find(file); if (lcc == NULL) RETURN(-EIO); env = lcc->lcc_env; LASSERT(!IS_ERR(env)); io = lcc->lcc_io; LASSERT(io != NULL); /* 0. Need locking between buffered and direct access. and race with * size changing by concurrent truncates and writes. * 1. Need inode mutex to operate transient pages. */ if (iov_iter_rw(iter) == READ) inode_lock(inode); while (iov_iter_count(iter)) { struct page **pages; size_t offs; count = min_t(size_t, iov_iter_count(iter), size); if (iov_iter_rw(iter) == READ) { if (file_offset >= i_size_read(inode)) break; if (file_offset + count > i_size_read(inode)) count = i_size_read(inode) - file_offset; } result = iov_iter_get_pages_alloc(iter, &pages, count, &offs); if (likely(result > 0)) { int n = DIV_ROUND_UP(result + offs, PAGE_SIZE); result = ll_direct_IO_seg(env, io, iov_iter_rw(iter), inode, result, file_offset, pages, n); ll_free_user_pages(pages, n, iov_iter_rw(iter) == READ); } if (unlikely(result <= 0)) { /* If we can't allocate a large enough buffer * for the request, shrink it to a smaller * PAGE_SIZE multiple and try again. * We should always be able to kmalloc for a * page worth of page pointers = 4MB on i386. */ if (result == -ENOMEM && size > (PAGE_SIZE / sizeof(*pages)) * PAGE_SIZE) { size = ((((size / 2) - 1) | ~PAGE_MASK) + 1) & PAGE_MASK; CDEBUG(D_VFSTRACE, "DIO size now %zu\n", size); continue; } GOTO(out, result); } iov_iter_advance(iter, result); tot_bytes += result; file_offset += result; } out: if (iov_iter_rw(iter) == READ) inode_unlock(inode); if (tot_bytes > 0) { struct vvp_io *vio = vvp_env_io(env); /* no commit async for direct IO */ vio->u.write.vui_written += tot_bytes; } return tot_bytes ? : result; }
static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, loff_t start, loff_t end, get_block_t get_block, struct buffer_head *bh) { loff_t pos = start, max = start, bh_max = start; bool hole = false, need_wmb = false; struct block_device *bdev = NULL; int rw = iov_iter_rw(iter), rc; long map_len = 0; struct blk_dax_ctl dax = { .addr = (void __pmem *) ERR_PTR(-EIO), }; unsigned blkbits = inode->i_blkbits; sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits; if (rw == READ) end = min(end, i_size_read(inode)); while (pos < end) { size_t len; if (pos == max) { long page = pos >> PAGE_SHIFT; sector_t block = page << (PAGE_SHIFT - blkbits); unsigned first = pos - (block << blkbits); long size; if (pos == bh_max) { bh->b_size = PAGE_ALIGN(end - pos); bh->b_state = 0; rc = get_block(inode, block, bh, rw == WRITE); if (rc) break; if (!buffer_size_valid(bh)) bh->b_size = 1 << blkbits; bh_max = pos - first + bh->b_size; bdev = bh->b_bdev; /* * We allow uninitialized buffers for writes * beyond EOF as those cannot race with faults */ WARN_ON_ONCE( (buffer_new(bh) && block < file_blks) || (rw == WRITE && buffer_unwritten(bh))); } else { unsigned done = bh->b_size - (bh_max - (pos - first)); bh->b_blocknr += done >> blkbits; bh->b_size -= done; } hole = rw == READ && !buffer_written(bh); if (hole) { size = bh->b_size - first; } else { dax_unmap_atomic(bdev, &dax); dax.sector = to_sector(bh, inode); dax.size = bh->b_size; map_len = dax_map_atomic(bdev, &dax); if (map_len < 0) { rc = map_len; break; } dax.addr += first; size = map_len - first; } /* * pos + size is one past the last offset for IO, * so pos + size can overflow loff_t at extreme offsets. * Cast to u64 to catch this and get the true minimum. */ max = min_t(u64, pos + size, end); } if (iov_iter_rw(iter) == WRITE) { len = copy_from_iter_pmem(dax.addr, max - pos, iter); need_wmb = true; } else if (!hole) len = copy_to_iter((void __force *) dax.addr, max - pos, iter); else len = iov_iter_zero(max - pos, iter); if (!len) { rc = -EFAULT; break; } pos += len; if (!IS_ERR(dax.addr)) dax.addr += len; } if (need_wmb) wmb_pmem(); dax_unmap_atomic(bdev, &dax); return (pos == start) ? rc : pos - start; }
static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct address_space *mapping = inode->i_mapping; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int rv; /* * Deferred lock, even if its a write, since we do no allocation * on this path. All we need change is atime, and this lock mode * ensures that other nodes have flushed their buffered read caches * (i.e. their page cache entries for this inode). We do not, * unfortunately have the option of only flushing a range like * the VFS does. */ gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh); rv = gfs2_glock_nq(&gh); if (rv) return rv; rv = gfs2_ok_for_dio(ip, offset); if (rv != 1) goto out; /* dio not valid, fall back to buffered i/o */ /* * Now since we are holding a deferred (CW) lock at this point, you * might be wondering why this is ever needed. There is a case however * where we've granted a deferred local lock against a cached exclusive * glock. That is ok provided all granted local locks are deferred, but * it also means that it is possible to encounter pages which are * cached and possibly also mapped. So here we check for that and sort * them out ahead of the dio. The glock state machine will take care of * everything else. * * If in fact the cached glock state (gl->gl_state) is deferred (CW) in * the first place, mapping->nr_pages will always be zero. */ if (mapping->nrpages) { loff_t lstart = offset & (PAGE_CACHE_SIZE - 1); loff_t len = iov_iter_count(iter); loff_t end = PAGE_ALIGN(offset + len) - 1; rv = 0; if (len == 0) goto out; if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) unmap_shared_mapping_range(ip->i_inode.i_mapping, offset, len); rv = filemap_write_and_wait_range(mapping, lstart, end); if (rv) goto out; if (iov_iter_rw(iter) == WRITE) truncate_inode_pages_range(mapping, lstart, end); } rv = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, offset, gfs2_get_block_direct, NULL, NULL, 0); out: gfs2_glock_dq(&gh); gfs2_holder_uninit(&gh); return rv; }