static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int rv; /* * Deferred lock, even if its a write, since we do no allocation * on this path. All we need change is atime, and this lock mode * ensures that other nodes have flushed their buffered read caches * (i.e. their page cache entries for this inode). We do not, * unfortunately have the option of only flushing a range like * the VFS does. */ gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh); rv = gfs2_glock_nq(&gh); if (rv) return rv; rv = gfs2_ok_for_dio(ip, rw, offset); if (rv != 1) goto out; /* dio not valid, fall back to buffered i/o */ rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, gfs2_get_block_direct, NULL, NULL, 0); out: gfs2_glock_dq_m(1, &gh); gfs2_holder_uninit(&gh); return rv; }
/* * xfs_file_dio_aio_write - handle direct IO writes * * Lock the inode appropriately to prepare for and issue a direct IO write. * By separating it from the buffered write path we remove all the tricky to * follow locking changes and looping. * * If there are cached pages or we're extending the file, we need IOLOCK_EXCL * until we're sure the bytes at the new EOF have been zeroed and/or the cached * pages are flushed out. * * In most cases the direct IO writes will be done holding IOLOCK_SHARED * allowing them to be done in parallel with reads and other direct IO writes. * However, if the IO is not aligned to filesystem blocks, the direct IO layer * needs to do sub-block zeroing and that requires serialisation against other * direct IOs to the same block. In this case we need to serialise the * submission of the unaligned IOs so that we don't get racing block zeroing in * the dio layer. To avoid the problem with aio, we also need to wait for * outstanding IOs to complete so that unwritten extent conversion is completed * before we try to map the overlapping block. This is currently implemented by * hitting it with a big hammer (i.e. inode_dio_wait()). * * Returns with locks held indicated by @iolock and errors indicated by * negative return values. */ STATIC ssize_t xfs_file_dio_aio_write( struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; ssize_t ret = 0; int unaligned_io = 0; int iolock; size_t count = iov_iter_count(from); loff_t end; struct iov_iter data; struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ if ((iocb->ki_pos | count) & target->bt_logical_sectormask) return -EINVAL; /* "unaligned" here means not aligned to a filesystem block */ if ((iocb->ki_pos & mp->m_blockmask) || ((iocb->ki_pos + count) & mp->m_blockmask)) unaligned_io = 1; /* * We don't need to take an exclusive lock unless there page cache needs * to be invalidated or unaligned IO is being executed. We don't need to * consider the EOF extension case here because * xfs_file_aio_write_checks() will relock the inode as necessary for * EOF zeroing cases and fill out the new inode size as appropriate. */ if (unaligned_io || mapping->nrpages) iolock = XFS_IOLOCK_EXCL; else iolock = XFS_IOLOCK_SHARED; xfs_rw_ilock(ip, iolock); /* * Recheck if there are cached pages that need invalidate after we got * the iolock to protect against other threads adding new pages while * we were waiting for the iolock. */ if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) { xfs_rw_iunlock(ip, iolock); iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, iolock); } ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; count = iov_iter_count(from); end = iocb->ki_pos + count - 1; /* * See xfs_file_dio_aio_read() for why we do a full-file flush here. */ if (mapping->nrpages) { ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); if (ret) goto out; /* * Invalidate whole pages. This can return an error if we fail * to invalidate a page, but this should never happen on XFS. * Warn if it does fail. */ ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping); WARN_ON_ONCE(ret); ret = 0; } /* * If we are doing unaligned IO, wait for all other IO to drain, * otherwise demote the lock if we had to flush cached pages */ if (unaligned_io) inode_dio_wait(inode); else if (iolock == XFS_IOLOCK_EXCL) { xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); iolock = XFS_IOLOCK_SHARED; } trace_xfs_file_direct_write(ip, count, iocb->ki_pos); /* If this is a block-aligned directio CoW, remap immediately. */ if (xfs_is_reflink_inode(ip) && !unaligned_io) { ret = xfs_reflink_allocate_cow_range(ip, iocb->ki_pos, count); if (ret) goto out; } data = *from; ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, xfs_get_blocks_direct, xfs_end_io_direct_write, NULL, DIO_ASYNC_EXTEND); /* see generic_file_direct_write() for why this is necessary */ if (mapping->nrpages) { invalidate_inode_pages2_range(mapping, iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); }
static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct address_space *mapping = inode->i_mapping; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int rv; /* * Deferred lock, even if its a write, since we do no allocation * on this path. All we need change is atime, and this lock mode * ensures that other nodes have flushed their buffered read caches * (i.e. their page cache entries for this inode). We do not, * unfortunately have the option of only flushing a range like * the VFS does. */ gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh); rv = gfs2_glock_nq(&gh); if (rv) return rv; rv = gfs2_ok_for_dio(ip, rw, offset); if (rv != 1) goto out; /* dio not valid, fall back to buffered i/o */ /* * Now since we are holding a deferred (CW) lock at this point, you * might be wondering why this is ever needed. There is a case however * where we've granted a deferred local lock against a cached exclusive * glock. That is ok provided all granted local locks are deferred, but * it also means that it is possible to encounter pages which are * cached and possibly also mapped. So here we check for that and sort * them out ahead of the dio. The glock state machine will take care of * everything else. * * If in fact the cached glock state (gl->gl_state) is deferred (CW) in * the first place, mapping->nr_pages will always be zero. */ if (mapping->nrpages) { loff_t lstart = offset & (PAGE_CACHE_SIZE - 1); loff_t len = iov_length(iov, nr_segs); loff_t end = PAGE_ALIGN(offset + len) - 1; rv = 0; if (len == 0) goto out; if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) unmap_shared_mapping_range(ip->i_inode.i_mapping, offset, len); rv = filemap_write_and_wait_range(mapping, lstart, end); if (rv) goto out; if (rw == WRITE) truncate_inode_pages_range(mapping, lstart, end); } rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, gfs2_get_block_direct, NULL, NULL, 0); out: gfs2_glock_dq(&gh); gfs2_holder_uninit(&gh); return rv; }
STATIC ssize_t xfs_file_dio_aio_read( struct kiocb *iocb, struct iov_iter *to) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); loff_t isize = i_size_read(inode); size_t count = iov_iter_count(to); struct iov_iter data; struct xfs_buftarg *target; ssize_t ret = 0; trace_xfs_file_direct_read(ip, count, iocb->ki_pos); if (!count) return 0; /* skip atime */ if (XFS_IS_REALTIME_INODE(ip)) target = ip->i_mount->m_rtdev_targp; else target = ip->i_mount->m_ddev_targp; /* DIO must be aligned to device logical sector size */ if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { if (iocb->ki_pos == isize) return 0; return -EINVAL; } file_accessed(iocb->ki_filp); /* * Locking is a bit tricky here. If we take an exclusive lock for direct * IO, we effectively serialise all new concurrent read IO to this file * and block it behind IO that is currently in progress because IO in * progress holds the IO lock shared. We only need to hold the lock * exclusive to blow away the page cache, so only take lock exclusively * if the page cache needs invalidation. This allows the normal direct * IO case of no page cache pages to proceeed concurrently without * serialisation. */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); if (mapping->nrpages) { xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); /* * The generic dio code only flushes the range of the particular * I/O. Because we take an exclusive lock here, this whole * sequence is considerably more expensive for us. This has a * noticeable performance impact for any file with cached pages, * even when outside of the range of the particular I/O. * * Hence, amortize the cost of the lock against a full file * flush and reduce the chances of repeated iolock cycles going * forward. */ if (mapping->nrpages) { ret = filemap_write_and_wait(mapping); if (ret) { xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } /* * Invalidate whole pages. This can return an error if * we fail to invalidate a page, but this should never * happen on XFS. Warn if it does fail. */ ret = invalidate_inode_pages2(mapping); WARN_ON_ONCE(ret); ret = 0; } xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); } data = *to; ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, xfs_get_blocks_direct, NULL, NULL, 0); if (ret >= 0) { iocb->ki_pos += ret; iov_iter_advance(to, ret); } xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; }