/* * xfs_file_dio_aio_write - handle direct IO writes * * Lock the inode appropriately to prepare for and issue a direct IO write. * By separating it from the buffered write path we remove all the tricky to * follow locking changes and looping. * * If there are cached pages or we're extending the file, we need IOLOCK_EXCL * until we're sure the bytes at the new EOF have been zeroed and/or the cached * pages are flushed out. * * In most cases the direct IO writes will be done holding IOLOCK_SHARED * allowing them to be done in parallel with reads and other direct IO writes. * However, if the IO is not aligned to filesystem blocks, the direct IO layer * needs to do sub-block zeroing and that requires serialisation against other * direct IOs to the same block. In this case we need to serialise the * submission of the unaligned IOs so that we don't get racing block zeroing in * the dio layer. To avoid the problem with aio, we also need to wait for * outstanding IOs to complete so that unwritten extent conversion is completed * before we try to map the overlapping block. This is currently implemented by * hitting it with a big hammer (i.e. inode_dio_wait()). * * Returns with locks held indicated by @iolock and errors indicated by * negative return values. */ STATIC ssize_t xfs_file_dio_aio_write( struct kiocb *iocb, const struct iovec *iovp, unsigned long nr_segs, loff_t pos, size_t ocount) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; ssize_t ret = 0; size_t count = ocount; int unaligned_io = 0; int iolock; struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; if ((pos & target->bt_smask) || (count & target->bt_smask)) return -XFS_ERROR(EINVAL); if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) unaligned_io = 1; /* * We don't need to take an exclusive lock unless there page cache needs * to be invalidated or unaligned IO is being executed. We don't need to * consider the EOF extension case here because * xfs_file_aio_write_checks() will relock the inode as necessary for * EOF zeroing cases and fill out the new inode size as appropriate. */ if (unaligned_io || mapping->nrpages) iolock = XFS_IOLOCK_EXCL; else iolock = XFS_IOLOCK_SHARED; xfs_rw_ilock(ip, iolock); /* * Recheck if there are cached pages that need invalidate after we got * the iolock to protect against other threads adding new pages while * we were waiting for the iolock. */ if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) { xfs_rw_iunlock(ip, iolock); iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, iolock); } ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock); if (ret) goto out; if (mapping->nrpages) { ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, FI_REMAPF_LOCKED); if (ret) goto out; } /* * If we are doing unaligned IO, wait for all other IO to drain, * otherwise demote the lock if we had to flush cached pages */ if (unaligned_io) inode_dio_wait(inode); else if (iolock == XFS_IOLOCK_EXCL) { xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); iolock = XFS_IOLOCK_SHARED; } trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); ret = generic_file_direct_write(iocb, iovp, &nr_segs, pos, &iocb->ki_pos, count, ocount); out: xfs_rw_iunlock(ip, iolock); /* No fallback to buffered IO on errors for XFS. */ ASSERT(ret < 0 || ret == count); return ret; }
/* * xfs_file_dio_aio_write - handle direct IO writes * * Lock the inode appropriately to prepare for and issue a direct IO write. * By separating it from the buffered write path we remove all the tricky to * follow locking changes and looping. * * If there are cached pages or we're extending the file, we need IOLOCK_EXCL * until we're sure the bytes at the new EOF have been zeroed and/or the cached * pages are flushed out. * * In most cases the direct IO writes will be done holding IOLOCK_SHARED * allowing them to be done in parallel with reads and other direct IO writes. * However, if the IO is not aligned to filesystem blocks, the direct IO layer * needs to do sub-block zeroing and that requires serialisation against other * direct IOs to the same block. In this case we need to serialise the * submission of the unaligned IOs so that we don't get racing block zeroing in * the dio layer. To avoid the problem with aio, we also need to wait for * outstanding IOs to complete so that unwritten extent conversion is completed * before we try to map the overlapping block. This is currently implemented by * hitting it with a big hammer (i.e. xfs_ioend_wait()). * * Returns with locks held indicated by @iolock and errors indicated by * negative return values. */ STATIC ssize_t xfs_file_dio_aio_write( struct kiocb *iocb, const struct iovec *iovp, unsigned long nr_segs, loff_t pos, size_t ocount, int *iolock) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; ssize_t ret = 0; size_t count = ocount; int unaligned_io = 0; struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; *iolock = 0; if ((pos & target->bt_smask) || (count & target->bt_smask)) return -XFS_ERROR(EINVAL); if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) unaligned_io = 1; if (unaligned_io || mapping->nrpages || pos > ip->i_size) *iolock = XFS_IOLOCK_EXCL; else *iolock = XFS_IOLOCK_SHARED; xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); ret = xfs_file_aio_write_checks(file, &pos, &count, iolock); if (ret) return ret; if (mapping->nrpages) { WARN_ON(*iolock != XFS_IOLOCK_EXCL); ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, FI_REMAPF_LOCKED); if (ret) return ret; } /* * If we are doing unaligned IO, wait for all other IO to drain, * otherwise demote the lock if we had to flush cached pages */ if (unaligned_io) xfs_ioend_wait(ip); else if (*iolock == XFS_IOLOCK_EXCL) { xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); *iolock = XFS_IOLOCK_SHARED; } trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); ret = generic_file_direct_write(iocb, iovp, &nr_segs, pos, &iocb->ki_pos, count, ocount); /* No fallback to buffered IO on errors for XFS. */ ASSERT(ret < 0 || ret == count); return ret; }
STATIC ssize_t xfs_file_aio_read( struct kiocb *iocb, const struct iovec *iovp, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; size_t size = 0; ssize_t ret = 0; int ioflags = 0; xfs_fsize_t n; XFS_STATS_INC(xs_read_calls); BUG_ON(iocb->ki_pos != pos); if (unlikely(file->f_flags & O_DIRECT)) ioflags |= IO_ISDIRECT; if (file->f_mode & FMODE_NOCMTIME) ioflags |= IO_INVIS; ret = generic_segment_checks(iovp, &nr_segs, &size, VERIFY_WRITE); if (ret < 0) return ret; if (unlikely(ioflags & IO_ISDIRECT)) { xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; if ((iocb->ki_pos & target->bt_smask) || (size & target->bt_smask)) { if (iocb->ki_pos == i_size_read(inode)) return 0; return -XFS_ERROR(EINVAL); } } n = mp->m_super->s_maxbytes - iocb->ki_pos; if (n <= 0 || size == 0) return 0; if (n < size) size = n; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; /* * Locking is a bit tricky here. If we take an exclusive lock * for direct IO, we effectively serialise all new concurrent * read IO to this file and block it behind IO that is currently in * progress because IO in progress holds the IO lock shared. We only * need to hold the lock exclusive to blow away the page cache, so * only take lock exclusively if the page cache needs invalidation. * This allows the normal direct IO case of no page cache pages to * proceeed concurrently without serialisation. */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); if ((ioflags & IO_ISDIRECT) && inode->i_mapping->nrpages) { xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); if (inode->i_mapping->nrpages) { ret = -xfs_flushinval_pages(ip, (iocb->ki_pos & PAGE_CACHE_MASK), -1, FI_REMAPF_LOCKED); if (ret) { xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } } xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); } trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos); if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; }
/* * xfs_file_dio_aio_write - handle direct IO writes * * Lock the inode appropriately to prepare for and issue a direct IO write. * By separating it from the buffered write path we remove all the tricky to * follow locking changes and looping. * * If there are cached pages or we're extending the file, we need IOLOCK_EXCL * until we're sure the bytes at the new EOF have been zeroed and/or the cached * pages are flushed out. * * In most cases the direct IO writes will be done holding IOLOCK_SHARED * allowing them to be done in parallel with reads and other direct IO writes. * However, if the IO is not aligned to filesystem blocks, the direct IO layer * needs to do sub-block zeroing and that requires serialisation against other * direct IOs to the same block. In this case we need to serialise the * submission of the unaligned IOs so that we don't get racing block zeroing in * the dio layer. To avoid the problem with aio, we also need to wait for * outstanding IOs to complete so that unwritten extent conversion is completed * before we try to map the overlapping block. This is currently implemented by * hitting it with a big hammer (i.e. inode_dio_wait()). * * Returns with locks held indicated by @iolock and errors indicated by * negative return values. */ STATIC ssize_t xfs_file_dio_aio_write( struct kiocb *iocb, const struct iovec *iovp, unsigned long nr_segs, loff_t pos, size_t ocount) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; ssize_t ret = 0; size_t count = ocount; int unaligned_io = 0; int iolock; struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; if ((pos & target->bt_smask) || (count & target->bt_smask)) return -XFS_ERROR(EINVAL); if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) unaligned_io = 1; if (unaligned_io || mapping->nrpages) iolock = XFS_IOLOCK_EXCL; else iolock = XFS_IOLOCK_SHARED; xfs_rw_ilock(ip, iolock); if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) { xfs_rw_iunlock(ip, iolock); iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, iolock); } ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock); if (ret) goto out; if (mapping->nrpages) { ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, FI_REMAPF_LOCKED); if (ret) goto out; } if (unaligned_io) inode_dio_wait(inode); else if (iolock == XFS_IOLOCK_EXCL) { xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); iolock = XFS_IOLOCK_SHARED; } trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); ret = generic_file_direct_write(iocb, iovp, &nr_segs, pos, &iocb->ki_pos, count, ocount); out: xfs_rw_iunlock(ip, iolock); ASSERT(ret < 0 || ret == count); return ret; }
STATIC ssize_t xfs_file_aio_read( struct kiocb *iocb, const struct iovec *iovp, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; size_t size = 0; ssize_t ret = 0; int ioflags = 0; xfs_fsize_t n; unsigned long seg; XFS_STATS_INC(xs_read_calls); BUG_ON(iocb->ki_pos != pos); if (unlikely(file->f_flags & O_DIRECT)) ioflags |= IO_ISDIRECT; if (file->f_mode & FMODE_NOCMTIME) ioflags |= IO_INVIS; /* START copy & waste from filemap.c */ for (seg = 0; seg < nr_segs; seg++) { const struct iovec *iv = &iovp[seg]; /* * If any segment has a negative length, or the cumulative * length ever wraps negative then return -EINVAL. */ size += iv->iov_len; if (unlikely((ssize_t)(size|iv->iov_len) < 0)) return XFS_ERROR(-EINVAL); } /* END copy & waste from filemap.c */ if (unlikely(ioflags & IO_ISDIRECT)) { xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; if ((iocb->ki_pos & target->bt_smask) || (size & target->bt_smask)) { if (iocb->ki_pos == ip->i_size) return 0; return -XFS_ERROR(EINVAL); } } n = XFS_MAXIOFFSET(mp) - iocb->ki_pos; if (n <= 0 || size == 0) return 0; if (n < size) size = n; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; if (unlikely(ioflags & IO_ISDIRECT)) { xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); if (inode->i_mapping->nrpages) { ret = -xfs_flushinval_pages(ip, (iocb->ki_pos & PAGE_CACHE_MASK), -1, FI_REMAPF_LOCKED); if (ret) { xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } } xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); } else xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags); ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos); if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; }
/* * xfs_file_dio_aio_write - handle direct IO writes * * Lock the inode appropriately to prepare for and issue a direct IO write. * By separating it from the buffered write path we remove all the tricky to * follow locking changes and looping. * * If there are cached pages or we're extending the file, we need IOLOCK_EXCL * until we're sure the bytes at the new EOF have been zeroed and/or the cached * pages are flushed out. * * In most cases the direct IO writes will be done holding IOLOCK_SHARED * allowing them to be done in parallel with reads and other direct IO writes. * However, if the IO is not aligned to filesystem blocks, the direct IO layer * needs to do sub-block zeroing and that requires serialisation against other * direct IOs to the same block. In this case we need to serialise the * submission of the unaligned IOs so that we don't get racing block zeroing in * the dio layer. To avoid the problem with aio, we also need to wait for * outstanding IOs to complete so that unwritten extent conversion is completed * before we try to map the overlapping block. This is currently implemented by * hitting it with a big hammer (i.e. inode_dio_wait()). * * Returns with locks held indicated by @iolock and errors indicated by * negative return values. */ STATIC ssize_t xfs_file_dio_aio_write( struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; ssize_t ret = 0; int unaligned_io = 0; int iolock; size_t count = iov_iter_count(from); loff_t end; struct iov_iter data; struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ if (!IS_DAX(inode) && ((iocb->ki_pos | count) & target->bt_logical_sectormask)) return -EINVAL; /* "unaligned" here means not aligned to a filesystem block */ if ((iocb->ki_pos & mp->m_blockmask) || ((iocb->ki_pos + count) & mp->m_blockmask)) unaligned_io = 1; /* * We don't need to take an exclusive lock unless there page cache needs * to be invalidated or unaligned IO is being executed. We don't need to * consider the EOF extension case here because * xfs_file_aio_write_checks() will relock the inode as necessary for * EOF zeroing cases and fill out the new inode size as appropriate. */ if (unaligned_io || mapping->nrpages) iolock = XFS_IOLOCK_EXCL; else iolock = XFS_IOLOCK_SHARED; xfs_rw_ilock(ip, iolock); /* * Recheck if there are cached pages that need invalidate after we got * the iolock to protect against other threads adding new pages while * we were waiting for the iolock. */ if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) { xfs_rw_iunlock(ip, iolock); iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, iolock); } ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; count = iov_iter_count(from); end = iocb->ki_pos + count - 1; /* * See xfs_file_read_iter() for why we do a full-file flush here. */ if (mapping->nrpages) { ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); if (ret) goto out; /* * Invalidate whole pages. This can return an error if we fail * to invalidate a page, but this should never happen on XFS. * Warn if it does fail. */ ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping); WARN_ON_ONCE(ret); ret = 0; } /* * If we are doing unaligned IO, wait for all other IO to drain, * otherwise demote the lock if we had to flush cached pages */ if (unaligned_io) inode_dio_wait(inode); else if (iolock == XFS_IOLOCK_EXCL) { xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); iolock = XFS_IOLOCK_SHARED; } trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); data = *from; ret = mapping->a_ops->direct_IO(iocb, &data); /* see generic_file_direct_write() for why this is necessary */ if (mapping->nrpages) { invalidate_inode_pages2_range(mapping, iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); }
STATIC ssize_t xfs_file_read_iter( struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; size_t size = iov_iter_count(to); ssize_t ret = 0; int ioflags = 0; xfs_fsize_t n; loff_t pos = iocb->ki_pos; XFS_STATS_INC(mp, xs_read_calls); if (unlikely(iocb->ki_flags & IOCB_DIRECT)) ioflags |= XFS_IO_ISDIRECT; if (file->f_mode & FMODE_NOCMTIME) ioflags |= XFS_IO_INVIS; if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) { xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ if ((pos | size) & target->bt_logical_sectormask) { if (pos == i_size_read(inode)) return 0; return -EINVAL; } } n = mp->m_super->s_maxbytes - pos; if (n <= 0 || size == 0) return 0; if (n < size) size = n; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; /* * Locking is a bit tricky here. If we take an exclusive lock for direct * IO, we effectively serialise all new concurrent read IO to this file * and block it behind IO that is currently in progress because IO in * progress holds the IO lock shared. We only need to hold the lock * exclusive to blow away the page cache, so only take lock exclusively * if the page cache needs invalidation. This allows the normal direct * IO case of no page cache pages to proceeed concurrently without * serialisation. */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) { xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); /* * The generic dio code only flushes the range of the particular * I/O. Because we take an exclusive lock here, this whole * sequence is considerably more expensive for us. This has a * noticeable performance impact for any file with cached pages, * even when outside of the range of the particular I/O. * * Hence, amortize the cost of the lock against a full file * flush and reduce the chances of repeated iolock cycles going * forward. */ if (inode->i_mapping->nrpages) { ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); if (ret) { xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } /* * Invalidate whole pages. This can return an error if * we fail to invalidate a page, but this should never * happen on XFS. Warn if it does fail. */ ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping); WARN_ON_ONCE(ret); ret = 0; } xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); } trace_xfs_file_read(ip, size, pos, ioflags); ret = generic_file_read_iter(iocb, to); if (ret > 0) XFS_STATS_ADD(mp, xs_read_bytes, ret); xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; }
/* * xfs_file_dio_aio_write - handle direct IO writes * * Lock the inode appropriately to prepare for and issue a direct IO write. * By separating it from the buffered write path we remove all the tricky to * follow locking changes and looping. * * If there are cached pages or we're extending the file, we need IOLOCK_EXCL * until we're sure the bytes at the new EOF have been zeroed and/or the cached * pages are flushed out. * * In most cases the direct IO writes will be done holding IOLOCK_SHARED * allowing them to be done in parallel with reads and other direct IO writes. * However, if the IO is not aligned to filesystem blocks, the direct IO layer * needs to do sub-block zeroing and that requires serialisation against other * direct IOs to the same block. In this case we need to serialise the * submission of the unaligned IOs so that we don't get racing block zeroing in * the dio layer. To avoid the problem with aio, we also need to wait for * outstanding IOs to complete so that unwritten extent conversion is completed * before we try to map the overlapping block. This is currently implemented by * hitting it with a big hammer (i.e. inode_dio_wait()). * * Returns with locks held indicated by @iolock and errors indicated by * negative return values. */ STATIC ssize_t xfs_file_dio_aio_write( struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; ssize_t ret = 0; int unaligned_io = 0; int iolock; size_t count = iov_iter_count(from); loff_t pos = iocb->ki_pos; struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ if ((pos | count) & target->bt_logical_sectormask) return -EINVAL; /* "unaligned" here means not aligned to a filesystem block */ if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) unaligned_io = 1; /* * We don't need to take an exclusive lock unless there page cache needs * to be invalidated or unaligned IO is being executed. We don't need to * consider the EOF extension case here because * xfs_file_aio_write_checks() will relock the inode as necessary for * EOF zeroing cases and fill out the new inode size as appropriate. */ if (unaligned_io || mapping->nrpages) iolock = XFS_IOLOCK_EXCL; else iolock = XFS_IOLOCK_SHARED; xfs_rw_ilock(ip, iolock); /* * Recheck if there are cached pages that need invalidate after we got * the iolock to protect against other threads adding new pages while * we were waiting for the iolock. */ if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) { xfs_rw_iunlock(ip, iolock); iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, iolock); } ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock); if (ret) goto out; iov_iter_truncate(from, count); if (mapping->nrpages) { ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, pos, -1); if (ret) goto out; truncate_pagecache_range(VFS_I(ip), pos, -1); } /* * If we are doing unaligned IO, wait for all other IO to drain, * otherwise demote the lock if we had to flush cached pages */ if (unaligned_io) inode_dio_wait(inode); else if (iolock == XFS_IOLOCK_EXCL) { xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); iolock = XFS_IOLOCK_SHARED; } trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); ret = generic_file_direct_write(iocb, from, pos); out: xfs_rw_iunlock(ip, iolock); /* No fallback to buffered IO on errors for XFS. */ ASSERT(ret < 0 || ret == count); return ret; }
STATIC ssize_t xfs_file_read_iter( struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; size_t size = iov_iter_count(to); ssize_t ret = 0; int ioflags = 0; xfs_fsize_t n; loff_t pos = iocb->ki_pos; XFS_STATS_INC(xs_read_calls); if (unlikely(file->f_flags & O_DIRECT)) ioflags |= XFS_IO_ISDIRECT; if (file->f_mode & FMODE_NOCMTIME) ioflags |= XFS_IO_INVIS; if (unlikely(ioflags & XFS_IO_ISDIRECT)) { xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ if ((pos | size) & target->bt_logical_sectormask) { if (pos == i_size_read(inode)) return 0; return -EINVAL; } } n = mp->m_super->s_maxbytes - pos; if (n <= 0 || size == 0) return 0; if (n < size) size = n; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; /* * Locking is a bit tricky here. If we take an exclusive lock * for direct IO, we effectively serialise all new concurrent * read IO to this file and block it behind IO that is currently in * progress because IO in progress holds the IO lock shared. We only * need to hold the lock exclusive to blow away the page cache, so * only take lock exclusively if the page cache needs invalidation. * This allows the normal direct IO case of no page cache pages to * proceeed concurrently without serialisation. */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) { xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); if (inode->i_mapping->nrpages) { ret = filemap_write_and_wait_range( VFS_I(ip)->i_mapping, pos, -1); if (ret) { xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } truncate_pagecache_range(VFS_I(ip), pos, -1); } xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); } trace_xfs_file_read(ip, size, pos, ioflags); ret = generic_file_read_iter(iocb, to); if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; }
STATIC ssize_t xfs_file_dio_aio_read( struct kiocb *iocb, struct iov_iter *to) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); loff_t isize = i_size_read(inode); size_t count = iov_iter_count(to); struct iov_iter data; struct xfs_buftarg *target; ssize_t ret = 0; trace_xfs_file_direct_read(ip, count, iocb->ki_pos); if (!count) return 0; /* skip atime */ if (XFS_IS_REALTIME_INODE(ip)) target = ip->i_mount->m_rtdev_targp; else target = ip->i_mount->m_ddev_targp; /* DIO must be aligned to device logical sector size */ if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { if (iocb->ki_pos == isize) return 0; return -EINVAL; } file_accessed(iocb->ki_filp); /* * Locking is a bit tricky here. If we take an exclusive lock for direct * IO, we effectively serialise all new concurrent read IO to this file * and block it behind IO that is currently in progress because IO in * progress holds the IO lock shared. We only need to hold the lock * exclusive to blow away the page cache, so only take lock exclusively * if the page cache needs invalidation. This allows the normal direct * IO case of no page cache pages to proceeed concurrently without * serialisation. */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); if (mapping->nrpages) { xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); /* * The generic dio code only flushes the range of the particular * I/O. Because we take an exclusive lock here, this whole * sequence is considerably more expensive for us. This has a * noticeable performance impact for any file with cached pages, * even when outside of the range of the particular I/O. * * Hence, amortize the cost of the lock against a full file * flush and reduce the chances of repeated iolock cycles going * forward. */ if (mapping->nrpages) { ret = filemap_write_and_wait(mapping); if (ret) { xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } /* * Invalidate whole pages. This can return an error if * we fail to invalidate a page, but this should never * happen on XFS. Warn if it does fail. */ ret = invalidate_inode_pages2(mapping); WARN_ON_ONCE(ret); ret = 0; } xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); } data = *to; ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, xfs_get_blocks_direct, NULL, NULL, 0); if (ret >= 0) { iocb->ki_pos += ret; iov_iter_advance(to, ret); } xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; }