/* * This tests whether the IO in question is block-aligned or not. * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they * are converted to written only after the IO is complete. Until they are * mapped, these blocks appear as holes, so dio_zero_block() will assume that * it needs to zero out portions of the start and/or end block. If 2 AIO * threads are at work on the same unwritten block, they must be synchronized * or one thread will zero the other's data, causing corruption. */ static int ext4_unaligned_aio(struct inode *inode, struct iov_iter *iter, loff_t pos) { struct super_block *sb = inode->i_sb; int blockmask = sb->s_blocksize - 1; size_t count = iov_iter_count(iter); loff_t final_size = pos + count; if (pos >= i_size_read(inode)) return 0; if ((pos & blockmask) || (final_size & blockmask)) return 1; return 0; }
static ssize_t v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct p9_fid *fid = iocb->ki_filp->private_data; int ret, err = 0; p9_debug(P9_DEBUG_VFS, "count %zu offset %lld\n", iov_iter_count(to), iocb->ki_pos); ret = p9_client_read(fid, iocb->ki_pos, to, &err); if (!ret) return err; iocb->ki_pos += ret; return ret; }
/** * process_vm_rw_single_vec - read/write pages from task specified * @addr: start memory address of target process * @len: size of area to copy to/from * @iter: where to copy to/from locally * @process_pages: struct pages area that can store at least * nr_pages_to_copy struct page pointers * @mm: mm for task * @task: task to read/write from * @vm_write: 0 means copy from, 1 means copy to * Returns 0 on success or on failure error code */ static int process_vm_rw_single_vec(unsigned long addr, unsigned long len, struct iov_iter *iter, struct page **process_pages, struct mm_struct *mm, struct task_struct *task, int vm_write) { unsigned long pa = addr & PAGE_MASK; unsigned long start_offset = addr - pa; unsigned long nr_pages; ssize_t rc = 0; unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES / sizeof(struct pages *); /* Work out address and page range required */ if (len == 0) return 0; nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1; while (!rc && nr_pages && iov_iter_count(iter)) { int pages = min(nr_pages, max_pages_per_loop); size_t bytes; /* Get the pages we're interested in */ pages = get_user_pages_unlocked(task, mm, pa, pages, vm_write, 0, process_pages); if (pages <= 0) return -EFAULT; bytes = pages * PAGE_SIZE - start_offset; if (bytes > len) bytes = len; rc = process_vm_rw_pages(process_pages, start_offset, bytes, iter, vm_write); len -= bytes; start_offset = 0; nr_pages -= pages; pa += pages * PAGE_SIZE; while (pages) put_page(process_pages[--pages]); } return rc; }
static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { ssize_t retval; struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct udf_inode_info *iinfo = UDF_I(inode); int err; mutex_lock(&inode->i_mutex); retval = generic_write_checks(iocb, from); if (retval <= 0) goto out; down_write(&iinfo->i_data_sem); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { loff_t end = iocb->ki_pos + iov_iter_count(from); if (inode->i_sb->s_blocksize < (udf_file_entry_alloc_offset(inode) + end)) { err = udf_expand_file_adinicb(inode); if (err) { mutex_unlock(&inode->i_mutex); udf_debug("udf_expand_adinicb: err=%d\n", err); return err; } } else { iinfo->i_lenAlloc = max(end, inode->i_size); up_write(&iinfo->i_data_sem); } } else up_write(&iinfo->i_data_sem); retval = __generic_file_write_iter(iocb, from); out: mutex_unlock(&inode->i_mutex); if (retval > 0) { mark_inode_dirty(inode); err = generic_write_sync(file, iocb->ki_pos - retval, retval); if (err < 0) retval = err; } return retval; }
STATIC ssize_t xfs_file_write_iter( struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t ret; size_t ocount = iov_iter_count(from); XFS_STATS_INC(ip->i_mount, xs_write_calls); if (ocount == 0) return 0; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; if (IS_DAX(inode)) ret = xfs_file_dax_write(iocb, from); else if (iocb->ki_flags & IOCB_DIRECT) { /* * Allow a directio write to fall back to a buffered * write *only* in the case that we're doing a reflink * CoW. In all other directio scenarios we do not * allow an operation to fall back to buffered mode. */ ret = xfs_file_dio_aio_write(iocb, from); if (ret == -EREMCHG) goto buffered; } else { buffered: ret = xfs_file_buffered_aio_write(iocb, from); } if (ret > 0) { XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); /* Handle various SYNC-type writes */ ret = generic_write_sync(iocb, ret); } return ret; }
static noinline ssize_t xfs_file_dax_write( struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = iocb->ki_filp->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); int iolock = XFS_IOLOCK_EXCL; ssize_t ret, error = 0; size_t count; loff_t pos; if (iocb->ki_flags & IOCB_NOWAIT) { if (!xfs_ilock_nowait(ip, iolock)) return -EAGAIN; } else { xfs_ilock(ip, iolock); } ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; pos = iocb->ki_pos; count = iov_iter_count(from); trace_xfs_file_dax_write(ip, count, pos); ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops); if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { i_size_write(inode, iocb->ki_pos); error = xfs_setfilesize(ip, pos, ret); } out: xfs_iunlock(ip, iolock); if (error) return error; if (ret > 0) { XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); /* Handle various SYNC-type writes */ ret = generic_write_sync(iocb, ret); } return ret; }
/** * dax_do_io - Perform I/O to a DAX file * @iocb: The control block for this I/O * @inode: The file which the I/O is directed at * @iter: The addresses to do I/O from or to * @pos: The file offset where the I/O starts * @get_block: The filesystem method used to translate file offsets to blocks * @end_io: A filesystem callback for I/O completion * @flags: See below * * This function uses the same locking scheme as do_blockdev_direct_IO: * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the * caller for writes. For reads, we take and release the i_mutex ourselves. * If DIO_LOCKING is not set, the filesystem takes care of its own locking. * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O * is in progress. */ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, struct iov_iter *iter, loff_t pos, get_block_t get_block, dio_iodone_t end_io, int flags) { struct buffer_head bh; ssize_t retval = -EINVAL; loff_t end = pos + iov_iter_count(iter); memset(&bh, 0, sizeof(bh)); bh.b_bdev = inode->i_sb->s_bdev; if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) { struct address_space *mapping = inode->i_mapping; inode_lock(inode); retval = filemap_write_and_wait_range(mapping, pos, end - 1); if (retval) { inode_unlock(inode); goto out; } } /* Protects against truncate */ if (!(flags & DIO_SKIP_DIO_COUNT)) inode_dio_begin(inode); retval = dax_io(inode, iter, pos, end, get_block, &bh); if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) inode_unlock(inode); if (end_io) { int err; err = end_io(iocb, pos, retval, bh.b_private); if (err) retval = err; } if (!(flags & DIO_SKIP_DIO_COUNT)) inode_dio_end(inode); out: return retval; }
static ssize_t hypfs_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; char *data = file->private_data; size_t available = strlen(data); loff_t pos = iocb->ki_pos; size_t count; if (pos < 0) return -EINVAL; if (pos >= available || !iov_iter_count(to)) return 0; count = copy_to_iter(data + pos, available - pos, to); if (!count) return -EFAULT; iocb->ki_pos = pos + count; file_accessed(file); return count; }
static ssize_t zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) { size_t count; ssize_t ret; uio_seg_t seg = UIO_USERSPACE; #ifndef HAVE_GENERIC_WRITE_CHECKS_KIOCB struct file *file = kiocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *ip = mapping->host; int isblk = S_ISBLK(ip->i_mode); count = iov_iter_count(from); ret = generic_write_checks(file, &kiocb->ki_pos, &count, isblk); if (ret) return (ret); #else /* * XXX - ideally this check should be in the same lock region with * write operations, so that there's no TOCTTOU race when doing * append and someone else grow the file. */ ret = generic_write_checks(kiocb, from); if (ret <= 0) return (ret); count = ret; #endif if (from->type & ITER_KVEC) seg = UIO_SYSSPACE; if (from->type & ITER_BVEC) seg = UIO_BVEC; ret = zpl_iter_write_common(kiocb, from->iov, from->nr_segs, count, seg, from->iov_offset); if (ret > 0) iov_iter_advance(from, ret); return (ret); }
ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos) { struct kiocb kiocb; ssize_t ret; if (!file->f_op->write_iter) return -EINVAL; init_sync_kiocb(&kiocb, file); kiocb.ki_pos = *ppos; kiocb.ki_nbytes = iov_iter_count(iter); iter->type |= WRITE; ret = file->f_op->write_iter(&kiocb, iter); if (ret == -EIOCBQUEUED) ret = wait_on_sync_kiocb(&kiocb); if (ret > 0) *ppos = kiocb.ki_pos; return ret; }
ssize_t nfs_file_read(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); ssize_t result; if (iocb->ki_flags & IOCB_DIRECT) return nfs_file_direct_read(iocb, to, iocb->ki_pos); dprintk("NFS: read(%pD2, %zu@%lu)\n", iocb->ki_filp, iov_iter_count(to), (unsigned long) iocb->ki_pos); result = nfs_revalidate_mapping_protected(inode, iocb->ki_filp->f_mapping); if (!result) { result = generic_file_read_iter(iocb, to); if (result > 0) nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result); } return result; }
STATIC ssize_t xfs_file_buffered_aio_read( struct kiocb *iocb, struct iov_iter *to) { struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); ssize_t ret; trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos); if (iocb->ki_flags & IOCB_NOWAIT) { if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) return -EAGAIN; } else { xfs_ilock(ip, XFS_IOLOCK_SHARED); } ret = generic_file_read_iter(iocb, to); xfs_iunlock(ip, XFS_IOLOCK_SHARED); return ret; }
static noinline ssize_t xfs_file_dax_read( struct kiocb *iocb, struct iov_iter *to) { struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); size_t count = iov_iter_count(to); ssize_t ret = 0; trace_xfs_file_dax_read(ip, count, iocb->ki_pos); if (!count) return 0; /* skip atime */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); ret = iomap_dax_rw(iocb, to, &xfs_iomap_ops); xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); file_accessed(iocb->ki_filp); return ret; }
/* * Read data from a specified offset in a file (referenced by inode). * Data may be placed either in a user or kernel buffer. */ ssize_t orangefs_inode_read(struct inode *inode, struct iov_iter *iter, loff_t *offset, loff_t readahead_size) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); size_t count = iov_iter_count(iter); size_t bufmap_size; ssize_t ret = -EINVAL; g_orangefs_stats.reads++; bufmap_size = orangefs_bufmap_size_query(); if (count > bufmap_size) { gossip_debug(GOSSIP_FILE_DEBUG, "%s: count is too large (%zd/%zd)!\n", __func__, count, bufmap_size); return -EINVAL; } gossip_debug(GOSSIP_FILE_DEBUG, "%s(%pU) %zd@%llu\n", __func__, &orangefs_inode->refn.khandle, count, llu(*offset)); ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, offset, iter, count, readahead_size); if (ret > 0) *offset += ret; gossip_debug(GOSSIP_FILE_DEBUG, "%s(%pU): Value(%zd) returned.\n", __func__, &orangefs_inode->refn.khandle, ret); return ret; }
/** * process_vm_rw - check iovecs before calling core routine * @pid: PID of process to read/write from/to * @lvec: iovec array specifying where to copy to/from locally * @liovcnt: size of lvec array * @rvec: iovec array specifying where to copy to/from in the other process * @riovcnt: size of rvec array * @flags: currently unused * @vm_write: 0 if reading from other process, 1 if writing to other process * Returns the number of bytes read/written or error code. May * return less bytes than expected if an error occurs during the copying * process. */ static ssize_t process_vm_rw(pid_t pid, const struct iovec __user *lvec, unsigned long liovcnt, const struct iovec __user *rvec, unsigned long riovcnt, unsigned long flags, int vm_write) { struct iovec iovstack_l[UIO_FASTIOV]; struct iovec iovstack_r[UIO_FASTIOV]; struct iovec *iov_l = iovstack_l; struct iovec *iov_r = iovstack_r; struct iov_iter iter; ssize_t rc; int dir = vm_write ? WRITE : READ; if (flags != 0) return -EINVAL; /* Check iovecs */ rc = import_iovec(dir, lvec, liovcnt, UIO_FASTIOV, &iov_l, &iter); if (rc < 0) return rc; if (!iov_iter_count(&iter)) goto free_iovecs; rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV, iovstack_r, &iov_r); if (rc <= 0) goto free_iovecs; rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write); free_iovecs: if (iov_r != iovstack_r) kfree(iov_r); kfree(iov_l); return rc; }
STATIC ssize_t xfs_file_dio_aio_read( struct kiocb *iocb, struct iov_iter *to) { struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); size_t count = iov_iter_count(to); ssize_t ret; trace_xfs_file_direct_read(ip, count, iocb->ki_pos); if (!count) return 0; /* skip atime */ file_accessed(iocb->ki_filp); xfs_ilock(ip, XFS_IOLOCK_SHARED); ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL); xfs_iunlock(ip, XFS_IOLOCK_SHARED); return ret; }
int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to) { struct rds_message *rm; struct scatterlist *sg; unsigned long to_copy; unsigned long vec_off; int copied; int ret; u32 len; rm = container_of(inc, struct rds_message, m_inc); len = be32_to_cpu(rm->m_inc.i_hdr.h_len); sg = rm->data.op_sg; vec_off = 0; copied = 0; while (iov_iter_count(to) && copied < len) { to_copy = min_t(unsigned long, iov_iter_count(to), sg->length - vec_off); to_copy = min_t(unsigned long, to_copy, len - copied); rds_stats_add(s_copy_to_user, to_copy); ret = copy_page_to_iter(sg_page(sg), sg->offset + vec_off, to_copy, to); if (ret != to_copy) return -EFAULT; vec_off += to_copy; copied += to_copy; if (vec_off == sg->length) { vec_off = 0; sg++; } } return copied; }
/** * v9fs_file_write - write to a file * @filp: file pointer to write * @data: data buffer to write data from * @count: size of buffer * @offset: offset at which to write data * */ static ssize_t v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; ssize_t retval = 0; loff_t origin = iocb->ki_pos; size_t count = iov_iter_count(from); int err = 0; retval = generic_write_checks(file, &origin, &count, 0); if (retval) return retval; iov_iter_truncate(from, count); if (!count) return 0; retval = p9_client_write(file->private_data, origin, from, &err); if (retval > 0) { struct inode *inode = file_inode(file); loff_t i_size; unsigned long pg_start, pg_end; pg_start = origin >> PAGE_CACHE_SHIFT; pg_end = (origin + retval - 1) >> PAGE_CACHE_SHIFT; if (inode->i_mapping && inode->i_mapping->nrpages) invalidate_inode_pages2_range(inode->i_mapping, pg_start, pg_end); origin += retval; i_size = i_size_read(inode); iocb->ki_pos = origin; if (origin > i_size) { inode_add_bytes(inode, origin - i_size); i_size_write(inode, origin); } return retval; } return err; }
static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct fd real; const struct cred *old_cred; ssize_t ret; if (!iov_iter_count(iter)) return 0; inode_lock(inode); /* Update mode */ ovl_copyattr(ovl_inode_real(inode), inode); ret = file_remove_privs(file); if (ret) goto out_unlock; ret = ovl_real_fdget(file, &real); if (ret) goto out_unlock; old_cred = ovl_override_creds(file_inode(file)->i_sb); file_start_write(real.file); ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, ovl_iocb_to_rwf(iocb)); file_end_write(real.file); revert_creds(old_cred); /* Update size */ ovl_copyattr(ovl_inode_real(inode), inode); fdput(real); out_unlock: inode_unlock(inode); return ret; }
static ssize_t hypfs_write_iter(struct kiocb *iocb, struct iov_iter *from) { int rc; struct super_block *sb = file_inode(iocb->ki_filp)->i_sb; struct hypfs_sb_info *fs_info = sb->s_fs_info; size_t count = iov_iter_count(from); /* * Currently we only allow one update per second for two reasons: * 1. diag 204 is VERY expensive * 2. If several processes do updates in parallel and then read the * hypfs data, the likelihood of collisions is reduced, if we restrict * the minimum update interval. A collision occurs, if during the * data gathering of one process another process triggers an update * If the first process wants to ensure consistent data, it has * to restart data collection in this case. */ mutex_lock(&fs_info->lock); if (fs_info->last_update == get_seconds()) { rc = -EBUSY; goto out; } hypfs_delete_tree(sb->s_root); if (MACHINE_IS_VM) rc = hypfs_vm_create_files(sb->s_root); else rc = hypfs_diag_create_files(sb->s_root); if (rc) { pr_err("Updating the hypfs tree failed\n"); hypfs_delete_tree(sb->s_root); goto out; } hypfs_update_update(sb); rc = count; iov_iter_advance(from, count); out: mutex_unlock(&fs_info->lock); return rc; }
STATIC ssize_t xfs_file_write_iter( struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t ret; size_t ocount = iov_iter_count(from); XFS_STATS_INC(xs_write_calls); if (ocount == 0) return 0; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; if (unlikely(file->f_flags & O_DIRECT)) ret = xfs_file_dio_aio_write(iocb, from); else ret = xfs_file_buffered_aio_write(iocb, from); if (ret > 0) { ssize_t err; XFS_STATS_ADD(xs_write_bytes, ret); /* Handle various SYNC-type writes */ err = generic_write_sync(file, iocb->ki_pos - ret, ret); if (err < 0) ret = err; } return ret; }
static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = file->f_mapping->host; size_t count = iov_iter_count(iter); ssize_t ret; ret = blockdev_direct_IO(iocb, inode, iter, jfs_get_block); /* * In case of error extending write may have instantiated a few * blocks outside i_size. Trim these off again. */ if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { loff_t isize = i_size_read(inode); loff_t end = iocb->ki_pos + count; if (end > isize) jfs_write_failed(mapping, end); } return ret; }
static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); int err; /* we don't need to use inline_data strictly */ if (f2fs_has_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); if (err) return err; } if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) return 0; err = check_direct_IO(inode, iter, offset); if (err) return err; trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); if (iov_iter_rw(iter) == WRITE) __allocate_data_blocks(inode, offset, count); err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio); if (err < 0 && iov_iter_rw(iter) == WRITE) f2fs_write_failed(mapping, offset + count); trace_f2fs_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), err); return err; }
/** * dax_do_io - Perform I/O to a DAX file * @iocb: The control block for this I/O * @inode: The file which the I/O is directed at * @iter: The addresses to do I/O from or to * @get_block: The filesystem method used to translate file offsets to blocks * @end_io: A filesystem callback for I/O completion * @flags: See below * * This function uses the same locking scheme as do_blockdev_direct_IO: * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the * caller for writes. For reads, we take and release the i_mutex ourselves. * If DIO_LOCKING is not set, the filesystem takes care of its own locking. * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O * is in progress. */ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, struct iov_iter *iter, get_block_t get_block, dio_iodone_t end_io, int flags) { struct buffer_head bh; ssize_t retval = -EINVAL; loff_t pos = iocb->ki_pos; loff_t end = pos + iov_iter_count(iter); memset(&bh, 0, sizeof(bh)); bh.b_bdev = inode->i_sb->s_bdev; if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) inode_lock(inode); /* Protects against truncate */ if (!(flags & DIO_SKIP_DIO_COUNT)) inode_dio_begin(inode); retval = dax_io(inode, iter, pos, end, get_block, &bh); if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) inode_unlock(inode); if (end_io) { int err; err = end_io(iocb, pos, retval, bh.b_private); if (err) retval = err; } if (!(flags & DIO_SKIP_DIO_COUNT)) inode_dio_end(inode); return retval; }
/* Do it by hand, with file-ops */ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, loff_t *ppos, io_fn_t fn) { ssize_t ret = 0; while (iov_iter_count(iter)) { struct iovec iovec = iov_iter_iovec(iter); ssize_t nr; nr = fn(filp, iovec.iov_base, iovec.iov_len, ppos); if (nr < 0) { if (!ret) ret = nr; break; } ret += nr; if (nr != iovec.iov_len) break; iov_iter_advance(iter, nr); } return ret; }
STATIC ssize_t xfs_file_read_iter( struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; size_t size = iov_iter_count(to); ssize_t ret = 0; int ioflags = 0; xfs_fsize_t n; loff_t pos = iocb->ki_pos; XFS_STATS_INC(xs_read_calls); if (unlikely(file->f_flags & O_DIRECT)) ioflags |= XFS_IO_ISDIRECT; if (file->f_mode & FMODE_NOCMTIME) ioflags |= XFS_IO_INVIS; if (unlikely(ioflags & XFS_IO_ISDIRECT)) { xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ if ((pos | size) & target->bt_logical_sectormask) { if (pos == i_size_read(inode)) return 0; return -EINVAL; } } n = mp->m_super->s_maxbytes - pos; if (n <= 0 || size == 0) return 0; if (n < size) size = n; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; /* * Locking is a bit tricky here. If we take an exclusive lock * for direct IO, we effectively serialise all new concurrent * read IO to this file and block it behind IO that is currently in * progress because IO in progress holds the IO lock shared. We only * need to hold the lock exclusive to blow away the page cache, so * only take lock exclusively if the page cache needs invalidation. * This allows the normal direct IO case of no page cache pages to * proceeed concurrently without serialisation. */ xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) { xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); if (inode->i_mapping->nrpages) { ret = filemap_write_and_wait_range( VFS_I(ip)->i_mapping, pos, pos + size - 1); if (ret) { xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } /* * Invalidate whole pages. This can return an error if * we fail to invalidate a page, but this should never * happen on XFS. Warn if it does fail. */ ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, pos >> PAGE_CACHE_SHIFT, (pos + size - 1) >> PAGE_CACHE_SHIFT); WARN_ON_ONCE(ret); ret = 0; }
/* * Common pre-write limit and setup checks. * * Called with the iolocked held either shared and exclusive according to * @iolock, and returns with it held. Might upgrade the iolock to exclusive * if called for a direct write beyond i_size. */ STATIC ssize_t xfs_file_aio_write_checks( struct kiocb *iocb, struct iov_iter *from, int *iolock) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t error = 0; size_t count = iov_iter_count(from); bool drained_dio = false; restart: error = generic_write_checks(iocb, from); if (error <= 0) return error; error = xfs_break_layouts(inode, iolock); if (error) return error; /* * For changing security info in file_remove_privs() we need i_rwsem * exclusively. */ if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { xfs_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; xfs_ilock(ip, *iolock); goto restart; } /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this * write. If zeroing is needed and we are currently holding the * iolock shared, we need to update it to exclusive which implies * having to redo all checks before. * * We need to serialise against EOF updates that occur in IO * completions here. We want to make sure that nobody is changing the * size while we do this check until we have placed an IO barrier (i.e. * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. * The spinlock effectively forms a memory barrier once we have the * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value * and hence be able to correctly determine if we need to run zeroing. */ spin_lock(&ip->i_flags_lock); if (iocb->ki_pos > i_size_read(inode)) { spin_unlock(&ip->i_flags_lock); if (!drained_dio) { if (*iolock == XFS_IOLOCK_SHARED) { xfs_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; xfs_ilock(ip, *iolock); iov_iter_reexpand(from, count); } /* * We now have an IO submission barrier in place, but * AIO can do EOF updates during IO completion and hence * we now need to wait for all of them to drain. Non-AIO * DIO will have drained before we are given the * XFS_IOLOCK_EXCL, and so for most cases this wait is a * no-op. */ inode_dio_wait(inode); drained_dio = true; goto restart; } error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), NULL); if (error) return error; } else spin_unlock(&ip->i_flags_lock); /* * Updating the timestamps will grab the ilock again from * xfs_fs_dirty_inode, so we have to call it after dropping the * lock above. Eventually we should look into a way to avoid * the pointless lock roundtrip. */ if (likely(!(file->f_mode & FMODE_NOCMTIME))) { error = file_update_time(file); if (error) return error; } /* * If we're writing the file then make sure to clear the setuid and * setgid bits if the process is not being run by root. This keeps * people from modifying setuid and setgid binaries. */ if (!IS_NOSEC(inode)) return file_remove_privs(file); return 0; }
/* * xfs_file_dio_aio_write - handle direct IO writes * * Lock the inode appropriately to prepare for and issue a direct IO write. * By separating it from the buffered write path we remove all the tricky to * follow locking changes and looping. * * If there are cached pages or we're extending the file, we need IOLOCK_EXCL * until we're sure the bytes at the new EOF have been zeroed and/or the cached * pages are flushed out. * * In most cases the direct IO writes will be done holding IOLOCK_SHARED * allowing them to be done in parallel with reads and other direct IO writes. * However, if the IO is not aligned to filesystem blocks, the direct IO layer * needs to do sub-block zeroing and that requires serialisation against other * direct IOs to the same block. In this case we need to serialise the * submission of the unaligned IOs so that we don't get racing block zeroing in * the dio layer. To avoid the problem with aio, we also need to wait for * outstanding IOs to complete so that unwritten extent conversion is completed * before we try to map the overlapping block. This is currently implemented by * hitting it with a big hammer (i.e. inode_dio_wait()). * * Returns with locks held indicated by @iolock and errors indicated by * negative return values. */ STATIC ssize_t xfs_file_dio_aio_write( struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; ssize_t ret = 0; int unaligned_io = 0; int iolock; size_t count = iov_iter_count(from); struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ if ((iocb->ki_pos | count) & target->bt_logical_sectormask) return -EINVAL; /* * Don't take the exclusive iolock here unless the I/O is unaligned to * the file system block size. We don't need to consider the EOF * extension case here because xfs_file_aio_write_checks() will relock * the inode as necessary for EOF zeroing cases and fill out the new * inode size as appropriate. */ if ((iocb->ki_pos & mp->m_blockmask) || ((iocb->ki_pos + count) & mp->m_blockmask)) { unaligned_io = 1; /* * We can't properly handle unaligned direct I/O to reflink * files yet, as we can't unshare a partial block. */ if (xfs_is_reflink_inode(ip)) { trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count); return -EREMCHG; } iolock = XFS_IOLOCK_EXCL; } else { iolock = XFS_IOLOCK_SHARED; } if (iocb->ki_flags & IOCB_NOWAIT) { if (!xfs_ilock_nowait(ip, iolock)) return -EAGAIN; } else { xfs_ilock(ip, iolock); } ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; count = iov_iter_count(from); /* * If we are doing unaligned IO, wait for all other IO to drain, * otherwise demote the lock if we had to take the exclusive lock * for other reasons in xfs_file_aio_write_checks. */ if (unaligned_io) { /* If we are going to wait for other DIO to finish, bail */ if (iocb->ki_flags & IOCB_NOWAIT) { if (atomic_read(&inode->i_dio_count)) return -EAGAIN; } else { inode_dio_wait(inode); } } else if (iolock == XFS_IOLOCK_EXCL) { xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); iolock = XFS_IOLOCK_SHARED; } trace_xfs_file_direct_write(ip, count, iocb->ki_pos); ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io); out: xfs_iunlock(ip, iolock); /* * No fallback to buffered IO on errors for XFS, direct IO will either * complete fully or fail. */ ASSERT(ret < 0 || ret == count); return ret; }
/** * process_vm_rw_core - core of reading/writing pages from task specified * @pid: PID of process to read/write from/to * @iter: where to copy to/from locally * @rvec: iovec array specifying where to copy to/from in the other process * @riovcnt: size of rvec array * @flags: currently unused * @vm_write: 0 if reading from other process, 1 if writing to other process * Returns the number of bytes read/written or error code. May * return less bytes than expected if an error occurs during the copying * process. */ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter, const struct iovec *rvec, unsigned long riovcnt, unsigned long flags, int vm_write) { struct task_struct *task; struct page *pp_stack[PVM_MAX_PP_ARRAY_COUNT]; struct page **process_pages = pp_stack; struct mm_struct *mm; unsigned long i; ssize_t rc = 0; unsigned long nr_pages = 0; unsigned long nr_pages_iov; ssize_t iov_len; size_t total_len = iov_iter_count(iter); return -ENOSYS; // PaX: until properly audited /* * Work out how many pages of struct pages we're going to need * when eventually calling get_user_pages */ for (i = 0; i < riovcnt; i++) { iov_len = rvec[i].iov_len; if (iov_len <= 0) continue; nr_pages_iov = ((unsigned long)rvec[i].iov_base + iov_len) / PAGE_SIZE - (unsigned long)rvec[i].iov_base / PAGE_SIZE + 1; nr_pages = max(nr_pages, nr_pages_iov); } if (nr_pages == 0) return 0; if (nr_pages > PVM_MAX_PP_ARRAY_COUNT) { /* For reliability don't try to kmalloc more than 2 pages worth */ process_pages = kmalloc(min_t(size_t, PVM_MAX_KMALLOC_PAGES, sizeof(struct pages *)*nr_pages), GFP_KERNEL); if (!process_pages) return -ENOMEM; } /* Get process information */ rcu_read_lock(); task = find_task_by_vpid(pid); if (task) get_task_struct(task); rcu_read_unlock(); if (!task) { rc = -ESRCH; goto free_proc_pages; } if (gr_handle_ptrace(task, vm_write ? PTRACE_POKETEXT : PTRACE_ATTACH)) { rc = -EPERM; goto put_task_struct; } mm = mm_access(task, PTRACE_MODE_ATTACH); if (!mm || IS_ERR(mm)) { rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; /* * Explicitly map EACCES to EPERM as EPERM is a more a * appropriate error code for process_vw_readv/writev */ if (rc == -EACCES) rc = -EPERM; goto put_task_struct; } for (i = 0; i < riovcnt && iov_iter_count(iter) && !rc; i++) rc = process_vm_rw_single_vec( (unsigned long)rvec[i].iov_base, rvec[i].iov_len, iter, process_pages, mm, task, vm_write); /* copied = space before - space after */ total_len -= iov_iter_count(iter); /* If we have managed to copy any data at all then we return the number of bytes copied. Otherwise we return the error code */ if (total_len) rc = total_len; mmput(mm); put_task_struct: put_task_struct(task); free_proc_pages: if (process_pages != pp_stack) kfree(process_pages); return rc; }
STATIC ssize_t xfs_file_buffered_aio_write( struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t ret; int enospc = 0; int iolock; if (iocb->ki_flags & IOCB_NOWAIT) return -EOPNOTSUPP; write_retry: iolock = XFS_IOLOCK_EXCL; xfs_ilock(ip, iolock); ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode); trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos); ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops); if (likely(ret >= 0)) iocb->ki_pos += ret; /* * If we hit a space limit, try to free up some lingering preallocated * space before returning an error. In the case of ENOSPC, first try to * write back all dirty inodes to free up some of the excess reserved * metadata space. This reduces the chances that the eofblocks scan * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this * also behaves as a filter to prevent too many eofblocks scans from * running at the same time. */ if (ret == -EDQUOT && !enospc) { xfs_iunlock(ip, iolock); enospc = xfs_inode_free_quota_eofblocks(ip); if (enospc) goto write_retry; enospc = xfs_inode_free_quota_cowblocks(ip); if (enospc) goto write_retry; iolock = 0; } else if (ret == -ENOSPC && !enospc) { struct xfs_eofblocks eofb = {0}; enospc = 1; xfs_flush_inodes(ip->i_mount); xfs_iunlock(ip, iolock); eofb.eof_flags = XFS_EOF_FLAGS_SYNC; xfs_icache_free_eofblocks(ip->i_mount, &eofb); xfs_icache_free_cowblocks(ip->i_mount, &eofb); goto write_retry; } current->backing_dev_info = NULL; out: if (iolock) xfs_iunlock(ip, iolock); return ret; }