static ssize_t aufs_aio_write_sp(struct kiocb *kio, const struct iovec *iov, unsigned long nv, loff_t pos) { ssize_t err; aufs_bindex_t bstart; unsigned char wbr; struct super_block *sb; struct file *file, *h_file; file = kio->ki_filp; sb = file->f_dentry->d_sb; si_read_lock(sb, AuLock_FLUSH); fi_read_lock(file); bstart = au_fbstart(file); h_file = au_hf_top(file); fi_read_unlock(file); wbr = !!au_br_writable(au_sbr(sb, bstart)->br_perm); si_read_unlock(sb); /* do not change the file in kio */ AuDebugOn(!h_file->f_op || !h_file->f_op->aio_write); err = h_file->f_op->aio_write(kio, iov, nv, pos); if (err > 0 && wbr) file_update_time(h_file); return err; }
/* * mmap()d file has taken write protection fault and is being made writable. We * can set the page state up correctly for a writable page, which means we can * do correct delalloc accounting (ENOSPC checking!) and unwritten extent * mapping. */ STATIC int xfs_filemap_page_mkwrite( struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); int ret; trace_xfs_filemap_page_mkwrite(XFS_I(inode)); sb_start_pagefault(inode->i_sb); file_update_time(vmf->vma->vm_file); xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops); } else { ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops); ret = block_page_mkwrite_return(ret); } xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); sb_end_pagefault(inode->i_sb); return ret; }
static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct page *page = vmf->page; struct inode *inode = file_inode(vma->vm_file); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; int err; f2fs_balance_fs(sbi); vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); /* block allocation */ f2fs_lock_op(sbi); set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_reserve_block(&dn, page->index); if (err) { f2fs_unlock_op(sbi); goto out; } f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); file_update_time(vma->vm_file); lock_page(page); if (unlikely(page->mapping != inode->i_mapping || page_offset(page) > i_size_read(inode) || !PageUptodate(page))) { unlock_page(page); err = -EFAULT; goto out; } /* * check to see if the page is mapped already (no holes) */ if (PageMappedToDisk(page)) goto mapped; /* page is wholly or partially inside EOF */ if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) { unsigned offset; offset = i_size_read(inode) & ~PAGE_CACHE_MASK; zero_user_segment(page, offset, PAGE_CACHE_SIZE); } set_page_dirty(page); SetPageUptodate(page); trace_f2fs_vm_page_mkwrite(page, DATA); mapped: /* fill the page */ f2fs_wait_on_page_writeback(page, DATA); /* if gced page is attached, don't write to cold segment */ clear_cold_data(page); out: return block_page_mkwrite_return(err); }
static int v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct v9fs_inode *v9inode; struct page *page = vmf->page; struct file *filp = vma->vm_file; struct inode *inode = file_inode(filp); p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n", page, (unsigned long)filp->private_data); /* Update file times before taking page lock */ file_update_time(filp); v9inode = V9FS_I(inode); /* make sure the cache has finished storing the page */ v9fs_fscache_wait_on_page_write(inode, page); BUG_ON(!v9inode->writeback_fid); lock_page(page); if (page->mapping != inode->i_mapping) goto out_unlock; wait_for_stable_page(page); return VM_FAULT_LOCKED; out_unlock: unlock_page(page); return VM_FAULT_NOPAGE; }
/* * Locking for serialisation of IO during page faults. This results in a lock * ordering of: * * mmap_sem (MM) * sb_start_pagefault(vfs, freeze) * i_mmaplock (XFS - truncate serialisation) * page_lock (MM) * i_lock (XFS - extent map serialisation) */ static int __xfs_filemap_fault( struct vm_fault *vmf, enum page_entry_size pe_size, bool write_fault) { struct inode *inode = file_inode(vmf->vma->vm_file); struct xfs_inode *ip = XFS_I(inode); int ret; trace_xfs_filemap_fault(ip, pe_size, write_fault); if (write_fault) { sb_start_pagefault(inode->i_sb); file_update_time(vmf->vma->vm_file); } xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops); } else { if (write_fault) ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops); else ret = filemap_fault(vmf); } xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (write_fault) sb_end_pagefault(inode->i_sb); return ret; }
static ssize_t ext2_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; ssize_t ret; inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret <= 0) goto out_unlock; ret = file_remove_privs(file); if (ret) goto out_unlock; ret = file_update_time(file); if (ret) goto out_unlock; ret = dax_iomap_rw(iocb, from, &ext2_iomap_ops); if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { i_size_write(inode, iocb->ki_pos); mark_inode_dirty(inode); } out_unlock: inode_unlock(inode); if (ret > 0) ret = generic_write_sync(iocb, ret); return ret; }
/* * pfn_mkwrite was originally inteneded to ensure we capture time stamp * updates on write faults. In reality, it's need to serialise against * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED * to ensure we serialise the fault barrier in place. */ static int xfs_filemap_pfn_mkwrite( struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); struct xfs_inode *ip = XFS_I(inode); int ret = VM_FAULT_NOPAGE; loff_t size; trace_xfs_filemap_pfn_mkwrite(ip); sb_start_pagefault(inode->i_sb); file_update_time(vmf->vma->vm_file); /* check if the faulting page hasn't raced with truncate */ xfs_ilock(ip, XFS_MMAPLOCK_SHARED); size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) ret = VM_FAULT_SIGBUS; else if (IS_DAX(inode)) ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops); xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); sb_end_pagefault(inode->i_sb); return ret; }
/* * Similar to xfs_filemap_fault(), the DAX fault path can call into here on * both read and write faults. Hence we need to handle both cases. There is no * ->huge_mkwrite callout for huge pages, so we have a single function here to * handle both cases here. @flags carries the information on the type of fault * occuring. */ STATIC int xfs_filemap_huge_fault( struct vm_fault *vmf, enum page_entry_size pe_size) { struct inode *inode = file_inode(vmf->vma->vm_file); struct xfs_inode *ip = XFS_I(inode); int ret; if (!IS_DAX(inode)) return VM_FAULT_FALLBACK; trace_xfs_filemap_huge_fault(ip); if (vmf->flags & FAULT_FLAG_WRITE) { sb_start_pagefault(inode->i_sb); file_update_time(vmf->vma->vm_file); } xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (vmf->flags & FAULT_FLAG_WRITE) sb_end_pagefault(inode->i_sb); return ret; }
/* * Common pre-write limit and setup checks. * * Called with the iolocked held either shared and exclusive according to * @iolock, and returns with it held. Might upgrade the iolock to exclusive * if called for a direct write beyond i_size. */ STATIC ssize_t xfs_file_aio_write_checks( struct file *file, loff_t *pos, size_t *count, int *iolock) { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); int error = 0; restart: error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); if (error) return error; /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this * write. If zeroing is needed and we are currently holding the * iolock shared, we need to update it to exclusive which implies * having to redo all checks before. */ if (*pos > i_size_read(inode)) { if (*iolock == XFS_IOLOCK_SHARED) { xfs_rw_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, *iolock); goto restart; } error = -xfs_zero_eof(ip, *pos, i_size_read(inode)); if (error) return error; } /* * Updating the timestamps will grab the ilock again from * xfs_fs_dirty_inode, so we have to call it after dropping the * lock above. Eventually we should look into a way to avoid * the pointless lock roundtrip. */ if (likely(!(file->f_mode & FMODE_NOCMTIME))) { error = file_update_time(file); if (error) return error; } /* * If we're writing the file then make sure to clear the setuid and * setgid bits if the process is not being run by root. This keeps * people from modifying setuid and setgid binaries. */ return file_remove_suid(file); }
ssize_t hmfs_xip_file_write(struct file * filp, const char __user * buf, size_t len, loff_t * ppos) { struct address_space *mapping = filp->f_mapping; struct inode *inode = filp->f_inode; struct hmfs_sb_info *sbi = HMFS_SB(inode->i_sb); size_t count = 0, ret; loff_t pos; int ilock; mutex_lock(&inode->i_mutex); if (!access_ok(VERIFY_READ, buf, len)) { ret = -EFAULT; goto out_up; } pos = *ppos; count = len; current->backing_dev_info = mapping->backing_dev_info; ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode)); if (ret) goto out_backing; if (count == 0) goto out_backing; ret = file_remove_suid(filp); if (ret) goto out_backing; ret = file_update_time(filp); if (ret) goto out_backing; inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; ilock = mutex_lock_op(sbi); ret = __hmfs_xip_file_write(filp, buf, count, pos, ppos); mutex_unlock_op(sbi, ilock); mark_inode_dirty(inode); out_backing: current->backing_dev_info = NULL; out_up: mutex_unlock(&inode->i_mutex); return ret; }
/* * Common pre-write limit and setup checks. * * Called with the iolocked held either shared and exclusive according to * @iolock, and returns with it held. Might upgrade the iolock to exclusive * if called for a direct write beyond i_size. */ STATIC ssize_t xfs_file_aio_write_checks( struct file *file, loff_t *pos, size_t *count, int *iolock) { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); int error = 0; xfs_rw_ilock(ip, XFS_ILOCK_EXCL); restart: error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); if (error) { xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); return error; } if (likely(!(file->f_mode & FMODE_NOCMTIME))) file_update_time(file); /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this * write. If zeroing is needed and we are currently holding the * iolock shared, we need to update it to exclusive which involves * dropping all locks and relocking to maintain correct locking order. * If we do this, restart the function to ensure all checks and values * are still valid. */ if (*pos > i_size_read(inode)) { if (*iolock == XFS_IOLOCK_SHARED) { xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); *iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); goto restart; } error = -xfs_zero_eof(ip, *pos, i_size_read(inode)); } xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); if (error) return error; /* * If we're writing the file then make sure to clear the setuid and * setgid bits if the process is not being run by root. This keeps * people from modifying setuid and setgid binaries. */ return file_remove_suid(file); }
static int ll_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { int count = 0; bool printed = false; bool retry; int result; ll_stats_ops_tally(ll_i2sbi(file_inode(vma->vm_file)), LPROC_LL_MKWRITE, 1); file_update_time(vma->vm_file); do { retry = false; result = ll_page_mkwrite0(vma, vmf->page, &retry); if (!printed && ++count > 16) { const struct dentry *de = file_dentry(vma->vm_file); CWARN("app(%s): the page %lu of file "DFID" is under" " heavy contention\n", current->comm, vmf->pgoff, PFID(ll_inode2fid(de->d_inode))); printed = true; } } while (retry); switch(result) { case 0: LASSERT(PageLocked(vmf->page)); result = VM_FAULT_LOCKED; break; case -ENODATA: case -EFAULT: result = VM_FAULT_NOPAGE; break; case -ENOMEM: result = VM_FAULT_OOM; break; case -EAGAIN: result = VM_FAULT_RETRY; break; default: result = VM_FAULT_SIGBUS; break; } return result; }
/** * dax_fault - handle a page fault on a DAX file * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault * @get_block: The filesystem method used to translate file offsets to blocks * * When a page fault occurs, filesystems may call this helper in their * fault handler for DAX files. */ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block, dax_iodone_t complete_unwritten) { int result; struct super_block *sb = file_inode(vma->vm_file)->i_sb; if (vmf->flags & FAULT_FLAG_WRITE) { sb_start_pagefault(sb); file_update_time(vma->vm_file); } result = __dax_fault(vma, vmf, get_block, complete_unwritten); if (vmf->flags & FAULT_FLAG_WRITE) sb_end_pagefault(sb); return result; }
/* * Common pre-write limit and setup checks. * * Returns with iolock held according to @iolock. */ STATIC ssize_t xfs_file_aio_write_checks( struct file *file, loff_t *pos, size_t *count, int *iolock) { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); xfs_fsize_t new_size; int error = 0; xfs_rw_ilock(ip, XFS_ILOCK_EXCL); error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); if (error) { xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); *iolock = 0; return error; } new_size = *pos + *count; if (new_size > ip->i_size) ip->i_new_size = new_size; if (likely(!(file->f_mode & FMODE_NOCMTIME))) file_update_time(file); /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this * write. */ if (*pos > ip->i_size) error = -xfs_zero_eof(ip, *pos, ip->i_size); xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); if (error) return error; /* * If we're writing the file then make sure to clear the setuid and * setgid bits if the process is not being run by root. This keeps * people from modifying setuid and setgid binaries. */ return file_remove_suid(file); }
/* * The lock ordering for ext2 DAX fault paths is: * * mmap_sem (MM) * sb_start_pagefault (vfs, freeze) * ext2_inode_info->dax_sem * address_space->i_mmap_rwsem or page_lock (mutually exclusive in DAX) * ext2_inode_info->truncate_mutex * * The default page_lock and i_size verification done by non-DAX fault paths * is sufficient because ext2 doesn't support hole punching. */ static vm_fault_t ext2_dax_fault(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); struct ext2_inode_info *ei = EXT2_I(inode); vm_fault_t ret; if (vmf->flags & FAULT_FLAG_WRITE) { sb_start_pagefault(inode->i_sb); file_update_time(vmf->vma->vm_file); } down_read(&ei->dax_sem); ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, NULL, &ext2_iomap_ops); up_read(&ei->dax_sem); if (vmf->flags & FAULT_FLAG_WRITE) sb_end_pagefault(inode->i_sb); return ret; }
/* * The lock ordering for ext2 DAX fault paths is: * * mmap_sem (MM) * sb_start_pagefault (vfs, freeze) * ext2_inode_info->dax_sem * address_space->i_mmap_rwsem or page_lock (mutually exclusive in DAX) * ext2_inode_info->truncate_mutex * * The default page_lock and i_size verification done by non-DAX fault paths * is sufficient because ext2 doesn't support hole punching. */ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct inode *inode = file_inode(vma->vm_file); struct ext2_inode_info *ei = EXT2_I(inode); int ret; if (vmf->flags & FAULT_FLAG_WRITE) { sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); } down_read(&ei->dax_sem); ret = __dax_fault(vma, vmf, ext2_get_block, NULL); up_read(&ei->dax_sem); if (vmf->flags & FAULT_FLAG_WRITE) sb_end_pagefault(inode->i_sb); return ret; }
static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags) { struct inode *inode = file_inode(vma->vm_file); struct ext2_inode_info *ei = EXT2_I(inode); int ret; if (flags & FAULT_FLAG_WRITE) { sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); } down_read(&ei->dax_sem); ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL); up_read(&ei->dax_sem); if (flags & FAULT_FLAG_WRITE) sb_end_pagefault(inode->i_sb); return ret; }
static int sysfs_bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct file *file = vma->vm_file; struct sysfs_open_file *of = sysfs_of(file); int ret; if (!of->vm_ops) return VM_FAULT_SIGBUS; if (!sysfs_get_active(of->sd)) return VM_FAULT_SIGBUS; ret = 0; if (of->vm_ops->page_mkwrite) ret = of->vm_ops->page_mkwrite(vma, vmf); else file_update_time(file); sysfs_put_active(of->sd); return ret; }
static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct file *file = vma->vm_file; struct bin_buffer *bb = file->private_data; struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; int ret; if (!bb->vm_ops) return VM_FAULT_SIGBUS; if (!sysfs_get_active(attr_sd)) return VM_FAULT_SIGBUS; ret = 0; if (bb->vm_ops->page_mkwrite) ret = bb->vm_ops->page_mkwrite(vma, vmf); else file_update_time(file); sysfs_put_active(attr_sd); return ret; }
STATIC ssize_t xfs_file_aio_write_checks( struct file *file, loff_t *pos, size_t *count, int *iolock) { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); int error = 0; xfs_rw_ilock(ip, XFS_ILOCK_EXCL); restart: error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); if (error) { xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); return error; } if (*pos > i_size_read(inode)) { if (*iolock == XFS_IOLOCK_SHARED) { xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); *iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); goto restart; } error = -xfs_zero_eof(ip, *pos, i_size_read(inode)); } xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); if (error) return error; if (likely(!(file->f_mode & FMODE_NOCMTIME))) file_update_time(file); return file_remove_suid(file); }
static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct inode *inode = file_inode(vma->vm_file); struct ext2_inode_info *ei = EXT2_I(inode); loff_t size; int ret; sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); down_read(&ei->dax_sem); /* check that the faulting page hasn't raced with truncate */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) ret = VM_FAULT_SIGBUS; else ret = dax_pfn_mkwrite(vma, vmf); up_read(&ei->dax_sem); sb_end_pagefault(inode->i_sb); return ret; }
/* * Locking for serialisation of IO during page faults. This results in a lock * ordering of: * * mmap_sem (MM) * sb_start_pagefault(vfs, freeze) * i_mmaplock (XFS - truncate serialisation) * page_lock (MM) * i_lock (XFS - extent map serialisation) */ static vm_fault_t __xfs_filemap_fault( struct vm_fault *vmf, enum page_entry_size pe_size, bool write_fault) { struct inode *inode = file_inode(vmf->vma->vm_file); struct xfs_inode *ip = XFS_I(inode); vm_fault_t ret; trace_xfs_filemap_fault(ip, pe_size, write_fault); if (write_fault) { sb_start_pagefault(inode->i_sb); file_update_time(vmf->vma->vm_file); } xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { pfn_t pfn; ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, &xfs_iomap_ops); if (ret & VM_FAULT_NEEDDSYNC) ret = dax_finish_sync_fault(vmf, pe_size, pfn); } else { if (write_fault) ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops); else ret = filemap_fault(vmf); } xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (write_fault) sb_end_pagefault(inode->i_sb); return ret; }
/* * Common pre-write limit and setup checks. * * Called with the iolocked held either shared and exclusive according to * @iolock, and returns with it held. Might upgrade the iolock to exclusive * if called for a direct write beyond i_size. */ STATIC ssize_t xfs_file_aio_write_checks( struct kiocb *iocb, struct iov_iter *from, int *iolock) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); ssize_t error = 0; size_t count = iov_iter_count(from); bool drained_dio = false; restart: error = generic_write_checks(iocb, from); if (error <= 0) return error; error = xfs_break_layouts(inode, iolock); if (error) return error; /* * For changing security info in file_remove_privs() we need i_rwsem * exclusively. */ if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { xfs_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; xfs_ilock(ip, *iolock); goto restart; } /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this * write. If zeroing is needed and we are currently holding the * iolock shared, we need to update it to exclusive which implies * having to redo all checks before. * * We need to serialise against EOF updates that occur in IO * completions here. We want to make sure that nobody is changing the * size while we do this check until we have placed an IO barrier (i.e. * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. * The spinlock effectively forms a memory barrier once we have the * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value * and hence be able to correctly determine if we need to run zeroing. */ spin_lock(&ip->i_flags_lock); if (iocb->ki_pos > i_size_read(inode)) { spin_unlock(&ip->i_flags_lock); if (!drained_dio) { if (*iolock == XFS_IOLOCK_SHARED) { xfs_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; xfs_ilock(ip, *iolock); iov_iter_reexpand(from, count); } /* * We now have an IO submission barrier in place, but * AIO can do EOF updates during IO completion and hence * we now need to wait for all of them to drain. Non-AIO * DIO will have drained before we are given the * XFS_IOLOCK_EXCL, and so for most cases this wait is a * no-op. */ inode_dio_wait(inode); drained_dio = true; goto restart; } error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), NULL); if (error) return error; } else spin_unlock(&ip->i_flags_lock); /* * Updating the timestamps will grab the ilock again from * xfs_fs_dirty_inode, so we have to call it after dropping the * lock above. Eventually we should look into a way to avoid * the pointless lock roundtrip. */ if (likely(!(file->f_mode & FMODE_NOCMTIME))) { error = file_update_time(file); if (error) return error; } /* * If we're writing the file then make sure to clear the setuid and * setgid bits if the process is not being run by root. This keeps * people from modifying setuid and setgid binaries. */ if (!IS_NOSEC(inode)) return file_remove_privs(file); return 0; }
static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_path.dentry->d_inode; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_alloc_parms ap = { .aflags = 0, }; unsigned long last_index; u64 pos = page->index << PAGE_CACHE_SHIFT; unsigned int data_blocks, ind_blocks, rblocks; int alloc_required = 0; struct gfs2_holder gh; loff_t size; int ret; sb_start_pagefault(inode->i_sb); /* Update file times before taking page lock */ file_update_time(vma->vm_file); ret = get_write_access(inode); if (ret) goto out; ret = gfs2_rs_alloc(ip); if (ret) goto out_write_access; gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ret = gfs2_glock_nq(&gh); if (ret) goto out_uninit; set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); set_bit(GIF_SW_PAGED, &ip->i_flags); gfs2_size_hint(inode, pos, PAGE_CACHE_SIZE); ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required); if (ret) goto out_unlock; if (!alloc_required) { lock_page(page); if (!PageUptodate(page) || page->mapping != inode->i_mapping) { ret = -EAGAIN; unlock_page(page); } goto out_unlock; } ret = gfs2_rindex_update(sdp); if (ret) goto out_unlock; ret = gfs2_quota_lock_check(ip); if (ret) goto out_unlock; gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); ap.target = data_blocks + ind_blocks; ret = gfs2_inplace_reserve(ip, &ap); if (ret) goto out_quota_unlock; rblocks = RES_DINODE + ind_blocks; if (gfs2_is_jdata(ip)) rblocks += data_blocks ? data_blocks : 1; if (ind_blocks || data_blocks) { rblocks += RES_STATFS + RES_QUOTA; rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks); } ret = gfs2_trans_begin(sdp, rblocks, 0); if (ret) goto out_trans_fail; lock_page(page); ret = -EINVAL; size = i_size_read(inode); last_index = (size - 1) >> PAGE_CACHE_SHIFT; /* Check page index against inode size */ if (size == 0 || (page->index > last_index)) goto out_trans_end; ret = -EAGAIN; /* If truncated, we must retry the operation, we may have raced * with the glock demotion code. */ if (!PageUptodate(page) || page->mapping != inode->i_mapping) goto out_trans_end; /* Unstuff, if required, and allocate backing blocks for page */ ret = 0; if (gfs2_is_stuffed(ip)) ret = gfs2_unstuff_dinode(ip, page); if (ret == 0) ret = gfs2_allocate_page_backing(page); out_trans_end: if (ret) unlock_page(page); gfs2_trans_end(sdp); out_trans_fail: gfs2_inplace_release(ip); out_quota_unlock: gfs2_quota_unlock(ip); out_unlock: gfs2_glock_dq(&gh); out_uninit: gfs2_holder_uninit(&gh); if (ret == 0) { set_page_dirty(page); wait_for_stable_page(page); } out_write_access: put_write_access(inode); out: sb_end_pagefault(inode->i_sb); return block_page_mkwrite_return(ret); } static const struct vm_operations_struct gfs2_vm_ops = { .fault = filemap_fault, .page_mkwrite = gfs2_page_mkwrite, }; /** * gfs2_mmap - * @file: The file to map * @vma: The VMA which described the mapping * * There is no need to get a lock here unless we should be updating * atime. We ignore any locking errors since the only consequence is * a missed atime update (which will just be deferred until later). * * Returns: 0 */ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma) { struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); if (!(file->f_flags & O_NOATIME) && !IS_NOATIME(&ip->i_inode)) { struct gfs2_holder i_gh; int error; error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (error) return error; /* grab lock to update inode */ gfs2_glock_dq_uninit(&i_gh); file_accessed(file); } vma->vm_ops = &gfs2_vm_ops; vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } /** * gfs2_open - open a file * @inode: the inode to open * @file: the struct file for this opening * * Returns: errno */ static int gfs2_open(struct inode *inode, struct file *file) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder i_gh; struct gfs2_file *fp; int error; fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL); if (!fp) return -ENOMEM; mutex_init(&fp->f_fl_mutex); gfs2_assert_warn(GFS2_SB(inode), !file->private_data); file->private_data = fp; if (S_ISREG(ip->i_inode.i_mode)) { error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); if (error) goto fail; if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) { error = -EOVERFLOW; goto fail_gunlock; } gfs2_glock_dq_uninit(&i_gh); } return 0; fail_gunlock: gfs2_glock_dq_uninit(&i_gh); fail: file->private_data = NULL; kfree(fp); return error; } /** * gfs2_release - called to close a struct file * @inode: the inode the struct file belongs to * @file: the struct file being closed * * Returns: errno */ static int gfs2_release(struct inode *inode, struct file *file) { struct gfs2_inode *ip = GFS2_I(inode); kfree(file->private_data); file->private_data = NULL; if (!(file->f_mode & FMODE_WRITE)) return 0; gfs2_rs_delete(ip); return 0; } /** * gfs2_fsync - sync the dirty data for a file (across the cluster) * @file: the file that points to the dentry * @start: the start position in the file to sync * @end: the end position in the file to sync * @datasync: set if we can ignore timestamp changes * * The VFS will flush data for us. We only need to worry * about metadata here. * * Returns: errno */ static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync) { struct inode *inode = dentry->d_inode; int sync_state = inode->i_state & I_DIRTY; struct gfs2_inode *ip = GFS2_I(inode); int ret; if (!gfs2_is_jdata(ip)) sync_state &= ~I_DIRTY_PAGES; if (datasync) sync_state &= ~I_DIRTY_SYNC; if (sync_state) { ret = sync_inode_metadata(inode, 1); if (ret) return ret; if (gfs2_is_jdata(ip)) filemap_write_and_wait(inode->i_mapping); gfs2_ail_flush(ip->i_gl, 1); } return 0; } /** * gfs2_file_aio_write - Perform a write to a file * @iocb: The io context * @iov: The data to write * @nr_segs: Number of @iov segments * @pos: The file position * * We have to do a lock/unlock here to refresh the inode size for * O_APPEND writes, otherwise we can land up writing at the wrong * offset. There is still a race, but provided the app is using its * own file locking, this will make O_APPEND work as expected. * */ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; size_t writesize = iov_length(iov, nr_segs); struct dentry *dentry = file->f_dentry; struct gfs2_inode *ip = GFS2_I(dentry->d_inode); struct gfs2_sbd *sdp; int ret; sdp = GFS2_SB(file->f_mapping->host); ret = gfs2_rs_alloc(ip); if (ret) return ret; gfs2_size_hint(file->f_dentry->d_inode, pos, writesize); if (file->f_flags & O_APPEND) { struct gfs2_holder gh; ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); if (ret) return ret; gfs2_glock_dq_uninit(&gh); } return generic_file_aio_write(iocb, iov, nr_segs, pos); } static ssize_t gfs2_file_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { int error; struct inode *inode = out->f_mapping->host; struct gfs2_inode *ip = GFS2_I(inode); error = gfs2_rs_alloc(ip); if (error) return (ssize_t)error; gfs2_size_hint(inode, *ppos, len); return generic_file_splice_write(pipe, out, ppos, len, flags); }
/* FIXME: Ugliest function of all in LFS, need I say more? */ static ssize_t lfs_file_write( struct file *file, const char __user *buf, size_t count, loff_t *ppos) { loff_t pos; struct page *page; ssize_t res, written, bytes; struct inode *inode = file->f_dentry->d_inode; struct super_block *sb = inode->i_sb; struct segment *segp = LFS_SBI(sb)->s_curr; //dprintk("lfs_file_write called for %lu at pos %Lu\n", inode->i_ino, *ppos); if(file->f_flags & O_DIRECT) { dprintk("The file is requesting direct IO\n"); return -EINVAL; } if (unlikely(count < 0 )) return -EINVAL; if (unlikely(!access_ok(VERIFY_READ, buf, count))) return -EFAULT; //down(&inode->i_sem); /* lock the file */ mutex_lock(&inode->i_mutex); //BrechREiZ: We need this for Kernel 2.6.17 lfs_lock(sb); pos = *ppos; res = generic_write_checks(file, &pos, &count, 0); if (res) goto out; if(count == 0) goto out; res = remove_suid(file->f_dentry); if(res) goto out; //inode_update_time(inode, 1); /* update mtime and ctime */ file_update_time(inode); //BrechREiZ: We need this for Kernel 2.6.17 written = 0; do { long offset; size_t copied; int i, siblock, eiblock, boffset; sector_t block; offset = (segp->offset % BUF_IN_PAGE) * LFS_BSIZE; offset += pos & (LFS_BSIZE - 1); /* within block */ bytes = PAGE_CACHE_SIZE - offset; /* number of bytes written in this iteration */ invalidate_old_page(inode, pos); if (bytes > count) bytes = count; //dprintk("1:segp->start=%Lu,segp->offset=%d,segp->end=%Lu,offset=%lu,bytes=%d\n", segp->start, segp->offset, segp->end,offset,bytes); siblock = pos >> LFS_BSIZE_BITS; eiblock = (pos + bytes - 1) >> LFS_BSIZE_BITS; //dprintk("writing %d bytes at offset %ld (pos = %Lu)\n", bytes, offset, pos); //dprintk("siblock = %d, eiblock = %d\n", siblock, eiblock); /* * Bring in the user page that we will copy from _first_. * Otherwise there's a nasty deadlock on copying from the * same page as we're writing to, without it being marked * up-to-date. */ fault_in_pages_readable(buf, bytes); page = get_seg_page(segp); if (!page) { res = -ENOMEM; break; } /* fill the page with current inode blocks if any */ boffset = offset / LFS_BSIZE;; for(i = siblock; i <= eiblock; ++i, ++boffset) { struct buffer_head *bh; //dprintk("Asking for block %d\n", i); bh = lfs_read_block(inode, i); if(!bh) /* new block */ break; //dprintk("boffset = %d\n", boffset); memcpy(page_address(page) + LFS_BSIZE * boffset, bh->b_data, LFS_BSIZE); brelse(bh); } copied = __copy_from_user(page_address(page) + offset, buf, bytes); flush_dcache_page(page); block = segp->start + segp->offset; for(i = siblock;i <= eiblock; ++i, ++block) segsum_update_finfo(segp, inode->i_ino, i, block); block = segp->start + segp->offset; segp->offset += (bytes - 1)/LFS_BSIZE + 1; //dprintk("2:segp->start=%Lu,segp->offset=%d,segp->end=%Lu,offset=%lu,bytes=%d\n", //segp->start, segp->offset, segp->end,offset,bytes); BUG_ON(segp->start + segp->offset > segp->end); if(segp->start + segp->offset == segp->end) { dprintk("allocating new segment\n"); /* This also is going to write the previous segment */ segment_allocate_new(inode->i_sb, segp, segp->start + segp->offset); segp = LFS_SBI(sb)->s_curr; } /* update the inode */ for(i = siblock;i <= eiblock; ++i, ++block) update_inode(inode, i, block); //dprintk("start=%Lu,offset=%d,end=%Lu\n", segp->start, segp->offset, segp->end); segusetbl_add_livebytes(sb, segp->segnum, bytes); written += bytes; buf += bytes; pos += bytes; count -= bytes; } while(count); *ppos = pos; if(pos > inode->i_size) i_size_write(inode, pos); if(written) mark_inode_dirty(inode); lfs_unlock(sb); //up(&inode->i_sem); mutex_unlock(&inode->i_mutex); //BrechREiZ: and unlocking... return written ? written : res; out: lfs_unlock(sb); //up(&inode->i_sem); mutex_unlock(&inode->i_mutex); //BrechREiZ: and unlocking... return res; }
int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags, get_block_t get_block, dax_iodone_t complete_unwritten) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct buffer_head bh; unsigned blkbits = inode->i_blkbits; unsigned long pmd_addr = address & PMD_MASK; bool write = flags & FAULT_FLAG_WRITE; struct block_device *bdev; pgoff_t size, pgoff; sector_t block; int error, result = 0; bool alloc = false; /* dax pmd mappings require pfn_t_devmap() */ if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) return VM_FAULT_FALLBACK; /* Fall back to PTEs if we're going to COW */ if (write && !(vma->vm_flags & VM_SHARED)) { split_huge_pmd(vma, pmd, address); dax_pmd_dbg(NULL, address, "cow write"); return VM_FAULT_FALLBACK; } /* If the PMD would extend outside the VMA */ if (pmd_addr < vma->vm_start) { dax_pmd_dbg(NULL, address, "vma start unaligned"); return VM_FAULT_FALLBACK; } if ((pmd_addr + PMD_SIZE) > vma->vm_end) { dax_pmd_dbg(NULL, address, "vma end unaligned"); return VM_FAULT_FALLBACK; } pgoff = linear_page_index(vma, pmd_addr); size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (pgoff >= size) return VM_FAULT_SIGBUS; /* If the PMD would cover blocks out of the file */ if ((pgoff | PG_PMD_COLOUR) >= size) { dax_pmd_dbg(NULL, address, "offset + huge page size > file size"); return VM_FAULT_FALLBACK; } memset(&bh, 0, sizeof(bh)); bh.b_bdev = inode->i_sb->s_bdev; block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); bh.b_size = PMD_SIZE; if (get_block(inode, block, &bh, 0) != 0) return VM_FAULT_SIGBUS; if (!buffer_mapped(&bh) && write) { if (get_block(inode, block, &bh, 1) != 0) return VM_FAULT_SIGBUS; alloc = true; } bdev = bh.b_bdev; /* * If the filesystem isn't willing to tell us the length of a hole, * just fall back to PTEs. Calling get_block 512 times in a loop * would be silly. */ if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) { dax_pmd_dbg(&bh, address, "allocated block too small"); return VM_FAULT_FALLBACK; } /* * If we allocated new storage, make sure no process has any * zero pages covering this hole */ if (alloc) { loff_t lstart = pgoff << PAGE_SHIFT; loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */ truncate_pagecache_range(inode, lstart, lend); } i_mmap_lock_read(mapping); /* * If a truncate happened while we were allocating blocks, we may * leave blocks allocated to the file that are beyond EOF. We can't * take i_mutex here, so just leave them hanging; they'll be freed * when the file is deleted. */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (pgoff >= size) { result = VM_FAULT_SIGBUS; goto out; } if ((pgoff | PG_PMD_COLOUR) >= size) { dax_pmd_dbg(&bh, address, "offset + huge page size > file size"); goto fallback; } if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { spinlock_t *ptl; pmd_t entry; struct page *zero_page = get_huge_zero_page(); if (unlikely(!zero_page)) { dax_pmd_dbg(&bh, address, "no zero page"); goto fallback; } ptl = pmd_lock(vma->vm_mm, pmd); if (!pmd_none(*pmd)) { spin_unlock(ptl); dax_pmd_dbg(&bh, address, "pmd already present"); goto fallback; } dev_dbg(part_to_dev(bdev->bd_part), "%s: %s addr: %lx pfn: <zero> sect: %llx\n", __func__, current->comm, address, (unsigned long long) to_sector(&bh, inode)); entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_mkhuge(entry); set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry); result = VM_FAULT_NOPAGE; spin_unlock(ptl); } else { struct blk_dax_ctl dax = { .sector = to_sector(&bh, inode), .size = PMD_SIZE, }; long length = dax_map_atomic(bdev, &dax); if (length < 0) { result = VM_FAULT_SIGBUS; goto out; } if (length < PMD_SIZE) { dax_pmd_dbg(&bh, address, "dax-length too small"); dax_unmap_atomic(bdev, &dax); goto fallback; } if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) { dax_pmd_dbg(&bh, address, "pfn unaligned"); dax_unmap_atomic(bdev, &dax); goto fallback; } if (!pfn_t_devmap(dax.pfn)) { dax_unmap_atomic(bdev, &dax); dax_pmd_dbg(&bh, address, "pfn not in memmap"); goto fallback; } if (buffer_unwritten(&bh) || buffer_new(&bh)) { clear_pmem(dax.addr, PMD_SIZE); wmb_pmem(); count_vm_event(PGMAJFAULT); mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); result |= VM_FAULT_MAJOR; } dax_unmap_atomic(bdev, &dax); /* * For PTE faults we insert a radix tree entry for reads, and * leave it clean. Then on the first write we dirty the radix * tree entry via the dax_pfn_mkwrite() path. This sequence * allows the dax_pfn_mkwrite() call to be simpler and avoid a * call into get_block() to translate the pgoff to a sector in * order to be able to create a new radix tree entry. * * The PMD path doesn't have an equivalent to * dax_pfn_mkwrite(), though, so for a read followed by a * write we traverse all the way through __dax_pmd_fault() * twice. This means we can just skip inserting a radix tree * entry completely on the initial read and just wait until * the write to insert a dirty entry. */ if (write) { error = dax_radix_entry(mapping, pgoff, dax.sector, true, true); if (error) { dax_pmd_dbg(&bh, address, "PMD radix insertion failed"); goto fallback; } } dev_dbg(part_to_dev(bdev->bd_part), "%s: %s addr: %lx pfn: %lx sect: %llx\n", __func__, current->comm, address, pfn_t_to_pfn(dax.pfn), (unsigned long long) dax.sector); result |= vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write); } out: i_mmap_unlock_read(mapping); if (buffer_unwritten(&bh)) complete_unwritten(&bh, !(result & VM_FAULT_ERROR)); return result; fallback: count_vm_event(THP_FAULT_FALLBACK); result = VM_FAULT_FALLBACK; goto out; } EXPORT_SYMBOL_GPL(__dax_pmd_fault); /** * dax_pmd_fault - handle a PMD fault on a DAX file * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault * @get_block: The filesystem method used to translate file offsets to blocks * * When a page fault occurs, filesystems may call this helper in their * pmd_fault handler for DAX files. */ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags, get_block_t get_block, dax_iodone_t complete_unwritten) { int result; struct super_block *sb = file_inode(vma->vm_file)->i_sb; if (flags & FAULT_FLAG_WRITE) { sb_start_pagefault(sb); file_update_time(vma->vm_file); } result = __dax_pmd_fault(vma, address, pmd, flags, get_block, complete_unwritten); if (flags & FAULT_FLAG_WRITE) sb_end_pagefault(sb); return result; }
STATIC ssize_t xfs_file_aio_write( struct kiocb *iocb, const struct iovec *iovp, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; ssize_t ret = 0, error = 0; int ioflags = 0; xfs_fsize_t isize, new_size; int iolock; int eventsent = 0; size_t ocount = 0, count; int need_i_mutex; XFS_STATS_INC(xs_write_calls); BUG_ON(iocb->ki_pos != pos); if (unlikely(file->f_flags & O_DIRECT)) ioflags |= IO_ISDIRECT; if (file->f_mode & FMODE_NOCMTIME) ioflags |= IO_INVIS; error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ); if (error) return error; count = ocount; if (count == 0) return 0; xfs_wait_for_freeze(mp, SB_FREEZE_WRITE); if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; relock: if (ioflags & IO_ISDIRECT) { iolock = XFS_IOLOCK_SHARED; need_i_mutex = 0; } else { iolock = XFS_IOLOCK_EXCL; need_i_mutex = 1; mutex_lock(&inode->i_mutex); } xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); start: error = -generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (error) { xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); goto out_unlock_mutex; } if ((DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS) && !eventsent)) { int dmflags = FILP_DELAY_FLAG(file); if (need_i_mutex) dmflags |= DM_FLAGS_IMUX; xfs_iunlock(ip, XFS_ILOCK_EXCL); error = XFS_SEND_DATA(ip->i_mount, DM_EVENT_WRITE, ip, pos, count, dmflags, &iolock); if (error) { goto out_unlock_internal; } xfs_ilock(ip, XFS_ILOCK_EXCL); eventsent = 1; /* * The iolock was dropped and reacquired in XFS_SEND_DATA * so we have to recheck the size when appending. * We will only "goto start;" once, since having sent the * event prevents another call to XFS_SEND_DATA, which is * what allows the size to change in the first place. */ if ((file->f_flags & O_APPEND) && pos != ip->i_size) goto start; } if (ioflags & IO_ISDIRECT) { xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; if ((pos & target->bt_smask) || (count & target->bt_smask)) { xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); return XFS_ERROR(-EINVAL); } if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) { xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock); iolock = XFS_IOLOCK_EXCL; need_i_mutex = 1; mutex_lock(&inode->i_mutex); xfs_ilock(ip, XFS_ILOCK_EXCL|iolock); goto start; } } new_size = pos + count; if (new_size > ip->i_size) ip->i_new_size = new_size; if (likely(!(ioflags & IO_INVIS))) file_update_time(file); /* * If the offset is beyond the size of the file, we have a couple * of things to do. First, if there is already space allocated * we need to either create holes or zero the disk or ... * * If there is a page where the previous size lands, we need * to zero it out up to the new size. */ if (pos > ip->i_size) { error = xfs_zero_eof(ip, pos, ip->i_size); if (error) { xfs_iunlock(ip, XFS_ILOCK_EXCL); goto out_unlock_internal; } } xfs_iunlock(ip, XFS_ILOCK_EXCL); /* * If we're writing the file then make sure to clear the * setuid and setgid bits if the process is not being run * by root. This keeps people from modifying setuid and * setgid binaries. */ error = -file_remove_suid(file); if (unlikely(error)) goto out_unlock_internal; /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; if ((ioflags & IO_ISDIRECT)) { if (mapping->nrpages) { WARN_ON(need_i_mutex == 0); error = xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, FI_REMAPF_LOCKED); if (error) goto out_unlock_internal; } if (need_i_mutex) { /* demote the lock now the cached pages are gone */ xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); mutex_unlock(&inode->i_mutex); iolock = XFS_IOLOCK_SHARED; need_i_mutex = 0; } trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags); ret = generic_file_direct_write(iocb, iovp, &nr_segs, pos, &iocb->ki_pos, count, ocount); /* * direct-io write to a hole: fall through to buffered I/O * for completing the rest of the request. */ if (ret >= 0 && ret != count) { XFS_STATS_ADD(xs_write_bytes, ret); pos += ret; count -= ret; ioflags &= ~IO_ISDIRECT; xfs_iunlock(ip, iolock); goto relock; } } else { int enospc = 0; ssize_t ret2 = 0; write_retry: trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags); ret2 = generic_file_buffered_write(iocb, iovp, nr_segs, pos, &iocb->ki_pos, count, ret); /* * if we just got an ENOSPC, flush the inode now we * aren't holding any page locks and retry *once* */ if (ret2 == -ENOSPC && !enospc) { error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE); if (error) goto out_unlock_internal; enospc = 1; goto write_retry; } ret = ret2; } current->backing_dev_info = NULL; isize = i_size_read(inode); if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize)) iocb->ki_pos = isize; if (iocb->ki_pos > ip->i_size) { xfs_ilock(ip, XFS_ILOCK_EXCL); if (iocb->ki_pos > ip->i_size) ip->i_size = iocb->ki_pos; xfs_iunlock(ip, XFS_ILOCK_EXCL); } if (ret == -ENOSPC && DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) { xfs_iunlock(ip, iolock); if (need_i_mutex) mutex_unlock(&inode->i_mutex); error = XFS_SEND_NAMESP(ip->i_mount, DM_EVENT_NOSPACE, ip, DM_RIGHT_NULL, ip, DM_RIGHT_NULL, NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */ if (need_i_mutex) mutex_lock(&inode->i_mutex); xfs_ilock(ip, iolock); if (error) goto out_unlock_internal; goto start; } error = -ret; if (ret <= 0) goto out_unlock_internal; XFS_STATS_ADD(xs_write_bytes, ret); /* Handle various SYNC-type writes */ if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { loff_t end = pos + ret - 1; int error2; xfs_iunlock(ip, iolock); if (need_i_mutex) mutex_unlock(&inode->i_mutex); error2 = filemap_write_and_wait_range(mapping, pos, end); if (!error) error = error2; if (need_i_mutex) mutex_lock(&inode->i_mutex); xfs_ilock(ip, iolock); error2 = -xfs_file_fsync(file, file->f_path.dentry, (file->f_flags & __O_SYNC) ? 0 : 1); if (!error) error = error2; } out_unlock_internal: if (ip->i_new_size) { xfs_ilock(ip, XFS_ILOCK_EXCL); ip->i_new_size = 0; /* * If this was a direct or synchronous I/O that failed (such * as ENOSPC) then part of the I/O may have been written to * disk before the error occured. In this case the on-disk * file size may have been adjusted beyond the in-memory file * size and now needs to be truncated back. */ if (ip->i_d.di_size > ip->i_size) ip->i_d.di_size = ip->i_size; xfs_iunlock(ip, XFS_ILOCK_EXCL); } xfs_iunlock(ip, iolock); out_unlock_mutex: if (need_i_mutex) mutex_unlock(&inode->i_mutex); return -error; }
static ssize_t svfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file *filp = iocb->ki_filp; struct file *llfs_filp; struct inode *inode = filp->f_dentry->d_inode; struct svfs_inode *si = SVFS_I(inode); const char __user *buf; size_t count; ssize_t ret = 0, bw; int seg; svfs_entry(mdc, "f_mode 0x%x, pos %lu, check 0x%x\n", filp->f_mode, (unsigned long)pos, (si->state & SVFS_STATE_CONN)); if (si->state & SVFS_STATE_DA) { /* create it now */ ASSERT(!(si->state & SVFS_STATE_CONN)); ret = llfs_create(filp->f_dentry); if (ret) goto out; } if (!(si->state & SVFS_STATE_CONN)) { /* open it? */ ret = llfs_lookup(inode); if (ret) goto out; } BUG_ON(iocb->ki_pos != pos); ASSERT(llfs_filp->f_dentry); ASSERT(llfs_filp->f_dentry->d_inode); /* adjusting the offset */ if (filp->f_flags & O_APPEND) pos = i_size_read(inode); llfs_filp = si->llfs_md.llfs_filp; llfs_filp->f_pos = pos; if (!(llfs_filp->f_mode & FMODE_WRITE)) return -EBADF; if (!llfs_filp->f_op || (!llfs_filp->f_op->write && !llfs_filp->f_op->aio_write)) return -EINVAL; for (seg = 0; seg < nr_segs; seg++) { buf = iov[seg].iov_base; count = iov[seg].iov_len; svfs_debug(mdc, "buf %p, len %ld: \n", buf, count); if (llfs_filp->f_op->write) bw = llfs_filp->f_op->write(llfs_filp, buf, count, &llfs_filp->f_pos); else bw = do_sync_write(llfs_filp, buf, count, &llfs_filp->f_pos); if (bw < 0) { ret = bw; goto out; } ret += bw; } if (ret > 0) fsnotify_modify(llfs_filp->f_dentry); if (ret > 0 && ((filp->f_flags & O_SYNC) || IS_SYNC(inode))) { ssize_t err; err = sync_page_range(llfs_filp->f_dentry->d_inode, llfs_filp->f_mapping, pos, ret); if (err < 0) ret = err; } iocb->ki_pos += ret; ASSERT(llfs_filp->f_pos == iocb->ki_ops); /* should update the file info */ file_update_time(filp); if (pos + ret > inode->i_size) { svfs_debug(mdc, "update with pos %lu count %ld, " "original i_size %lu\n", (unsigned long)pos, ret, (unsigned long)inode->i_size); i_size_write(inode, pos + ret); mark_inode_dirty(inode); } out: return ret; }
static ssize_t ncp_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); size_t already_written = 0; size_t bufsize; int errno; void *bouncebuffer; off_t pos; ncp_dbg(1, "enter %pD2\n", file); errno = generic_write_checks(iocb, from); if (errno <= 0) return errno; errno = ncp_make_open(inode, O_WRONLY); if (errno) { ncp_dbg(1, "open failed, error=%d\n", errno); return errno; } bufsize = NCP_SERVER(inode)->buffer_size; errno = file_update_time(file); if (errno) goto outrel; bouncebuffer = vmalloc(bufsize); if (!bouncebuffer) { errno = -EIO; /* -ENOMEM */ goto outrel; } pos = iocb->ki_pos; while (iov_iter_count(from)) { int written_this_time; size_t to_write = min_t(size_t, bufsize - (pos % bufsize), iov_iter_count(from)); if (copy_from_iter(bouncebuffer, to_write, from) != to_write) { errno = -EFAULT; break; } if (ncp_write_kernel(NCP_SERVER(inode), NCP_FINFO(inode)->file_handle, pos, to_write, bouncebuffer, &written_this_time) != 0) { errno = -EIO; break; } pos += written_this_time; already_written += written_this_time; if (written_this_time != to_write) break; } vfree(bouncebuffer); iocb->ki_pos = pos; if (pos > i_size_read(inode)) { mutex_lock(&inode->i_mutex); if (pos > i_size_read(inode)) i_size_write(inode, pos); mutex_unlock(&inode->i_mutex); } ncp_dbg(1, "exit %pD2\n", file); outrel: ncp_inode_close(inode); return already_written ? already_written : errno; }
static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct page *page = vmf->page; struct inode *inode = file_inode(vma->vm_file); struct nilfs_transaction_info ti; int ret = 0; if (unlikely(nilfs_near_disk_full(inode->i_sb->s_fs_info))) return VM_FAULT_SIGBUS; /* -ENOSPC */ sb_start_pagefault(inode->i_sb); lock_page(page); if (page->mapping != inode->i_mapping || page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) { unlock_page(page); ret = -EFAULT; /* make the VM retry the fault */ goto out; } /* * check to see if the page is mapped already (no holes) */ if (PageMappedToDisk(page)) goto mapped; if (page_has_buffers(page)) { struct buffer_head *bh, *head; int fully_mapped = 1; bh = head = page_buffers(page); do { if (!buffer_mapped(bh)) { fully_mapped = 0; break; } } while (bh = bh->b_this_page, bh != head); if (fully_mapped) { SetPageMappedToDisk(page); goto mapped; } } unlock_page(page); /* * fill hole blocks */ ret = nilfs_transaction_begin(inode->i_sb, &ti, 1); /* never returns -ENOMEM, but may return -ENOSPC */ if (unlikely(ret)) goto out; file_update_time(vma->vm_file); ret = __block_page_mkwrite(vma, vmf, nilfs_get_block); if (ret) { nilfs_transaction_abort(inode->i_sb); goto out; } nilfs_set_file_dirty(inode, 1 << (PAGE_SHIFT - inode->i_blkbits)); nilfs_transaction_commit(inode->i_sb); mapped: wait_for_stable_page(page); out: sb_end_pagefault(inode->i_sb); return block_page_mkwrite_return(ret); }